@jambudipa/spider 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -16
- package/dist/browser/BrowserManager.d.ts +63 -0
- package/dist/browser/BrowserManager.d.ts.map +1 -0
- package/dist/browser/PlaywrightAdapter.d.ts +166 -0
- package/dist/browser/PlaywrightAdapter.d.ts.map +1 -0
- package/dist/examples/01-basic-crawl-working.d.ts +13 -0
- package/dist/examples/01-basic-crawl-working.d.ts.map +1 -0
- package/dist/examples/02-multiple-urls-working.d.ts +13 -0
- package/dist/examples/02-multiple-urls-working.d.ts.map +1 -0
- package/dist/examples/03-url-filtering.d.ts +13 -0
- package/dist/examples/03-url-filtering.d.ts.map +1 -0
- package/dist/examples/04-robots-compliance.d.ts +14 -0
- package/dist/examples/04-robots-compliance.d.ts.map +1 -0
- package/dist/examples/05-link-extraction-selectors.d.ts +14 -0
- package/dist/examples/05-link-extraction-selectors.d.ts.map +1 -0
- package/dist/examples/06-custom-middleware.d.ts +18 -0
- package/dist/examples/06-custom-middleware.d.ts.map +1 -0
- package/dist/examples/07-resumability-demo.d.ts +14 -0
- package/dist/examples/07-resumability-demo.d.ts.map +1 -0
- package/dist/examples/08-worker-monitoring.d.ts +15 -0
- package/dist/examples/08-worker-monitoring.d.ts.map +1 -0
- package/dist/examples/09-error-handling-recovery.d.ts +15 -0
- package/dist/examples/09-error-handling-recovery.d.ts.map +1 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +2891 -1456
- package/dist/index.js.map +1 -1
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +107 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
- package/dist/lib/HttpClient/CookieManager.d.ts +58 -0
- package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +63 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
- package/dist/lib/HttpClient/SessionStore.d.ts +114 -0
- package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts +83 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
- package/dist/lib/HttpClient/index.d.ts +8 -0
- package/dist/lib/HttpClient/index.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/index.d.ts +37 -0
- package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
- package/dist/lib/Logging/FetchLogger.d.ts +24 -0
- package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts +37 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +239 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
- package/dist/lib/Middleware/types.d.ts +99 -0
- package/dist/lib/Middleware/types.d.ts.map +1 -0
- package/dist/lib/PageData/PageData.d.ts +28 -0
- package/dist/lib/PageData/PageData.d.ts.map +1 -0
- package/dist/lib/Resumability/Resumability.service.d.ts +178 -0
- package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/index.d.ts +51 -0
- package/dist/lib/Resumability/index.d.ts.map +1 -0
- package/dist/lib/Resumability/strategies.d.ts +76 -0
- package/dist/lib/Resumability/strategies.d.ts.map +1 -0
- package/dist/lib/Resumability/types.d.ts +201 -0
- package/dist/lib/Resumability/types.d.ts.map +1 -0
- package/dist/lib/Robots/Robots.service.d.ts +78 -0
- package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
- package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +249 -0
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
- package/dist/lib/StateManager/StateManager.service.d.ts +107 -0
- package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
- package/dist/lib/StateManager/index.d.ts +5 -0
- package/dist/lib/StateManager/index.d.ts.map +1 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +110 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
- package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
- package/dist/lib/api-facades.d.ts +313 -0
- package/dist/lib/api-facades.d.ts.map +1 -0
- package/dist/lib/errors/effect-errors.d.ts +179 -0
- package/dist/lib/errors/effect-errors.d.ts.map +1 -0
- package/dist/lib/errors.d.ts +172 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/dist/lib/utils/FileUtils.d.ts +284 -0
- package/dist/lib/utils/FileUtils.d.ts.map +1 -0
- package/dist/lib/utils/JsonUtils.d.ts +196 -0
- package/dist/lib/utils/JsonUtils.d.ts.map +1 -0
- package/dist/lib/utils/RegexUtils.d.ts +257 -0
- package/dist/lib/utils/RegexUtils.d.ts.map +1 -0
- package/dist/lib/utils/SchemaUtils.d.ts +251 -0
- package/dist/lib/utils/SchemaUtils.d.ts.map +1 -0
- package/dist/lib/utils/UrlUtils.d.ts +223 -0
- package/dist/lib/utils/UrlUtils.d.ts.map +1 -0
- package/dist/lib/utils/effect-migration.d.ts +31 -0
- package/dist/lib/utils/effect-migration.d.ts.map +1 -0
- package/dist/lib/utils/index.d.ts +15 -0
- package/dist/lib/utils/index.d.ts.map +1 -0
- package/dist/lib/utils/url-deduplication.d.ts +108 -0
- package/dist/lib/utils/url-deduplication.d.ts.map +1 -0
- package/dist/lib/utils/url-deduplication.test.d.ts +5 -0
- package/dist/lib/utils/url-deduplication.test.d.ts.map +1 -0
- package/dist/test/infrastructure/EffectTestUtils.d.ts +167 -0
- package/dist/test/infrastructure/EffectTestUtils.d.ts.map +1 -0
- package/package.json +21 -9
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { Effect, Option } from 'effect';
|
|
2
|
+
import { SpiderState, SpiderStateKey } from '../Scheduler/SpiderScheduler.service.js';
|
|
3
|
+
import { HybridPersistenceConfig, PersistenceError, PersistenceStrategy, StateOperation, StorageBackend } from './types.js';
|
|
4
|
+
/**
|
|
5
|
+
* Full state persistence strategy.
|
|
6
|
+
*
|
|
7
|
+
* Saves the complete spider state on every operation. Simple and reliable,
|
|
8
|
+
* but can be inefficient for large crawls with many URLs.
|
|
9
|
+
*
|
|
10
|
+
* @group Strategies
|
|
11
|
+
* @public
|
|
12
|
+
*/
|
|
13
|
+
export declare class FullStatePersistence implements PersistenceStrategy {
|
|
14
|
+
private readonly backend;
|
|
15
|
+
constructor(backend: StorageBackend);
|
|
16
|
+
persist: (operation: StateOperation) => Effect.Effect<void, PersistenceError>;
|
|
17
|
+
restore: (key: SpiderStateKey) => Effect.Effect<Option.Option<SpiderState>, PersistenceError>;
|
|
18
|
+
cleanup: (key: SpiderStateKey) => Effect.Effect<void, PersistenceError>;
|
|
19
|
+
getInfo: () => {
|
|
20
|
+
name: string;
|
|
21
|
+
description: string;
|
|
22
|
+
capabilities: string[];
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Delta persistence strategy.
|
|
27
|
+
*
|
|
28
|
+
* Saves only incremental changes (deltas) instead of the full state.
|
|
29
|
+
* Much more efficient for large crawls, but requires delta replay for restoration.
|
|
30
|
+
*
|
|
31
|
+
* @group Strategies
|
|
32
|
+
* @public
|
|
33
|
+
*/
|
|
34
|
+
export declare class DeltaPersistence implements PersistenceStrategy {
|
|
35
|
+
private readonly backend;
|
|
36
|
+
constructor(backend: StorageBackend);
|
|
37
|
+
persist: (operation: StateOperation) => Effect.Effect<void, PersistenceError>;
|
|
38
|
+
restore: (key: SpiderStateKey) => Effect.Effect<Option.Option<SpiderState>, PersistenceError>;
|
|
39
|
+
cleanup: (key: SpiderStateKey) => Effect.Effect<void, PersistenceError>;
|
|
40
|
+
reconstructStateFromDeltas: (key: SpiderStateKey, deltas: ReadonlyArray<import("./types.js").StateDelta>) => Effect.Effect<SpiderState, PersistenceError>;
|
|
41
|
+
getInfo: () => {
|
|
42
|
+
name: string;
|
|
43
|
+
description: string;
|
|
44
|
+
capabilities: string[];
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Hybrid persistence strategy.
|
|
49
|
+
*
|
|
50
|
+
* Combines delta and full state approaches for optimal performance.
|
|
51
|
+
* Saves deltas for efficiency, with periodic snapshots for fast recovery.
|
|
52
|
+
*
|
|
53
|
+
* @group Strategies
|
|
54
|
+
* @public
|
|
55
|
+
*/
|
|
56
|
+
export declare class HybridPersistence implements PersistenceStrategy {
|
|
57
|
+
private readonly backend;
|
|
58
|
+
private readonly config;
|
|
59
|
+
private operationCount;
|
|
60
|
+
private lastSnapshotSequence;
|
|
61
|
+
private pendingDeltas;
|
|
62
|
+
constructor(backend: StorageBackend, config?: HybridPersistenceConfig);
|
|
63
|
+
persist: (operation: StateOperation) => Effect.Effect<void, PersistenceError>;
|
|
64
|
+
private saveSnapshot;
|
|
65
|
+
private saveDelta;
|
|
66
|
+
private flushPendingDeltas;
|
|
67
|
+
restore: (key: SpiderStateKey) => Effect.Effect<Option.Option<SpiderState>, PersistenceError>;
|
|
68
|
+
private applyDeltasToState;
|
|
69
|
+
cleanup: (key: SpiderStateKey) => Effect.Effect<void, PersistenceError>;
|
|
70
|
+
getInfo: () => {
|
|
71
|
+
name: string;
|
|
72
|
+
description: string;
|
|
73
|
+
capabilities: string[];
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
//# sourceMappingURL=strategies.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strategies.d.ts","sourceRoot":"","sources":["../../../src/lib/Resumability/strategies.ts"],"names":[],"mappings":"AAAA,OAAO,EAAS,MAAM,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC/C,OAAO,EACL,WAAW,EACX,cAAc,EACf,MAAM,yCAAyC,CAAC;AACjD,OAAO,EAEL,uBAAuB,EACvB,gBAAgB,EAChB,mBAAmB,EACnB,cAAc,EACd,cAAc,EACf,MAAM,YAAY,CAAC;AAEpB;;;;;;;;GAQG;AACH,qBAAa,oBAAqB,YAAW,mBAAmB;IAClD,OAAO,CAAC,QAAQ,CAAC,OAAO;gBAAP,OAAO,EAAE,cAAc;IAEpD,OAAO,GACL,WAAW,cAAc,KACxB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAiBtC;IAEF,OAAO,GACL,KAAK,cAAc,KAClB,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,gBAAgB,CAAC,CAc5D;IAEF,OAAO,GAAI,KAAK,cAAc,KAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAcpE;IAEF,OAAO;;;;MAKJ;CACJ;AAED;;;;;;;;GAQG;AACH,qBAAa,gBAAiB,YAAW,mBAAmB;IAC9C,OAAO,CAAC,QAAQ,CAAC,OAAO;gBAAP,OAAO,EAAE,cAAc;IAEpD,OAAO,GACL,WAAW,cAAc,KACxB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CActC;IAEF,OAAO,GACL,KAAK,cAAc,KAClB,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,gBAAgB,CAAC,CAqB5D;IAEF,OAAO,GAAI,KAAK,cAAc,KAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAmBpE;IAEF,0BAA0B,GACxB,KAAK,cAAc,EACnB,QAAQ,aAAa,CAAC,OAAO,YAAY,EAAE,UAAU,CAAC,KACrD,MAAM,CAAC,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC,CA4D1C;IAEL,OAAO;;;;MAKJ;CACJ;AAED;;;;;;;;GAQG;AACH,qBAAa,iBAAkB,YAAW,mBAAmB;IAMzD,OAAO,CAAC,QAAQ,CAAC,OAAO;IACxB,OAAO,CAAC,QAAQ,CAAC,MAAM;IANzB,OAAO,CAAC,cAAc,CAAK;IAC3B,OAAO,CAAC,oBAAoB,CAAK;IACjC,OAAO,CAAC,aAAa,CAAyC;gBAG3C,OAAO,EAAE,cAAc,EACvB,MAAM,GAAE,uBAA+C;IAG1E,OAAO,GACL,WAAW,cAAc,KACxB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CA+BtC;IAEF,OAAO,CAAC,YAAY,CAkClB;IAEF,OAAO,CAAC,SAAS,CAmBf;IAEF,OAAO,CAAC,kBAAkB,CAwBxB;IAEF,OAAO,GACL,KAAK,cAAc,KAClB,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,gBAAgB,CAAC,CA0C5D;IAEF,OAAO,CAAC,kBAAkB,CA0ExB;IAEF,OAAO,GAAI,KAAK,cAAc,KAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAepE;IAEF,OAAO;;;;MAWJ;CACJ"}
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import { Effect, Option, Schema } from 'effect';
|
|
2
|
+
import { PriorityRequest, SpiderState, SpiderStateKey } from '../Scheduler/SpiderScheduler.service.js';
|
|
3
|
+
export { SpiderStateKey, PriorityRequest, SpiderState };
|
|
4
|
+
declare const StateDelta_base: Schema.Class<StateDelta, {
|
|
5
|
+
/** Session this delta applies to */
|
|
6
|
+
stateKey: typeof Schema.String;
|
|
7
|
+
/** Sequence number for ordering deltas */
|
|
8
|
+
sequence: typeof Schema.Number;
|
|
9
|
+
/** When this delta was created */
|
|
10
|
+
timestamp: typeof Schema.Date;
|
|
11
|
+
/** The operation that created this delta */
|
|
12
|
+
operation: Schema.Union<[Schema.Struct<{
|
|
13
|
+
type: Schema.Literal<["enqueue"]>;
|
|
14
|
+
request: typeof PriorityRequest;
|
|
15
|
+
}>, Schema.Struct<{
|
|
16
|
+
type: Schema.Literal<["dequeue"]>;
|
|
17
|
+
fingerprint: typeof Schema.String;
|
|
18
|
+
}>, Schema.Struct<{
|
|
19
|
+
type: Schema.Literal<["mark_visited"]>;
|
|
20
|
+
fingerprint: typeof Schema.String;
|
|
21
|
+
}>]>;
|
|
22
|
+
}, Schema.Struct.Encoded<{
|
|
23
|
+
/** Session this delta applies to */
|
|
24
|
+
stateKey: typeof Schema.String;
|
|
25
|
+
/** Sequence number for ordering deltas */
|
|
26
|
+
sequence: typeof Schema.Number;
|
|
27
|
+
/** When this delta was created */
|
|
28
|
+
timestamp: typeof Schema.Date;
|
|
29
|
+
/** The operation that created this delta */
|
|
30
|
+
operation: Schema.Union<[Schema.Struct<{
|
|
31
|
+
type: Schema.Literal<["enqueue"]>;
|
|
32
|
+
request: typeof PriorityRequest;
|
|
33
|
+
}>, Schema.Struct<{
|
|
34
|
+
type: Schema.Literal<["dequeue"]>;
|
|
35
|
+
fingerprint: typeof Schema.String;
|
|
36
|
+
}>, Schema.Struct<{
|
|
37
|
+
type: Schema.Literal<["mark_visited"]>;
|
|
38
|
+
fingerprint: typeof Schema.String;
|
|
39
|
+
}>]>;
|
|
40
|
+
}>, never, {
|
|
41
|
+
readonly operation: {
|
|
42
|
+
readonly type: "enqueue";
|
|
43
|
+
readonly request: PriorityRequest;
|
|
44
|
+
} | {
|
|
45
|
+
readonly type: "dequeue";
|
|
46
|
+
readonly fingerprint: string;
|
|
47
|
+
} | {
|
|
48
|
+
readonly type: "mark_visited";
|
|
49
|
+
readonly fingerprint: string;
|
|
50
|
+
};
|
|
51
|
+
} & {
|
|
52
|
+
readonly timestamp: Date;
|
|
53
|
+
} & {
|
|
54
|
+
readonly stateKey: string;
|
|
55
|
+
} & {
|
|
56
|
+
readonly sequence: number;
|
|
57
|
+
}, {}, {}>;
|
|
58
|
+
/**
|
|
59
|
+
* Delta operation that represents a single state change.
|
|
60
|
+
*
|
|
61
|
+
* Used for incremental persistence instead of saving the entire state
|
|
62
|
+
* on every operation, which is much more efficient for large crawls.
|
|
63
|
+
*
|
|
64
|
+
* @group Delta Updates
|
|
65
|
+
* @public
|
|
66
|
+
*/
|
|
67
|
+
export declare class StateDelta extends StateDelta_base {
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Represents a state change operation with both the delta and resulting state.
|
|
71
|
+
*
|
|
72
|
+
* This allows persistence strategies to choose whether to save deltas,
|
|
73
|
+
* full state, or both depending on their optimization needs.
|
|
74
|
+
*
|
|
75
|
+
* @group Operations
|
|
76
|
+
* @public
|
|
77
|
+
*/
|
|
78
|
+
export interface StateOperation {
|
|
79
|
+
/** The incremental change */
|
|
80
|
+
readonly delta: StateDelta;
|
|
81
|
+
/** The complete state after applying this operation */
|
|
82
|
+
readonly resultingState: SpiderState;
|
|
83
|
+
/** Whether this operation should trigger a snapshot */
|
|
84
|
+
readonly shouldSnapshot: boolean;
|
|
85
|
+
}
|
|
86
|
+
declare const PersistenceError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
87
|
+
readonly _tag: "PersistenceError";
|
|
88
|
+
} & Readonly<A>;
|
|
89
|
+
/**
|
|
90
|
+
* Error that can occur during persistence operations.
|
|
91
|
+
*
|
|
92
|
+
* @group Errors
|
|
93
|
+
* @public
|
|
94
|
+
*/
|
|
95
|
+
export declare class PersistenceError extends PersistenceError_base<{
|
|
96
|
+
readonly message: string;
|
|
97
|
+
readonly cause?: unknown;
|
|
98
|
+
readonly operation?: string;
|
|
99
|
+
}> {
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Storage backend capabilities that determine optimal persistence strategy.
|
|
103
|
+
*
|
|
104
|
+
* Backends advertise their capabilities so the ResumabilityService can
|
|
105
|
+
* choose the best strategy automatically.
|
|
106
|
+
*
|
|
107
|
+
* @group Storage
|
|
108
|
+
* @public
|
|
109
|
+
*/
|
|
110
|
+
export interface StorageCapabilities {
|
|
111
|
+
/** Can efficiently store and retrieve delta operations */
|
|
112
|
+
readonly supportsDelta: boolean;
|
|
113
|
+
/** Can efficiently store full state snapshots */
|
|
114
|
+
readonly supportsSnapshot: boolean;
|
|
115
|
+
/** Can handle streaming/batch operations */
|
|
116
|
+
readonly supportsStreaming: boolean;
|
|
117
|
+
/** Can handle concurrent access safely */
|
|
118
|
+
readonly supportsConcurrency: boolean;
|
|
119
|
+
/** Estimated latency category */
|
|
120
|
+
readonly latency: 'low' | 'medium' | 'high';
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Generic storage backend interface that persistence strategies use.
|
|
124
|
+
*
|
|
125
|
+
* Backends implement the storage operations they support best.
|
|
126
|
+
* Not all methods need to be implemented - strategies will adapt.
|
|
127
|
+
*
|
|
128
|
+
* @group Storage
|
|
129
|
+
* @public
|
|
130
|
+
*/
|
|
131
|
+
export interface StorageBackend {
|
|
132
|
+
/** Backend capabilities for strategy selection */
|
|
133
|
+
readonly capabilities: StorageCapabilities;
|
|
134
|
+
/** Storage backend identifier */
|
|
135
|
+
readonly name: string;
|
|
136
|
+
/** Initialize the backend (create tables, connections, etc.) */
|
|
137
|
+
initialize(): Effect.Effect<void, PersistenceError>;
|
|
138
|
+
/** Cleanup backend resources */
|
|
139
|
+
cleanup(): Effect.Effect<void, PersistenceError>;
|
|
140
|
+
saveState?(key: SpiderStateKey, state: SpiderState): Effect.Effect<void, PersistenceError>;
|
|
141
|
+
loadState?(key: SpiderStateKey): Effect.Effect<Option.Option<SpiderState>, PersistenceError>;
|
|
142
|
+
deleteState?(key: SpiderStateKey): Effect.Effect<void, PersistenceError>;
|
|
143
|
+
saveDelta?(delta: StateDelta): Effect.Effect<void, PersistenceError>;
|
|
144
|
+
saveDeltas?(deltas: readonly StateDelta[]): Effect.Effect<void, PersistenceError>;
|
|
145
|
+
loadDeltas?(key: SpiderStateKey, fromSequence?: number): Effect.Effect<readonly StateDelta[], PersistenceError>;
|
|
146
|
+
saveSnapshot?(key: SpiderStateKey, state: SpiderState, sequence: number): Effect.Effect<void, PersistenceError>;
|
|
147
|
+
loadLatestSnapshot?(key: SpiderStateKey): Effect.Effect<Option.Option<{
|
|
148
|
+
state: SpiderState;
|
|
149
|
+
sequence: number;
|
|
150
|
+
}>, PersistenceError>;
|
|
151
|
+
compactDeltas?(key: SpiderStateKey, beforeSequence: number): Effect.Effect<void, PersistenceError>;
|
|
152
|
+
listSessions?(): Effect.Effect<readonly SpiderStateKey[], PersistenceError>;
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Core strategy interface for different persistence approaches.
|
|
156
|
+
*
|
|
157
|
+
* Strategies implement the logic for when and how to persist state,
|
|
158
|
+
* using the storage backend for actual I/O operations.
|
|
159
|
+
*
|
|
160
|
+
* @group Strategies
|
|
161
|
+
* @public
|
|
162
|
+
*/
|
|
163
|
+
export interface PersistenceStrategy {
|
|
164
|
+
/** Persist a state operation */
|
|
165
|
+
persist(operation: StateOperation): Effect.Effect<void, PersistenceError>;
|
|
166
|
+
/** Restore state from storage */
|
|
167
|
+
restore(key: SpiderStateKey): Effect.Effect<Option.Option<SpiderState>, PersistenceError>;
|
|
168
|
+
/** Clean up old data */
|
|
169
|
+
cleanup(key: SpiderStateKey): Effect.Effect<void, PersistenceError>;
|
|
170
|
+
/** Get strategy information */
|
|
171
|
+
getInfo(): {
|
|
172
|
+
readonly name: string;
|
|
173
|
+
readonly description: string;
|
|
174
|
+
readonly capabilities: string[];
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Configuration for hybrid persistence strategy.
|
|
179
|
+
*
|
|
180
|
+
* Controls when to save snapshots vs deltas for optimal performance.
|
|
181
|
+
*
|
|
182
|
+
* @group Configuration
|
|
183
|
+
* @public
|
|
184
|
+
*/
|
|
185
|
+
export interface HybridPersistenceConfig {
|
|
186
|
+
/** Save a full snapshot every N operations */
|
|
187
|
+
readonly snapshotInterval: number;
|
|
188
|
+
/** Maximum deltas to accumulate before forcing a snapshot */
|
|
189
|
+
readonly maxDeltasBeforeSnapshot: number;
|
|
190
|
+
/** Whether to compact old deltas after snapshots */
|
|
191
|
+
readonly compactionEnabled: boolean;
|
|
192
|
+
/** Batch multiple deltas together for efficiency */
|
|
193
|
+
readonly batchDeltas: boolean;
|
|
194
|
+
/** Batch size for delta operations */
|
|
195
|
+
readonly deltaBatchSize: number;
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Default hybrid persistence configuration.
|
|
199
|
+
*/
|
|
200
|
+
export declare const DEFAULT_HYBRID_CONFIG: HybridPersistenceConfig;
|
|
201
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/lib/Resumability/types.ts"],"names":[],"mappings":"AAAA,OAAO,EAAQ,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AACtD,OAAO,EACL,eAAe,EACf,WAAW,EACX,cAAc,EACf,MAAM,yCAAyC,CAAC;AAGjD,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,WAAW,EAAE,CAAC;;IAYtD,oCAAoC;;IAEpC,0CAA0C;;IAE1C,kCAAkC;;IAElC,4CAA4C;;;;;;;;;;;;IAN5C,oCAAoC;;IAEpC,0CAA0C;;IAE1C,kCAAkC;;IAElC,4CAA4C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAhB9C;;;;;;;;GAQG;AACH,qBAAa,UAAW,SAAQ,eAsB9B;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,cAAc;IAC7B,6BAA6B;IAC7B,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAC3B,uDAAuD;IACvD,QAAQ,CAAC,cAAc,EAAE,WAAW,CAAC;IACrC,uDAAuD;IACvD,QAAQ,CAAC,cAAc,EAAE,OAAO,CAAC;CAClC;;;;AAED;;;;;GAKG;AACH,qBAAa,gBAAiB,SAAQ,sBAAqC;IACzE,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;CAC7B,CAAC;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,mBAAmB;IAClC,0DAA0D;IAC1D,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAChC,iDAAiD;IACjD,QAAQ,CAAC,gBAAgB,EAAE,OAAO,CAAC;IACnC,4CAA4C;IAC5C,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IACpC,0CAA0C;IAC1C,QAAQ,CAAC,mBAAmB,EAAE,OAAO,CAAC;IACtC,iCAAiC;IACjC,QAAQ,CAAC,OAAO,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;CAC7C;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,cAAc;IAC7B,kDAAkD;IAClD,QAAQ,CAAC,YAAY,EAAE,mBAAmB,CAAC;IAE3C,iCAAiC;IACjC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAEtB,gEAAgE;IAChE,UAAU,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IAEpD,gCAAgC;IAChC,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IAGjD,SAAS,CAAC,CACR,GAAG,EAAE,cAAc,EACnB,KAAK,EAAE,WAAW,GACjB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IACzC,SAAS,CAAC,CACR,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,gBAAgB,CAAC,CAAC;IAC/D,WAAW,CAAC,CACV,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IAGzC,SAAS,CAAC,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IACrE,UAAU,CAAC,CACT,MAAM,EAAE,SAAS,UAAU,EAAE,GAC5B,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IACzC,UAAU,CAAC,CACT,GAAG,EAAE,cAAc,EACnB,YAAY,CAAC,EAAE,MAAM,GACpB,MAAM,CAAC,MAAM,CAAC,SAAS,UAAU,EAAE,EAAE,gBAAgB,CAAC,CAAC;IAG1D,YAAY,CAAC,CACX,GAAG,EAAE,cAAc,EACnB,KAAK,EAAE,WAAW,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IACzC,kBAAkB,CAAC,CACjB,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CACd,MAAM,CAAC,MAAM,CAAC;QAAE,KAAK,EAAE,WAAW,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,EACvD,gBAAgB,CACjB,CAAC;IAGF,aAAa,CAAC,CACZ,GAAG,EAAE,cAAc,EACnB,cAAc,EAAE,MAAM,GACrB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IACzC,YAAY,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,SAAS,cAAc,EAAE,EAAE,gBAAgB,CAAC,CAAC;CAC7E;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,mBAAmB;IAClC,gCAAgC;IAChC,OAAO,CACL,SAAS,EAAE,cAAc,GACxB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IAEzC,iCAAiC;IACjC,OAAO,CACL,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,gBAAgB,CAAC,CAAC;IAE/D,wBAAwB;IACxB,OAAO,CAAC,GAAG,EAAE,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IAEpE,+BAA+B;IAC/B,OAAO,IAAI;QACT,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;QACtB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;QAC7B,QAAQ,CAAC,YAAY,EAAE,MAAM,EAAE,CAAC;KACjC,CAAC;CACH;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,uBAAuB;IACtC,8CAA8C;IAC9C,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAC;IAClC,6DAA6D;IAC7D,QAAQ,CAAC,uBAAuB,EAAE,MAAM,CAAC;IACzC,oDAAoD;IACpD,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IACpC,oDAAoD;IACpD,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC;IAC9B,sCAAsC;IACtC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;CACjC;AAED;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,uBAMnC,CAAC"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { Effect, MutableHashSet, Option } from 'effect';
|
|
2
|
+
/**
|
|
3
|
+
* Parsed robots.txt rules for a specific user agent.
|
|
4
|
+
*
|
|
5
|
+
* Contains the disallowed paths and crawl delay settings extracted
|
|
6
|
+
* from a robots.txt file for a particular user agent string.
|
|
7
|
+
*
|
|
8
|
+
* @group Data Types
|
|
9
|
+
* @internal
|
|
10
|
+
*/
|
|
11
|
+
interface RobotsRules {
|
|
12
|
+
/** Set of URL paths that are disallowed for this user agent */
|
|
13
|
+
disallowedPaths: MutableHashSet.MutableHashSet<string>;
|
|
14
|
+
/** Optional crawl delay in seconds specified in robots.txt */
|
|
15
|
+
crawlDelay?: number;
|
|
16
|
+
/** The user agent these rules apply to */
|
|
17
|
+
userAgent: string;
|
|
18
|
+
}
|
|
19
|
+
declare const RobotsService_base: Effect.Service.Class<RobotsService, "@jambudipa.io/RobotsService", {
|
|
20
|
+
readonly effect: Effect.Effect<{
|
|
21
|
+
checkUrl: (urlString: string) => Effect.Effect<{
|
|
22
|
+
allowed: boolean;
|
|
23
|
+
crawlDelay?: undefined;
|
|
24
|
+
} | {
|
|
25
|
+
allowed: boolean;
|
|
26
|
+
crawlDelay: number | undefined;
|
|
27
|
+
}, never, never>;
|
|
28
|
+
getRules: (domain: string) => Effect.Effect<Option.Option<RobotsRules>, never, never>;
|
|
29
|
+
}, never, never>;
|
|
30
|
+
}>;
|
|
31
|
+
/**
|
|
32
|
+
* Service for parsing and enforcing robots.txt compliance.
|
|
33
|
+
*
|
|
34
|
+
* The RobotsService handles fetching, parsing, and caching robots.txt files
|
|
35
|
+
* to ensure compliant web crawling. It provides efficient URL checking with
|
|
36
|
+
* automatic caching to minimise network requests.
|
|
37
|
+
*
|
|
38
|
+
* **Key Features:**
|
|
39
|
+
* - Automatic robots.txt fetching and parsing
|
|
40
|
+
* - Intelligent caching to reduce redundant requests
|
|
41
|
+
* - User agent-specific rule enforcement
|
|
42
|
+
* - Crawl delay extraction and enforcement
|
|
43
|
+
* - Graceful error handling for malformed robots.txt files
|
|
44
|
+
*
|
|
45
|
+
* **Standards Compliance:**
|
|
46
|
+
* - Follows the Robots Exclusion Standard (RFC 9309)
|
|
47
|
+
* - Supports User-agent, Disallow, and Crawl-delay directives
|
|
48
|
+
* - Handles wildcard (*) user agent specifications
|
|
49
|
+
* - Case-insensitive user agent matching
|
|
50
|
+
*
|
|
51
|
+
* @example
|
|
52
|
+
* ```typescript
|
|
53
|
+
* const program = Effect.gen(function* () {
|
|
54
|
+
* const robots = yield* RobotsService;
|
|
55
|
+
*
|
|
56
|
+
* // Check if URL is allowed
|
|
57
|
+
* const check = yield* robots.checkUrl('https://example.com/admin');
|
|
58
|
+
* if (!check.allowed) {
|
|
59
|
+
* console.log('URL blocked by robots.txt');
|
|
60
|
+
* return;
|
|
61
|
+
* }
|
|
62
|
+
*
|
|
63
|
+
* // Apply crawl delay if specified
|
|
64
|
+
* if (check.crawlDelay) {
|
|
65
|
+
* yield* Effect.sleep(`${check.crawlDelay} seconds`);
|
|
66
|
+
* }
|
|
67
|
+
*
|
|
68
|
+
* // Proceed with crawling...
|
|
69
|
+
* });
|
|
70
|
+
* ```
|
|
71
|
+
*
|
|
72
|
+
* @group Services
|
|
73
|
+
* @public
|
|
74
|
+
*/
|
|
75
|
+
export declare class RobotsService extends RobotsService_base {
|
|
76
|
+
}
|
|
77
|
+
export {};
|
|
78
|
+
//# sourceMappingURL=Robots.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Robots.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Robots/Robots.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAkB,cAAc,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAGxE;;;;;;;;GAQG;AACH,UAAU,WAAW;IACnB,+DAA+D;IAC/D,eAAe,EAAE,cAAc,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IACvD,8DAA8D;IAC9D,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;CACnB;;;8BAuK6B,MAAM;;;;;;;2BA+CT,MAAM;;;AApNjC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2CG;AACH,qBAAa,aAAc,SAAQ,kBAiLlC;CAAG"}
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import { Effect, Option, Schema } from 'effect';
|
|
2
|
+
import { CrawlTask } from '../Spider/Spider.service.js';
|
|
3
|
+
import { ConfigurationError } from '../errors.js';
|
|
4
|
+
declare const SpiderStateKey_base: Schema.Class<SpiderStateKey, {
|
|
5
|
+
/** Unique identifier for the session */
|
|
6
|
+
id: typeof Schema.String;
|
|
7
|
+
/** When the session was created */
|
|
8
|
+
timestamp: typeof Schema.Date;
|
|
9
|
+
/** Human-readable name for the session */
|
|
10
|
+
name: typeof Schema.String;
|
|
11
|
+
}, Schema.Struct.Encoded<{
|
|
12
|
+
/** Unique identifier for the session */
|
|
13
|
+
id: typeof Schema.String;
|
|
14
|
+
/** When the session was created */
|
|
15
|
+
timestamp: typeof Schema.Date;
|
|
16
|
+
/** Human-readable name for the session */
|
|
17
|
+
name: typeof Schema.String;
|
|
18
|
+
}>, never, {
|
|
19
|
+
readonly name: string;
|
|
20
|
+
} & {
|
|
21
|
+
readonly timestamp: Date;
|
|
22
|
+
} & {
|
|
23
|
+
readonly id: string;
|
|
24
|
+
}, {}, {}>;
|
|
25
|
+
/**
|
|
26
|
+
* Unique identifier for a spider crawling session.
|
|
27
|
+
*
|
|
28
|
+
* Used to identify and restore specific crawl sessions when using
|
|
29
|
+
* persistent storage. Each crawl session should have a unique key.
|
|
30
|
+
*
|
|
31
|
+
* @group Data Types
|
|
32
|
+
* @public
|
|
33
|
+
*/
|
|
34
|
+
export declare class SpiderStateKey extends SpiderStateKey_base {
|
|
35
|
+
}
|
|
36
|
+
declare const PriorityRequest_base: Schema.Class<PriorityRequest, {
|
|
37
|
+
/** The crawl task containing URL and depth information */
|
|
38
|
+
request: Schema.Struct<{
|
|
39
|
+
url: typeof Schema.String;
|
|
40
|
+
depth: typeof Schema.Number;
|
|
41
|
+
fromUrl: Schema.optional<typeof Schema.String>;
|
|
42
|
+
}>;
|
|
43
|
+
/** Priority level (higher numbers processed first) */
|
|
44
|
+
priority: typeof Schema.Number;
|
|
45
|
+
/** When this request was created */
|
|
46
|
+
timestamp: typeof Schema.Date;
|
|
47
|
+
/** Unique fingerprint for deduplication */
|
|
48
|
+
fingerprint: typeof Schema.String;
|
|
49
|
+
}, Schema.Struct.Encoded<{
|
|
50
|
+
/** The crawl task containing URL and depth information */
|
|
51
|
+
request: Schema.Struct<{
|
|
52
|
+
url: typeof Schema.String;
|
|
53
|
+
depth: typeof Schema.Number;
|
|
54
|
+
fromUrl: Schema.optional<typeof Schema.String>;
|
|
55
|
+
}>;
|
|
56
|
+
/** Priority level (higher numbers processed first) */
|
|
57
|
+
priority: typeof Schema.Number;
|
|
58
|
+
/** When this request was created */
|
|
59
|
+
timestamp: typeof Schema.Date;
|
|
60
|
+
/** Unique fingerprint for deduplication */
|
|
61
|
+
fingerprint: typeof Schema.String;
|
|
62
|
+
}>, never, {
|
|
63
|
+
readonly timestamp: Date;
|
|
64
|
+
} & {
|
|
65
|
+
readonly priority: number;
|
|
66
|
+
} & {
|
|
67
|
+
readonly fingerprint: string;
|
|
68
|
+
} & {
|
|
69
|
+
readonly request: {
|
|
70
|
+
readonly url: string;
|
|
71
|
+
readonly depth: number;
|
|
72
|
+
readonly fromUrl?: string | undefined;
|
|
73
|
+
};
|
|
74
|
+
}, {}, {}>;
|
|
75
|
+
/**
|
|
76
|
+
* A crawl request with priority and metadata for scheduling.
|
|
77
|
+
*
|
|
78
|
+
* Requests are processed in priority order (higher numbers first),
|
|
79
|
+
* with FIFO ordering within the same priority level.
|
|
80
|
+
*
|
|
81
|
+
* @group Data Types
|
|
82
|
+
* @public
|
|
83
|
+
*/
|
|
84
|
+
export declare class PriorityRequest extends PriorityRequest_base {
|
|
85
|
+
}
|
|
86
|
+
declare const SpiderState_base: Schema.Class<SpiderState, {
|
|
87
|
+
/** The state key identifying this session */
|
|
88
|
+
key: typeof SpiderStateKey;
|
|
89
|
+
/** All requests waiting to be processed */
|
|
90
|
+
pendingRequests: Schema.Array$<typeof PriorityRequest>;
|
|
91
|
+
/** Fingerprints of URLs already visited (for deduplication) */
|
|
92
|
+
visitedFingerprints: Schema.Array$<typeof Schema.String>;
|
|
93
|
+
/** Total number of requests processed so far */
|
|
94
|
+
totalProcessed: typeof Schema.Number;
|
|
95
|
+
}, Schema.Struct.Encoded<{
|
|
96
|
+
/** The state key identifying this session */
|
|
97
|
+
key: typeof SpiderStateKey;
|
|
98
|
+
/** All requests waiting to be processed */
|
|
99
|
+
pendingRequests: Schema.Array$<typeof PriorityRequest>;
|
|
100
|
+
/** Fingerprints of URLs already visited (for deduplication) */
|
|
101
|
+
visitedFingerprints: Schema.Array$<typeof Schema.String>;
|
|
102
|
+
/** Total number of requests processed so far */
|
|
103
|
+
totalProcessed: typeof Schema.Number;
|
|
104
|
+
}>, never, {
|
|
105
|
+
readonly key: SpiderStateKey;
|
|
106
|
+
} & {
|
|
107
|
+
readonly totalProcessed: number;
|
|
108
|
+
} & {
|
|
109
|
+
readonly pendingRequests: readonly PriorityRequest[];
|
|
110
|
+
} & {
|
|
111
|
+
readonly visitedFingerprints: readonly string[];
|
|
112
|
+
}, {}, {}>;
|
|
113
|
+
/**
|
|
114
|
+
* Complete state snapshot of a spider crawling session.
|
|
115
|
+
*
|
|
116
|
+
* This contains all information needed to resume a crawl session,
|
|
117
|
+
* including pending requests, visited URLs, and progress counters.
|
|
118
|
+
*
|
|
119
|
+
* @group Data Types
|
|
120
|
+
* @public
|
|
121
|
+
*/
|
|
122
|
+
export declare class SpiderState extends SpiderState_base {
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Generic interface for persisting spider state.
|
|
126
|
+
*
|
|
127
|
+
* Implementations can use any storage backend (filesystem, database, etc.)
|
|
128
|
+
* to save and restore crawling sessions. All operations are Effect-based
|
|
129
|
+
* for composability and error handling.
|
|
130
|
+
*
|
|
131
|
+
* @example
|
|
132
|
+
* ```typescript
|
|
133
|
+
* class FilePersistence implements StatePersistence {
|
|
134
|
+
* saveState = (key: SpiderStateKey, state: SpiderState) =>
|
|
135
|
+
* Effect.tryPromise(() => fs.writeFile(key.id + '.json', JSON.stringify(state)))
|
|
136
|
+
*
|
|
137
|
+
* loadState = (key: SpiderStateKey) =>
|
|
138
|
+
* Effect.tryPromise(() => fs.readFile(key.id + '.json').then(JSON.parse))
|
|
139
|
+
*
|
|
140
|
+
* deleteState = (key: SpiderStateKey) =>
|
|
141
|
+
* Effect.tryPromise(() => fs.unlink(key.id + '.json'))
|
|
142
|
+
* }
|
|
143
|
+
* ```
|
|
144
|
+
*
|
|
145
|
+
* @group Interfaces
|
|
146
|
+
* @public
|
|
147
|
+
*/
|
|
148
|
+
export interface StatePersistence {
|
|
149
|
+
/** Saves the complete spider state to persistent storage */
|
|
150
|
+
saveState: (_key: SpiderStateKey, _state: SpiderState) => Effect.Effect<void, Error>;
|
|
151
|
+
/** Loads spider state from persistent storage, returns Option.none if not found */
|
|
152
|
+
loadState: (_key: SpiderStateKey) => Effect.Effect<Option.Option<SpiderState>, Error>;
|
|
153
|
+
/** Deletes spider state from persistent storage */
|
|
154
|
+
deleteState: (_key: SpiderStateKey) => Effect.Effect<void, Error>;
|
|
155
|
+
}
|
|
156
|
+
declare const SpiderSchedulerService_base: Effect.Service.Class<SpiderSchedulerService, "@jambudipa/spiderSchedulerService", {
|
|
157
|
+
readonly effect: Effect.Effect<{
|
|
158
|
+
configurePersistence: (persistence: StatePersistence, stateKey: SpiderStateKey) => Effect.Effect<void, never, never>;
|
|
159
|
+
clearPersistence: () => Effect.Effect<void, never, never>;
|
|
160
|
+
enqueue: (request: CrawlTask, priority?: number) => Effect.Effect<boolean, Error, never>;
|
|
161
|
+
dequeue: () => Effect.Effect<PriorityRequest, Error, never>;
|
|
162
|
+
size: () => Effect.Effect<number, never, never>;
|
|
163
|
+
isEmpty: () => Effect.Effect<boolean, never, never>;
|
|
164
|
+
getState: () => Effect.Effect<SpiderState, ConfigurationError, never>;
|
|
165
|
+
restoreFromState: (state: SpiderState) => Effect.Effect<void, Error>;
|
|
166
|
+
restore: (persistence: StatePersistence, stateKey: SpiderStateKey) => Effect.Effect<boolean, Error, never>;
|
|
167
|
+
}, never, import("../Config/SpiderConfig.service.js").SpiderConfigService>;
|
|
168
|
+
readonly dependencies: readonly [import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>];
|
|
169
|
+
}>;
|
|
170
|
+
/**
|
|
171
|
+
* Manages request scheduling, prioritization, and state persistence for web crawling.
|
|
172
|
+
*
|
|
173
|
+
* The SpiderSchedulerService provides a priority-based request queue with optional persistence
|
|
174
|
+
* capabilities. It handles:
|
|
175
|
+
* - Request deduplication via fingerprinting
|
|
176
|
+
* - Priority-based scheduling (higher numbers processed first)
|
|
177
|
+
* - State persistence for resumable crawling
|
|
178
|
+
* - Atomic state operations
|
|
179
|
+
*
|
|
180
|
+
* @example
|
|
181
|
+
* ```typescript
|
|
182
|
+
* const program = Effect.gen(function* () {
|
|
183
|
+
* const scheduler = yield* SpiderSchedulerService;
|
|
184
|
+
*
|
|
185
|
+
* // Configure persistence
|
|
186
|
+
* const persistence = new FilePersistence('./state');
|
|
187
|
+
* const stateKey = new SpiderStateKey({
|
|
188
|
+
* id: 'my-crawl',
|
|
189
|
+
* timestamp: new Date(),
|
|
190
|
+
* name: 'Example Crawl'
|
|
191
|
+
* });
|
|
192
|
+
*
|
|
193
|
+
* yield* scheduler.configurePersistence(persistence, stateKey);
|
|
194
|
+
*
|
|
195
|
+
* // Queue requests
|
|
196
|
+
* yield* scheduler.enqueue({ url: 'https://example.com', depth: 0 }, 10);
|
|
197
|
+
* yield* scheduler.enqueue({ url: 'https://example.com/about', depth: 1 }, 5);
|
|
198
|
+
*
|
|
199
|
+
* // Process requests
|
|
200
|
+
* const request = yield* scheduler.dequeue();
|
|
201
|
+
* console.log(`Processing: ${request.request.url}`);
|
|
202
|
+
* });
|
|
203
|
+
* ```
|
|
204
|
+
*
|
|
205
|
+
* @group Services
|
|
206
|
+
* @public
|
|
207
|
+
*/
|
|
208
|
+
export declare class SpiderSchedulerService extends SpiderSchedulerService_base {
|
|
209
|
+
}
|
|
210
|
+
export {};
|
|
211
|
+
//# sourceMappingURL=SpiderScheduler.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SpiderScheduler.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Scheduler/SpiderScheduler.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAY,MAAM,EAAkB,MAAM,EAAS,MAAM,EAAE,MAAM,QAAQ,CAAC;AACjF,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;;IAehD,wCAAwC;;IAExC,mCAAmC;;IAEnC,0CAA0C;;;IAJ1C,wCAAwC;;IAExC,mCAAmC;;IAEnC,0CAA0C;;;;;;;;;AAhB5C;;;;;;;;GAQG;AACH,qBAAa,cAAe,SAAQ,mBASlC;CAAG;;IAcH,0DAA0D;;;;;;IAM1D,sDAAsD;;IAEtD,oCAAoC;;IAEpC,2CAA2C;;;IAV3C,0DAA0D;;;;;;IAM1D,sDAAsD;;IAEtD,oCAAoC;;IAEpC,2CAA2C;;;;;;;;;;;;;;;AAtB7C;;;;;;;;GAQG;AACH,qBAAa,eAAgB,SAAQ,oBAenC;CAAG;;IAYH,6CAA6C;;IAE7C,2CAA2C;;IAE3C,+DAA+D;;IAE/D,gDAAgD;;;IANhD,6CAA6C;;IAE7C,2CAA2C;;IAE3C,+DAA+D;;IAE/D,gDAAgD;;;;;;;;;;;AAhBlD;;;;;;;;GAQG;AACH,qBAAa,WAAY,SAAQ,gBAS/B;CAAG;AAEL;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4DAA4D;IAC5D,SAAS,EAAE,CACT,IAAI,EAAE,cAAc,EACpB,MAAM,EAAE,WAAW,KAChB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;IAChC,mFAAmF;IACnF,SAAS,EAAE,CAAC,IAAI,EAAE,cAAc,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,KAAK,CAAC,CAAC;IACtF,mDAAmD;IACnD,WAAW,EAAE,CAAC,IAAI,EAAE,cAAc,KAAK,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;CACnE;;;4CAiMsB,gBAAgB,YACnB,cAAc;;2BAeP,SAAS;;;;;kCAhDrB,WAAW,KACjB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC;+BA4HJ,gBAAgB,YAAY,cAAc;;;;AA5RzE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AACH,qBAAa,sBAAuB,SAAQ,2BAoQ3C;CAAG"}
|