@jambudipa/spider 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -16
- package/dist/browser/BrowserManager.d.ts +63 -0
- package/dist/browser/BrowserManager.d.ts.map +1 -0
- package/dist/browser/PlaywrightAdapter.d.ts +166 -0
- package/dist/browser/PlaywrightAdapter.d.ts.map +1 -0
- package/dist/examples/01-basic-crawl-working.d.ts +13 -0
- package/dist/examples/01-basic-crawl-working.d.ts.map +1 -0
- package/dist/examples/02-multiple-urls-working.d.ts +13 -0
- package/dist/examples/02-multiple-urls-working.d.ts.map +1 -0
- package/dist/examples/03-url-filtering.d.ts +13 -0
- package/dist/examples/03-url-filtering.d.ts.map +1 -0
- package/dist/examples/04-robots-compliance.d.ts +14 -0
- package/dist/examples/04-robots-compliance.d.ts.map +1 -0
- package/dist/examples/05-link-extraction-selectors.d.ts +14 -0
- package/dist/examples/05-link-extraction-selectors.d.ts.map +1 -0
- package/dist/examples/06-custom-middleware.d.ts +18 -0
- package/dist/examples/06-custom-middleware.d.ts.map +1 -0
- package/dist/examples/07-resumability-demo.d.ts +14 -0
- package/dist/examples/07-resumability-demo.d.ts.map +1 -0
- package/dist/examples/08-worker-monitoring.d.ts +15 -0
- package/dist/examples/08-worker-monitoring.d.ts.map +1 -0
- package/dist/examples/09-error-handling-recovery.d.ts +15 -0
- package/dist/examples/09-error-handling-recovery.d.ts.map +1 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +2891 -1456
- package/dist/index.js.map +1 -1
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +107 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
- package/dist/lib/HttpClient/CookieManager.d.ts +58 -0
- package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +63 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
- package/dist/lib/HttpClient/SessionStore.d.ts +114 -0
- package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts +83 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
- package/dist/lib/HttpClient/index.d.ts +8 -0
- package/dist/lib/HttpClient/index.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/index.d.ts +37 -0
- package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
- package/dist/lib/Logging/FetchLogger.d.ts +24 -0
- package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts +37 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +239 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
- package/dist/lib/Middleware/types.d.ts +99 -0
- package/dist/lib/Middleware/types.d.ts.map +1 -0
- package/dist/lib/PageData/PageData.d.ts +28 -0
- package/dist/lib/PageData/PageData.d.ts.map +1 -0
- package/dist/lib/Resumability/Resumability.service.d.ts +178 -0
- package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/index.d.ts +51 -0
- package/dist/lib/Resumability/index.d.ts.map +1 -0
- package/dist/lib/Resumability/strategies.d.ts +76 -0
- package/dist/lib/Resumability/strategies.d.ts.map +1 -0
- package/dist/lib/Resumability/types.d.ts +201 -0
- package/dist/lib/Resumability/types.d.ts.map +1 -0
- package/dist/lib/Robots/Robots.service.d.ts +78 -0
- package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
- package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +249 -0
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
- package/dist/lib/StateManager/StateManager.service.d.ts +107 -0
- package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
- package/dist/lib/StateManager/index.d.ts +5 -0
- package/dist/lib/StateManager/index.d.ts.map +1 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +110 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
- package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
- package/dist/lib/api-facades.d.ts +313 -0
- package/dist/lib/api-facades.d.ts.map +1 -0
- package/dist/lib/errors/effect-errors.d.ts +179 -0
- package/dist/lib/errors/effect-errors.d.ts.map +1 -0
- package/dist/lib/errors.d.ts +172 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/dist/lib/utils/FileUtils.d.ts +284 -0
- package/dist/lib/utils/FileUtils.d.ts.map +1 -0
- package/dist/lib/utils/JsonUtils.d.ts +196 -0
- package/dist/lib/utils/JsonUtils.d.ts.map +1 -0
- package/dist/lib/utils/RegexUtils.d.ts +257 -0
- package/dist/lib/utils/RegexUtils.d.ts.map +1 -0
- package/dist/lib/utils/SchemaUtils.d.ts +251 -0
- package/dist/lib/utils/SchemaUtils.d.ts.map +1 -0
- package/dist/lib/utils/UrlUtils.d.ts +223 -0
- package/dist/lib/utils/UrlUtils.d.ts.map +1 -0
- package/dist/lib/utils/effect-migration.d.ts +31 -0
- package/dist/lib/utils/effect-migration.d.ts.map +1 -0
- package/dist/lib/utils/index.d.ts +15 -0
- package/dist/lib/utils/index.d.ts.map +1 -0
- package/dist/lib/utils/url-deduplication.d.ts +108 -0
- package/dist/lib/utils/url-deduplication.d.ts.map +1 -0
- package/dist/lib/utils/url-deduplication.test.d.ts +5 -0
- package/dist/lib/utils/url-deduplication.test.d.ts.map +1 -0
- package/dist/test/infrastructure/EffectTestUtils.d.ts +167 -0
- package/dist/test/infrastructure/EffectTestUtils.d.ts.map +1 -0
- package/package.json +21 -9
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token Extractor Service
|
|
3
|
+
* Extracts and manages various types of tokens from HTTP responses
|
|
4
|
+
*/
|
|
5
|
+
import { Context, Effect, Layer, Option } from 'effect';
|
|
6
|
+
import { StateManager, TokenType } from '../StateManager/StateManager.service.js';
|
|
7
|
+
import { EnhancedHttpClient, type HttpResponse } from './EnhancedHttpClient.js';
|
|
8
|
+
import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
|
|
9
|
+
import { NetworkError, ParseError, TimeoutError } from '../errors/effect-errors.js';
|
|
10
|
+
declare const TokenNotAvailableError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
11
|
+
readonly _tag: "TokenNotAvailableError";
|
|
12
|
+
} & Readonly<A>;
|
|
13
|
+
export declare class TokenNotAvailableError extends TokenNotAvailableError_base<{
|
|
14
|
+
readonly message: string;
|
|
15
|
+
}> {
|
|
16
|
+
}
|
|
17
|
+
declare const TokenRefreshError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
18
|
+
readonly _tag: "TokenRefreshError";
|
|
19
|
+
} & Readonly<A>;
|
|
20
|
+
export declare class TokenRefreshError extends TokenRefreshError_base<{
|
|
21
|
+
readonly message: string;
|
|
22
|
+
readonly tokenType: TokenType;
|
|
23
|
+
}> {
|
|
24
|
+
}
|
|
25
|
+
declare const NoRefreshUrlError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
26
|
+
readonly _tag: "NoRefreshUrlError";
|
|
27
|
+
} & Readonly<A>;
|
|
28
|
+
export declare class NoRefreshUrlError extends NoRefreshUrlError_base<{
|
|
29
|
+
readonly message: string;
|
|
30
|
+
}> {
|
|
31
|
+
}
|
|
32
|
+
export interface TokenInfo {
|
|
33
|
+
type: TokenType;
|
|
34
|
+
value: string;
|
|
35
|
+
source: 'html' | 'header' | 'script' | 'json';
|
|
36
|
+
selector?: string;
|
|
37
|
+
pattern?: string;
|
|
38
|
+
}
|
|
39
|
+
type HttpRequestError = NetworkError | ParseError | TimeoutError;
|
|
40
|
+
type TokenExtractorError = HttpRequestError | Error | TokenNotAvailableError | TokenRefreshError | NoRefreshUrlError;
|
|
41
|
+
export interface TokenExtractorService {
|
|
42
|
+
/**
|
|
43
|
+
* Extract all tokens from an HTTP response
|
|
44
|
+
*/
|
|
45
|
+
extractTokensFromResponse: (response: HttpResponse) => Effect.Effect<TokenInfo[]>;
|
|
46
|
+
/**
|
|
47
|
+
* Extract CSRF token from response
|
|
48
|
+
*/
|
|
49
|
+
extractCSRFFromResponse: (response: HttpResponse) => Effect.Effect<Option.Option<string>>;
|
|
50
|
+
/**
|
|
51
|
+
* Extract API token from response
|
|
52
|
+
*/
|
|
53
|
+
extractAPIFromResponse: (response: HttpResponse) => Effect.Effect<Option.Option<string>>;
|
|
54
|
+
/**
|
|
55
|
+
* Make authenticated request with automatic token injection
|
|
56
|
+
*/
|
|
57
|
+
authenticatedRequest: (url: string, options?: {
|
|
58
|
+
requireCSRF?: boolean;
|
|
59
|
+
requireAPI?: boolean;
|
|
60
|
+
customHeaders?: Record<string, string>;
|
|
61
|
+
}) => Effect.Effect<HttpResponse, TokenExtractorError>;
|
|
62
|
+
/**
|
|
63
|
+
* Detect and handle token rotation
|
|
64
|
+
*/
|
|
65
|
+
detectTokenRotation: (oldToken: string, response: HttpResponse, type: TokenType) => Effect.Effect<boolean>;
|
|
66
|
+
/**
|
|
67
|
+
* Refresh expired tokens
|
|
68
|
+
*/
|
|
69
|
+
refreshToken: (type: TokenType, refreshUrl?: string) => Effect.Effect<string, TokenExtractorError>;
|
|
70
|
+
}
|
|
71
|
+
export type { TokenExtractorError };
|
|
72
|
+
declare const TokenExtractor_base: Context.TagClass<TokenExtractor, "TokenExtractor", TokenExtractorService>;
|
|
73
|
+
export declare class TokenExtractor extends TokenExtractor_base {
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Create a TokenExtractor service implementation
|
|
77
|
+
*/
|
|
78
|
+
export declare const makeTokenExtractor: Effect.Effect<TokenExtractorService, never, SpiderLogger | EnhancedHttpClient | StateManager>;
|
|
79
|
+
/**
|
|
80
|
+
* TokenExtractor Layer with dependencies
|
|
81
|
+
*/
|
|
82
|
+
export declare const TokenExtractorLive: Layer.Layer<TokenExtractor, never, SpiderLogger | EnhancedHttpClient | StateManager>;
|
|
83
|
+
//# sourceMappingURL=TokenExtractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TokenExtractor.d.ts","sourceRoot":"","sources":["../../../src/lib/HttpClient/TokenExtractor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAW,MAAM,EAAkB,MAAM,QAAQ,CAAC;AAEjF,OAAO,EACL,YAAY,EACZ,SAAS,EACV,MAAM,yCAAyC,CAAC;AACjD,OAAO,EAAE,kBAAkB,EAAE,KAAK,YAAY,EAAE,MAAM,yBAAyB,CAAC;AAChF,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;AAClE,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;;;;AAGpF,qBAAa,sBAAuB,SAAQ,4BAA2C;IACrF,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;;;;AAEL,qBAAa,iBAAkB,SAAQ,uBAAsC;IAC3E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,SAAS,EAAE,SAAS,CAAC;CAC/B,CAAC;CAAG;;;;AAEL,qBAAa,iBAAkB,SAAQ,uBAAsC;IAC3E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;AAEL,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,SAAS,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,GAAG,QAAQ,GAAG,QAAQ,GAAG,MAAM,CAAC;IAC9C,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAGD,KAAK,gBAAgB,GAAG,YAAY,GAAG,UAAU,GAAG,YAAY,CAAC;AAGjE,KAAK,mBAAmB,GAAG,gBAAgB,GAAG,KAAK,GAAG,sBAAsB,GAAG,iBAAiB,GAAG,iBAAiB,CAAC;AAErH,MAAM,WAAW,qBAAqB;IACpC;;OAEG;IACH,yBAAyB,EAAE,CACzB,QAAQ,EAAE,YAAY,KACnB,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IAEhC;;OAEG;IACH,uBAAuB,EAAE,CACvB,QAAQ,EAAE,YAAY,KACnB,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;IAE1C;;OAEG;IACH,sBAAsB,EAAE,CACtB,QAAQ,EAAE,YAAY,KACnB,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;IAE1C;;OAEG;IACH,oBAAoB,EAAE,CACpB,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE;QACR,WAAW,CAAC,EAAE,OAAO,CAAC;QACtB,UAAU,CAAC,EAAE,OAAO,CAAC;QACrB,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KACxC,KACE,MAAM,CAAC,MAAM,CAAC,YAAY,EAAE,mBAAmB,CAAC,CAAC;IAEtD;;OAEG;IACH,mBAAmB,EAAE,CACnB,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,YAAY,EACtB,IAAI,EAAE,SAAS,KACZ,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE5B;;OAEG;IACH,YAAY,EAAE,CACZ,IAAI,EAAE,SAAS,EACf,UAAU,CAAC,EAAE,MAAM,KAChB,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,mBAAmB,CAAC,CAAC;CACjD;AAED,YAAY,EAAE,mBAAmB,EAAE,CAAC;;AAEpC,qBAAa,cAAe,SAAQ,mBAGjC;CAAG;AAEN;;GAEG;AACH,eAAO,MAAM,kBAAkB,+FAkd7B,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,kBAAkB,sFAG9B,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/HttpClient/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,oBAAoB,CAAC;AACnC,cAAc,yBAAyB,CAAC;AACxC,cAAc,mBAAmB,CAAC;AAClC,cAAc,qBAAqB,CAAC"}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { Effect } from 'effect';
|
|
2
|
+
/**
|
|
3
|
+
* Configuration for link extraction behavior.
|
|
4
|
+
*
|
|
5
|
+
* Focuses purely on HOW to extract links from HTML documents,
|
|
6
|
+
* not on processing or validating the extracted URLs.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```typescript
|
|
10
|
+
* // Extract from specific CSS selectors
|
|
11
|
+
* const config: LinkExtractorConfig = {
|
|
12
|
+
* restrictCss: ['a.product-link', 'form[action]'],
|
|
13
|
+
* tags: ['a', 'form'],
|
|
14
|
+
* attrs: ['href', 'action']
|
|
15
|
+
* };
|
|
16
|
+
*
|
|
17
|
+
* // Extract from all standard elements
|
|
18
|
+
* const config: LinkExtractorConfig = {
|
|
19
|
+
* tags: ['a', 'area', 'form', 'frame', 'iframe'],
|
|
20
|
+
* attrs: ['href', 'action', 'src']
|
|
21
|
+
* };
|
|
22
|
+
* ```
|
|
23
|
+
*
|
|
24
|
+
* @group LinkExtractor
|
|
25
|
+
* @public
|
|
26
|
+
*/
|
|
27
|
+
export interface LinkExtractorConfig {
|
|
28
|
+
/**
|
|
29
|
+
* CSS selectors to restrict extraction to specific elements.
|
|
30
|
+
* If specified, only elements matching these selectors will be processed.
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```typescript
|
|
34
|
+
* restrictCss: [
|
|
35
|
+
* 'a.product-link', // Only product links
|
|
36
|
+
* '.content a', // Links within content area
|
|
37
|
+
* 'form[method="post"]' // POST forms only
|
|
38
|
+
* ]
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
readonly restrictCss?: string[];
|
|
42
|
+
/**
|
|
43
|
+
* HTML tag names to extract links from.
|
|
44
|
+
* Defaults to common link-containing elements.
|
|
45
|
+
*
|
|
46
|
+
* @example ['a', 'area', 'form', 'frame', 'iframe', 'link']
|
|
47
|
+
*/
|
|
48
|
+
readonly tags?: string[];
|
|
49
|
+
/**
|
|
50
|
+
* HTML attributes to extract URLs from.
|
|
51
|
+
* Defaults to common URL-containing attributes.
|
|
52
|
+
*
|
|
53
|
+
* @example ['href', 'action', 'src', 'data-url']
|
|
54
|
+
*/
|
|
55
|
+
readonly attrs?: string[];
|
|
56
|
+
/**
|
|
57
|
+
* Whether to extract URLs from form input elements.
|
|
58
|
+
* Looks for hidden inputs with URL-like names/values.
|
|
59
|
+
*
|
|
60
|
+
* @default false
|
|
61
|
+
*/
|
|
62
|
+
readonly extractFromInputs?: boolean;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Result of link extraction from an HTML document.
|
|
66
|
+
*
|
|
67
|
+
* Contains the raw extracted URLs without any processing or validation.
|
|
68
|
+
*
|
|
69
|
+
* @group LinkExtractor
|
|
70
|
+
* @public
|
|
71
|
+
*/
|
|
72
|
+
export interface LinkExtractionResult {
|
|
73
|
+
/**
|
|
74
|
+
* Raw URLs extracted from the HTML document.
|
|
75
|
+
* These are unprocessed and may be relative URLs, fragments, etc.
|
|
76
|
+
*/
|
|
77
|
+
readonly links: string[];
|
|
78
|
+
/**
|
|
79
|
+
* Total number of potential URL-containing elements found.
|
|
80
|
+
* Includes elements that didn't yield valid URLs.
|
|
81
|
+
*/
|
|
82
|
+
readonly totalElementsProcessed: number;
|
|
83
|
+
/**
|
|
84
|
+
* Breakdown of extraction by element type.
|
|
85
|
+
* Maps element types to the number of URLs extracted from them.
|
|
86
|
+
*/
|
|
87
|
+
readonly extractionBreakdown: Record<string, number>;
|
|
88
|
+
}
|
|
89
|
+
declare const LinkExtractionError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
90
|
+
readonly _tag: "LinkExtractionError";
|
|
91
|
+
} & Readonly<A>;
|
|
92
|
+
/**
|
|
93
|
+
* Error that can occur during link extraction.
|
|
94
|
+
*
|
|
95
|
+
* @group Errors
|
|
96
|
+
* @public
|
|
97
|
+
*/
|
|
98
|
+
export declare class LinkExtractionError extends LinkExtractionError_base<{
|
|
99
|
+
readonly message: string;
|
|
100
|
+
readonly cause?: unknown;
|
|
101
|
+
}> {
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Service interface for extracting links from HTML documents.
|
|
105
|
+
*
|
|
106
|
+
* This service focuses purely on extraction - it does not process,
|
|
107
|
+
* validate, or filter the extracted URLs in any way.
|
|
108
|
+
*
|
|
109
|
+
* @group Services
|
|
110
|
+
* @public
|
|
111
|
+
*/
|
|
112
|
+
export interface LinkExtractorServiceInterface {
|
|
113
|
+
/**
|
|
114
|
+
* Extracts all URLs from an HTML document based on configuration.
|
|
115
|
+
*
|
|
116
|
+
* This method only extracts URLs from the HTML - it does not:
|
|
117
|
+
* - Validate URLs
|
|
118
|
+
* - Resolve relative URLs to absolute URLs
|
|
119
|
+
* - Apply domain or pattern filtering
|
|
120
|
+
* - Canonicalize URLs
|
|
121
|
+
*
|
|
122
|
+
* URL processing should be handled separately by the consumer.
|
|
123
|
+
*
|
|
124
|
+
* @param html - The HTML content to extract links from
|
|
125
|
+
* @param config - Configuration for extraction behavior
|
|
126
|
+
* @returns Effect containing the extraction result
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* const extractor = yield* LinkExtractorService;
|
|
131
|
+
* const result = yield* extractor.extractLinks(htmlContent, {
|
|
132
|
+
* tags: ['a', 'form'],
|
|
133
|
+
* attrs: ['href', 'action'],
|
|
134
|
+
* restrictCss: ['.content a']
|
|
135
|
+
* });
|
|
136
|
+
*
|
|
137
|
+
* console.log(`Found ${result.links.length} raw URLs`);
|
|
138
|
+
* // URLs may be relative, absolute, fragments, etc.
|
|
139
|
+
* ```
|
|
140
|
+
*/
|
|
141
|
+
extractLinks: (html: string, config?: LinkExtractorConfig) => Effect.Effect<LinkExtractionResult, LinkExtractionError>;
|
|
142
|
+
}
|
|
143
|
+
declare const LinkExtractorService_base: Effect.Service.Class<LinkExtractorService, "@jambudipa.io/LinkExtractorService", {
|
|
144
|
+
readonly effect: Effect.Effect<{
|
|
145
|
+
extractLinks: (html: string, config?: LinkExtractorConfig) => Effect.Effect<LinkExtractionResult, LinkExtractionError, never>;
|
|
146
|
+
}, never, never>;
|
|
147
|
+
}>;
|
|
148
|
+
/**
|
|
149
|
+
* Implementation of the LinkExtractorService.
|
|
150
|
+
*
|
|
151
|
+
* Provides pure HTML link extraction without any URL processing.
|
|
152
|
+
*
|
|
153
|
+
* @group Services
|
|
154
|
+
* @public
|
|
155
|
+
*/
|
|
156
|
+
export declare class LinkExtractorService extends LinkExtractorService_base {
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Default layer for LinkExtractorService.
|
|
160
|
+
*
|
|
161
|
+
* @group Layers
|
|
162
|
+
* @public
|
|
163
|
+
*/
|
|
164
|
+
export declare const LinkExtractorServiceLayer: import("effect/Layer").Layer<LinkExtractorService, never, never>;
|
|
165
|
+
export {};
|
|
166
|
+
//# sourceMappingURL=LinkExtractor.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"LinkExtractor.service.d.ts","sourceRoot":"","sources":["../../../src/lib/LinkExtractor/LinkExtractor.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAe,MAAM,EAAU,MAAM,QAAQ,CAAC;AAIrD;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,WAAW,mBAAmB;IAClC;;;;;;;;;;;;OAYG;IACH,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IAEhC;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAEzB;;;;;OAKG;IACH,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IAE1B;;;;;OAKG;IACH,QAAQ,CAAC,iBAAiB,CAAC,EAAE,OAAO,CAAC;CACtC;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;OAGG;IACH,QAAQ,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC;IAEzB;;;OAGG;IACH,QAAQ,CAAC,sBAAsB,EAAE,MAAM,CAAC;IAExC;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACtD;;;;AAED;;;;;GAKG;AACH,qBAAa,mBAAoB,SAAQ,yBAEvC;IACA,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;CAC1B,CAAC;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,6BAA6B;IAC5C;;;;;;;;;;;;;;;;;;;;;;;;;;;OA2BG;IACH,YAAY,EAAE,CACZ,IAAI,EAAE,MAAM,EACZ,MAAM,CAAC,EAAE,mBAAmB,KACzB,MAAM,CAAC,MAAM,CAAC,oBAAoB,EAAE,mBAAmB,CAAC,CAAC;CAC/D;;;6BAyB0B,MAAM,WAAW,mBAAmB;;;AAZ/D;;;;;;;GAOG;AACH,qBAAa,oBAAqB,SAAQ,yBAoBzC;CAAG;AAEJ;;;;;GAKG;AACH,eAAO,MAAM,yBAAyB,kEAA+B,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Advanced link extraction functionality for the Spider framework.
|
|
3
|
+
*
|
|
4
|
+
* This module provides Scrapy-equivalent link extraction capabilities with support for:
|
|
5
|
+
* - CSS selector-based extraction
|
|
6
|
+
* - Pattern-based filtering (allow/deny regex patterns)
|
|
7
|
+
* - Domain-based filtering
|
|
8
|
+
* - URL canonicalization
|
|
9
|
+
* - Duplicate removal
|
|
10
|
+
* - Comprehensive extraction statistics
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* import { LinkExtractorService, type LinkExtractorConfig } from '@jambudipa/spider/LinkExtractor';
|
|
15
|
+
*
|
|
16
|
+
* const program = Effect.gen(function* () {
|
|
17
|
+
* const extractor = yield* LinkExtractorService;
|
|
18
|
+
*
|
|
19
|
+
* const result = yield* extractor.extractLinks(
|
|
20
|
+
* htmlContent,
|
|
21
|
+
* 'https://example.com',
|
|
22
|
+
* {
|
|
23
|
+
* allowPatterns: [/\/articles\/\d+/],
|
|
24
|
+
* restrictCss: ['.content a'],
|
|
25
|
+
* canonicalize: true
|
|
26
|
+
* }
|
|
27
|
+
* );
|
|
28
|
+
*
|
|
29
|
+
* console.log(`Extracted ${result.links.length} links`);
|
|
30
|
+
* });
|
|
31
|
+
* ```
|
|
32
|
+
*
|
|
33
|
+
* @group LinkExtractor
|
|
34
|
+
* @public
|
|
35
|
+
*/
|
|
36
|
+
export { LinkExtractorService, LinkExtractorServiceLayer, type LinkExtractorConfig, type LinkExtractionResult, type LinkExtractorServiceInterface, LinkExtractionError, } from './LinkExtractor.service.js';
|
|
37
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/LinkExtractor/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAEH,OAAO,EACL,oBAAoB,EACpB,yBAAyB,EACzB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EACzB,KAAK,6BAA6B,EAClC,mBAAmB,GACpB,MAAM,4BAA4B,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { Context, Effect } from 'effect';
|
|
2
|
+
import { SpiderLogger } from './SpiderLogger.service.js';
|
|
3
|
+
declare const FetchError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
4
|
+
readonly _tag: "FetchError";
|
|
5
|
+
} & Readonly<A>;
|
|
6
|
+
/**
|
|
7
|
+
* Tagged error for fetch operations
|
|
8
|
+
*/
|
|
9
|
+
export declare class FetchError extends FetchError_base<{
|
|
10
|
+
readonly url: string;
|
|
11
|
+
readonly reason: 'timeout' | 'network' | 'unknown';
|
|
12
|
+
readonly durationMs: number;
|
|
13
|
+
readonly cause?: unknown;
|
|
14
|
+
}> {
|
|
15
|
+
get message(): string;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Wrapper for fetch that adds comprehensive logging
|
|
19
|
+
*/
|
|
20
|
+
export declare const makeLoggingFetch: Effect.Effect<(url: string, options?: RequestInit) => Effect.Effect<Response, FetchError>, never, SpiderLogger>;
|
|
21
|
+
export type LoggingFetchFn = (url: string, options?: RequestInit) => Effect.Effect<Response, FetchError>;
|
|
22
|
+
export declare const LoggingFetch: Context.Tag<LoggingFetchFn, LoggingFetchFn>;
|
|
23
|
+
export {};
|
|
24
|
+
//# sourceMappingURL=FetchLogger.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"FetchLogger.d.ts","sourceRoot":"","sources":["../../../src/lib/Logging/FetchLogger.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAA4B,MAAM,EAAU,MAAM,QAAQ,CAAC;AAC3E,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;;;;AAEzD;;GAEG;AACH,qBAAa,UAAW,SAAQ,gBAA+B;IAC7D,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,MAAM,EAAE,SAAS,GAAG,SAAS,GAAG,SAAS,CAAC;IACnD,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;CAC1B,CAAC;IACA,IAAI,OAAO,IAAI,MAAM,CAEpB;CACF;AAED;;GAEG;AACH,eAAO,MAAM,gBAAgB,sBAGd,MAAM,YAAY,WAAW,KAAG,MAAM,CAAC,MAAM,CAAC,QAAQ,EAAE,UAAU,CAAC,sBA+HhF,CAAC;AAEH,MAAM,MAAM,cAAc,GAAG,CAC3B,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE,WAAW,KAClB,MAAM,CAAC,MAAM,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;AAEzC,eAAO,MAAM,YAAY,6CAAqD,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { Context, Effect, Layer } from 'effect';
|
|
2
|
+
export interface SpiderLogEvent {
|
|
3
|
+
timestamp: string;
|
|
4
|
+
type: 'domain_start' | 'domain_complete' | 'domain_error' | 'page_scraped' | 'queue_status' | 'worker_status' | 'rate_limit' | 'spider_lifecycle' | 'worker_lifecycle' | 'worker_state' | 'completion_monitor' | 'edge_case' | 'crawl_delay_capped';
|
|
5
|
+
domain?: string;
|
|
6
|
+
url?: string;
|
|
7
|
+
workerId?: string;
|
|
8
|
+
fiberId?: string;
|
|
9
|
+
message: string;
|
|
10
|
+
details?: Record<string, unknown>;
|
|
11
|
+
}
|
|
12
|
+
export interface SpiderLoggerService {
|
|
13
|
+
readonly logEvent: (event: Omit<SpiderLogEvent, 'timestamp'>) => Effect.Effect<void>;
|
|
14
|
+
readonly logDomainStart: (domain: string, startUrl: string) => Effect.Effect<void>;
|
|
15
|
+
readonly logDomainComplete: (domain: string, pagesScraped: number, reason: 'max_pages' | 'queue_empty' | 'error') => Effect.Effect<void>;
|
|
16
|
+
readonly logPageScraped: (url: string, domain: string, pageNumber: number) => Effect.Effect<void>;
|
|
17
|
+
readonly logQueueStatus: (domain: string, queueSize: number, activeWorkers: number) => Effect.Effect<void>;
|
|
18
|
+
readonly logRateLimit: (domain: string, requestsInWindow: number) => Effect.Effect<void>;
|
|
19
|
+
readonly logSpiderLifecycle: (event: 'start' | 'complete' | 'error', details?: Record<string, unknown>) => Effect.Effect<void>;
|
|
20
|
+
readonly logWorkerLifecycle: (workerId: string, domain: string, event: 'created' | 'entering_loop' | 'exiting_loop', reason?: string, details?: Record<string, unknown>) => Effect.Effect<void>;
|
|
21
|
+
readonly logWorkerState: (workerId: string, domain: string, event: 'taking_task' | 'marked_active' | 'marked_idle' | 'task_completed', details?: Record<string, unknown>) => Effect.Effect<void>;
|
|
22
|
+
readonly logCompletionMonitor: (domain: string, checkCount: number, queueSize: number, activeWorkers: number, stableCount: number, maxPagesReached: boolean, decision: string) => Effect.Effect<void>;
|
|
23
|
+
readonly logEdgeCase: (domain: string, caseType: string, details?: Record<string, unknown>) => Effect.Effect<void>;
|
|
24
|
+
readonly logDomainStatus: (domain: string, status: {
|
|
25
|
+
pagesScraped: number;
|
|
26
|
+
queueSize: number;
|
|
27
|
+
activeWorkers: number;
|
|
28
|
+
maxWorkers: number;
|
|
29
|
+
}) => Effect.Effect<void>;
|
|
30
|
+
}
|
|
31
|
+
declare const SpiderLogger_base: Context.TagClass<SpiderLogger, "SpiderLogger", SpiderLoggerService>;
|
|
32
|
+
export declare class SpiderLogger extends SpiderLogger_base {
|
|
33
|
+
}
|
|
34
|
+
export declare const makeSpiderLogger: (logDir?: string) => SpiderLoggerService;
|
|
35
|
+
export declare const SpiderLoggerLive: Layer.Layer<SpiderLogger, never, never>;
|
|
36
|
+
export {};
|
|
37
|
+
//# sourceMappingURL=SpiderLogger.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SpiderLogger.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Logging/SpiderLogger.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAW,OAAO,EAAY,MAAM,EAAE,KAAK,EAAU,MAAM,QAAQ,CAAC;AAI3E,MAAM,WAAW,cAAc;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EACA,cAAc,GACd,iBAAiB,GACjB,cAAc,GACd,cAAc,GACd,cAAc,GACd,eAAe,GACf,YAAY,GACZ,kBAAkB,GAClB,kBAAkB,GAClB,cAAc,GACd,oBAAoB,GACpB,WAAW,GACX,oBAAoB,CAAC;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,QAAQ,EAAE,CACjB,KAAK,EAAE,IAAI,CAAC,cAAc,EAAE,WAAW,CAAC,KACrC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,MAAM,KACb,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,iBAAiB,EAAE,CAC1B,MAAM,EAAE,MAAM,EACd,YAAY,EAAE,MAAM,EACpB,MAAM,EAAE,WAAW,GAAG,aAAa,GAAG,OAAO,KAC1C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,GAAG,EAAE,MAAM,EACX,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,KACf,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,KAClB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,YAAY,EAAE,CACrB,MAAM,EAAE,MAAM,EACd,gBAAgB,EAAE,MAAM,KACrB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,kBAAkB,EAAE,CAC3B,KAAK,EAAE,OAAO,GAAG,UAAU,GAAG,OAAO,EACrC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAGzB,QAAQ,CAAC,kBAAkB,EAAE,CAC3B,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,SAAS,GAAG,eAAe,GAAG,cAAc,EACnD,MAAM,CAAC,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,aAAa,GAAG,eAAe,GAAG,aAAa,GAAG,gBAAgB,EACzE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,oBAAoB,EAAE,CAC7B,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,EACrB,WAAW,EAAE,MAAM,EACnB,eAAe,EAAE,OAAO,EACxB,QAAQ,EAAE,MAAM,KACb,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,WAAW,EAAE,CACpB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,eAAe,EAAE,CACxB,MAAM,EAAE,MAAM,EACd,MAAM,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,SAAS,EAAE,MAAM,CAAC;QAClB,aAAa,EAAE,MAAM,CAAC;QACtB,UAAU,EAAE,MAAM,CAAC;KACpB,KACE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CAC1B;;AAED,qBAAa,YAAa,SAAQ,iBAG/B;CAAG;AAiCN,eAAO,MAAM,gBAAgB,GAAI,eAAwB,KAAG,mBA2U3D,CAAC;AAEF,eAAO,MAAM,gBAAgB,yCAAkD,CAAC"}
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import { Effect, Option } from 'effect';
|
|
2
|
+
import { MiddlewareError } from '../errors.js';
|
|
3
|
+
import { SpiderRequest, SpiderResponse } from './types.js';
|
|
4
|
+
export { SpiderRequest, SpiderResponse } from './types.js';
|
|
5
|
+
/**
|
|
6
|
+
* Interface for implementing custom middleware components.
|
|
7
|
+
*
|
|
8
|
+
* Middleware can intercept and modify requests before they're sent,
|
|
9
|
+
* responses after they're received, and handle exceptions that occur
|
|
10
|
+
* during processing. All methods are optional.
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* const loggingMiddleware: SpiderMiddleware = {
|
|
15
|
+
* processRequest: (request) => Effect.gen(function* () {
|
|
16
|
+
* console.log(`Requesting: ${request.task.url}`);
|
|
17
|
+
* return request;
|
|
18
|
+
* }),
|
|
19
|
+
*
|
|
20
|
+
* processResponse: (response, request) => Effect.gen(function* () {
|
|
21
|
+
* console.log(`Response: ${response.statusCode} for ${request.task.url}`);
|
|
22
|
+
* return response;
|
|
23
|
+
* }),
|
|
24
|
+
*
|
|
25
|
+
* processException: (error, request) => Effect.gen(function* () {
|
|
26
|
+
* console.error(`Error processing ${request.task.url}: ${error.message}`);
|
|
27
|
+
* return null; // Let the error propagate
|
|
28
|
+
* })
|
|
29
|
+
* };
|
|
30
|
+
* ```
|
|
31
|
+
*
|
|
32
|
+
* @group Interfaces
|
|
33
|
+
* @public
|
|
34
|
+
*/
|
|
35
|
+
export interface SpiderMiddleware {
|
|
36
|
+
/**
|
|
37
|
+
* Process a request before it's sent to the target server.
|
|
38
|
+
* Can modify headers, metadata, or reject the request entirely.
|
|
39
|
+
*/
|
|
40
|
+
processRequest?: (_request: SpiderRequest) => Effect.Effect<SpiderRequest, MiddlewareError>;
|
|
41
|
+
/**
|
|
42
|
+
* Process a response after it's received from the target server.
|
|
43
|
+
* Can modify the response data or metadata.
|
|
44
|
+
*/
|
|
45
|
+
processResponse?: (_response: SpiderResponse, _request: SpiderRequest) => Effect.Effect<SpiderResponse, MiddlewareError>;
|
|
46
|
+
/**
|
|
47
|
+
* Handle exceptions that occur during request processing.
|
|
48
|
+
* Can attempt recovery by returning a SpiderResponse, or return Option.none() to propagate the error.
|
|
49
|
+
*/
|
|
50
|
+
processException?: (_error: Error, _request: SpiderRequest) => Effect.Effect<Option.Option<SpiderResponse>, MiddlewareError>;
|
|
51
|
+
}
|
|
52
|
+
declare const MiddlewareManager_base: Effect.Service.Class<MiddlewareManager, "@jambudipa.io/MiddlewareManager", {
|
|
53
|
+
readonly effect: Effect.Effect<{
|
|
54
|
+
/**
|
|
55
|
+
* Processes a request through the middleware pipeline.
|
|
56
|
+
*
|
|
57
|
+
* Middleware are executed in order from first to last, with each middleware
|
|
58
|
+
* receiving the output of the previous middleware as input.
|
|
59
|
+
*
|
|
60
|
+
* @param request - The initial request to process
|
|
61
|
+
* @param middlewares - Array of middleware to apply
|
|
62
|
+
* @returns Effect containing the processed request
|
|
63
|
+
*/
|
|
64
|
+
processRequest: (request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<SpiderRequest, MiddlewareError, never>;
|
|
65
|
+
/**
|
|
66
|
+
* Processes a response through the middleware pipeline in reverse order.
|
|
67
|
+
*
|
|
68
|
+
* Middleware are executed in reverse order (last to first) to provide
|
|
69
|
+
* proper nesting of response processing.
|
|
70
|
+
*
|
|
71
|
+
* @param response - The response to process
|
|
72
|
+
* @param request - The original request (for context)
|
|
73
|
+
* @param middlewares - Array of middleware to apply
|
|
74
|
+
* @returns Effect containing the processed response
|
|
75
|
+
*/
|
|
76
|
+
processResponse: (response: SpiderResponse, request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<SpiderResponse, MiddlewareError, never>;
|
|
77
|
+
/**
|
|
78
|
+
* Processes an exception through the middleware pipeline in reverse order.
|
|
79
|
+
*
|
|
80
|
+
* Middleware are given a chance to handle or recover from exceptions.
|
|
81
|
+
* If a middleware returns Option.some(SpiderResponse), it indicates successful recovery.
|
|
82
|
+
* If it returns Option.none(), the exception continues to propagate.
|
|
83
|
+
*
|
|
84
|
+
* @param error - The error that occurred
|
|
85
|
+
* @param request - The request that caused the error
|
|
86
|
+
* @param middlewares - Array of middleware to apply
|
|
87
|
+
* @returns Effect containing a recovered response wrapped in Option
|
|
88
|
+
*/
|
|
89
|
+
processException: (error: Error, request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<Option.Option<SpiderResponse>, MiddlewareError, never>;
|
|
90
|
+
}, never, never>;
|
|
91
|
+
}>;
|
|
92
|
+
/**
|
|
93
|
+
* Manages the middleware pipeline for request and response processing.
|
|
94
|
+
*
|
|
95
|
+
* The MiddlewareManager orchestrates the execution of middleware in the correct order:
|
|
96
|
+
* - Requests are processed forward through the middleware array
|
|
97
|
+
* - Responses are processed in reverse order (last middleware first)
|
|
98
|
+
* - Exceptions are processed in reverse order for proper error handling
|
|
99
|
+
*
|
|
100
|
+
* @example
|
|
101
|
+
* ```typescript
|
|
102
|
+
* const program = Effect.gen(function* () {
|
|
103
|
+
* const manager = yield* MiddlewareManager;
|
|
104
|
+
*
|
|
105
|
+
* const middleware = [
|
|
106
|
+
* rateLimitMiddleware,
|
|
107
|
+
* loggingMiddleware,
|
|
108
|
+
* userAgentMiddleware
|
|
109
|
+
* ];
|
|
110
|
+
*
|
|
111
|
+
* const request: SpiderRequest = {
|
|
112
|
+
* task: { url: 'https://example.com', depth: 0 },
|
|
113
|
+
* headers: {}
|
|
114
|
+
* };
|
|
115
|
+
*
|
|
116
|
+
* const processedRequest = yield* manager.processRequest(request, middleware);
|
|
117
|
+
* console.log('Request processed through middleware pipeline');
|
|
118
|
+
* });
|
|
119
|
+
* ```
|
|
120
|
+
*
|
|
121
|
+
* @group Services
|
|
122
|
+
* @public
|
|
123
|
+
*/
|
|
124
|
+
export declare class MiddlewareManager extends MiddlewareManager_base {
|
|
125
|
+
}
|
|
126
|
+
declare const RateLimitMiddleware_base: Effect.Service.Class<RateLimitMiddleware, "@jambudipa.io/RateLimitMiddleware", {
|
|
127
|
+
readonly effect: Effect.Effect<{
|
|
128
|
+
create: (config: {
|
|
129
|
+
maxConcurrentRequests: number;
|
|
130
|
+
maxRequestsPerSecondPerDomain: number;
|
|
131
|
+
requestDelayMs?: number;
|
|
132
|
+
}) => SpiderMiddleware;
|
|
133
|
+
}, never, never>;
|
|
134
|
+
}>;
|
|
135
|
+
/**
|
|
136
|
+
* Provides rate limiting functionality for respectful crawling.
|
|
137
|
+
*
|
|
138
|
+
* Controls request frequency at both global and per-domain levels to prevent
|
|
139
|
+
* overwhelming target servers and avoid being blocked.
|
|
140
|
+
*
|
|
141
|
+
* @example
|
|
142
|
+
* ```typescript
|
|
143
|
+
* const rateLimiter = yield* RateLimitMiddleware;
|
|
144
|
+
* const middleware = rateLimiter.create({
|
|
145
|
+
* maxConcurrentRequests: 5,
|
|
146
|
+
* maxRequestsPerSecondPerDomain: 2,
|
|
147
|
+
* requestDelayMs: 250
|
|
148
|
+
* });
|
|
149
|
+
* ```
|
|
150
|
+
*
|
|
151
|
+
* @group Middleware
|
|
152
|
+
* @public
|
|
153
|
+
*/
|
|
154
|
+
export declare class RateLimitMiddleware extends RateLimitMiddleware_base {
|
|
155
|
+
}
|
|
156
|
+
declare const LoggingMiddleware_base: Effect.Service.Class<LoggingMiddleware, "@jambudipa.io/LoggingMiddleware", {
|
|
157
|
+
readonly effect: Effect.Effect<{
|
|
158
|
+
create: (config?: {
|
|
159
|
+
logRequests?: boolean;
|
|
160
|
+
logResponses?: boolean;
|
|
161
|
+
logErrors?: boolean;
|
|
162
|
+
logLevel?: "debug" | "info" | "warn" | "error";
|
|
163
|
+
}) => SpiderMiddleware;
|
|
164
|
+
}, never, never>;
|
|
165
|
+
}>;
|
|
166
|
+
/**
|
|
167
|
+
* Provides logging functionality using Effect.Logger.
|
|
168
|
+
*
|
|
169
|
+
* Logs requests, responses, and errors at configurable levels for debugging
|
|
170
|
+
* and monitoring purposes.
|
|
171
|
+
*
|
|
172
|
+
* @example
|
|
173
|
+
* ```typescript
|
|
174
|
+
* const logger = yield* LoggingMiddleware;
|
|
175
|
+
* const middleware = logger.create({
|
|
176
|
+
* logRequests: true,
|
|
177
|
+
* logResponses: true,
|
|
178
|
+
* logLevel: 'info'
|
|
179
|
+
* });
|
|
180
|
+
* ```
|
|
181
|
+
*
|
|
182
|
+
* @group Middleware
|
|
183
|
+
* @public
|
|
184
|
+
*/
|
|
185
|
+
export declare class LoggingMiddleware extends LoggingMiddleware_base {
|
|
186
|
+
}
|
|
187
|
+
declare const UserAgentMiddleware_base: Effect.Service.Class<UserAgentMiddleware, "@jambudipa.io/UserAgentMiddleware", {
|
|
188
|
+
readonly effect: Effect.Effect<{
|
|
189
|
+
create: (userAgent: string) => SpiderMiddleware;
|
|
190
|
+
}, never, never>;
|
|
191
|
+
}>;
|
|
192
|
+
/**
|
|
193
|
+
* Adds User-Agent headers to requests.
|
|
194
|
+
*
|
|
195
|
+
* Sets a consistent User-Agent string for all requests to identify
|
|
196
|
+
* your crawler to web servers.
|
|
197
|
+
*
|
|
198
|
+
* @example
|
|
199
|
+
* ```typescript
|
|
200
|
+
* const userAgent = yield* UserAgentMiddleware;
|
|
201
|
+
* const middleware = userAgent.create('MyBot/1.0 (+https://example.com)');
|
|
202
|
+
* ```
|
|
203
|
+
*
|
|
204
|
+
* @group Middleware
|
|
205
|
+
* @public
|
|
206
|
+
*/
|
|
207
|
+
export declare class UserAgentMiddleware extends UserAgentMiddleware_base {
|
|
208
|
+
}
|
|
209
|
+
declare const StatsMiddleware_base: Effect.Service.Class<StatsMiddleware, "@jambudipa.io/StatsMiddleware", {
|
|
210
|
+
readonly effect: Effect.Effect<{
|
|
211
|
+
create: () => {
|
|
212
|
+
middleware: SpiderMiddleware;
|
|
213
|
+
getStats: () => Effect.Effect<Record<string, number>>;
|
|
214
|
+
};
|
|
215
|
+
}, never, never>;
|
|
216
|
+
}>;
|
|
217
|
+
/**
|
|
218
|
+
* Collects statistics about crawling activity.
|
|
219
|
+
*
|
|
220
|
+
* Tracks various metrics including requests processed, response codes,
|
|
221
|
+
* bytes downloaded, and processing times for monitoring and optimization.
|
|
222
|
+
*
|
|
223
|
+
* @example
|
|
224
|
+
* ```typescript
|
|
225
|
+
* const statsService = yield* StatsMiddleware;
|
|
226
|
+
* const { middleware, getStats } = statsService.create();
|
|
227
|
+
*
|
|
228
|
+
* // Use middleware in your pipeline
|
|
229
|
+
* // Later get statistics
|
|
230
|
+
* const stats = yield* getStats();
|
|
231
|
+
* console.log(`Processed ${stats.requests_processed} requests`);
|
|
232
|
+
* ```
|
|
233
|
+
*
|
|
234
|
+
* @group Middleware
|
|
235
|
+
* @public
|
|
236
|
+
*/
|
|
237
|
+
export declare class StatsMiddleware extends StatsMiddleware_base {
|
|
238
|
+
}
|
|
239
|
+
//# sourceMappingURL=SpiderMiddleware.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SpiderMiddleware.d.ts","sourceRoot":"","sources":["../../../src/lib/Middleware/SpiderMiddleware.ts"],"names":[],"mappings":"AAAA,OAAO,EAAY,MAAM,EAAkB,MAAM,EAAE,MAAM,QAAQ,CAAC;AAClE,OAAO,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAC/C,OAAO,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAE3D,OAAO,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAE3D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,cAAc,CAAC,EAAE,CACf,QAAQ,EAAE,aAAa,KACpB,MAAM,CAAC,MAAM,CAAC,aAAa,EAAE,eAAe,CAAC,CAAC;IAEnD;;;OAGG;IACH,eAAe,CAAC,EAAE,CAChB,SAAS,EAAE,cAAc,EACzB,QAAQ,EAAE,aAAa,KACpB,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,eAAe,CAAC,CAAC;IAEpD;;;OAGG;IACH,gBAAgB,CAAC,EAAE,CACjB,MAAM,EAAE,KAAK,EACb,QAAQ,EAAE,aAAa,KACpB,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,EAAE,eAAe,CAAC,CAAC;CACpE;;;QAsCK;;;;;;;;;WASG;kCAEQ,aAAa,eACT,gBAAgB,EAAE;QAQjC;;;;;;;;;;WAUG;oCAES,cAAc,WACf,aAAa,eACT,gBAAgB,EAAE;QAWjC;;;;;;;;;;;WAWG;kCAEM,KAAK,WACH,aAAa,eACT,gBAAgB,EAAE;;;AAhGvC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,qBAAa,iBAAkB,SAAQ,sBA4EtC;CAAG;;;yBA8BqB;YACf,qBAAqB,EAAE,MAAM,CAAC;YAC9B,6BAA6B,EAAE,MAAM,CAAC;YACtC,cAAc,CAAC,EAAE,MAAM,CAAC;SACzB,KAAG,gBAAgB;;;AAhC5B;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,mBAAoB,SAAQ,wBAqExC;CAAG;;;0BA0BY;YACN,WAAW,CAAC,EAAE,OAAO,CAAC;YACtB,YAAY,CAAC,EAAE,OAAO,CAAC;YACvB,SAAS,CAAC,EAAE,OAAO,CAAC;YACpB,QAAQ,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;SAChD,KACA,gBAAgB;;;AA9BzB;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,iBAAkB,SAAQ,sBA4EtC;CAAG;;;4BAqBsB,MAAM,KAAG,gBAAgB;;;AAnBnD;;;;;;;;;;;;;;GAcG;AACH,qBAAa,mBAAoB,SAAQ,wBAYxC;CAAG;;;sBA6BgB;YACV,UAAU,EAAE,gBAAgB,CAAC;YAC7B,QAAQ,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;SACvD;;;AA9BT;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,eAAgB,SAAQ,oBAqEpC;CAAG"}
|