@jambudipa/spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +426 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4681 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
- package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
- package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
- package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
- package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
- package/dist/lib/HttpClient/index.d.ts +8 -0
- package/dist/lib/HttpClient/index.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/index.d.ts +37 -0
- package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
- package/dist/lib/Logging/FetchLogger.d.ts +8 -0
- package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
- package/dist/lib/PageData/PageData.d.ts +28 -0
- package/dist/lib/PageData/PageData.d.ts.map +1 -0
- package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
- package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/index.d.ts +51 -0
- package/dist/lib/Resumability/index.d.ts.map +1 -0
- package/dist/lib/Resumability/strategies.d.ts +76 -0
- package/dist/lib/Resumability/strategies.d.ts.map +1 -0
- package/dist/lib/Resumability/types.d.ts +201 -0
- package/dist/lib/Resumability/types.d.ts.map +1 -0
- package/dist/lib/Robots/Robots.service.d.ts +78 -0
- package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
- package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +194 -0
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
- package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
- package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
- package/dist/lib/StateManager/index.d.ts +5 -0
- package/dist/lib/StateManager/index.d.ts.map +1 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
- package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
- package/dist/lib/api-facades.d.ts +313 -0
- package/dist/lib/api-facades.d.ts.map +1 -0
- package/dist/lib/errors.d.ts +99 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/package.json +108 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { Effect } from 'effect';
|
|
2
|
+
/**
|
|
3
|
+
* Configuration for link extraction behavior.
|
|
4
|
+
*
|
|
5
|
+
* Focuses purely on HOW to extract links from HTML documents,
|
|
6
|
+
* not on processing or validating the extracted URLs.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```typescript
|
|
10
|
+
* // Extract from specific CSS selectors
|
|
11
|
+
* const config: LinkExtractorConfig = {
|
|
12
|
+
* restrictCss: ['a.product-link', 'form[action]'],
|
|
13
|
+
* tags: ['a', 'form'],
|
|
14
|
+
* attrs: ['href', 'action']
|
|
15
|
+
* };
|
|
16
|
+
*
|
|
17
|
+
* // Extract from all standard elements
|
|
18
|
+
* const config: LinkExtractorConfig = {
|
|
19
|
+
* tags: ['a', 'area', 'form', 'frame', 'iframe'],
|
|
20
|
+
* attrs: ['href', 'action', 'src']
|
|
21
|
+
* };
|
|
22
|
+
* ```
|
|
23
|
+
*
|
|
24
|
+
* @group LinkExtractor
|
|
25
|
+
* @public
|
|
26
|
+
*/
|
|
27
|
+
export interface LinkExtractorConfig {
|
|
28
|
+
/**
|
|
29
|
+
* CSS selectors to restrict extraction to specific elements.
|
|
30
|
+
* If specified, only elements matching these selectors will be processed.
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```typescript
|
|
34
|
+
* restrictCss: [
|
|
35
|
+
* 'a.product-link', // Only product links
|
|
36
|
+
* '.content a', // Links within content area
|
|
37
|
+
* 'form[method="post"]' // POST forms only
|
|
38
|
+
* ]
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
readonly restrictCss?: string[];
|
|
42
|
+
/**
|
|
43
|
+
* HTML tag names to extract links from.
|
|
44
|
+
* Defaults to common link-containing elements.
|
|
45
|
+
*
|
|
46
|
+
* @example ['a', 'area', 'form', 'frame', 'iframe', 'link']
|
|
47
|
+
*/
|
|
48
|
+
readonly tags?: string[];
|
|
49
|
+
/**
|
|
50
|
+
* HTML attributes to extract URLs from.
|
|
51
|
+
* Defaults to common URL-containing attributes.
|
|
52
|
+
*
|
|
53
|
+
* @example ['href', 'action', 'src', 'data-url']
|
|
54
|
+
*/
|
|
55
|
+
readonly attrs?: string[];
|
|
56
|
+
/**
|
|
57
|
+
* Whether to extract URLs from form input elements.
|
|
58
|
+
* Looks for hidden inputs with URL-like names/values.
|
|
59
|
+
*
|
|
60
|
+
* @default false
|
|
61
|
+
*/
|
|
62
|
+
readonly extractFromInputs?: boolean;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Result of link extraction from an HTML document.
|
|
66
|
+
*
|
|
67
|
+
* Contains the raw extracted URLs without any processing or validation.
|
|
68
|
+
*
|
|
69
|
+
* @group LinkExtractor
|
|
70
|
+
* @public
|
|
71
|
+
*/
|
|
72
|
+
export interface LinkExtractionResult {
|
|
73
|
+
/**
|
|
74
|
+
* Raw URLs extracted from the HTML document.
|
|
75
|
+
* These are unprocessed and may be relative URLs, fragments, etc.
|
|
76
|
+
*/
|
|
77
|
+
readonly links: string[];
|
|
78
|
+
/**
|
|
79
|
+
* Total number of potential URL-containing elements found.
|
|
80
|
+
* Includes elements that didn't yield valid URLs.
|
|
81
|
+
*/
|
|
82
|
+
readonly totalElementsProcessed: number;
|
|
83
|
+
/**
|
|
84
|
+
* Breakdown of extraction by element type.
|
|
85
|
+
* Maps element types to the number of URLs extracted from them.
|
|
86
|
+
*/
|
|
87
|
+
readonly extractionBreakdown: Record<string, number>;
|
|
88
|
+
}
|
|
89
|
+
declare const LinkExtractionError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
90
|
+
readonly _tag: "LinkExtractionError";
|
|
91
|
+
} & Readonly<A>;
|
|
92
|
+
/**
|
|
93
|
+
* Error that can occur during link extraction.
|
|
94
|
+
*
|
|
95
|
+
* @group Errors
|
|
96
|
+
* @public
|
|
97
|
+
*/
|
|
98
|
+
export declare class LinkExtractionError extends LinkExtractionError_base<{
|
|
99
|
+
readonly message: string;
|
|
100
|
+
readonly cause?: unknown;
|
|
101
|
+
}> {
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Service interface for extracting links from HTML documents.
|
|
105
|
+
*
|
|
106
|
+
* This service focuses purely on extraction - it does not process,
|
|
107
|
+
* validate, or filter the extracted URLs in any way.
|
|
108
|
+
*
|
|
109
|
+
* @group Services
|
|
110
|
+
* @public
|
|
111
|
+
*/
|
|
112
|
+
export interface LinkExtractorServiceInterface {
|
|
113
|
+
/**
|
|
114
|
+
* Extracts all URLs from an HTML document based on configuration.
|
|
115
|
+
*
|
|
116
|
+
* This method only extracts URLs from the HTML - it does not:
|
|
117
|
+
* - Validate URLs
|
|
118
|
+
* - Resolve relative URLs to absolute URLs
|
|
119
|
+
* - Apply domain or pattern filtering
|
|
120
|
+
* - Canonicalize URLs
|
|
121
|
+
*
|
|
122
|
+
* URL processing should be handled separately by the consumer.
|
|
123
|
+
*
|
|
124
|
+
* @param html - The HTML content to extract links from
|
|
125
|
+
* @param config - Configuration for extraction behavior
|
|
126
|
+
* @returns Effect containing the extraction result
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* const extractor = yield* LinkExtractorService;
|
|
131
|
+
* const result = yield* extractor.extractLinks(htmlContent, {
|
|
132
|
+
* tags: ['a', 'form'],
|
|
133
|
+
* attrs: ['href', 'action'],
|
|
134
|
+
* restrictCss: ['.content a']
|
|
135
|
+
* });
|
|
136
|
+
*
|
|
137
|
+
* console.log(`Found ${result.links.length} raw URLs`);
|
|
138
|
+
* // URLs may be relative, absolute, fragments, etc.
|
|
139
|
+
* ```
|
|
140
|
+
*/
|
|
141
|
+
extractLinks(html: string, config?: LinkExtractorConfig): Effect.Effect<LinkExtractionResult, LinkExtractionError>;
|
|
142
|
+
}
|
|
143
|
+
declare const LinkExtractorService_base: Effect.Service.Class<LinkExtractorService, "@jambudipa.io/LinkExtractorService", {
|
|
144
|
+
readonly effect: Effect.Effect<{
|
|
145
|
+
extractLinks: (html: string, config?: LinkExtractorConfig) => Effect.Effect<LinkExtractionResult, LinkExtractionError, never>;
|
|
146
|
+
}, never, never>;
|
|
147
|
+
}>;
|
|
148
|
+
/**
|
|
149
|
+
* Implementation of the LinkExtractorService.
|
|
150
|
+
*
|
|
151
|
+
* Provides pure HTML link extraction without any URL processing.
|
|
152
|
+
*
|
|
153
|
+
* @group Services
|
|
154
|
+
* @public
|
|
155
|
+
*/
|
|
156
|
+
export declare class LinkExtractorService extends LinkExtractorService_base {
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Default layer for LinkExtractorService.
|
|
160
|
+
*
|
|
161
|
+
* @group Layers
|
|
162
|
+
* @public
|
|
163
|
+
*/
|
|
164
|
+
export declare const LinkExtractorServiceLayer: import("effect/Layer").Layer<LinkExtractorService, never, never>;
|
|
165
|
+
export {};
|
|
166
|
+
//# sourceMappingURL=LinkExtractor.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"LinkExtractor.service.d.ts","sourceRoot":"","sources":["../../../src/lib/LinkExtractor/LinkExtractor.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAQ,MAAM,EAAE,MAAM,QAAQ,CAAC;AAItC;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,WAAW,mBAAmB;IAClC;;;;;;;;;;;;OAYG;IACH,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IAEhC;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAEzB;;;;;OAKG;IACH,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IAE1B;;;;;OAKG;IACH,QAAQ,CAAC,iBAAiB,CAAC,EAAE,OAAO,CAAC;CACtC;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;OAGG;IACH,QAAQ,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC;IAEzB;;;OAGG;IACH,QAAQ,CAAC,sBAAsB,EAAE,MAAM,CAAC;IAExC;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACtD;;;;AAED;;;;;GAKG;AACH,qBAAa,mBAAoB,SAAQ,yBAEvC;IACA,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;CAC1B,CAAC;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,6BAA6B;IAC5C;;;;;;;;;;;;;;;;;;;;;;;;;;;OA2BG;IACH,YAAY,CACV,IAAI,EAAE,MAAM,EACZ,MAAM,CAAC,EAAE,mBAAmB,GAC3B,MAAM,CAAC,MAAM,CAAC,oBAAoB,EAAE,mBAAmB,CAAC,CAAC;CAC7D;;;6BAyB0B,MAAM,WAAW,mBAAmB;;;AAZ/D;;;;;;;GAOG;AACH,qBAAa,oBAAqB,SAAQ,yBAsBzC;CAAG;AAEJ;;;;;GAKG;AACH,eAAO,MAAM,yBAAyB,kEAA+B,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Advanced link extraction functionality for the Spider framework.
|
|
3
|
+
*
|
|
4
|
+
* This module provides Scrapy-equivalent link extraction capabilities with support for:
|
|
5
|
+
* - CSS selector-based extraction
|
|
6
|
+
* - Pattern-based filtering (allow/deny regex patterns)
|
|
7
|
+
* - Domain-based filtering
|
|
8
|
+
* - URL canonicalization
|
|
9
|
+
* - Duplicate removal
|
|
10
|
+
* - Comprehensive extraction statistics
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* import { LinkExtractorService, type LinkExtractorConfig } from '@jambudipa.io/spider/LinkExtractor';
|
|
15
|
+
*
|
|
16
|
+
* const program = Effect.gen(function* () {
|
|
17
|
+
* const extractor = yield* LinkExtractorService;
|
|
18
|
+
*
|
|
19
|
+
* const result = yield* extractor.extractLinks(
|
|
20
|
+
* htmlContent,
|
|
21
|
+
* 'https://example.com',
|
|
22
|
+
* {
|
|
23
|
+
* allowPatterns: [/\/articles\/\d+/],
|
|
24
|
+
* restrictCss: ['.content a'],
|
|
25
|
+
* canonicalize: true
|
|
26
|
+
* }
|
|
27
|
+
* );
|
|
28
|
+
*
|
|
29
|
+
* console.log(`Extracted ${result.links.length} links`);
|
|
30
|
+
* });
|
|
31
|
+
* ```
|
|
32
|
+
*
|
|
33
|
+
* @group LinkExtractor
|
|
34
|
+
* @public
|
|
35
|
+
*/
|
|
36
|
+
export { LinkExtractorService, LinkExtractorServiceLayer, type LinkExtractorConfig, type LinkExtractionResult, type LinkExtractorServiceInterface, LinkExtractionError, } from './LinkExtractor.service.js';
|
|
37
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/LinkExtractor/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAEH,OAAO,EACL,oBAAoB,EACpB,yBAAyB,EACzB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EACzB,KAAK,6BAA6B,EAClC,mBAAmB,GACpB,MAAM,4BAA4B,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { Context, Effect } from 'effect';
|
|
2
|
+
import { SpiderLogger } from './SpiderLogger.service.js';
|
|
3
|
+
/**
|
|
4
|
+
* Wrapper for fetch that adds comprehensive logging
|
|
5
|
+
*/
|
|
6
|
+
export declare const makeLoggingFetch: Effect.Effect<(url: string, options?: RequestInit) => Promise<Response>, never, SpiderLogger>;
|
|
7
|
+
export declare const LoggingFetch: Context.Tag<(url: string, options?: RequestInit) => Promise<Response>, (url: string, options?: RequestInit) => Promise<Response>>;
|
|
8
|
+
//# sourceMappingURL=FetchLogger.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"FetchLogger.d.ts","sourceRoot":"","sources":["../../../src/lib/Logging/FetchLogger.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AAEzD;;GAEG;AACH,eAAO,MAAM,gBAAgB,sBAGd,MAAM,YAAY,WAAW,4CAgG1C,CAAC;AAEH,eAAO,MAAM,YAAY,oBACE,MAAM,YAAY,WAAW,KAAK,OAAO,CAAC,QAAQ,CAAC,QAAnD,MAAM,YAAY,WAAW,KAAK,OAAO,CAAC,QAAQ,CAAC,CAE3E,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { Context, Effect, Layer } from 'effect';
|
|
2
|
+
export interface SpiderLogEvent {
|
|
3
|
+
timestamp: string;
|
|
4
|
+
type: 'domain_start' | 'domain_complete' | 'domain_error' | 'page_scraped' | 'queue_status' | 'worker_status' | 'rate_limit' | 'spider_lifecycle' | 'worker_lifecycle' | 'worker_state' | 'completion_monitor' | 'edge_case' | 'crawl_delay_capped';
|
|
5
|
+
domain?: string;
|
|
6
|
+
url?: string;
|
|
7
|
+
workerId?: string;
|
|
8
|
+
fiberId?: string;
|
|
9
|
+
message: string;
|
|
10
|
+
details?: Record<string, unknown>;
|
|
11
|
+
}
|
|
12
|
+
export interface SpiderLogger {
|
|
13
|
+
readonly logEvent: (event: Omit<SpiderLogEvent, 'timestamp'>) => Effect.Effect<void>;
|
|
14
|
+
readonly logDomainStart: (domain: string, startUrl: string) => Effect.Effect<void>;
|
|
15
|
+
readonly logDomainComplete: (domain: string, pagesScraped: number, reason: 'max_pages' | 'queue_empty' | 'error') => Effect.Effect<void>;
|
|
16
|
+
readonly logPageScraped: (url: string, domain: string, pageNumber: number) => Effect.Effect<void>;
|
|
17
|
+
readonly logQueueStatus: (domain: string, queueSize: number, activeWorkers: number) => Effect.Effect<void>;
|
|
18
|
+
readonly logRateLimit: (domain: string, requestsInWindow: number) => Effect.Effect<void>;
|
|
19
|
+
readonly logSpiderLifecycle: (event: 'start' | 'complete' | 'error', details?: Record<string, unknown>) => Effect.Effect<void>;
|
|
20
|
+
readonly logWorkerLifecycle: (workerId: string, domain: string, event: 'created' | 'entering_loop' | 'exiting_loop', reason?: string, details?: Record<string, unknown>) => Effect.Effect<void>;
|
|
21
|
+
readonly logWorkerState: (workerId: string, domain: string, event: 'taking_task' | 'marked_active' | 'marked_idle' | 'task_completed', details?: Record<string, unknown>) => Effect.Effect<void>;
|
|
22
|
+
readonly logCompletionMonitor: (domain: string, checkCount: number, queueSize: number, activeWorkers: number, stableCount: number, maxPagesReached: boolean, decision: string) => Effect.Effect<void>;
|
|
23
|
+
readonly logEdgeCase: (domain: string, caseType: string, details?: Record<string, unknown>) => Effect.Effect<void>;
|
|
24
|
+
readonly logDomainStatus: (domain: string, status: {
|
|
25
|
+
pagesScraped: number;
|
|
26
|
+
queueSize: number;
|
|
27
|
+
activeWorkers: number;
|
|
28
|
+
maxWorkers: number;
|
|
29
|
+
}) => Effect.Effect<void>;
|
|
30
|
+
}
|
|
31
|
+
export declare const SpiderLogger: Context.Tag<SpiderLogger, SpiderLogger>;
|
|
32
|
+
export declare const makeSpiderLogger: (logDir?: string) => SpiderLogger;
|
|
33
|
+
export declare const SpiderLoggerLive: Layer.Layer<SpiderLogger, never, never>;
|
|
34
|
+
//# sourceMappingURL=SpiderLogger.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SpiderLogger.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Logging/SpiderLogger.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAW,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,QAAQ,CAAC;AAIzD,MAAM,WAAW,cAAc;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EACA,cAAc,GACd,iBAAiB,GACjB,cAAc,GACd,cAAc,GACd,cAAc,GACd,eAAe,GACf,YAAY,GACZ,kBAAkB,GAClB,kBAAkB,GAClB,cAAc,GACd,oBAAoB,GACpB,WAAW,GACX,oBAAoB,CAAC;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,QAAQ,EAAE,CACjB,KAAK,EAAE,IAAI,CAAC,cAAc,EAAE,WAAW,CAAC,KACrC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,MAAM,KACb,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,iBAAiB,EAAE,CAC1B,MAAM,EAAE,MAAM,EACd,YAAY,EAAE,MAAM,EACpB,MAAM,EAAE,WAAW,GAAG,aAAa,GAAG,OAAO,KAC1C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,GAAG,EAAE,MAAM,EACX,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,KACf,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,KAClB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,YAAY,EAAE,CACrB,MAAM,EAAE,MAAM,EACd,gBAAgB,EAAE,MAAM,KACrB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,kBAAkB,EAAE,CAC3B,KAAK,EAAE,OAAO,GAAG,UAAU,GAAG,OAAO,EACrC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAGzB,QAAQ,CAAC,kBAAkB,EAAE,CAC3B,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,SAAS,GAAG,eAAe,GAAG,cAAc,EACnD,MAAM,CAAC,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,aAAa,GAAG,eAAe,GAAG,aAAa,GAAG,gBAAgB,EACzE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,oBAAoB,EAAE,CAC7B,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,EACrB,WAAW,EAAE,MAAM,EACnB,eAAe,EAAE,OAAO,EACxB,QAAQ,EAAE,MAAM,KACb,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,WAAW,EAAE,CACpB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,eAAe,EAAE,CACxB,MAAM,EAAE,MAAM,EACd,MAAM,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,SAAS,EAAE,MAAM,CAAC;QAClB,aAAa,EAAE,MAAM,CAAC;QACtB,UAAU,EAAE,MAAM,CAAC;KACpB,KACE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CAC1B;AAED,eAAO,MAAM,YAAY,yCAAmD,CAAC;AAE7E,eAAO,MAAM,gBAAgB,GAAI,eAAwB,KAAG,YA2R3D,CAAC;AAEF,eAAO,MAAM,gBAAgB,yCAAkD,CAAC"}
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import { Effect } from 'effect';
|
|
2
|
+
import { CrawlTask } from '../Spider/Spider.service.js';
|
|
3
|
+
import { PageData } from '../PageData/PageData.js';
|
|
4
|
+
import { MiddlewareError } from '../errors.js';
|
|
5
|
+
/**
|
|
6
|
+
* Request object used in the middleware pipeline.
|
|
7
|
+
*
|
|
8
|
+
* Contains the crawl task along with optional headers and metadata
|
|
9
|
+
* that can be modified by middleware during processing.
|
|
10
|
+
*
|
|
11
|
+
* @group Interfaces
|
|
12
|
+
* @public
|
|
13
|
+
*/
|
|
14
|
+
export interface SpiderRequest {
|
|
15
|
+
/** The crawl task containing URL and depth information */
|
|
16
|
+
task: CrawlTask;
|
|
17
|
+
/** HTTP headers to include with the request */
|
|
18
|
+
headers?: Record<string, string>;
|
|
19
|
+
/** Additional metadata that can be used by middleware */
|
|
20
|
+
meta?: Record<string, unknown>;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Response object used in the middleware pipeline.
|
|
24
|
+
*
|
|
25
|
+
* Contains the extracted page data along with optional HTTP response
|
|
26
|
+
* information and metadata from middleware processing.
|
|
27
|
+
*
|
|
28
|
+
* @group Interfaces
|
|
29
|
+
* @public
|
|
30
|
+
*/
|
|
31
|
+
export interface SpiderResponse {
|
|
32
|
+
/** The extracted page data including content, links, and metadata */
|
|
33
|
+
pageData: PageData;
|
|
34
|
+
/** HTTP status code of the response */
|
|
35
|
+
statusCode?: number;
|
|
36
|
+
/** HTTP response headers */
|
|
37
|
+
headers?: Record<string, string>;
|
|
38
|
+
/** Additional metadata added by middleware */
|
|
39
|
+
meta?: Record<string, unknown>;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Interface for implementing custom middleware components.
|
|
43
|
+
*
|
|
44
|
+
* Middleware can intercept and modify requests before they're sent,
|
|
45
|
+
* responses after they're received, and handle exceptions that occur
|
|
46
|
+
* during processing. All methods are optional.
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* ```typescript
|
|
50
|
+
* const loggingMiddleware: SpiderMiddleware = {
|
|
51
|
+
* processRequest: (request) => Effect.gen(function* () {
|
|
52
|
+
* console.log(`Requesting: ${request.task.url}`);
|
|
53
|
+
* return request;
|
|
54
|
+
* }),
|
|
55
|
+
*
|
|
56
|
+
* processResponse: (response, request) => Effect.gen(function* () {
|
|
57
|
+
* console.log(`Response: ${response.statusCode} for ${request.task.url}`);
|
|
58
|
+
* return response;
|
|
59
|
+
* }),
|
|
60
|
+
*
|
|
61
|
+
* processException: (error, request) => Effect.gen(function* () {
|
|
62
|
+
* console.error(`Error processing ${request.task.url}: ${error.message}`);
|
|
63
|
+
* return null; // Let the error propagate
|
|
64
|
+
* })
|
|
65
|
+
* };
|
|
66
|
+
* ```
|
|
67
|
+
*
|
|
68
|
+
* @group Interfaces
|
|
69
|
+
* @public
|
|
70
|
+
*/
|
|
71
|
+
export interface SpiderMiddleware {
|
|
72
|
+
/**
|
|
73
|
+
* Process a request before it's sent to the target server.
|
|
74
|
+
* Can modify headers, metadata, or reject the request entirely.
|
|
75
|
+
*/
|
|
76
|
+
processRequest?: (request: SpiderRequest) => Effect.Effect<SpiderRequest, MiddlewareError>;
|
|
77
|
+
/**
|
|
78
|
+
* Process a response after it's received from the target server.
|
|
79
|
+
* Can modify the response data or metadata.
|
|
80
|
+
*/
|
|
81
|
+
processResponse?: (response: SpiderResponse, request: SpiderRequest) => Effect.Effect<SpiderResponse, MiddlewareError>;
|
|
82
|
+
/**
|
|
83
|
+
* Handle exceptions that occur during request processing.
|
|
84
|
+
* Can attempt recovery by returning a SpiderResponse, or return null to propagate the error.
|
|
85
|
+
*/
|
|
86
|
+
processException?: (error: Error, request: SpiderRequest) => Effect.Effect<SpiderResponse | null, MiddlewareError>;
|
|
87
|
+
}
|
|
88
|
+
declare const MiddlewareManager_base: Effect.Service.Class<MiddlewareManager, "@jambudipa.io/MiddlewareManager", {
|
|
89
|
+
readonly effect: Effect.Effect<{
|
|
90
|
+
/**
|
|
91
|
+
* Processes a request through the middleware pipeline.
|
|
92
|
+
*
|
|
93
|
+
* Middleware are executed in order from first to last, with each middleware
|
|
94
|
+
* receiving the output of the previous middleware as input.
|
|
95
|
+
*
|
|
96
|
+
* @param request - The initial request to process
|
|
97
|
+
* @param middlewares - Array of middleware to apply
|
|
98
|
+
* @returns Effect containing the processed request
|
|
99
|
+
*/
|
|
100
|
+
processRequest: (request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<SpiderRequest, MiddlewareError, never>;
|
|
101
|
+
/**
|
|
102
|
+
* Processes a response through the middleware pipeline in reverse order.
|
|
103
|
+
*
|
|
104
|
+
* Middleware are executed in reverse order (last to first) to provide
|
|
105
|
+
* proper nesting of response processing.
|
|
106
|
+
*
|
|
107
|
+
* @param response - The response to process
|
|
108
|
+
* @param request - The original request (for context)
|
|
109
|
+
* @param middlewares - Array of middleware to apply
|
|
110
|
+
* @returns Effect containing the processed response
|
|
111
|
+
*/
|
|
112
|
+
processResponse: (response: SpiderResponse, request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<SpiderResponse, MiddlewareError, never>;
|
|
113
|
+
/**
|
|
114
|
+
* Processes an exception through the middleware pipeline in reverse order.
|
|
115
|
+
*
|
|
116
|
+
* Middleware are given a chance to handle or recover from exceptions.
|
|
117
|
+
* If a middleware returns a SpiderResponse, it indicates successful recovery.
|
|
118
|
+
* If it returns null, the exception continues to propagate.
|
|
119
|
+
*
|
|
120
|
+
* @param error - The error that occurred
|
|
121
|
+
* @param request - The request that caused the error
|
|
122
|
+
* @param middlewares - Array of middleware to apply
|
|
123
|
+
* @returns Effect containing a recovered response or null
|
|
124
|
+
*/
|
|
125
|
+
processException: (error: Error, request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<SpiderResponse | null, MiddlewareError, never>;
|
|
126
|
+
}, never, never>;
|
|
127
|
+
}>;
|
|
128
|
+
/**
|
|
129
|
+
* Manages the middleware pipeline for request and response processing.
|
|
130
|
+
*
|
|
131
|
+
* The MiddlewareManager orchestrates the execution of middleware in the correct order:
|
|
132
|
+
* - Requests are processed forward through the middleware array
|
|
133
|
+
* - Responses are processed in reverse order (last middleware first)
|
|
134
|
+
* - Exceptions are processed in reverse order for proper error handling
|
|
135
|
+
*
|
|
136
|
+
* @example
|
|
137
|
+
* ```typescript
|
|
138
|
+
* const program = Effect.gen(function* () {
|
|
139
|
+
* const manager = yield* MiddlewareManager;
|
|
140
|
+
*
|
|
141
|
+
* const middleware = [
|
|
142
|
+
* rateLimitMiddleware,
|
|
143
|
+
* loggingMiddleware,
|
|
144
|
+
* userAgentMiddleware
|
|
145
|
+
* ];
|
|
146
|
+
*
|
|
147
|
+
* const request: SpiderRequest = {
|
|
148
|
+
* task: { url: 'https://example.com', depth: 0 },
|
|
149
|
+
* headers: {}
|
|
150
|
+
* };
|
|
151
|
+
*
|
|
152
|
+
* const processedRequest = yield* manager.processRequest(request, middleware);
|
|
153
|
+
* console.log('Request processed through middleware pipeline');
|
|
154
|
+
* });
|
|
155
|
+
* ```
|
|
156
|
+
*
|
|
157
|
+
* @group Services
|
|
158
|
+
* @public
|
|
159
|
+
*/
|
|
160
|
+
export declare class MiddlewareManager extends MiddlewareManager_base {
|
|
161
|
+
}
|
|
162
|
+
declare const RateLimitMiddleware_base: Effect.Service.Class<RateLimitMiddleware, "@jambudipa.io/RateLimitMiddleware", {
|
|
163
|
+
readonly effect: Effect.Effect<{
|
|
164
|
+
create: (config: {
|
|
165
|
+
maxConcurrentRequests: number;
|
|
166
|
+
maxRequestsPerSecondPerDomain: number;
|
|
167
|
+
requestDelayMs?: number;
|
|
168
|
+
}) => SpiderMiddleware;
|
|
169
|
+
}, never, never>;
|
|
170
|
+
}>;
|
|
171
|
+
/**
|
|
172
|
+
* Provides rate limiting functionality for respectful crawling.
|
|
173
|
+
*
|
|
174
|
+
* Controls request frequency at both global and per-domain levels to prevent
|
|
175
|
+
* overwhelming target servers and avoid being blocked.
|
|
176
|
+
*
|
|
177
|
+
* @example
|
|
178
|
+
* ```typescript
|
|
179
|
+
* const rateLimiter = yield* RateLimitMiddleware;
|
|
180
|
+
* const middleware = rateLimiter.create({
|
|
181
|
+
* maxConcurrentRequests: 5,
|
|
182
|
+
* maxRequestsPerSecondPerDomain: 2,
|
|
183
|
+
* requestDelayMs: 250
|
|
184
|
+
* });
|
|
185
|
+
* ```
|
|
186
|
+
*
|
|
187
|
+
* @group Middleware
|
|
188
|
+
* @public
|
|
189
|
+
*/
|
|
190
|
+
export declare class RateLimitMiddleware extends RateLimitMiddleware_base {
|
|
191
|
+
}
|
|
192
|
+
declare const LoggingMiddleware_base: Effect.Service.Class<LoggingMiddleware, "@jambudipa.io/LoggingMiddleware", {
|
|
193
|
+
readonly effect: Effect.Effect<{
|
|
194
|
+
create: (config?: {
|
|
195
|
+
logRequests?: boolean;
|
|
196
|
+
logResponses?: boolean;
|
|
197
|
+
logErrors?: boolean;
|
|
198
|
+
logLevel?: "debug" | "info" | "warn" | "error";
|
|
199
|
+
}) => SpiderMiddleware;
|
|
200
|
+
}, never, never>;
|
|
201
|
+
}>;
|
|
202
|
+
/**
|
|
203
|
+
* Provides logging functionality using Effect.Logger.
|
|
204
|
+
*
|
|
205
|
+
* Logs requests, responses, and errors at configurable levels for debugging
|
|
206
|
+
* and monitoring purposes.
|
|
207
|
+
*
|
|
208
|
+
* @example
|
|
209
|
+
* ```typescript
|
|
210
|
+
* const logger = yield* LoggingMiddleware;
|
|
211
|
+
* const middleware = logger.create({
|
|
212
|
+
* logRequests: true,
|
|
213
|
+
* logResponses: true,
|
|
214
|
+
* logLevel: 'info'
|
|
215
|
+
* });
|
|
216
|
+
* ```
|
|
217
|
+
*
|
|
218
|
+
* @group Middleware
|
|
219
|
+
* @public
|
|
220
|
+
*/
|
|
221
|
+
export declare class LoggingMiddleware extends LoggingMiddleware_base {
|
|
222
|
+
}
|
|
223
|
+
declare const UserAgentMiddleware_base: Effect.Service.Class<UserAgentMiddleware, "@jambudipa.io/UserAgentMiddleware", {
|
|
224
|
+
readonly effect: Effect.Effect<{
|
|
225
|
+
create: (userAgent: string) => SpiderMiddleware;
|
|
226
|
+
}, never, never>;
|
|
227
|
+
}>;
|
|
228
|
+
/**
|
|
229
|
+
* Adds User-Agent headers to requests.
|
|
230
|
+
*
|
|
231
|
+
* Sets a consistent User-Agent string for all requests to identify
|
|
232
|
+
* your crawler to web servers.
|
|
233
|
+
*
|
|
234
|
+
* @example
|
|
235
|
+
* ```typescript
|
|
236
|
+
* const userAgent = yield* UserAgentMiddleware;
|
|
237
|
+
* const middleware = userAgent.create('MyBot/1.0 (+https://example.com)');
|
|
238
|
+
* ```
|
|
239
|
+
*
|
|
240
|
+
* @group Middleware
|
|
241
|
+
* @public
|
|
242
|
+
*/
|
|
243
|
+
export declare class UserAgentMiddleware extends UserAgentMiddleware_base {
|
|
244
|
+
}
|
|
245
|
+
declare const StatsMiddleware_base: Effect.Service.Class<StatsMiddleware, "@jambudipa.io/StatsMiddleware", {
|
|
246
|
+
readonly effect: Effect.Effect<{
|
|
247
|
+
create: () => {
|
|
248
|
+
middleware: SpiderMiddleware;
|
|
249
|
+
getStats: () => Effect.Effect<Record<string, number>>;
|
|
250
|
+
};
|
|
251
|
+
}, never, never>;
|
|
252
|
+
}>;
|
|
253
|
+
/**
|
|
254
|
+
* Collects statistics about crawling activity.
|
|
255
|
+
*
|
|
256
|
+
* Tracks various metrics including requests processed, response codes,
|
|
257
|
+
* bytes downloaded, and processing times for monitoring and optimization.
|
|
258
|
+
*
|
|
259
|
+
* @example
|
|
260
|
+
* ```typescript
|
|
261
|
+
* const statsService = yield* StatsMiddleware;
|
|
262
|
+
* const { middleware, getStats } = statsService.create();
|
|
263
|
+
*
|
|
264
|
+
* // Use middleware in your pipeline
|
|
265
|
+
* // Later get statistics
|
|
266
|
+
* const stats = yield* getStats();
|
|
267
|
+
* console.log(`Processed ${stats.requests_processed} requests`);
|
|
268
|
+
* ```
|
|
269
|
+
*
|
|
270
|
+
* @group Middleware
|
|
271
|
+
* @public
|
|
272
|
+
*/
|
|
273
|
+
export declare class StatsMiddleware extends StatsMiddleware_base {
|
|
274
|
+
}
|
|
275
|
+
export {};
|
|
276
|
+
//# sourceMappingURL=SpiderMiddleware.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SpiderMiddleware.d.ts","sourceRoot":"","sources":["../../../src/lib/Middleware/SpiderMiddleware.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAA0B,MAAM,QAAQ,CAAC;AACxD,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAE/C;;;;;;;;GAQG;AACH,MAAM,WAAW,aAAa;IAC5B,0DAA0D;IAC1D,IAAI,EAAE,SAAS,CAAC;IAChB,+CAA+C;IAC/C,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,yDAAyD;IACzD,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAChC;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,cAAc;IAC7B,qEAAqE;IACrE,QAAQ,EAAE,QAAQ,CAAC;IACnB,uCAAuC;IACvC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,4BAA4B;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,8CAA8C;IAC9C,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAChC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,cAAc,CAAC,EAAE,CACf,OAAO,EAAE,aAAa,KACnB,MAAM,CAAC,MAAM,CAAC,aAAa,EAAE,eAAe,CAAC,CAAC;IAEnD;;;OAGG;IACH,eAAe,CAAC,EAAE,CAChB,QAAQ,EAAE,cAAc,EACxB,OAAO,EAAE,aAAa,KACnB,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,eAAe,CAAC,CAAC;IAEpD;;;OAGG;IACH,gBAAgB,CAAC,EAAE,CACjB,KAAK,EAAE,KAAK,EACZ,OAAO,EAAE,aAAa,KACnB,MAAM,CAAC,MAAM,CAAC,cAAc,GAAG,IAAI,EAAE,eAAe,CAAC,CAAC;CAC5D;;;QAsCK;;;;;;;;;WASG;kCAEQ,aAAa,eACT,gBAAgB,EAAE;QAQjC;;;;;;;;;;WAUG;oCAES,cAAc,WACf,aAAa,eACT,gBAAgB,EAAE;QAWjC;;;;;;;;;;;WAWG;kCAEM,KAAK,WACH,aAAa,eACT,gBAAgB,EAAE;;;AAhGvC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,qBAAa,iBAAkB,SAAQ,sBA4EtC;CAAG;;;yBA8BqB;YACf,qBAAqB,EAAE,MAAM,CAAC;YAC9B,6BAA6B,EAAE,MAAM,CAAC;YACtC,cAAc,CAAC,EAAE,MAAM,CAAC;SACzB,KAAG,gBAAgB;;;AAhC5B;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,mBAAoB,SAAQ,wBAmExC;CAAG;;;0BA0BY;YACN,WAAW,CAAC,EAAE,OAAO,CAAC;YACtB,YAAY,CAAC,EAAE,OAAO,CAAC;YACvB,SAAS,CAAC,EAAE,OAAO,CAAC;YACpB,QAAQ,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;SAChD,KACA,gBAAgB;;;AA9BzB;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,iBAAkB,SAAQ,sBA4EtC;CAAG;;;4BAqBsB,MAAM,KAAG,gBAAgB;;;AAnBnD;;;;;;;;;;;;;;GAcG;AACH,qBAAa,mBAAoB,SAAQ,wBAgBxC;CAAG;;;sBA0Bc;YACV,UAAU,EAAE,gBAAgB,CAAC;YAC7B,QAAQ,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;SACvD;;;AA3BP;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,eAAgB,SAAQ,oBA4DpC;CAAG"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { Schema } from 'effect';
|
|
2
|
+
export declare const PageDataSchema: Schema.Struct<{
|
|
3
|
+
url: Schema.filter<typeof Schema.String>;
|
|
4
|
+
html: typeof Schema.String;
|
|
5
|
+
title: Schema.optional<typeof Schema.String>;
|
|
6
|
+
/** All available metadata from meta tags */
|
|
7
|
+
metadata: Schema.Record$<typeof Schema.String, typeof Schema.String>;
|
|
8
|
+
/** Commonly used metadata fields for convenience */
|
|
9
|
+
commonMetadata: Schema.optional<Schema.Struct<{
|
|
10
|
+
description: Schema.optional<typeof Schema.String>;
|
|
11
|
+
keywords: Schema.optional<typeof Schema.String>;
|
|
12
|
+
author: Schema.optional<typeof Schema.String>;
|
|
13
|
+
robots: Schema.optional<typeof Schema.String>;
|
|
14
|
+
}>>;
|
|
15
|
+
statusCode: Schema.filter<Schema.filter<typeof Schema.Number>>;
|
|
16
|
+
/** All response headers */
|
|
17
|
+
headers: Schema.Record$<typeof Schema.String, typeof Schema.String>;
|
|
18
|
+
/** When the fetch operation started */
|
|
19
|
+
fetchedAt: typeof Schema.DateFromSelf;
|
|
20
|
+
/** How long the entire fetch and parse operation took in milliseconds */
|
|
21
|
+
scrapeDurationMs: typeof Schema.Number;
|
|
22
|
+
/** The crawl depth (number of hops from the starting URL) */
|
|
23
|
+
depth: Schema.filter<Schema.filter<typeof Schema.Number>>;
|
|
24
|
+
/** Optional extracted data from the page */
|
|
25
|
+
extractedData: Schema.optional<Schema.Record$<typeof Schema.String, typeof Schema.Unknown>>;
|
|
26
|
+
}>;
|
|
27
|
+
export type PageData = Schema.Schema.Type<typeof PageDataSchema>;
|
|
28
|
+
//# sourceMappingURL=PageData.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PageData.d.ts","sourceRoot":"","sources":["../../../src/lib/PageData/PageData.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAEhC,eAAO,MAAM,cAAc;;;;IAkBzB,4CAA4C;;IAE5C,oDAAoD;;;;;;;;IAUpD,2BAA2B;;IAE3B,uCAAuC;;IAEvC,yEAAyE;;IAEzE,6DAA6D;;IAE7D,4CAA4C;;EAI5C,CAAC;AAEH,MAAM,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,cAAc,CAAC,CAAC"}
|