@jambudipa/spider 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -35
- package/dist/browser/BrowserManager.d.ts +63 -0
- package/dist/browser/BrowserManager.d.ts.map +1 -0
- package/dist/browser/PlaywrightAdapter.d.ts +166 -0
- package/dist/browser/PlaywrightAdapter.d.ts.map +1 -0
- package/dist/examples/01-basic-crawl-working.d.ts +13 -0
- package/dist/examples/01-basic-crawl-working.d.ts.map +1 -0
- package/dist/examples/02-multiple-urls-working.d.ts +13 -0
- package/dist/examples/02-multiple-urls-working.d.ts.map +1 -0
- package/dist/examples/03-url-filtering.d.ts +13 -0
- package/dist/examples/03-url-filtering.d.ts.map +1 -0
- package/dist/examples/04-robots-compliance.d.ts +14 -0
- package/dist/examples/04-robots-compliance.d.ts.map +1 -0
- package/dist/examples/05-link-extraction-selectors.d.ts +14 -0
- package/dist/examples/05-link-extraction-selectors.d.ts.map +1 -0
- package/dist/examples/06-custom-middleware.d.ts +18 -0
- package/dist/examples/06-custom-middleware.d.ts.map +1 -0
- package/dist/examples/07-resumability-demo.d.ts +14 -0
- package/dist/examples/07-resumability-demo.d.ts.map +1 -0
- package/dist/examples/08-worker-monitoring.d.ts +15 -0
- package/dist/examples/08-worker-monitoring.d.ts.map +1 -0
- package/dist/examples/09-error-handling-recovery.d.ts +15 -0
- package/dist/examples/09-error-handling-recovery.d.ts.map +1 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3596 -1440
- package/dist/index.js.map +1 -1
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +107 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
- package/dist/lib/HttpClient/CookieManager.d.ts +58 -0
- package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +63 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
- package/dist/lib/HttpClient/SessionStore.d.ts +114 -0
- package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts +83 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
- package/dist/lib/HttpClient/index.d.ts +8 -0
- package/dist/lib/HttpClient/index.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/index.d.ts +37 -0
- package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
- package/dist/lib/Logging/FetchLogger.d.ts +24 -0
- package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts +37 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +239 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
- package/dist/lib/Middleware/types.d.ts +99 -0
- package/dist/lib/Middleware/types.d.ts.map +1 -0
- package/dist/lib/PageData/PageData.d.ts +28 -0
- package/dist/lib/PageData/PageData.d.ts.map +1 -0
- package/dist/lib/Resumability/Resumability.service.d.ts +178 -0
- package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/index.d.ts +51 -0
- package/dist/lib/Resumability/index.d.ts.map +1 -0
- package/dist/lib/Resumability/strategies.d.ts +76 -0
- package/dist/lib/Resumability/strategies.d.ts.map +1 -0
- package/dist/lib/Resumability/types.d.ts +201 -0
- package/dist/lib/Resumability/types.d.ts.map +1 -0
- package/dist/lib/Robots/Robots.service.d.ts +78 -0
- package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
- package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +249 -0
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
- package/dist/lib/StateManager/StateManager.service.d.ts +107 -0
- package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
- package/dist/lib/StateManager/index.d.ts +5 -0
- package/dist/lib/StateManager/index.d.ts.map +1 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +110 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
- package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
- package/dist/lib/api-facades.d.ts +313 -0
- package/dist/lib/api-facades.d.ts.map +1 -0
- package/dist/lib/errors/effect-errors.d.ts +179 -0
- package/dist/lib/errors/effect-errors.d.ts.map +1 -0
- package/dist/lib/errors.d.ts +172 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/dist/lib/utils/FileUtils.d.ts +284 -0
- package/dist/lib/utils/FileUtils.d.ts.map +1 -0
- package/dist/lib/utils/JsonUtils.d.ts +196 -0
- package/dist/lib/utils/JsonUtils.d.ts.map +1 -0
- package/dist/lib/utils/RegexUtils.d.ts +257 -0
- package/dist/lib/utils/RegexUtils.d.ts.map +1 -0
- package/dist/lib/utils/SchemaUtils.d.ts +251 -0
- package/dist/lib/utils/SchemaUtils.d.ts.map +1 -0
- package/dist/lib/utils/UrlUtils.d.ts +223 -0
- package/dist/lib/utils/UrlUtils.d.ts.map +1 -0
- package/dist/lib/utils/effect-migration.d.ts +31 -0
- package/dist/lib/utils/effect-migration.d.ts.map +1 -0
- package/dist/lib/utils/index.d.ts +15 -0
- package/dist/lib/utils/index.d.ts.map +1 -0
- package/dist/lib/utils/url-deduplication.d.ts +108 -0
- package/dist/lib/utils/url-deduplication.d.ts.map +1 -0
- package/dist/lib/utils/url-deduplication.test.d.ts +5 -0
- package/dist/lib/utils/url-deduplication.test.d.ts.map +1 -0
- package/dist/test/infrastructure/EffectTestUtils.d.ts +167 -0
- package/dist/test/infrastructure/EffectTestUtils.d.ts.map +1 -0
- package/package.json +23 -9
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { Effect } from 'effect';
|
|
2
|
+
import { NetworkError, ResponseError, ContentTypeError, RequestAbortError } from '../errors.js';
|
|
3
|
+
import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
|
|
4
|
+
declare const ScraperService_base: Effect.Service.Class<ScraperService, "@jambudipa.io/ScraperService", {
|
|
5
|
+
readonly effect: Effect.Effect<{
|
|
6
|
+
/**
|
|
7
|
+
* Fetches a URL and parses the HTML to extract basic page information.
|
|
8
|
+
*
|
|
9
|
+
* This method performs the following operations:
|
|
10
|
+
* 1. Fetches the URL with configurable timeout (30 seconds)
|
|
11
|
+
* 2. Validates content type (skips binary files)
|
|
12
|
+
* 3. Parses HTML content with cheerio
|
|
13
|
+
* 4. Extracts basic page metadata (title, description, etc.)
|
|
14
|
+
* 5. Returns structured PageData object
|
|
15
|
+
*
|
|
16
|
+
* The method uses AbortController for proper timeout handling to prevent
|
|
17
|
+
* workers from hanging on malformed URLs or slow responses.
|
|
18
|
+
*
|
|
19
|
+
* @param url - The URL to fetch and parse
|
|
20
|
+
* @param depth - The crawl depth for logging purposes (default: 0)
|
|
21
|
+
* @returns Effect containing PageData with extracted information
|
|
22
|
+
* @throws NetworkError for network-related failures
|
|
23
|
+
* @throws ResponseError for HTTP error responses
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* Basic usage:
|
|
27
|
+
* ```typescript
|
|
28
|
+
* const pageData = yield* scraper.fetchAndParse('https://example.com');
|
|
29
|
+
* console.log(`Page title: ${pageData.title}`);
|
|
30
|
+
* ```
|
|
31
|
+
*
|
|
32
|
+
* With depth tracking:
|
|
33
|
+
* ```typescript
|
|
34
|
+
* const pageData = yield* scraper.fetchAndParse('https://example.com/page', 2);
|
|
35
|
+
* ```
|
|
36
|
+
*
|
|
37
|
+
* Error handling:
|
|
38
|
+
* ```typescript
|
|
39
|
+
* const result = yield* scraper.fetchAndParse('https://example.com').pipe(
|
|
40
|
+
* Effect.catchTags({
|
|
41
|
+
* NetworkError: (error) => {
|
|
42
|
+
* console.log('Network error:', error.message);
|
|
43
|
+
* return Effect.succeed(null);
|
|
44
|
+
* },
|
|
45
|
+
* ResponseError: (error) => {
|
|
46
|
+
* console.log('HTTP error:', error.statusCode);
|
|
47
|
+
* return Effect.succeed(null);
|
|
48
|
+
* }
|
|
49
|
+
* })
|
|
50
|
+
* );
|
|
51
|
+
* ```
|
|
52
|
+
*
|
|
53
|
+
* @performance
|
|
54
|
+
* - Request timeout: 30 seconds
|
|
55
|
+
* - Response parsing timeout: 10 seconds
|
|
56
|
+
* - Memory usage: ~2-5MB per page depending on content size
|
|
57
|
+
*
|
|
58
|
+
* @security
|
|
59
|
+
* - Validates content types to prevent processing binary files
|
|
60
|
+
* - Uses AbortController to prevent hanging requests
|
|
61
|
+
* - No execution of JavaScript content (static HTML parsing only)
|
|
62
|
+
*/
|
|
63
|
+
fetchAndParse: (url: string, depth?: number) => Effect.Effect<{
|
|
64
|
+
readonly url: string;
|
|
65
|
+
readonly html: string;
|
|
66
|
+
readonly title?: string | undefined;
|
|
67
|
+
readonly metadata: {
|
|
68
|
+
readonly [x: string]: string;
|
|
69
|
+
};
|
|
70
|
+
readonly commonMetadata?: {
|
|
71
|
+
readonly description?: string | undefined;
|
|
72
|
+
readonly keywords?: string | undefined;
|
|
73
|
+
readonly author?: string | undefined;
|
|
74
|
+
readonly robots?: string | undefined;
|
|
75
|
+
} | undefined;
|
|
76
|
+
readonly statusCode: number;
|
|
77
|
+
readonly headers: {
|
|
78
|
+
readonly [x: string]: string;
|
|
79
|
+
};
|
|
80
|
+
readonly fetchedAt: Date;
|
|
81
|
+
readonly scrapeDurationMs: number;
|
|
82
|
+
readonly depth: number;
|
|
83
|
+
readonly extractedData?: {
|
|
84
|
+
readonly [x: string]: unknown;
|
|
85
|
+
} | undefined;
|
|
86
|
+
}, NetworkError | ResponseError | ContentTypeError | RequestAbortError | import("effect/ParseResult").ParseError, SpiderLogger>;
|
|
87
|
+
}, never, never>;
|
|
88
|
+
}>;
|
|
89
|
+
/**
|
|
90
|
+
* Service responsible for fetching HTML content and parsing basic page information.
|
|
91
|
+
*
|
|
92
|
+
* The ScraperService handles the core HTTP fetching and HTML parsing functionality
|
|
93
|
+
* for the Spider framework. It provides robust error handling, timeout management,
|
|
94
|
+
* and content type validation to ensure reliable data extraction.
|
|
95
|
+
*
|
|
96
|
+
* **Key Features:**
|
|
97
|
+
* - Automatic timeout handling with AbortController
|
|
98
|
+
* - Content type validation (skips binary files)
|
|
99
|
+
* - Comprehensive error handling with typed errors
|
|
100
|
+
* - Performance monitoring and logging
|
|
101
|
+
* - Effect integration for composability
|
|
102
|
+
*
|
|
103
|
+
* **Note:** This service focuses solely on fetching and parsing HTML content.
|
|
104
|
+
* Link extraction is handled separately by LinkExtractorService for better
|
|
105
|
+
* separation of concerns and modularity.
|
|
106
|
+
*
|
|
107
|
+
* @example
|
|
108
|
+
* ```typescript
|
|
109
|
+
* const program = Effect.gen(function* () {
|
|
110
|
+
* const scraper = yield* ScraperService;
|
|
111
|
+
* const pageData = yield* scraper.fetchAndParse('https://example.com', 0);
|
|
112
|
+
* console.log(`Title: ${pageData.title}`);
|
|
113
|
+
* console.log(`Content length: ${pageData.html.length}`);
|
|
114
|
+
* });
|
|
115
|
+
* ```
|
|
116
|
+
*
|
|
117
|
+
* @group Services
|
|
118
|
+
* @public
|
|
119
|
+
*/
|
|
120
|
+
export declare class ScraperService extends ScraperService_base {
|
|
121
|
+
}
|
|
122
|
+
export {};
|
|
123
|
+
//# sourceMappingURL=Scraper.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Scraper.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Scraper/Scraper.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsB,MAAM,EAAkB,MAAM,QAAQ,CAAC;AAGpE,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,cAAc,CAAC;AAChG,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;;;QAqC5D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDG;6BACkB,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;AA5FjC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,qBAAa,cAAe,SAAQ,mBAoOnC;CAAG"}
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import { Effect, Sink } from 'effect';
|
|
2
|
+
import { UrlDeduplicatorService } from '../UrlDeduplicator/UrlDeduplicator.service.js';
|
|
3
|
+
import { ScraperService } from '../Scraper/Scraper.service.js';
|
|
4
|
+
import { PageData } from '../PageData/PageData.js';
|
|
5
|
+
import { RobotsService } from '../Robots/Robots.service.js';
|
|
6
|
+
import { type LinkExtractorConfig, LinkExtractorService } from '../LinkExtractor/index.js';
|
|
7
|
+
import { SpiderSchedulerService } from '../Scheduler/SpiderScheduler.service.js';
|
|
8
|
+
import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
|
|
9
|
+
/**
|
|
10
|
+
* Configuration for extracting a nested field from an element.
|
|
11
|
+
*
|
|
12
|
+
* @group Data Types
|
|
13
|
+
* @public
|
|
14
|
+
*/
|
|
15
|
+
interface NestedFieldConfig {
|
|
16
|
+
/** CSS selector to find the nested element */
|
|
17
|
+
readonly selector: string;
|
|
18
|
+
/** HTML attribute to extract (if not specified, extracts text content) */
|
|
19
|
+
readonly attribute?: string;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Configuration for extracting a single field from the page.
|
|
23
|
+
*
|
|
24
|
+
* @group Data Types
|
|
25
|
+
* @public
|
|
26
|
+
*/
|
|
27
|
+
interface FieldExtractionConfig {
|
|
28
|
+
/** CSS selector to find the element */
|
|
29
|
+
readonly selector: string;
|
|
30
|
+
/** Extract text content (default: true) */
|
|
31
|
+
readonly text?: boolean;
|
|
32
|
+
/** HTML attribute to extract instead of text */
|
|
33
|
+
readonly attribute?: string;
|
|
34
|
+
/** Extract multiple matching elements */
|
|
35
|
+
readonly multiple?: boolean;
|
|
36
|
+
/** Check if element exists (returns boolean) */
|
|
37
|
+
readonly exists?: boolean;
|
|
38
|
+
/** Nested fields to extract from each matched element */
|
|
39
|
+
readonly fields?: Record<string, NestedFieldConfig>;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Data extraction configuration - either a simple CSS selector string
|
|
43
|
+
* or a detailed field extraction configuration.
|
|
44
|
+
*
|
|
45
|
+
* @group Data Types
|
|
46
|
+
* @public
|
|
47
|
+
*/
|
|
48
|
+
type DataExtractionFieldConfig = string | FieldExtractionConfig;
|
|
49
|
+
/**
|
|
50
|
+
* Configuration for extracting structured data from pages.
|
|
51
|
+
*
|
|
52
|
+
* @group Data Types
|
|
53
|
+
* @public
|
|
54
|
+
*/
|
|
55
|
+
type DataExtractionConfig = Record<string, DataExtractionFieldConfig>;
|
|
56
|
+
/**
|
|
57
|
+
* Represents a single crawling task with URL and depth information.
|
|
58
|
+
*
|
|
59
|
+
* @group Data Types
|
|
60
|
+
* @public
|
|
61
|
+
*/
|
|
62
|
+
interface CrawlTask {
|
|
63
|
+
/** The URL to be crawled */
|
|
64
|
+
url: string;
|
|
65
|
+
/** The depth level of this URL relative to the starting URL */
|
|
66
|
+
depth: number;
|
|
67
|
+
/** The URL from which this URL was discovered (optional) */
|
|
68
|
+
fromUrl?: string;
|
|
69
|
+
/** Optional metadata to be passed through to the result */
|
|
70
|
+
metadata?: Record<string, unknown>;
|
|
71
|
+
/** Optional data extraction configuration */
|
|
72
|
+
extractData?: DataExtractionConfig;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* The result of a successful crawl operation.
|
|
76
|
+
*
|
|
77
|
+
* Contains all extracted information from a crawled page along with
|
|
78
|
+
* metadata about when and at what depth it was processed.
|
|
79
|
+
*
|
|
80
|
+
* @group Data Types
|
|
81
|
+
* @public
|
|
82
|
+
*/
|
|
83
|
+
interface CrawlResult {
|
|
84
|
+
/** The extracted page data including content, links, and metadata */
|
|
85
|
+
pageData: PageData;
|
|
86
|
+
/** The depth at which this page was crawled */
|
|
87
|
+
depth: number;
|
|
88
|
+
/** When this page was crawled */
|
|
89
|
+
timestamp: Date;
|
|
90
|
+
/** Optional metadata passed through from the original request */
|
|
91
|
+
metadata?: Record<string, unknown>;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* The main Spider service that orchestrates web crawling operations.
|
|
95
|
+
*
|
|
96
|
+
* This service provides the core functionality for crawling websites, including:
|
|
97
|
+
* - URL validation and filtering based on configuration
|
|
98
|
+
* - Robots.txt compliance checking
|
|
99
|
+
* - Concurrent crawling with configurable worker pools
|
|
100
|
+
* - Request scheduling and rate limiting
|
|
101
|
+
* - Result streaming through Effect sinks
|
|
102
|
+
*
|
|
103
|
+
* @example
|
|
104
|
+
* ```typescript
|
|
105
|
+
* const program = Effect.gen(function* () {
|
|
106
|
+
* const spider = yield* Spider;
|
|
107
|
+
* const collectSink = Sink.forEach<CrawlResult>(result =>
|
|
108
|
+
* Effect.sync(() => console.log(result.pageData.url))
|
|
109
|
+
* );
|
|
110
|
+
*
|
|
111
|
+
* const stats = yield* spider.crawl('https://example.com', collectSink);
|
|
112
|
+
* console.log(`Crawled ${stats.totalPages} pages`);
|
|
113
|
+
* });
|
|
114
|
+
* ```
|
|
115
|
+
*
|
|
116
|
+
* @group Services
|
|
117
|
+
* @public
|
|
118
|
+
*/
|
|
119
|
+
/**
|
|
120
|
+
* Options for enhanced link extraction during crawling.
|
|
121
|
+
*
|
|
122
|
+
* @group Configuration
|
|
123
|
+
* @public
|
|
124
|
+
*/
|
|
125
|
+
export interface SpiderLinkExtractionOptions {
|
|
126
|
+
/** Configuration for the LinkExtractorService */
|
|
127
|
+
readonly linkExtractorConfig?: LinkExtractorConfig;
|
|
128
|
+
/** Whether to use enhanced extraction in addition to basic extraction (default: false) */
|
|
129
|
+
readonly useEnhancedExtraction?: boolean;
|
|
130
|
+
/** Whether to replace basic extraction with enhanced extraction (default: true) */
|
|
131
|
+
readonly replaceBasicExtraction?: boolean;
|
|
132
|
+
/** Data extraction configuration for structured data extraction */
|
|
133
|
+
readonly extractData?: DataExtractionConfig;
|
|
134
|
+
}
|
|
135
|
+
declare const SpiderService_base: Effect.Service.Class<SpiderService, "@jambudipa/spider", {
|
|
136
|
+
readonly effect: Effect.Effect<{
|
|
137
|
+
/**
|
|
138
|
+
* Starts crawling from the specified URL and processes results through the provided sink.
|
|
139
|
+
*
|
|
140
|
+
* This method:
|
|
141
|
+
* 1. Validates the starting URL against configuration rules
|
|
142
|
+
* 2. Starts a configurable number of worker fibers
|
|
143
|
+
* 3. Each worker processes URLs from a shared queue
|
|
144
|
+
* 4. Results are streamed through the provided sink
|
|
145
|
+
* 5. New URLs discovered are queued for processing
|
|
146
|
+
*
|
|
147
|
+
* @param startingUrls - The starting URL(s) for crawling (single string or array)
|
|
148
|
+
* @param sink - Sink to process crawl results as they're produced
|
|
149
|
+
* @param options - Optional enhanced link extraction configuration
|
|
150
|
+
* @returns Effect containing crawl statistics (total pages, completion status)
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* Basic usage:
|
|
154
|
+
* ```typescript
|
|
155
|
+
* const collectSink = Sink.forEach<CrawlResult>(result =>
|
|
156
|
+
* Effect.sync(() => console.log(`Found: ${result.pageData.title}`))
|
|
157
|
+
* );
|
|
158
|
+
*
|
|
159
|
+
* const stats = yield* spider.crawl('https://example.com', collectSink);
|
|
160
|
+
* ```
|
|
161
|
+
*
|
|
162
|
+
* With multiple starting URLs:
|
|
163
|
+
* ```typescript
|
|
164
|
+
* const stats = yield* spider.crawl([
|
|
165
|
+
* 'https://example.com',
|
|
166
|
+
* 'https://other-domain.com'
|
|
167
|
+
* ], collectSink);
|
|
168
|
+
* ```
|
|
169
|
+
*
|
|
170
|
+
* With enhanced link extraction:
|
|
171
|
+
* ```typescript
|
|
172
|
+
* const stats = yield* spider.crawl('https://example.com', collectSink, {
|
|
173
|
+
* useEnhancedExtraction: true,
|
|
174
|
+
* linkExtractorConfig: {
|
|
175
|
+
* allowPatterns: [/\/articles\//],
|
|
176
|
+
* restrictCss: ['.content a']
|
|
177
|
+
* }
|
|
178
|
+
* });
|
|
179
|
+
* ```
|
|
180
|
+
*/
|
|
181
|
+
crawl: <A, E, R>(startingUrls: string | string[] | {
|
|
182
|
+
url: string;
|
|
183
|
+
metadata?: Record<string, unknown>;
|
|
184
|
+
} | {
|
|
185
|
+
url: string;
|
|
186
|
+
metadata?: Record<string, unknown>;
|
|
187
|
+
}[], sink: Sink.Sink<A, CrawlResult, E, R>, options?: SpiderLinkExtractionOptions) => Effect.Effect<{
|
|
188
|
+
completed: boolean;
|
|
189
|
+
}, unknown, import("../Config/SpiderConfig.service.js").SpiderConfigService | SpiderLogger>;
|
|
190
|
+
crawlSingle: <A, E, R>(urlString: string, sink: Sink.Sink<A, CrawlResult, E, R>, options?: SpiderLinkExtractionOptions, initialMetadata?: Record<string, unknown>, restrictToStartingDomain?: boolean) => Effect.Effect<{
|
|
191
|
+
completed: boolean;
|
|
192
|
+
pagesScraped: number;
|
|
193
|
+
domain: string;
|
|
194
|
+
}, unknown, import("../Config/SpiderConfig.service.js").SpiderConfigService | SpiderLogger>;
|
|
195
|
+
/**
|
|
196
|
+
* Resume a previous crawling session from persistent storage.
|
|
197
|
+
*
|
|
198
|
+
* This method requires resumability to be enabled in the SpiderConfig and
|
|
199
|
+
* a StatePersistence implementation to be configured. It will restore the
|
|
200
|
+
* crawling state and continue processing from where it left off.
|
|
201
|
+
*
|
|
202
|
+
* @param stateKey - The unique identifier for the session to resume
|
|
203
|
+
* @param sink - Sink to process crawl results as they're produced
|
|
204
|
+
* @param persistence - Optional persistence implementation (uses configured one if not provided)
|
|
205
|
+
* @returns Effect containing crawl statistics
|
|
206
|
+
*
|
|
207
|
+
* @example
|
|
208
|
+
* ```typescript
|
|
209
|
+
* const stateKey = new SpiderStateKey({
|
|
210
|
+
* id: 'my-crawl-session',
|
|
211
|
+
* timestamp: new Date('2024-01-01'),
|
|
212
|
+
* name: 'Example Crawl'
|
|
213
|
+
* });
|
|
214
|
+
*
|
|
215
|
+
* const collectSink = Sink.forEach<CrawlResult>(result =>
|
|
216
|
+
* Effect.sync(() => console.log(`Resumed: ${result.pageData.title}`))
|
|
217
|
+
* );
|
|
218
|
+
*
|
|
219
|
+
* const stats = yield* spider.resume(stateKey, collectSink);
|
|
220
|
+
* ```
|
|
221
|
+
*/
|
|
222
|
+
resume: <A, E, R>(stateKey: import("../Scheduler/SpiderScheduler.service.js").SpiderStateKey, resumeSink: Sink.Sink<A, CrawlResult, E, R>, _persistence?: import("../Scheduler/SpiderScheduler.service.js").StatePersistence) => Effect.Effect<{
|
|
223
|
+
resumed: boolean;
|
|
224
|
+
sessionId: string;
|
|
225
|
+
completed: boolean;
|
|
226
|
+
urlsProcessed?: undefined;
|
|
227
|
+
} | {
|
|
228
|
+
completed: boolean;
|
|
229
|
+
resumed: boolean;
|
|
230
|
+
sessionId: string;
|
|
231
|
+
urlsProcessed: number;
|
|
232
|
+
}, unknown, import("../Config/SpiderConfig.service.js").SpiderConfigService | SpiderLogger | SpiderSchedulerService>;
|
|
233
|
+
/**
|
|
234
|
+
* Returns the list of URLs that have been visited during crawling.
|
|
235
|
+
*
|
|
236
|
+
* @returns Effect containing array of visited URLs
|
|
237
|
+
*
|
|
238
|
+
* @remarks
|
|
239
|
+
* This is currently a placeholder implementation. In a future version,
|
|
240
|
+
* this will return the actual list of visited URLs from the current session.
|
|
241
|
+
*/
|
|
242
|
+
getVisitedUrls: () => Effect.Effect<string[]>;
|
|
243
|
+
}, never, SpiderLogger | ScraperService | RobotsService | LinkExtractorService>;
|
|
244
|
+
readonly dependencies: readonly [import("effect/Layer").Layer<RobotsService, never, never>, import("effect/Layer").Layer<ScraperService, never, never>, import("effect/Layer").Layer<UrlDeduplicatorService, never, never>, import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>, import("effect/Layer").Layer<LinkExtractorService, never, never>, import("effect/Layer").Layer<SpiderLogger, never, never>];
|
|
245
|
+
}>;
|
|
246
|
+
export declare class SpiderService extends SpiderService_base {
|
|
247
|
+
}
|
|
248
|
+
export type { CrawlResult, CrawlTask };
|
|
249
|
+
//# sourceMappingURL=Spider.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Spider.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Spider/Spider.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,MAAM,EASN,IAAI,EAEL,MAAM,QAAQ,CAAC;AAIhB,OAAO,EAAE,sBAAsB,EAAE,MAAM,+CAA+C,CAAC;AACvF,OAAO,EAAE,cAAc,EAAE,MAAM,+BAA+B,CAAC;AAC/D,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EACL,KAAK,mBAAmB,EACxB,oBAAoB,EACrB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EAAE,sBAAsB,EAAE,MAAM,yCAAyC,CAAC;AAEjF,OAAO,EACL,YAAY,EAEb,MAAM,oCAAoC,CAAC;AAG5C;;;;;GAKG;AACH,UAAU,iBAAiB;IACzB,8CAA8C;IAC9C,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,0EAA0E;IAC1E,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;CAC7B;AAED;;;;;GAKG;AACH,UAAU,qBAAqB;IAC7B,uCAAuC;IACvC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,2CAA2C;IAC3C,QAAQ,CAAC,IAAI,CAAC,EAAE,OAAO,CAAC;IACxB,gDAAgD;IAChD,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAC5B,yCAAyC;IACzC,QAAQ,CAAC,QAAQ,CAAC,EAAE,OAAO,CAAC;IAC5B,gDAAgD;IAChD,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,CAAC;IAC1B,yDAAyD;IACzD,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,iBAAiB,CAAC,CAAC;CACrD;AAED;;;;;;GAMG;AACH,KAAK,yBAAyB,GAAG,MAAM,GAAG,qBAAqB,CAAC;AAEhE;;;;;GAKG;AACH,KAAK,oBAAoB,GAAG,MAAM,CAAC,MAAM,EAAE,yBAAyB,CAAC,CAAC;AAEtE;;;;;GAKG;AACH,UAAU,SAAS;IACjB,4BAA4B;IAC5B,GAAG,EAAE,MAAM,CAAC;IACZ,+DAA+D;IAC/D,KAAK,EAAE,MAAM,CAAC;IACd,4DAA4D;IAC5D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,2DAA2D;IAC3D,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACnC,6CAA6C;IAC7C,WAAW,CAAC,EAAE,oBAAoB,CAAC;CACpC;AAED;;;;;;;;GAQG;AACH,UAAU,WAAW;IACnB,qEAAqE;IACrE,QAAQ,EAAE,QAAQ,CAAC;IACnB,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAC;IACd,iCAAiC;IACjC,SAAS,EAAE,IAAI,CAAC;IAChB,iEAAiE;IACjE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH;;;;;GAKG;AACH,MAAM,WAAW,2BAA2B;IAC1C,iDAAiD;IACjD,QAAQ,CAAC,mBAAmB,CAAC,EAAE,mBAAmB,CAAC;IACnD,0FAA0F;IAC1F,QAAQ,CAAC,qBAAqB,CAAC,EAAE,OAAO,CAAC;IACzC,mFAAmF;IACnF,QAAQ,CAAC,sBAAsB,CAAC,EAAE,OAAO,CAAC;IAC1C,mEAAmE;IACnE,QAAQ,CAAC,WAAW,CAAC,EAAE,oBAAoB,CAAC;CAC7C;;;QAqBO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WA2CG;gBACK,CAAC,EAAE,CAAC,EAAE,CAAC,gBAET,MAAM,GACN,MAAM,EAAE,GACR;YAAE,GAAG,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;SAAE,GACnD;YAAE,GAAG,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;SAAE,EAAE,QACnD,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,YAC3B,2BAA2B;;;sBA0HzB,CAAC,EAAE,CAAC,EAAE,CAAC,aACR,MAAM,QACX,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,YAC3B,2BAA2B,oBACnB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,6BACd,OAAO;;;;;QA6gCpC;;;;;;;;;;;;;;;;;;;;;;;;;;WA0BG;iBACM,CAAC,EAAE,CAAC,EAAE,CAAC,YACJ,OAAO,yCAAyC,EAAE,cAAc,cAC9D,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,iBAC5B,OAAO,yCAAyC,EAAE,gBAAgB;;;;;;;;;;;QAyInF;;;;;;;;WAQG;8BACiB,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;;;;AAl4CnD,qBAAa,aAAc,SAAQ,kBAg5ClC;CAAG;AAEJ,YAAY,EAAE,WAAW,EAAE,SAAS,EAAE,CAAC"}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* State Manager Service
|
|
3
|
+
* Manages tokens, sessions, and client-side storage simulation
|
|
4
|
+
*/
|
|
5
|
+
import { Context, Effect, Layer } from 'effect';
|
|
6
|
+
declare const CSRFTokenNotFoundError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
7
|
+
readonly _tag: "CSRFTokenNotFoundError";
|
|
8
|
+
} & Readonly<A>;
|
|
9
|
+
export declare class CSRFTokenNotFoundError extends CSRFTokenNotFoundError_base<{
|
|
10
|
+
readonly message: string;
|
|
11
|
+
}> {
|
|
12
|
+
}
|
|
13
|
+
declare const APITokenNotFoundError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
14
|
+
readonly _tag: "APITokenNotFoundError";
|
|
15
|
+
} & Readonly<A>;
|
|
16
|
+
export declare class APITokenNotFoundError extends APITokenNotFoundError_base<{
|
|
17
|
+
readonly message: string;
|
|
18
|
+
}> {
|
|
19
|
+
}
|
|
20
|
+
declare const TokenNotFoundError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
21
|
+
readonly _tag: "TokenNotFoundError";
|
|
22
|
+
} & Readonly<A>;
|
|
23
|
+
export declare class TokenNotFoundError extends TokenNotFoundError_base<{
|
|
24
|
+
readonly message: string;
|
|
25
|
+
readonly tokenType: TokenType;
|
|
26
|
+
}> {
|
|
27
|
+
}
|
|
28
|
+
declare const TokenExpiredError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
29
|
+
readonly _tag: "TokenExpiredError";
|
|
30
|
+
} & Readonly<A>;
|
|
31
|
+
export declare class TokenExpiredError extends TokenExpiredError_base<{
|
|
32
|
+
readonly message: string;
|
|
33
|
+
readonly tokenType: TokenType;
|
|
34
|
+
}> {
|
|
35
|
+
}
|
|
36
|
+
declare const StorageKeyNotFoundError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
37
|
+
readonly _tag: "StorageKeyNotFoundError";
|
|
38
|
+
} & Readonly<A>;
|
|
39
|
+
export declare class StorageKeyNotFoundError extends StorageKeyNotFoundError_base<{
|
|
40
|
+
readonly message: string;
|
|
41
|
+
readonly key: string;
|
|
42
|
+
readonly storageType: 'local' | 'session';
|
|
43
|
+
}> {
|
|
44
|
+
}
|
|
45
|
+
export declare enum TokenType {
|
|
46
|
+
CSRF = "csrf",
|
|
47
|
+
API = "api",
|
|
48
|
+
AUTH = "auth",
|
|
49
|
+
REFRESH = "refresh"
|
|
50
|
+
}
|
|
51
|
+
export interface Token {
|
|
52
|
+
type: TokenType;
|
|
53
|
+
value: string;
|
|
54
|
+
expiry?: Date;
|
|
55
|
+
scope?: string[];
|
|
56
|
+
}
|
|
57
|
+
export interface StateManagerService {
|
|
58
|
+
/**
|
|
59
|
+
* Extract CSRF token from HTML
|
|
60
|
+
*/
|
|
61
|
+
extractCSRFToken: (html: string) => Effect.Effect<string, CSRFTokenNotFoundError>;
|
|
62
|
+
/**
|
|
63
|
+
* Extract API token from JavaScript
|
|
64
|
+
*/
|
|
65
|
+
extractAPIToken: (scripts: string[]) => Effect.Effect<string, APITokenNotFoundError>;
|
|
66
|
+
/**
|
|
67
|
+
* Store a token
|
|
68
|
+
*/
|
|
69
|
+
storeToken: (type: TokenType, token: string, expiry?: Date) => Effect.Effect<void>;
|
|
70
|
+
/**
|
|
71
|
+
* Get a stored token
|
|
72
|
+
*/
|
|
73
|
+
getToken: (type: TokenType) => Effect.Effect<string, TokenNotFoundError | TokenExpiredError>;
|
|
74
|
+
/**
|
|
75
|
+
* Check if token is valid (not expired)
|
|
76
|
+
*/
|
|
77
|
+
isTokenValid: (type: TokenType) => Effect.Effect<boolean>;
|
|
78
|
+
/**
|
|
79
|
+
* Simulate local storage
|
|
80
|
+
*/
|
|
81
|
+
setLocalStorage: (key: string, value: string) => Effect.Effect<void>;
|
|
82
|
+
getLocalStorage: (key: string) => Effect.Effect<string, StorageKeyNotFoundError>;
|
|
83
|
+
clearLocalStorage: () => Effect.Effect<void>;
|
|
84
|
+
/**
|
|
85
|
+
* Simulate session storage
|
|
86
|
+
*/
|
|
87
|
+
setSessionStorage: (key: string, value: string) => Effect.Effect<void>;
|
|
88
|
+
getSessionStorage: (key: string) => Effect.Effect<string, StorageKeyNotFoundError>;
|
|
89
|
+
clearSessionStorage: () => Effect.Effect<void>;
|
|
90
|
+
/**
|
|
91
|
+
* Clear all state
|
|
92
|
+
*/
|
|
93
|
+
clearState: () => Effect.Effect<void>;
|
|
94
|
+
}
|
|
95
|
+
declare const StateManager_base: Context.TagClass<StateManager, "StateManager", StateManagerService>;
|
|
96
|
+
export declare class StateManager extends StateManager_base {
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Create a StateManager service implementation
|
|
100
|
+
*/
|
|
101
|
+
export declare const makeStateManager: () => Effect.Effect<StateManagerService>;
|
|
102
|
+
/**
|
|
103
|
+
* StateManager Layer
|
|
104
|
+
*/
|
|
105
|
+
export declare const StateManagerLive: Layer.Layer<StateManager, never, never>;
|
|
106
|
+
export {};
|
|
107
|
+
//# sourceMappingURL=StateManager.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"StateManager.service.d.ts","sourceRoot":"","sources":["../../../src/lib/StateManager/StateManager.service.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAkB,MAAM,EAAW,KAAK,EAAe,MAAM,QAAQ,CAAC;;;;AAItF,qBAAa,sBAAuB,SAAQ,4BAA2C;IACrF,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;;;;AAEL,qBAAa,qBAAsB,SAAQ,2BAA0C;IACnF,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;;;;AAEL,qBAAa,kBAAmB,SAAQ,wBAAuC;IAC7E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,SAAS,EAAE,SAAS,CAAC;CAC/B,CAAC;CAAG;;;;AAEL,qBAAa,iBAAkB,SAAQ,uBAAsC;IAC3E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,SAAS,EAAE,SAAS,CAAC;CAC/B,CAAC;CAAG;;;;AAEL,qBAAa,uBAAwB,SAAQ,6BAA4C;IACvF,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,WAAW,EAAE,OAAO,GAAG,SAAS,CAAC;CAC3C,CAAC;CAAG;AAEL,oBAAY,SAAS;IACnB,IAAI,SAAS;IACb,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,OAAO,YAAY;CACpB;AAED,MAAM,WAAW,KAAK;IACpB,IAAI,EAAE,SAAS,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,IAAI,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;CAClB;AAED,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,gBAAgB,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,sBAAsB,CAAC,CAAC;IAElF;;OAEG;IACH,eAAe,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,qBAAqB,CAAC,CAAC;IAErF;;OAEG;IACH,UAAU,EAAE,CACV,IAAI,EAAE,SAAS,EACf,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,IAAI,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAEzB;;OAEG;IACH,QAAQ,EAAE,CAAC,IAAI,EAAE,SAAS,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,kBAAkB,GAAG,iBAAiB,CAAC,CAAC;IAE7F;;OAEG;IACH,YAAY,EAAE,CAAC,IAAI,EAAE,SAAS,KAAK,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE1D;;OAEG;IACH,eAAe,EAAE,CACf,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,eAAe,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,uBAAuB,CAAC,CAAC;IACjF,iBAAiB,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE7C;;OAEG;IACH,iBAAiB,EAAE,CACjB,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,iBAAiB,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,uBAAuB,CAAC,CAAC;IACnF,mBAAmB,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE/C;;OAEG;IACH,UAAU,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CACvC;;AAED,qBAAa,YAAa,SAAQ,iBAG/B;CAAG;AAEN;;GAEG;AACH,eAAO,MAAM,gBAAgB,QAAO,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAqNjE,CAAC;AAEL;;GAEG;AACH,eAAO,MAAM,gBAAgB,yCAAiD,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/StateManager/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,2BAA2B,CAAC"}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { Effect, MutableHashSet } from 'effect';
|
|
2
|
+
/**
|
|
3
|
+
* Thread-safe URL deduplication service with built-in normalization.
|
|
4
|
+
*
|
|
5
|
+
* Provides atomic operations for checking and adding URLs to prevent
|
|
6
|
+
* race conditions in concurrent environments. URLs are normalized
|
|
7
|
+
* before storage to ensure consistent deduplication.
|
|
8
|
+
*
|
|
9
|
+
* @group Services
|
|
10
|
+
* @public
|
|
11
|
+
*/
|
|
12
|
+
export interface IUrlDeduplicator {
|
|
13
|
+
/**
|
|
14
|
+
* Attempts to add a URL to the deduplication set.
|
|
15
|
+
*
|
|
16
|
+
* @param url - The URL to add
|
|
17
|
+
* @returns Effect containing boolean - true if URL was added (first time seen), false if already exists
|
|
18
|
+
*/
|
|
19
|
+
tryAdd(_url: string): Effect.Effect<boolean>;
|
|
20
|
+
/**
|
|
21
|
+
* Checks if a URL has already been seen.
|
|
22
|
+
*
|
|
23
|
+
* @param url - The URL to check
|
|
24
|
+
* @returns Effect containing boolean - true if URL exists, false otherwise
|
|
25
|
+
*/
|
|
26
|
+
contains(_url: string): Effect.Effect<boolean>;
|
|
27
|
+
/**
|
|
28
|
+
* Returns the current number of unique URLs in the set.
|
|
29
|
+
*
|
|
30
|
+
* @returns Effect containing the count
|
|
31
|
+
*/
|
|
32
|
+
size(): Effect.Effect<number>;
|
|
33
|
+
/**
|
|
34
|
+
* Clears all URLs from the deduplication set.
|
|
35
|
+
*
|
|
36
|
+
* @returns Effect containing void
|
|
37
|
+
*/
|
|
38
|
+
clear(): Effect.Effect<void>;
|
|
39
|
+
}
|
|
40
|
+
declare const UrlDeduplicatorService_base: Effect.Service.Class<UrlDeduplicatorService, "@jambudipa.io/UrlDeduplicatorService", {
|
|
41
|
+
readonly effect: Effect.Effect<{
|
|
42
|
+
tryAdd: (url: string) => Effect.Effect<boolean, never, never>;
|
|
43
|
+
contains: (url: string) => Effect.Effect<boolean, never, never>;
|
|
44
|
+
size: () => Effect.Effect<number, never, never>;
|
|
45
|
+
clear: () => Effect.Effect<MutableHashSet.MutableHashSet<string>, never, never>;
|
|
46
|
+
}, never, import("../Config/SpiderConfig.service.js").SpiderConfigService>;
|
|
47
|
+
readonly dependencies: readonly [import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>];
|
|
48
|
+
}>;
|
|
49
|
+
/**
|
|
50
|
+
* URL deduplication service as an Effect Service.
|
|
51
|
+
*
|
|
52
|
+
* @group Services
|
|
53
|
+
* @public
|
|
54
|
+
*/
|
|
55
|
+
export declare class UrlDeduplicatorService extends UrlDeduplicatorService_base {
|
|
56
|
+
}
|
|
57
|
+
export {};
|
|
58
|
+
//# sourceMappingURL=UrlDeduplicator.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"UrlDeduplicator.service.d.ts","sourceRoot":"","sources":["../../../src/lib/UrlDeduplicator/UrlDeduplicator.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,QAAQ,CAAC;AAGhD;;;;;;;;;GASG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;OAKG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE7C;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE/C;;;;OAIG;IACH,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAE9B;;;;OAIG;IACH,KAAK,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CAC9B;;;sBA4EqB,MAAM;wBAcJ,MAAM;;;;;;AAxF9B;;;;;GAKG;AACH,qBAAa,sBAAuB,SAAQ,2BAuG3C;CAAG"}
|