@jambudipa/spider 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +12 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +321 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
export type { ISpider, ISpiderScheduler, IMiddlewareManager, IRateLimitMiddleware, ILoggingMiddleware, IUserAgentMiddleware, IStatsMiddleware, } from './lib/api-facades.js';
|
|
2
2
|
export * from './lib/Spider/Spider.service.js';
|
|
3
|
+
export { SPIDER_DEFAULTS } from './lib/Spider/Spider.defaults.js';
|
|
3
4
|
export * from './lib/Robots/Robots.service.js';
|
|
4
5
|
export * from './lib/Scraper/Scraper.service.js';
|
|
5
6
|
export * from './lib/PageData/PageData.js';
|
|
@@ -20,16 +21,24 @@ export { StateDelta, PersistenceError as ResumabilityError, DEFAULT_HYBRID_CONFI
|
|
|
20
21
|
export { ResumabilityService, ResumabilityConfigs, createStateOperation, } from './lib/Resumability/Resumability.service.js';
|
|
21
22
|
export { FullStatePersistence, DeltaPersistence, HybridPersistence, } from './lib/Resumability/strategies.js';
|
|
22
23
|
export { FileStorageBackend } from './lib/Resumability/backends/FileStorageBackend.js';
|
|
24
|
+
export type { DatabaseClientInterface, PostgresStorageConfig, } from './lib/Resumability/backends/PostgresStorageBackend.js';
|
|
25
|
+
export { PostgresStorageBackend } from './lib/Resumability/backends/PostgresStorageBackend.js';
|
|
23
26
|
export { NetworkError, ResponseError, RobotsTxtError, ConfigurationError, MiddlewareError, FileSystemError, PersistenceError, ContentTypeError, RequestAbortError, AdapterNotInitialisedError, BrowserError, BrowserCleanupError, TimeoutError, ParseError, ValidationError, PageError, StateError, SessionError, CrawlError, QueueError, ConfigError, isSpiderError, isNetworkError, isBrowserError, } from './lib/errors/effect-errors.js';
|
|
24
27
|
export type { SpiderError, AllSpiderErrors } from './lib/errors/effect-errors.js';
|
|
25
28
|
export type { SpiderLogEvent, SpiderLogger, } from './lib/Logging/SpiderLogger.service.js';
|
|
26
29
|
export { SpiderLogger as SpiderLoggerTag, makeSpiderLogger, SpiderLoggerLive, } from './lib/Logging/SpiderLogger.service.js';
|
|
30
|
+
export type { LoggingFetchFn } from './lib/Logging/FetchLogger.js';
|
|
31
|
+
export { FetchError, makeLoggingFetch, LoggingFetch, } from './lib/Logging/FetchLogger.js';
|
|
27
32
|
export type { CookieManagerService, EnhancedHttpClientService, HttpRequestOptions, HttpResponse, Session, Credentials, SessionStoreService, TokenInfo, TokenExtractorService, } from './lib/HttpClient/index.js';
|
|
28
33
|
export { CookieManager, makeCookieManager, CookieManagerLive, EnhancedHttpClient, makeEnhancedHttpClient, EnhancedHttpClientLive, SessionStore, makeSessionStore, SessionStoreLive, TokenExtractor, makeTokenExtractor, TokenExtractorLive, } from './lib/HttpClient/index.js';
|
|
29
34
|
export type { Token, StateManagerService } from './lib/StateManager/index.js';
|
|
30
|
-
export { TokenType, StateManager, makeStateManager, StateManagerLive, } from './lib/StateManager/index.js';
|
|
35
|
+
export { TokenType, StateManager, makeStateManager, StateManagerLive, CSRFTokenNotFoundError, APITokenNotFoundError, TokenNotFoundError, TokenExpiredError, StorageKeyNotFoundError, } from './lib/StateManager/index.js';
|
|
31
36
|
export type { BrowserEngineConfig, BrowserEngineServiceInterface, PageElement, } from './lib/BrowserEngine/BrowserEngine.service.js';
|
|
32
37
|
export { BrowserEngineService, BrowserEngineLive, BrowserEngineWithConfig, withBrowser, } from './lib/BrowserEngine/BrowserEngine.service.js';
|
|
33
|
-
export type { LoginCredentials, ScrapingSession, WebScrapingEngineService, } from './lib/WebScrapingEngine/index.js';
|
|
34
|
-
export { WebScrapingEngine, makeWebScrapingEngine, WebScrapingEngineLive, } from './lib/WebScrapingEngine/index.js';
|
|
38
|
+
export type { LoginCredentials, ScrapingSession, WebScrapingEngineService, WebScrapingEngineError, HttpOperationError, HttpPostOperationError, } from './lib/WebScrapingEngine/index.js';
|
|
39
|
+
export { WebScrapingEngine, makeWebScrapingEngine, WebScrapingEngineLive, LoginError, SessionNotValidError, SessionLoadError, } from './lib/WebScrapingEngine/index.js';
|
|
40
|
+
export { WorkerHealthMonitor } from './lib/WorkerHealth/WorkerHealthMonitor.service.js';
|
|
41
|
+
export type { DeduplicationStrategy, UrlWithMetadata, NormalizedUrl, } from './lib/utils/url-deduplication.js';
|
|
42
|
+
export { DEFAULT_DEDUPLICATION_STRATEGY, parseUrl, normalizeUrl, deduplicateUrls, createUrlDeduplicator, } from './lib/utils/url-deduplication.js';
|
|
43
|
+
export { safeJsonParse, toOption, fromPromise, cleanupResources, matchOption, } from './lib/utils/effect-migration.js';
|
|
35
44
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,YAAY,EACV,OAAO,EACP,gBAAgB,EAChB,kBAAkB,EAClB,oBAAoB,EACpB,kBAAkB,EAClB,oBAAoB,EACpB,gBAAgB,GACjB,MAAM,sBAAsB,CAAC;AAG9B,cAAc,gCAAgC,CAAC;AAC/C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,kCAAkC,CAAC;AACjD,cAAc,4BAA4B,CAAC;AAG3C,YAAY,EACV,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,YAAY,EACZ,gBAAgB,GACjB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EAAE,gBAAgB,EAAE,MAAM,kDAAkD,CAAC;AACzF,OAAO,EAAE,sBAAsB,EAAE,MAAM,kDAAkD,CAAC;AAG1F,YAAY,EAAE,gBAAgB,EAAE,MAAM,4CAA4C,CAAC;AACnF,OAAO,EACL,sBAAsB,EACtB,cAAc,EACd,eAAe,EACf,WAAW,GACZ,MAAM,4CAA4C,CAAC;AAGpD,YAAY,EACV,gBAAgB,EAChB,aAAa,EACb,cAAc,GACf,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,iBAAiB,EACjB,mBAAmB,EACnB,iBAAiB,EACjB,mBAAmB,EACnB,eAAe,GAChB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EACV,mBAAmB,EACnB,oBAAoB,EACpB,6BAA6B,GAC9B,MAAM,8CAA8C,CAAC;AACtD,OAAO,EACL,oBAAoB,EACpB,yBAAyB,EACzB,mBAAmB,GACpB,MAAM,8CAA8C,CAAC;AAGtD,YAAY,EACV,WAAW,EACX,SAAS,EACT,2BAA2B,GAC5B,MAAM,gCAAgC,CAAC;AAGxC,YAAY,EACV,mBAAmB,EACnB,cAAc,EACd,cAAc,EACd,mBAAmB,EACnB,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AACrC,YAAY,EAAE,kBAAkB,EAAE,MAAM,4CAA4C,CAAC;AACrF,OAAO,EACL,UAAU,EACV,gBAAgB,IAAI,iBAAiB,EACrC,qBAAqB,GACtB,MAAM,6BAA6B,CAAC;AACrC,OAAO,EACL,mBAAmB,EACnB,mBAAmB,EACnB,oBAAoB,GACrB,MAAM,4CAA4C,CAAC;AACpD,OAAO,EACL,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,kBAAkB,EAAE,MAAM,mDAAmD,CAAC;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,YAAY,EACV,OAAO,EACP,gBAAgB,EAChB,kBAAkB,EAClB,oBAAoB,EACpB,kBAAkB,EAClB,oBAAoB,EACpB,gBAAgB,GACjB,MAAM,sBAAsB,CAAC;AAG9B,cAAc,gCAAgC,CAAC;AAC/C,OAAO,EAAE,eAAe,EAAE,MAAM,iCAAiC,CAAC;AAClE,cAAc,gCAAgC,CAAC;AAC/C,cAAc,kCAAkC,CAAC;AACjD,cAAc,4BAA4B,CAAC;AAG3C,YAAY,EACV,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,YAAY,EACZ,gBAAgB,GACjB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EAAE,gBAAgB,EAAE,MAAM,kDAAkD,CAAC;AACzF,OAAO,EAAE,sBAAsB,EAAE,MAAM,kDAAkD,CAAC;AAG1F,YAAY,EAAE,gBAAgB,EAAE,MAAM,4CAA4C,CAAC;AACnF,OAAO,EACL,sBAAsB,EACtB,cAAc,EACd,eAAe,EACf,WAAW,GACZ,MAAM,4CAA4C,CAAC;AAGpD,YAAY,EACV,gBAAgB,EAChB,aAAa,EACb,cAAc,GACf,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,iBAAiB,EACjB,mBAAmB,EACnB,iBAAiB,EACjB,mBAAmB,EACnB,eAAe,GAChB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EACV,mBAAmB,EACnB,oBAAoB,EACpB,6BAA6B,GAC9B,MAAM,8CAA8C,CAAC;AACtD,OAAO,EACL,oBAAoB,EACpB,yBAAyB,EACzB,mBAAmB,GACpB,MAAM,8CAA8C,CAAC;AAGtD,YAAY,EACV,WAAW,EACX,SAAS,EACT,2BAA2B,GAC5B,MAAM,gCAAgC,CAAC;AAGxC,YAAY,EACV,mBAAmB,EACnB,cAAc,EACd,cAAc,EACd,mBAAmB,EACnB,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AACrC,YAAY,EAAE,kBAAkB,EAAE,MAAM,4CAA4C,CAAC;AACrF,OAAO,EACL,UAAU,EACV,gBAAgB,IAAI,iBAAiB,EACrC,qBAAqB,GACtB,MAAM,6BAA6B,CAAC;AACrC,OAAO,EACL,mBAAmB,EACnB,mBAAmB,EACnB,oBAAoB,GACrB,MAAM,4CAA4C,CAAC;AACpD,OAAO,EACL,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,kBAAkB,EAAE,MAAM,mDAAmD,CAAC;AACvF,YAAY,EACV,uBAAuB,EACvB,qBAAqB,GACtB,MAAM,uDAAuD,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAE,MAAM,uDAAuD,CAAC;AAG/F,OAAO,EACL,YAAY,EACZ,aAAa,EACb,cAAc,EACd,kBAAkB,EAClB,eAAe,EACf,eAAe,EACf,gBAAgB,EAChB,gBAAgB,EAChB,iBAAiB,EACjB,0BAA0B,EAC1B,YAAY,EACZ,mBAAmB,EACnB,YAAY,EACZ,UAAU,EACV,eAAe,EACf,SAAS,EACT,UAAU,EACV,YAAY,EACZ,UAAU,EACV,UAAU,EACV,WAAW,EACX,aAAa,EACb,cAAc,EACd,cAAc,GACf,MAAM,+BAA+B,CAAC;AACvC,YAAY,EAAE,WAAW,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAGlF,YAAY,EACV,cAAc,EACd,YAAY,GACb,MAAM,uCAAuC,CAAC;AAC/C,OAAO,EACL,YAAY,IAAI,eAAe,EAC/B,gBAAgB,EAChB,gBAAgB,GACjB,MAAM,uCAAuC,CAAC;AAG/C,YAAY,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AACnE,OAAO,EACL,UAAU,EACV,gBAAgB,EAChB,YAAY,GACb,MAAM,8BAA8B,CAAC;AAGtC,YAAY,EACV,oBAAoB,EACpB,yBAAyB,EACzB,kBAAkB,EAClB,YAAY,EACZ,OAAO,EACP,WAAW,EACX,mBAAmB,EACnB,SAAS,EACT,qBAAqB,GACtB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EACL,aAAa,EACb,iBAAiB,EACjB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EACtB,sBAAsB,EACtB,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,EAChB,cAAc,EACd,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,2BAA2B,CAAC;AAGnC,YAAY,EAAE,KAAK,EAAE,mBAAmB,EAAE,MAAM,6BAA6B,CAAC;AAC9E,OAAO,EACL,SAAS,EACT,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,EAChB,sBAAsB,EACtB,qBAAqB,EACrB,kBAAkB,EAClB,iBAAiB,EACjB,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AAGrC,YAAY,EACV,mBAAmB,EACnB,6BAA6B,EAC7B,WAAW,GACZ,MAAM,8CAA8C,CAAC;AACtD,OAAO,EACL,oBAAoB,EACpB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,GACZ,MAAM,8CAA8C,CAAC;AAGtD,YAAY,EACV,gBAAgB,EAChB,eAAe,EACf,wBAAwB,EACxB,sBAAsB,EACtB,kBAAkB,EAClB,sBAAsB,GACvB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EACL,iBAAiB,EACjB,qBAAqB,EACrB,qBAAqB,EACrB,UAAU,EACV,oBAAoB,EACpB,gBAAgB,GACjB,MAAM,kCAAkC,CAAC;AAG1C,OAAO,EAAE,mBAAmB,EAAE,MAAM,mDAAmD,CAAC;AAGxF,YAAY,EACV,qBAAqB,EACrB,eAAe,EACf,aAAa,GACd,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EACL,8BAA8B,EAC9B,QAAQ,EACR,YAAY,EACZ,eAAe,EACf,qBAAqB,GACtB,MAAM,kCAAkC,CAAC;AAG1C,OAAO,EACL,aAAa,EACb,QAAQ,EACR,WAAW,EACX,gBAAgB,EAChB,WAAW,GACZ,MAAM,iCAAiC,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Effect, Layer, Option, Chunk, MutableHashSet, Schema, Data, pipe, Context, DateTime, Console, Duration, MutableHashMap, Queue, HashMap, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Struct
|
|
1
|
+
import { Effect, Layer, Option, Chunk, MutableHashSet, Schema, Data, pipe, Context, DateTime, Console, Duration, MutableHashMap, Queue, HashMap, Ref, HashSet, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Struct } from "effect";
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
3
|
import * as fs from "fs";
|
|
4
4
|
import * as path from "path";
|
|
@@ -1747,6 +1747,62 @@ const deduplicateUrls = (urls, strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Eff
|
|
|
1747
1747
|
}
|
|
1748
1748
|
};
|
|
1749
1749
|
});
|
|
1750
|
+
const createUrlDeduplicator = (strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Effect.gen(function* () {
|
|
1751
|
+
const seenUrls = yield* Ref.make(HashSet.empty());
|
|
1752
|
+
const urlStats = yield* Ref.make({
|
|
1753
|
+
processed: 0,
|
|
1754
|
+
unique: 0,
|
|
1755
|
+
duplicates: 0
|
|
1756
|
+
});
|
|
1757
|
+
return {
|
|
1758
|
+
/**
|
|
1759
|
+
* Check if a URL has been seen (after normalization)
|
|
1760
|
+
*/
|
|
1761
|
+
hasSeenUrl: (url) => Effect.gen(function* () {
|
|
1762
|
+
const normalized = yield* normalizeUrl(url, strategy);
|
|
1763
|
+
const seen = yield* Ref.get(seenUrls);
|
|
1764
|
+
return HashSet.has(seen, normalized.normalized);
|
|
1765
|
+
}),
|
|
1766
|
+
/**
|
|
1767
|
+
* Add a URL to the seen set
|
|
1768
|
+
*/
|
|
1769
|
+
markUrlSeen: (url) => Effect.gen(function* () {
|
|
1770
|
+
const normalized = yield* normalizeUrl(url, strategy);
|
|
1771
|
+
const seen = yield* Ref.get(seenUrls);
|
|
1772
|
+
if (HashSet.has(seen, normalized.normalized)) {
|
|
1773
|
+
yield* Ref.update(urlStats, (stats) => ({
|
|
1774
|
+
...stats,
|
|
1775
|
+
processed: stats.processed + 1,
|
|
1776
|
+
duplicates: stats.duplicates + 1
|
|
1777
|
+
}));
|
|
1778
|
+
return false;
|
|
1779
|
+
} else {
|
|
1780
|
+
yield* Ref.set(seenUrls, HashSet.add(seen, normalized.normalized));
|
|
1781
|
+
yield* Ref.update(urlStats, (stats) => ({
|
|
1782
|
+
...stats,
|
|
1783
|
+
processed: stats.processed + 1,
|
|
1784
|
+
unique: stats.unique + 1
|
|
1785
|
+
}));
|
|
1786
|
+
return true;
|
|
1787
|
+
}
|
|
1788
|
+
}),
|
|
1789
|
+
/**
|
|
1790
|
+
* Get deduplication statistics
|
|
1791
|
+
*/
|
|
1792
|
+
getStats: () => Ref.get(urlStats),
|
|
1793
|
+
/**
|
|
1794
|
+
* Reset the deduplicator
|
|
1795
|
+
*/
|
|
1796
|
+
reset: () => Effect.gen(function* () {
|
|
1797
|
+
yield* Ref.set(seenUrls, HashSet.empty());
|
|
1798
|
+
yield* Ref.set(urlStats, {
|
|
1799
|
+
processed: 0,
|
|
1800
|
+
unique: 0,
|
|
1801
|
+
duplicates: 0
|
|
1802
|
+
});
|
|
1803
|
+
})
|
|
1804
|
+
};
|
|
1805
|
+
});
|
|
1750
1806
|
const SPIDER_DEFAULTS = Object.freeze({
|
|
1751
1807
|
/** Threshold in ms after which a worker is considered stale (60s) */
|
|
1752
1808
|
STALE_WORKER_THRESHOLD_MS: 6e4,
|
|
@@ -5080,6 +5136,120 @@ const ResumabilityConfigs = {
|
|
|
5080
5136
|
backend: new PostgresStorageBackend(dbClient, config)
|
|
5081
5137
|
})
|
|
5082
5138
|
};
|
|
5139
|
+
class FetchError extends Data.TaggedError("FetchError") {
|
|
5140
|
+
get message() {
|
|
5141
|
+
return `Fetch failed for ${this.url}: ${this.reason} after ${this.durationMs}ms`;
|
|
5142
|
+
}
|
|
5143
|
+
}
|
|
5144
|
+
const makeLoggingFetch = Effect.gen(function* () {
|
|
5145
|
+
const logger = yield* SpiderLogger;
|
|
5146
|
+
return (url, options) => Effect.gen(function* () {
|
|
5147
|
+
const startTime = yield* DateTime.now;
|
|
5148
|
+
const startMs = DateTime.toEpochMillis(startTime);
|
|
5149
|
+
const domain = new URL(url).hostname;
|
|
5150
|
+
const optionDetails = Option.fromNullable(options).pipe(
|
|
5151
|
+
Option.map((opts) => ({
|
|
5152
|
+
method: opts.method,
|
|
5153
|
+
headers: Object.keys(opts.headers ?? {})
|
|
5154
|
+
}))
|
|
5155
|
+
);
|
|
5156
|
+
yield* logger.logEvent({
|
|
5157
|
+
type: "edge_case",
|
|
5158
|
+
domain,
|
|
5159
|
+
url,
|
|
5160
|
+
message: "[FETCH_START] Starting fetch request",
|
|
5161
|
+
details: {
|
|
5162
|
+
case: "fetch_start",
|
|
5163
|
+
url,
|
|
5164
|
+
timestamp: DateTime.formatIso(startTime),
|
|
5165
|
+
options: Option.getOrUndefined(optionDetails)
|
|
5166
|
+
}
|
|
5167
|
+
});
|
|
5168
|
+
const fetchEffect = Effect.tryPromise({
|
|
5169
|
+
try: () => globalThis.fetch(url, options),
|
|
5170
|
+
catch: (error) => new FetchError({
|
|
5171
|
+
url,
|
|
5172
|
+
reason: "network",
|
|
5173
|
+
durationMs: 0,
|
|
5174
|
+
// Duration will be calculated in error handler
|
|
5175
|
+
cause: error
|
|
5176
|
+
})
|
|
5177
|
+
});
|
|
5178
|
+
const timeoutDuration = Duration.seconds(30);
|
|
5179
|
+
const fetchWithTimeout = fetchEffect.pipe(
|
|
5180
|
+
Effect.timeoutOption(timeoutDuration),
|
|
5181
|
+
Effect.flatMap(
|
|
5182
|
+
(maybeResponse) => Option.match(maybeResponse, {
|
|
5183
|
+
onNone: () => Effect.gen(function* () {
|
|
5184
|
+
const currentTime = yield* DateTime.now;
|
|
5185
|
+
const durationMs2 = DateTime.toEpochMillis(currentTime) - startMs;
|
|
5186
|
+
yield* logger.logEvent({
|
|
5187
|
+
type: "edge_case",
|
|
5188
|
+
domain,
|
|
5189
|
+
url,
|
|
5190
|
+
message: `[FETCH_ABORT] Aborting fetch after ${durationMs2}ms`,
|
|
5191
|
+
details: {
|
|
5192
|
+
case: "fetch_abort",
|
|
5193
|
+
url,
|
|
5194
|
+
durationMs: durationMs2,
|
|
5195
|
+
reason: "timeout"
|
|
5196
|
+
}
|
|
5197
|
+
});
|
|
5198
|
+
return yield* Effect.fail(
|
|
5199
|
+
new FetchError({
|
|
5200
|
+
url,
|
|
5201
|
+
reason: "timeout",
|
|
5202
|
+
durationMs: Number(durationMs2)
|
|
5203
|
+
})
|
|
5204
|
+
);
|
|
5205
|
+
}),
|
|
5206
|
+
onSome: (response2) => Effect.succeed(response2)
|
|
5207
|
+
})
|
|
5208
|
+
)
|
|
5209
|
+
);
|
|
5210
|
+
const response = yield* fetchWithTimeout.pipe(
|
|
5211
|
+
Effect.catchAll(
|
|
5212
|
+
(error) => Effect.gen(function* () {
|
|
5213
|
+
const currentTime = yield* DateTime.now;
|
|
5214
|
+
const durationMs2 = DateTime.toEpochMillis(currentTime) - startMs;
|
|
5215
|
+
yield* logger.logEvent({
|
|
5216
|
+
type: "edge_case",
|
|
5217
|
+
domain,
|
|
5218
|
+
url,
|
|
5219
|
+
message: `[FETCH_ERROR] Failed after ${durationMs2}ms`,
|
|
5220
|
+
details: {
|
|
5221
|
+
case: "fetch_failed",
|
|
5222
|
+
url,
|
|
5223
|
+
durationMs: durationMs2,
|
|
5224
|
+
error: error._tag,
|
|
5225
|
+
message: error.message,
|
|
5226
|
+
isAborted: error.reason === "timeout"
|
|
5227
|
+
}
|
|
5228
|
+
});
|
|
5229
|
+
return yield* Effect.fail(error);
|
|
5230
|
+
})
|
|
5231
|
+
)
|
|
5232
|
+
);
|
|
5233
|
+
const endTime = yield* DateTime.now;
|
|
5234
|
+
const durationMs = DateTime.toEpochMillis(endTime) - startMs;
|
|
5235
|
+
yield* logger.logEvent({
|
|
5236
|
+
type: "edge_case",
|
|
5237
|
+
domain,
|
|
5238
|
+
url,
|
|
5239
|
+
message: `[FETCH_SUCCESS] Got response in ${durationMs}ms`,
|
|
5240
|
+
details: {
|
|
5241
|
+
case: "fetch_success",
|
|
5242
|
+
url,
|
|
5243
|
+
durationMs,
|
|
5244
|
+
status: response.status,
|
|
5245
|
+
statusText: response.statusText,
|
|
5246
|
+
contentType: response.headers.get("content-type")
|
|
5247
|
+
}
|
|
5248
|
+
});
|
|
5249
|
+
return response;
|
|
5250
|
+
});
|
|
5251
|
+
});
|
|
5252
|
+
const LoggingFetch = Context.GenericTag("LoggingFetch");
|
|
5083
5253
|
class JsonParseError extends Data.TaggedError("JsonParseError") {
|
|
5084
5254
|
get message() {
|
|
5085
5255
|
const preview = this.input.length > 100 ? `${this.input.substring(0, 100)}...` : this.input;
|
|
@@ -6980,23 +7150,153 @@ const WebScrapingEngineLive = Layer.effect(
|
|
|
6980
7150
|
WebScrapingEngine,
|
|
6981
7151
|
makeWebScrapingEngine
|
|
6982
7152
|
);
|
|
7153
|
+
class WorkerHealthMonitor extends Effect.Service()(
|
|
7154
|
+
"@jambudipa.io/WorkerHealthMonitor",
|
|
7155
|
+
{
|
|
7156
|
+
effect: Effect.gen(function* () {
|
|
7157
|
+
const logger = yield* SpiderLogger;
|
|
7158
|
+
const workers = yield* Ref.make(HashMap.empty());
|
|
7159
|
+
const stuckThresholdMs = 6e4;
|
|
7160
|
+
return {
|
|
7161
|
+
/**
|
|
7162
|
+
* Register a worker's activity
|
|
7163
|
+
*/
|
|
7164
|
+
recordActivity: (workerId, domain, activity) => Effect.gen(function* () {
|
|
7165
|
+
const now = DateTime.unsafeNow();
|
|
7166
|
+
yield* Ref.update(workers, (map) => {
|
|
7167
|
+
const current = HashMap.get(map, workerId).pipe(
|
|
7168
|
+
(opt) => opt._tag === "Some" ? opt.value : {
|
|
7169
|
+
workerId,
|
|
7170
|
+
domain,
|
|
7171
|
+
lastActivity: now
|
|
7172
|
+
}
|
|
7173
|
+
);
|
|
7174
|
+
const updated = {
|
|
7175
|
+
...current,
|
|
7176
|
+
domain,
|
|
7177
|
+
lastActivity: now,
|
|
7178
|
+
currentUrl: activity.url ?? current.currentUrl,
|
|
7179
|
+
fetchStartTime: activity.fetchStart ? now : current.fetchStartTime
|
|
7180
|
+
};
|
|
7181
|
+
return HashMap.set(map, workerId, updated);
|
|
7182
|
+
});
|
|
7183
|
+
}),
|
|
7184
|
+
/**
|
|
7185
|
+
* Remove a worker from monitoring
|
|
7186
|
+
*/
|
|
7187
|
+
removeWorker: (workerId) => Ref.update(workers, (map) => HashMap.remove(map, workerId)),
|
|
7188
|
+
/**
|
|
7189
|
+
* Get stuck workers
|
|
7190
|
+
*/
|
|
7191
|
+
getStuckWorkers: Effect.gen(function* () {
|
|
7192
|
+
const now = DateTime.unsafeNow();
|
|
7193
|
+
const workerMap = yield* Ref.get(workers);
|
|
7194
|
+
const stuck = [];
|
|
7195
|
+
for (const [, status] of workerMap) {
|
|
7196
|
+
const inactiveMs = DateTime.toEpochMillis(now) - DateTime.toEpochMillis(status.lastActivity);
|
|
7197
|
+
if (inactiveMs > stuckThresholdMs) {
|
|
7198
|
+
stuck.push(status);
|
|
7199
|
+
}
|
|
7200
|
+
}
|
|
7201
|
+
return stuck;
|
|
7202
|
+
}),
|
|
7203
|
+
/**
|
|
7204
|
+
* Monitor workers and log stuck ones
|
|
7205
|
+
*/
|
|
7206
|
+
startMonitoring: Effect.gen(function* () {
|
|
7207
|
+
const self = {
|
|
7208
|
+
getStuckWorkers: Effect.gen(function* () {
|
|
7209
|
+
const now = DateTime.unsafeNow();
|
|
7210
|
+
const workerMap = yield* Ref.get(workers);
|
|
7211
|
+
const stuck = [];
|
|
7212
|
+
for (const [, status] of workerMap) {
|
|
7213
|
+
const inactiveMs = DateTime.toEpochMillis(now) - DateTime.toEpochMillis(status.lastActivity);
|
|
7214
|
+
if (inactiveMs > stuckThresholdMs) {
|
|
7215
|
+
stuck.push(status);
|
|
7216
|
+
}
|
|
7217
|
+
}
|
|
7218
|
+
return stuck;
|
|
7219
|
+
})
|
|
7220
|
+
};
|
|
7221
|
+
yield* Effect.repeat(
|
|
7222
|
+
Effect.gen(function* () {
|
|
7223
|
+
const stuck = yield* self.getStuckWorkers;
|
|
7224
|
+
if (stuck.length > 0) {
|
|
7225
|
+
for (const worker of stuck) {
|
|
7226
|
+
const nowMillis = DateTime.toEpochMillis(DateTime.unsafeNow());
|
|
7227
|
+
const inactiveMs = nowMillis - DateTime.toEpochMillis(worker.lastActivity);
|
|
7228
|
+
yield* logger.logEdgeCase(
|
|
7229
|
+
worker.domain,
|
|
7230
|
+
"worker_stuck_detected",
|
|
7231
|
+
{
|
|
7232
|
+
workerId: worker.workerId,
|
|
7233
|
+
currentUrl: worker.currentUrl,
|
|
7234
|
+
lastActivity: DateTime.formatIso(worker.lastActivity),
|
|
7235
|
+
inactiveMs,
|
|
7236
|
+
fetchStartTime: Option.fromNullable(worker.fetchStartTime).pipe(
|
|
7237
|
+
Option.map(DateTime.formatIso),
|
|
7238
|
+
Option.getOrElse(() => "N/A")
|
|
7239
|
+
)
|
|
7240
|
+
}
|
|
7241
|
+
);
|
|
7242
|
+
}
|
|
7243
|
+
}
|
|
7244
|
+
}),
|
|
7245
|
+
Schedule.fixed(Duration.seconds(30))
|
|
7246
|
+
);
|
|
7247
|
+
})
|
|
7248
|
+
};
|
|
7249
|
+
})
|
|
7250
|
+
}
|
|
7251
|
+
) {
|
|
7252
|
+
}
|
|
7253
|
+
const safeJsonParse = (data, onError) => Schema.decodeUnknown(Schema.parseJson(Schema.Unknown))(data).pipe(
|
|
7254
|
+
Effect.mapError(onError)
|
|
7255
|
+
);
|
|
7256
|
+
const toOption = (value, logContext) => {
|
|
7257
|
+
const result = Option.fromNullable(value);
|
|
7258
|
+
if (logContext && Option.isNone(result)) {
|
|
7259
|
+
return Effect.logDebug(`[Migration] Null value encountered: ${logContext}`).pipe(
|
|
7260
|
+
Effect.map(() => result)
|
|
7261
|
+
);
|
|
7262
|
+
}
|
|
7263
|
+
return Effect.succeed(result);
|
|
7264
|
+
};
|
|
7265
|
+
const fromPromise = (promise, onError) => Effect.tryPromise({
|
|
7266
|
+
try: promise,
|
|
7267
|
+
catch: onError
|
|
7268
|
+
});
|
|
7269
|
+
const cleanupResources = (resources) => Effect.all(
|
|
7270
|
+
resources.map(
|
|
7271
|
+
({ id, cleanup, onError }) => Effect.tryPromise({
|
|
7272
|
+
try: cleanup,
|
|
7273
|
+
catch: (error) => onError(id, error)
|
|
7274
|
+
})
|
|
7275
|
+
),
|
|
7276
|
+
{ mode: "either" }
|
|
7277
|
+
);
|
|
7278
|
+
const matchOption = (option, onNone, onSome) => Option.match(option, { onNone, onSome });
|
|
6983
7279
|
export {
|
|
7280
|
+
APITokenNotFoundError,
|
|
6984
7281
|
AdapterNotInitialisedError,
|
|
6985
7282
|
BrowserCleanupError,
|
|
6986
7283
|
BrowserEngineLive,
|
|
6987
7284
|
BrowserEngineService,
|
|
6988
7285
|
BrowserEngineWithConfig,
|
|
6989
7286
|
BrowserError,
|
|
7287
|
+
CSRFTokenNotFoundError,
|
|
6990
7288
|
ConfigError,
|
|
6991
7289
|
ConfigurationError,
|
|
6992
7290
|
ContentTypeError,
|
|
6993
7291
|
CookieManager,
|
|
6994
7292
|
CookieManagerLive,
|
|
6995
7293
|
CrawlError,
|
|
7294
|
+
DEFAULT_DEDUPLICATION_STRATEGY,
|
|
6996
7295
|
DEFAULT_HYBRID_CONFIG,
|
|
6997
7296
|
DeltaPersistence,
|
|
6998
7297
|
EnhancedHttpClient,
|
|
6999
7298
|
EnhancedHttpClientLive,
|
|
7299
|
+
FetchError,
|
|
7000
7300
|
FileStorageBackend,
|
|
7001
7301
|
FileSystemError,
|
|
7002
7302
|
FullStatePersistence,
|
|
@@ -7004,7 +7304,9 @@ export {
|
|
|
7004
7304
|
LinkExtractionError,
|
|
7005
7305
|
LinkExtractorService,
|
|
7006
7306
|
LinkExtractorServiceLayer,
|
|
7307
|
+
LoggingFetch,
|
|
7007
7308
|
LoggingMiddleware,
|
|
7309
|
+
LoginError,
|
|
7008
7310
|
MiddlewareError,
|
|
7009
7311
|
MiddlewareManager,
|
|
7010
7312
|
NetworkError,
|
|
@@ -7012,6 +7314,7 @@ export {
|
|
|
7012
7314
|
PageError,
|
|
7013
7315
|
ParseError,
|
|
7014
7316
|
PersistenceError$1 as PersistenceError,
|
|
7317
|
+
PostgresStorageBackend,
|
|
7015
7318
|
PriorityRequest,
|
|
7016
7319
|
QueueError,
|
|
7017
7320
|
RateLimitMiddleware,
|
|
@@ -7022,8 +7325,11 @@ export {
|
|
|
7022
7325
|
ResumabilityService,
|
|
7023
7326
|
RobotsService,
|
|
7024
7327
|
RobotsTxtError,
|
|
7328
|
+
SPIDER_DEFAULTS,
|
|
7025
7329
|
ScraperService,
|
|
7026
7330
|
SessionError$1 as SessionError,
|
|
7331
|
+
SessionLoadError,
|
|
7332
|
+
SessionNotValidError,
|
|
7027
7333
|
SessionStore,
|
|
7028
7334
|
SessionStoreLive,
|
|
7029
7335
|
SpiderConfig,
|
|
@@ -7038,27 +7344,41 @@ export {
|
|
|
7038
7344
|
StateManager,
|
|
7039
7345
|
StateManagerLive,
|
|
7040
7346
|
StatsMiddleware,
|
|
7347
|
+
StorageKeyNotFoundError,
|
|
7041
7348
|
TimeoutError,
|
|
7349
|
+
TokenExpiredError,
|
|
7042
7350
|
TokenExtractor,
|
|
7043
7351
|
TokenExtractorLive,
|
|
7352
|
+
TokenNotFoundError,
|
|
7044
7353
|
TokenType,
|
|
7045
7354
|
UrlDeduplicatorService,
|
|
7046
7355
|
UserAgentMiddleware,
|
|
7047
7356
|
ValidationError,
|
|
7048
7357
|
WebScrapingEngine,
|
|
7049
7358
|
WebScrapingEngineLive,
|
|
7359
|
+
WorkerHealthMonitor,
|
|
7360
|
+
cleanupResources,
|
|
7050
7361
|
createStateOperation,
|
|
7362
|
+
createUrlDeduplicator,
|
|
7363
|
+
deduplicateUrls,
|
|
7364
|
+
fromPromise,
|
|
7051
7365
|
isBrowserError,
|
|
7052
7366
|
isNetworkError,
|
|
7053
7367
|
isSpiderError,
|
|
7054
7368
|
makeCookieManager,
|
|
7055
7369
|
makeEnhancedHttpClient,
|
|
7370
|
+
makeLoggingFetch,
|
|
7056
7371
|
makeSessionStore,
|
|
7057
7372
|
makeSpiderConfig,
|
|
7058
7373
|
makeSpiderLogger,
|
|
7059
7374
|
makeStateManager,
|
|
7060
7375
|
makeTokenExtractor,
|
|
7061
7376
|
makeWebScrapingEngine,
|
|
7377
|
+
matchOption,
|
|
7378
|
+
normalizeUrl,
|
|
7379
|
+
parseUrl,
|
|
7380
|
+
safeJsonParse,
|
|
7381
|
+
toOption,
|
|
7062
7382
|
withBrowser
|
|
7063
7383
|
};
|
|
7064
7384
|
//# sourceMappingURL=index.js.map
|