@jambudipa/spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +426 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4681 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
- package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
- package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
- package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
- package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
- package/dist/lib/HttpClient/index.d.ts +8 -0
- package/dist/lib/HttpClient/index.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/index.d.ts +37 -0
- package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
- package/dist/lib/Logging/FetchLogger.d.ts +8 -0
- package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
- package/dist/lib/PageData/PageData.d.ts +28 -0
- package/dist/lib/PageData/PageData.d.ts.map +1 -0
- package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
- package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/index.d.ts +51 -0
- package/dist/lib/Resumability/index.d.ts.map +1 -0
- package/dist/lib/Resumability/strategies.d.ts +76 -0
- package/dist/lib/Resumability/strategies.d.ts.map +1 -0
- package/dist/lib/Resumability/types.d.ts +201 -0
- package/dist/lib/Resumability/types.d.ts.map +1 -0
- package/dist/lib/Robots/Robots.service.d.ts +78 -0
- package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
- package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +194 -0
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
- package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
- package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
- package/dist/lib/StateManager/index.d.ts +5 -0
- package/dist/lib/StateManager/index.d.ts.map +1 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
- package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
- package/dist/lib/api-facades.d.ts +313 -0
- package/dist/lib/api-facades.d.ts.map +1 -0
- package/dist/lib/errors.d.ts +99 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/package.json +108 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,4681 @@
|
|
|
1
|
+
import { Effect, Layer, MutableHashSet, Schema, Data, Context, Console, MutableHashMap, Option, Queue, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Ref } from "effect";
|
|
2
|
+
import * as cheerio from "cheerio";
|
|
3
|
+
import * as fs from "fs";
|
|
4
|
+
import * as path from "path";
|
|
5
|
+
import * as fs$1 from "fs/promises";
|
|
6
|
+
import { CookieJar } from "tough-cookie";
|
|
7
|
+
class SpiderConfig extends Effect.Service()(
|
|
8
|
+
"@jambudipa.io/SpiderConfig",
|
|
9
|
+
{
|
|
10
|
+
effect: Effect.sync(() => makeSpiderConfig({}))
|
|
11
|
+
}
|
|
12
|
+
) {
|
|
13
|
+
/**
|
|
14
|
+
* Creates a Layer that provides SpiderConfig with custom options
|
|
15
|
+
* @param config - The configuration options or a pre-made SpiderConfigService
|
|
16
|
+
*/
|
|
17
|
+
static Live = (config) => Layer.effect(
|
|
18
|
+
SpiderConfig,
|
|
19
|
+
Effect.succeed("getOptions" in config ? config : makeSpiderConfig(config))
|
|
20
|
+
);
|
|
21
|
+
}
|
|
22
|
+
const FILE_EXTENSION_CATEGORIES = {
|
|
23
|
+
/** Archive files (8 extensions) */
|
|
24
|
+
archives: [".7z", ".7zip", ".bz2", ".rar", ".tar", ".tar.gz", ".xz", ".zip"],
|
|
25
|
+
/** Image files (19 extensions) */
|
|
26
|
+
images: [
|
|
27
|
+
".mng",
|
|
28
|
+
".pct",
|
|
29
|
+
".bmp",
|
|
30
|
+
".gif",
|
|
31
|
+
".jpg",
|
|
32
|
+
".jpeg",
|
|
33
|
+
".png",
|
|
34
|
+
".pst",
|
|
35
|
+
".psp",
|
|
36
|
+
".tif",
|
|
37
|
+
".tiff",
|
|
38
|
+
".ai",
|
|
39
|
+
".drw",
|
|
40
|
+
".dxf",
|
|
41
|
+
".eps",
|
|
42
|
+
".ps",
|
|
43
|
+
".svg",
|
|
44
|
+
".cdr",
|
|
45
|
+
".ico",
|
|
46
|
+
".webp"
|
|
47
|
+
],
|
|
48
|
+
/** Audio files (9 extensions) */
|
|
49
|
+
audio: [
|
|
50
|
+
".mp3",
|
|
51
|
+
".wma",
|
|
52
|
+
".ogg",
|
|
53
|
+
".wav",
|
|
54
|
+
".ra",
|
|
55
|
+
".aac",
|
|
56
|
+
".mid",
|
|
57
|
+
".au",
|
|
58
|
+
".aiff"
|
|
59
|
+
],
|
|
60
|
+
/** Video files (14 extensions) */
|
|
61
|
+
video: [
|
|
62
|
+
".3gp",
|
|
63
|
+
".asf",
|
|
64
|
+
".asx",
|
|
65
|
+
".avi",
|
|
66
|
+
".mov",
|
|
67
|
+
".mp4",
|
|
68
|
+
".mpg",
|
|
69
|
+
".qt",
|
|
70
|
+
".rm",
|
|
71
|
+
".swf",
|
|
72
|
+
".wmv",
|
|
73
|
+
".m4a",
|
|
74
|
+
".m4v",
|
|
75
|
+
".flv",
|
|
76
|
+
".webm"
|
|
77
|
+
],
|
|
78
|
+
/** Office documents (21 extensions) */
|
|
79
|
+
officeDocuments: [
|
|
80
|
+
".xls",
|
|
81
|
+
".xlsm",
|
|
82
|
+
".xlsx",
|
|
83
|
+
".xltm",
|
|
84
|
+
".xltx",
|
|
85
|
+
".potm",
|
|
86
|
+
".potx",
|
|
87
|
+
".ppt",
|
|
88
|
+
".pptm",
|
|
89
|
+
".pptx",
|
|
90
|
+
".pps",
|
|
91
|
+
".doc",
|
|
92
|
+
".docb",
|
|
93
|
+
".docm",
|
|
94
|
+
".docx",
|
|
95
|
+
".dotm",
|
|
96
|
+
".dotx",
|
|
97
|
+
".odt",
|
|
98
|
+
".ods",
|
|
99
|
+
".odg",
|
|
100
|
+
".odp"
|
|
101
|
+
],
|
|
102
|
+
/** Other files (16 extensions) */
|
|
103
|
+
other: [
|
|
104
|
+
".css",
|
|
105
|
+
".pdf",
|
|
106
|
+
".exe",
|
|
107
|
+
".bin",
|
|
108
|
+
".rss",
|
|
109
|
+
".dmg",
|
|
110
|
+
".iso",
|
|
111
|
+
".apk",
|
|
112
|
+
".jar",
|
|
113
|
+
".sh",
|
|
114
|
+
".rb",
|
|
115
|
+
".js",
|
|
116
|
+
".hta",
|
|
117
|
+
".bat",
|
|
118
|
+
".cpl",
|
|
119
|
+
".msi",
|
|
120
|
+
".msp",
|
|
121
|
+
".py"
|
|
122
|
+
]
|
|
123
|
+
};
|
|
124
|
+
const generateSkipExtensions = (filters) => {
|
|
125
|
+
const skipExtensions = [];
|
|
126
|
+
if (filters.filterArchives) {
|
|
127
|
+
skipExtensions.push(...FILE_EXTENSION_CATEGORIES.archives);
|
|
128
|
+
}
|
|
129
|
+
if (filters.filterImages) {
|
|
130
|
+
skipExtensions.push(...FILE_EXTENSION_CATEGORIES.images);
|
|
131
|
+
}
|
|
132
|
+
if (filters.filterAudio) {
|
|
133
|
+
skipExtensions.push(...FILE_EXTENSION_CATEGORIES.audio);
|
|
134
|
+
}
|
|
135
|
+
if (filters.filterVideo) {
|
|
136
|
+
skipExtensions.push(...FILE_EXTENSION_CATEGORIES.video);
|
|
137
|
+
}
|
|
138
|
+
if (filters.filterOfficeDocuments) {
|
|
139
|
+
skipExtensions.push(...FILE_EXTENSION_CATEGORIES.officeDocuments);
|
|
140
|
+
}
|
|
141
|
+
if (filters.filterOther) {
|
|
142
|
+
skipExtensions.push(...FILE_EXTENSION_CATEGORIES.other);
|
|
143
|
+
}
|
|
144
|
+
return skipExtensions;
|
|
145
|
+
};
|
|
146
|
+
const makeSpiderConfig = (options = {}) => {
|
|
147
|
+
const defaultFileExtensionFilters = {
|
|
148
|
+
filterArchives: true,
|
|
149
|
+
filterImages: true,
|
|
150
|
+
filterAudio: true,
|
|
151
|
+
filterVideo: true,
|
|
152
|
+
filterOfficeDocuments: true,
|
|
153
|
+
filterOther: true
|
|
154
|
+
};
|
|
155
|
+
const defaultTechnicalFilters = {
|
|
156
|
+
filterUnsupportedSchemes: true,
|
|
157
|
+
filterLongUrls: true,
|
|
158
|
+
maxUrlLength: 2083,
|
|
159
|
+
// Scrapy's default
|
|
160
|
+
filterMalformedUrls: true
|
|
161
|
+
};
|
|
162
|
+
const defaultOptions = {
|
|
163
|
+
ignoreRobotsTxt: false,
|
|
164
|
+
maxConcurrentWorkers: 5,
|
|
165
|
+
concurrency: 4,
|
|
166
|
+
requestDelayMs: 1e3,
|
|
167
|
+
maxRobotsCrawlDelayMs: 2e3,
|
|
168
|
+
// Maximum 1 second for robots.txt crawl delay
|
|
169
|
+
userAgent: "JambudipaSpider/1.0",
|
|
170
|
+
allowedProtocols: ["http:", "https:", "file:", "ftp:"],
|
|
171
|
+
// Scrapy's allowed schemes
|
|
172
|
+
followRedirects: true,
|
|
173
|
+
respectNoFollow: true,
|
|
174
|
+
fileExtensionFilters: defaultFileExtensionFilters,
|
|
175
|
+
technicalFilters: defaultTechnicalFilters,
|
|
176
|
+
maxConcurrentRequests: 10,
|
|
177
|
+
maxRequestsPerSecondPerDomain: 2,
|
|
178
|
+
normalizeUrlsForDeduplication: true,
|
|
179
|
+
enableResumability: false
|
|
180
|
+
};
|
|
181
|
+
const config = {
|
|
182
|
+
...defaultOptions,
|
|
183
|
+
...options,
|
|
184
|
+
// Merge nested objects properly
|
|
185
|
+
fileExtensionFilters: options.fileExtensionFilters ? {
|
|
186
|
+
...defaultOptions.fileExtensionFilters,
|
|
187
|
+
...options.fileExtensionFilters
|
|
188
|
+
} : defaultOptions.fileExtensionFilters,
|
|
189
|
+
technicalFilters: options.technicalFilters ? {
|
|
190
|
+
...defaultOptions.technicalFilters,
|
|
191
|
+
...options.technicalFilters
|
|
192
|
+
} : defaultOptions.technicalFilters
|
|
193
|
+
};
|
|
194
|
+
const skipExtensions = config.skipFileExtensions || generateSkipExtensions(
|
|
195
|
+
config.fileExtensionFilters ?? defaultFileExtensionFilters
|
|
196
|
+
);
|
|
197
|
+
return {
|
|
198
|
+
getOptions: () => Effect.succeed(config),
|
|
199
|
+
shouldFollowUrl: (urlString, fromUrl, restrictToStartingDomain) => Effect.sync(() => {
|
|
200
|
+
try {
|
|
201
|
+
const url = new URL(urlString);
|
|
202
|
+
const fromUrlParsed = fromUrl ? new URL(fromUrl) : void 0;
|
|
203
|
+
const techFilters = config.technicalFilters ?? defaultTechnicalFilters;
|
|
204
|
+
if (restrictToStartingDomain) {
|
|
205
|
+
const startingDomain = new URL(restrictToStartingDomain).hostname;
|
|
206
|
+
const isAllowedDomain = url.hostname === startingDomain || url.hostname.endsWith(`.${startingDomain}`);
|
|
207
|
+
if (!isAllowedDomain) {
|
|
208
|
+
return {
|
|
209
|
+
follow: false,
|
|
210
|
+
reason: `Domain ${url.hostname} restricted to starting domain ${startingDomain}`
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
if (techFilters.filterLongUrls && urlString.length > techFilters.maxUrlLength) {
|
|
215
|
+
return {
|
|
216
|
+
follow: false,
|
|
217
|
+
reason: `URL length ${urlString.length} exceeds maximum ${techFilters.maxUrlLength}`
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
if (techFilters.filterUnsupportedSchemes && !config.allowedProtocols.includes(url.protocol)) {
|
|
221
|
+
return {
|
|
222
|
+
follow: false,
|
|
223
|
+
reason: `Protocol ${url.protocol} not in allowed schemes: ${config.allowedProtocols.join(", ")}`
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
if (config.allowedDomains && config.allowedDomains.length > 0) {
|
|
227
|
+
const isDomainAllowed = config.allowedDomains.some(
|
|
228
|
+
(domain) => url.hostname === domain || url.hostname.endsWith(`.${domain}`)
|
|
229
|
+
);
|
|
230
|
+
if (!isDomainAllowed) {
|
|
231
|
+
return {
|
|
232
|
+
follow: false,
|
|
233
|
+
reason: `Domain ${url.hostname} not in allowlist`
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
if (config.blockedDomains && config.blockedDomains.length > 0) {
|
|
238
|
+
const isDomainBlocked = config.blockedDomains.some(
|
|
239
|
+
(domain) => url.hostname === domain || url.hostname.endsWith(`.${domain}`)
|
|
240
|
+
);
|
|
241
|
+
if (isDomainBlocked) {
|
|
242
|
+
return {
|
|
243
|
+
follow: false,
|
|
244
|
+
reason: `Domain ${url.hostname} is blocked`
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
if (config.customUrlFilters && config.customUrlFilters.length > 0) {
|
|
249
|
+
for (const pattern of config.customUrlFilters) {
|
|
250
|
+
if (pattern.test(urlString)) {
|
|
251
|
+
return {
|
|
252
|
+
follow: false,
|
|
253
|
+
reason: `URL matches custom filter pattern: ${pattern}`
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
if (fromUrlParsed && url.hostname === fromUrlParsed.hostname && url.pathname === fromUrlParsed.pathname && url.search === fromUrlParsed.search && url.hash) {
|
|
259
|
+
return {
|
|
260
|
+
follow: false,
|
|
261
|
+
reason: "Fragment-only link to same page"
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
const pathname = url.pathname.toLowerCase();
|
|
265
|
+
if (skipExtensions.some((ext) => pathname.endsWith(ext.toLowerCase()))) {
|
|
266
|
+
const filterReasons = [];
|
|
267
|
+
if (config.fileExtensionFilters?.filterArchives && FILE_EXTENSION_CATEGORIES.archives.some(
|
|
268
|
+
(ext) => pathname.endsWith(ext.toLowerCase())
|
|
269
|
+
)) {
|
|
270
|
+
filterReasons.push("archive");
|
|
271
|
+
}
|
|
272
|
+
if (config.fileExtensionFilters?.filterImages && FILE_EXTENSION_CATEGORIES.images.some(
|
|
273
|
+
(ext) => pathname.endsWith(ext.toLowerCase())
|
|
274
|
+
)) {
|
|
275
|
+
filterReasons.push("image");
|
|
276
|
+
}
|
|
277
|
+
if (config.fileExtensionFilters?.filterAudio && FILE_EXTENSION_CATEGORIES.audio.some(
|
|
278
|
+
(ext) => pathname.endsWith(ext.toLowerCase())
|
|
279
|
+
)) {
|
|
280
|
+
filterReasons.push("audio");
|
|
281
|
+
}
|
|
282
|
+
if (config.fileExtensionFilters?.filterVideo && FILE_EXTENSION_CATEGORIES.video.some(
|
|
283
|
+
(ext) => pathname.endsWith(ext.toLowerCase())
|
|
284
|
+
)) {
|
|
285
|
+
filterReasons.push("video");
|
|
286
|
+
}
|
|
287
|
+
if (config.fileExtensionFilters?.filterOfficeDocuments && FILE_EXTENSION_CATEGORIES.officeDocuments.some(
|
|
288
|
+
(ext) => pathname.endsWith(ext.toLowerCase())
|
|
289
|
+
)) {
|
|
290
|
+
filterReasons.push("office document");
|
|
291
|
+
}
|
|
292
|
+
if (config.fileExtensionFilters?.filterOther && FILE_EXTENSION_CATEGORIES.other.some(
|
|
293
|
+
(ext) => pathname.endsWith(ext.toLowerCase())
|
|
294
|
+
)) {
|
|
295
|
+
filterReasons.push("other file type");
|
|
296
|
+
}
|
|
297
|
+
const reason = filterReasons.length > 0 ? `Filtered ${filterReasons.join("/")} file extension` : "File extension not suitable for crawling";
|
|
298
|
+
return {
|
|
299
|
+
follow: false,
|
|
300
|
+
reason
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
return { follow: true };
|
|
304
|
+
} catch (error) {
|
|
305
|
+
if (config.technicalFilters?.filterMalformedUrls) {
|
|
306
|
+
return {
|
|
307
|
+
follow: false,
|
|
308
|
+
reason: `Malformed URL: ${error instanceof Error ? error.message : "Unknown parsing error"}`
|
|
309
|
+
};
|
|
310
|
+
} else {
|
|
311
|
+
return { follow: true };
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
}),
|
|
315
|
+
getUserAgent: () => Effect.succeed(config.userAgent),
|
|
316
|
+
getRequestDelay: () => Effect.succeed(config.requestDelayMs),
|
|
317
|
+
getMaxRobotsCrawlDelay: () => Effect.succeed(config.maxRobotsCrawlDelayMs),
|
|
318
|
+
shouldIgnoreRobotsTxt: () => Effect.succeed(config.ignoreRobotsTxt),
|
|
319
|
+
getMaxConcurrentWorkers: () => Effect.succeed(config.maxConcurrentWorkers),
|
|
320
|
+
getMaxDepth: () => Effect.succeed(config.maxDepth),
|
|
321
|
+
getMaxPages: () => Effect.succeed(config.maxPages),
|
|
322
|
+
shouldFollowRedirects: () => Effect.succeed(config.followRedirects),
|
|
323
|
+
shouldRespectNoFollow: () => Effect.succeed(config.respectNoFollow),
|
|
324
|
+
getSkipFileExtensions: () => Effect.succeed(config.skipFileExtensions || []),
|
|
325
|
+
getMaxConcurrentRequests: () => Effect.succeed(config.maxConcurrentRequests),
|
|
326
|
+
getMaxRequestsPerSecondPerDomain: () => Effect.succeed(config.maxRequestsPerSecondPerDomain),
|
|
327
|
+
shouldNormalizeUrlsForDeduplication: () => Effect.succeed(config.normalizeUrlsForDeduplication),
|
|
328
|
+
getConcurrency: () => Effect.succeed(config.concurrency),
|
|
329
|
+
isResumabilityEnabled: () => Effect.succeed(config.enableResumability)
|
|
330
|
+
};
|
|
331
|
+
};
|
|
332
|
+
class UrlDeduplicatorService extends Effect.Service()(
|
|
333
|
+
"@jambudipa.io/UrlDeduplicatorService",
|
|
334
|
+
{
|
|
335
|
+
effect: Effect.gen(function* () {
|
|
336
|
+
const config = yield* SpiderConfig;
|
|
337
|
+
const shouldNormalize = yield* config.shouldNormalizeUrlsForDeduplication();
|
|
338
|
+
const seenUrls = MutableHashSet.empty();
|
|
339
|
+
const mutex = yield* Effect.makeSemaphore(1);
|
|
340
|
+
const normalizeUrl = (url) => {
|
|
341
|
+
if (!shouldNormalize) {
|
|
342
|
+
return url;
|
|
343
|
+
}
|
|
344
|
+
try {
|
|
345
|
+
const parsed = new URL(url);
|
|
346
|
+
let normalizedPath = parsed.pathname.replace(/\/+/g, "/").replace(/\/$/, "");
|
|
347
|
+
if (normalizedPath === "") {
|
|
348
|
+
normalizedPath = "/";
|
|
349
|
+
}
|
|
350
|
+
parsed.pathname = normalizedPath;
|
|
351
|
+
parsed.hash = "";
|
|
352
|
+
if (parsed.protocol === "http:" && parsed.port === "80" || parsed.protocol === "https:" && parsed.port === "443") {
|
|
353
|
+
parsed.port = "";
|
|
354
|
+
}
|
|
355
|
+
if (parsed.search) {
|
|
356
|
+
const params = new URLSearchParams(parsed.search);
|
|
357
|
+
const sortedParams = new URLSearchParams();
|
|
358
|
+
Array.from(params.keys()).sort().forEach((key) => {
|
|
359
|
+
params.getAll(key).forEach((value) => {
|
|
360
|
+
sortedParams.append(key, value);
|
|
361
|
+
});
|
|
362
|
+
});
|
|
363
|
+
parsed.search = sortedParams.toString();
|
|
364
|
+
}
|
|
365
|
+
return parsed.toString();
|
|
366
|
+
} catch {
|
|
367
|
+
return url;
|
|
368
|
+
}
|
|
369
|
+
};
|
|
370
|
+
return {
|
|
371
|
+
tryAdd: (url) => mutex.withPermits(1)(
|
|
372
|
+
Effect.sync(() => {
|
|
373
|
+
const normalizedUrl = normalizeUrl(url);
|
|
374
|
+
if (MutableHashSet.has(seenUrls, normalizedUrl)) {
|
|
375
|
+
return false;
|
|
376
|
+
}
|
|
377
|
+
MutableHashSet.add(seenUrls, normalizedUrl);
|
|
378
|
+
return true;
|
|
379
|
+
})
|
|
380
|
+
),
|
|
381
|
+
contains: (url) => mutex.withPermits(1)(
|
|
382
|
+
Effect.sync(() => {
|
|
383
|
+
const normalizedUrl = normalizeUrl(url);
|
|
384
|
+
return MutableHashSet.has(seenUrls, normalizedUrl);
|
|
385
|
+
})
|
|
386
|
+
),
|
|
387
|
+
size: () => mutex.withPermits(1)(
|
|
388
|
+
Effect.sync(() => MutableHashSet.size(seenUrls))
|
|
389
|
+
),
|
|
390
|
+
clear: () => mutex.withPermits(1)(
|
|
391
|
+
Effect.sync(() => MutableHashSet.clear(seenUrls))
|
|
392
|
+
)
|
|
393
|
+
};
|
|
394
|
+
}),
|
|
395
|
+
dependencies: [SpiderConfig.Default]
|
|
396
|
+
}
|
|
397
|
+
) {
|
|
398
|
+
}
|
|
399
|
+
const PageDataSchema = Schema.Struct({
|
|
400
|
+
url: Schema.String.pipe(
|
|
401
|
+
Schema.filter(
|
|
402
|
+
(s) => {
|
|
403
|
+
try {
|
|
404
|
+
new URL(s);
|
|
405
|
+
return true;
|
|
406
|
+
} catch {
|
|
407
|
+
return false;
|
|
408
|
+
}
|
|
409
|
+
},
|
|
410
|
+
{
|
|
411
|
+
message: () => "Invalid URL format"
|
|
412
|
+
}
|
|
413
|
+
)
|
|
414
|
+
),
|
|
415
|
+
html: Schema.String,
|
|
416
|
+
title: Schema.optional(Schema.String),
|
|
417
|
+
/** All available metadata from meta tags */
|
|
418
|
+
metadata: Schema.Record({ key: Schema.String, value: Schema.String }),
|
|
419
|
+
/** Commonly used metadata fields for convenience */
|
|
420
|
+
commonMetadata: Schema.optional(
|
|
421
|
+
Schema.Struct({
|
|
422
|
+
description: Schema.optional(Schema.String),
|
|
423
|
+
keywords: Schema.optional(Schema.String),
|
|
424
|
+
author: Schema.optional(Schema.String),
|
|
425
|
+
robots: Schema.optional(Schema.String)
|
|
426
|
+
})
|
|
427
|
+
),
|
|
428
|
+
statusCode: Schema.Number.pipe(Schema.int(), Schema.between(100, 599)),
|
|
429
|
+
/** All response headers */
|
|
430
|
+
headers: Schema.Record({ key: Schema.String, value: Schema.String }),
|
|
431
|
+
/** When the fetch operation started */
|
|
432
|
+
fetchedAt: Schema.DateFromSelf,
|
|
433
|
+
/** How long the entire fetch and parse operation took in milliseconds */
|
|
434
|
+
scrapeDurationMs: Schema.Number,
|
|
435
|
+
/** The crawl depth (number of hops from the starting URL) */
|
|
436
|
+
depth: Schema.Number.pipe(Schema.int(), Schema.greaterThanOrEqualTo(0)),
|
|
437
|
+
/** Optional extracted data from the page */
|
|
438
|
+
extractedData: Schema.optional(
|
|
439
|
+
Schema.Record({ key: Schema.String, value: Schema.Unknown })
|
|
440
|
+
)
|
|
441
|
+
});
|
|
442
|
+
class NetworkError extends Data.TaggedError("NetworkError") {
|
|
443
|
+
static fromCause(url, cause) {
|
|
444
|
+
return new NetworkError({
|
|
445
|
+
url,
|
|
446
|
+
cause,
|
|
447
|
+
message: `Failed to fetch ${url}: ${cause}`
|
|
448
|
+
});
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
class ResponseError extends Data.TaggedError("ResponseError") {
|
|
452
|
+
static fromCause(url, cause) {
|
|
453
|
+
return new ResponseError({
|
|
454
|
+
url,
|
|
455
|
+
cause,
|
|
456
|
+
message: `Failed to read response from ${url}: ${cause}`
|
|
457
|
+
});
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
class RobotsTxtError extends Data.TaggedError("RobotsTxtError") {
|
|
461
|
+
static fromCause(url, cause) {
|
|
462
|
+
return new RobotsTxtError({
|
|
463
|
+
url,
|
|
464
|
+
cause,
|
|
465
|
+
message: `Failed to fetch robots.txt: ${cause}`
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
class ConfigurationError extends Data.TaggedError("ConfigurationError") {
|
|
470
|
+
}
|
|
471
|
+
class MiddlewareError extends Data.TaggedError("MiddlewareError") {
|
|
472
|
+
static transform(middlewareName, cause) {
|
|
473
|
+
return new MiddlewareError({
|
|
474
|
+
phase: "transform",
|
|
475
|
+
middlewareName,
|
|
476
|
+
cause,
|
|
477
|
+
message: `Middleware '${middlewareName}' failed during transform: ${cause}`
|
|
478
|
+
});
|
|
479
|
+
}
|
|
480
|
+
static error(middlewareName, cause) {
|
|
481
|
+
return new MiddlewareError({
|
|
482
|
+
phase: "error",
|
|
483
|
+
middlewareName,
|
|
484
|
+
cause,
|
|
485
|
+
message: `Middleware '${middlewareName}' failed during error handling: ${cause}`
|
|
486
|
+
});
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
class FileSystemError extends Data.TaggedError("FileSystemError") {
|
|
490
|
+
static write(path2, cause) {
|
|
491
|
+
return new FileSystemError({
|
|
492
|
+
operation: "write",
|
|
493
|
+
path: path2,
|
|
494
|
+
cause,
|
|
495
|
+
message: `Failed to write file ${path2}: ${cause}`
|
|
496
|
+
});
|
|
497
|
+
}
|
|
498
|
+
static create(path2, cause) {
|
|
499
|
+
return new FileSystemError({
|
|
500
|
+
operation: "create",
|
|
501
|
+
path: path2,
|
|
502
|
+
cause,
|
|
503
|
+
message: `Failed to create directory ${path2}: ${cause}`
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
let PersistenceError$1 = class PersistenceError extends Data.TaggedError("PersistenceError") {
|
|
508
|
+
static save(cause, key) {
|
|
509
|
+
return new PersistenceError({
|
|
510
|
+
operation: "save",
|
|
511
|
+
key,
|
|
512
|
+
cause,
|
|
513
|
+
message: key ? `Failed to save state for key ${key}: ${cause}` : `Failed to save state: ${cause}`
|
|
514
|
+
});
|
|
515
|
+
}
|
|
516
|
+
static load(cause, key) {
|
|
517
|
+
return new PersistenceError({
|
|
518
|
+
operation: "load",
|
|
519
|
+
key,
|
|
520
|
+
cause,
|
|
521
|
+
message: key ? `Failed to load state for key ${key}: ${cause}` : `Failed to load state: ${cause}`
|
|
522
|
+
});
|
|
523
|
+
}
|
|
524
|
+
static delete(cause, key) {
|
|
525
|
+
return new PersistenceError({
|
|
526
|
+
operation: "delete",
|
|
527
|
+
key,
|
|
528
|
+
cause,
|
|
529
|
+
message: key ? `Failed to delete state for key ${key}: ${cause}` : `Failed to delete state: ${cause}`
|
|
530
|
+
});
|
|
531
|
+
}
|
|
532
|
+
};
|
|
533
|
+
const SpiderLogger = Context.GenericTag("SpiderLogger");
|
|
534
|
+
const makeSpiderLogger = (logDir = "./spider-logs") => {
|
|
535
|
+
if (!fs.existsSync(logDir)) {
|
|
536
|
+
fs.mkdirSync(logDir, { recursive: true });
|
|
537
|
+
}
|
|
538
|
+
const logFileName = `spider-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}.jsonl`;
|
|
539
|
+
const logFilePath = path.join(logDir, logFileName);
|
|
540
|
+
const summaryFilePath = path.join(logDir, "spider-summary.json");
|
|
541
|
+
const writeLogEvent = (event) => Effect.sync(() => {
|
|
542
|
+
const logLine = JSON.stringify(event) + "\n";
|
|
543
|
+
fs.appendFileSync(logFilePath, logLine);
|
|
544
|
+
const importantTypes = [
|
|
545
|
+
"domain_start",
|
|
546
|
+
"domain_complete",
|
|
547
|
+
"spider_lifecycle",
|
|
548
|
+
"domain_error"
|
|
549
|
+
];
|
|
550
|
+
if (importantTypes.includes(event.type)) {
|
|
551
|
+
const prefix = `[${event.type}]`;
|
|
552
|
+
const domainInfo = event.domain ? ` [${event.domain}]` : "";
|
|
553
|
+
Console.log(`${prefix}${domainInfo} ${event.message}`).pipe(
|
|
554
|
+
Effect.runSync
|
|
555
|
+
);
|
|
556
|
+
}
|
|
557
|
+
});
|
|
558
|
+
const updateSummary = (update) => Effect.sync(() => {
|
|
559
|
+
let summary = {};
|
|
560
|
+
if (fs.existsSync(summaryFilePath)) {
|
|
561
|
+
const content = fs.readFileSync(summaryFilePath, "utf-8");
|
|
562
|
+
try {
|
|
563
|
+
const parsed = JSON.parse(content);
|
|
564
|
+
summary = typeof parsed === "object" && parsed !== null ? parsed : {};
|
|
565
|
+
} catch {
|
|
566
|
+
summary = {};
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
summary = update(summary);
|
|
570
|
+
fs.writeFileSync(summaryFilePath, JSON.stringify(summary, null, 2));
|
|
571
|
+
});
|
|
572
|
+
return {
|
|
573
|
+
logEvent: (event) => Effect.gen(function* () {
|
|
574
|
+
const fullEvent = {
|
|
575
|
+
...event,
|
|
576
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
577
|
+
};
|
|
578
|
+
yield* writeLogEvent(fullEvent);
|
|
579
|
+
}),
|
|
580
|
+
logDomainStart: (domain, startUrl) => Effect.gen(function* () {
|
|
581
|
+
yield* writeLogEvent({
|
|
582
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
583
|
+
type: "domain_start",
|
|
584
|
+
domain,
|
|
585
|
+
url: startUrl,
|
|
586
|
+
message: `Starting crawl for domain: ${domain}`,
|
|
587
|
+
details: { startUrl }
|
|
588
|
+
});
|
|
589
|
+
yield* updateSummary((summary) => ({
|
|
590
|
+
...summary,
|
|
591
|
+
domains: {
|
|
592
|
+
...summary.domains || {},
|
|
593
|
+
[domain]: {
|
|
594
|
+
status: "running",
|
|
595
|
+
startTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
596
|
+
startUrl,
|
|
597
|
+
pagesScraped: 0
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
}));
|
|
601
|
+
}),
|
|
602
|
+
logDomainComplete: (domain, pagesScraped, reason) => Effect.gen(function* () {
|
|
603
|
+
yield* writeLogEvent({
|
|
604
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
605
|
+
type: "domain_complete",
|
|
606
|
+
domain,
|
|
607
|
+
message: `Domain ${domain} completed: ${pagesScraped} pages scraped (reason: ${reason})`,
|
|
608
|
+
details: { pagesScraped, reason }
|
|
609
|
+
});
|
|
610
|
+
yield* updateSummary((summary) => {
|
|
611
|
+
const domains = summary.domains || {};
|
|
612
|
+
const existingDomain = domains[domain] || {};
|
|
613
|
+
return {
|
|
614
|
+
...summary,
|
|
615
|
+
domains: {
|
|
616
|
+
...domains,
|
|
617
|
+
[domain]: {
|
|
618
|
+
...existingDomain,
|
|
619
|
+
status: "completed",
|
|
620
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
621
|
+
pagesScraped,
|
|
622
|
+
completionReason: reason
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
};
|
|
626
|
+
});
|
|
627
|
+
}),
|
|
628
|
+
logPageScraped: (url, domain, pageNumber) => Effect.gen(function* () {
|
|
629
|
+
yield* writeLogEvent({
|
|
630
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
631
|
+
type: "page_scraped",
|
|
632
|
+
domain,
|
|
633
|
+
url,
|
|
634
|
+
message: `Scraped page #${pageNumber} from ${domain}`,
|
|
635
|
+
details: { pageNumber }
|
|
636
|
+
});
|
|
637
|
+
yield* updateSummary((summary) => {
|
|
638
|
+
const domains = summary.domains || {};
|
|
639
|
+
const existingDomain = domains[domain] || {};
|
|
640
|
+
return {
|
|
641
|
+
...summary,
|
|
642
|
+
domains: {
|
|
643
|
+
...domains,
|
|
644
|
+
[domain]: {
|
|
645
|
+
...existingDomain,
|
|
646
|
+
pagesScraped: pageNumber
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
};
|
|
650
|
+
});
|
|
651
|
+
}),
|
|
652
|
+
logQueueStatus: (domain, queueSize, activeWorkers) => Effect.gen(function* () {
|
|
653
|
+
yield* writeLogEvent({
|
|
654
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
655
|
+
type: "queue_status",
|
|
656
|
+
domain,
|
|
657
|
+
message: `Queue status - size: ${queueSize}, active workers: ${activeWorkers}`,
|
|
658
|
+
details: { queueSize, activeWorkers }
|
|
659
|
+
});
|
|
660
|
+
}),
|
|
661
|
+
logRateLimit: (domain, requestsInWindow) => Effect.gen(function* () {
|
|
662
|
+
yield* writeLogEvent({
|
|
663
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
664
|
+
type: "rate_limit",
|
|
665
|
+
domain,
|
|
666
|
+
message: `Rate limit applied - ${requestsInWindow} requests in window`,
|
|
667
|
+
details: { requestsInWindow }
|
|
668
|
+
});
|
|
669
|
+
}),
|
|
670
|
+
logSpiderLifecycle: (event, details) => Effect.gen(function* () {
|
|
671
|
+
yield* writeLogEvent({
|
|
672
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
673
|
+
type: "spider_lifecycle",
|
|
674
|
+
message: `Spider ${event}`,
|
|
675
|
+
details
|
|
676
|
+
});
|
|
677
|
+
if (event === "start") {
|
|
678
|
+
yield* updateSummary((summary) => ({
|
|
679
|
+
...summary,
|
|
680
|
+
spiderStartTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
681
|
+
status: "running"
|
|
682
|
+
}));
|
|
683
|
+
} else if (event === "complete" || event === "error") {
|
|
684
|
+
yield* updateSummary((summary) => ({
|
|
685
|
+
...summary,
|
|
686
|
+
spiderEndTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
687
|
+
status: event === "complete" ? "completed" : "error",
|
|
688
|
+
...details && { finalDetails: details }
|
|
689
|
+
}));
|
|
690
|
+
}
|
|
691
|
+
}),
|
|
692
|
+
// Enhanced diagnostic logging methods
|
|
693
|
+
logWorkerLifecycle: (workerId, domain, event, reason, details) => Effect.gen(function* () {
|
|
694
|
+
yield* writeLogEvent({
|
|
695
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
696
|
+
type: "worker_lifecycle",
|
|
697
|
+
domain,
|
|
698
|
+
workerId,
|
|
699
|
+
message: `[WORKER_LIFECYCLE] Worker ${workerId} ${event}${reason ? ` - reason: ${reason}` : ""} (domain: ${domain})`,
|
|
700
|
+
details: { event, reason, ...details }
|
|
701
|
+
});
|
|
702
|
+
}),
|
|
703
|
+
logWorkerState: (workerId, domain, event, details) => Effect.gen(function* () {
|
|
704
|
+
yield* writeLogEvent({
|
|
705
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
706
|
+
type: "worker_state",
|
|
707
|
+
domain,
|
|
708
|
+
workerId,
|
|
709
|
+
message: `[WORKER_STATE] Worker ${workerId} ${event} (domain: ${domain})`,
|
|
710
|
+
details: { event, ...details }
|
|
711
|
+
});
|
|
712
|
+
}),
|
|
713
|
+
logCompletionMonitor: (domain, checkCount, queueSize, activeWorkers, stableCount, maxPagesReached, decision) => Effect.gen(function* () {
|
|
714
|
+
yield* writeLogEvent({
|
|
715
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
716
|
+
type: "completion_monitor",
|
|
717
|
+
domain,
|
|
718
|
+
message: `[COMPLETION_MONITOR] Check #${checkCount}: queue=${queueSize}, active=${activeWorkers}, stable=${stableCount}, maxPages=${maxPagesReached} -> ${decision}`,
|
|
719
|
+
details: {
|
|
720
|
+
checkCount,
|
|
721
|
+
queueSize,
|
|
722
|
+
activeWorkers,
|
|
723
|
+
stableCount,
|
|
724
|
+
maxPagesReached,
|
|
725
|
+
decision
|
|
726
|
+
}
|
|
727
|
+
});
|
|
728
|
+
}),
|
|
729
|
+
logEdgeCase: (domain, caseType, details) => Effect.gen(function* () {
|
|
730
|
+
yield* writeLogEvent({
|
|
731
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
732
|
+
type: "edge_case",
|
|
733
|
+
domain,
|
|
734
|
+
message: `[EDGE_CASE] ${caseType} (domain: ${domain})`,
|
|
735
|
+
details: { case: caseType, ...details }
|
|
736
|
+
});
|
|
737
|
+
}),
|
|
738
|
+
logDomainStatus: (domain, status) => Effect.gen(function* () {
|
|
739
|
+
yield* writeLogEvent({
|
|
740
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
741
|
+
type: "domain_start",
|
|
742
|
+
// Reuse existing type for now
|
|
743
|
+
domain,
|
|
744
|
+
message: `[DOMAIN_STATUS] ${domain}: ${status.pagesScraped} pages, queue=${status.queueSize}, workers=${status.activeWorkers}/${status.maxWorkers}`,
|
|
745
|
+
details: status
|
|
746
|
+
});
|
|
747
|
+
yield* updateSummary((summary) => {
|
|
748
|
+
const domains = summary.domains || {};
|
|
749
|
+
const existingDomain = domains[domain] || {};
|
|
750
|
+
return {
|
|
751
|
+
...summary,
|
|
752
|
+
domains: {
|
|
753
|
+
...domains,
|
|
754
|
+
[domain]: {
|
|
755
|
+
...existingDomain,
|
|
756
|
+
pagesScraped: Math.max(0, status.pagesScraped || 0),
|
|
757
|
+
queueSize: Math.max(0, status.queueSize || 0),
|
|
758
|
+
activeWorkers: Math.max(0, status.activeWorkers || 0),
|
|
759
|
+
maxWorkers: Math.max(1, status.maxWorkers || 5)
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
};
|
|
763
|
+
});
|
|
764
|
+
})
|
|
765
|
+
};
|
|
766
|
+
};
|
|
767
|
+
const SpiderLoggerLive = Layer.succeed(SpiderLogger, makeSpiderLogger());
|
|
768
|
+
class ScraperService extends Effect.Service()(
|
|
769
|
+
"@jambudipa.io/ScraperService",
|
|
770
|
+
{
|
|
771
|
+
effect: Effect.sync(() => ({
|
|
772
|
+
/**
|
|
773
|
+
* Fetches a URL and parses the HTML to extract basic page information.
|
|
774
|
+
*
|
|
775
|
+
* This method performs the following operations:
|
|
776
|
+
* 1. Fetches the URL with configurable timeout (30 seconds)
|
|
777
|
+
* 2. Validates content type (skips binary files)
|
|
778
|
+
* 3. Parses HTML content with cheerio
|
|
779
|
+
* 4. Extracts basic page metadata (title, description, etc.)
|
|
780
|
+
* 5. Returns structured PageData object
|
|
781
|
+
*
|
|
782
|
+
* The method uses AbortController for proper timeout handling to prevent
|
|
783
|
+
* workers from hanging on malformed URLs or slow responses.
|
|
784
|
+
*
|
|
785
|
+
* @param url - The URL to fetch and parse
|
|
786
|
+
* @param depth - The crawl depth for logging purposes (default: 0)
|
|
787
|
+
* @returns Effect containing PageData with extracted information
|
|
788
|
+
* @throws NetworkError for network-related failures
|
|
789
|
+
* @throws ResponseError for HTTP error responses
|
|
790
|
+
*
|
|
791
|
+
* @example
|
|
792
|
+
* Basic usage:
|
|
793
|
+
* ```typescript
|
|
794
|
+
* const pageData = yield* scraper.fetchAndParse('https://example.com');
|
|
795
|
+
* console.log(`Page title: ${pageData.title}`);
|
|
796
|
+
* ```
|
|
797
|
+
*
|
|
798
|
+
* With depth tracking:
|
|
799
|
+
* ```typescript
|
|
800
|
+
* const pageData = yield* scraper.fetchAndParse('https://example.com/page', 2);
|
|
801
|
+
* ```
|
|
802
|
+
*
|
|
803
|
+
* Error handling:
|
|
804
|
+
* ```typescript
|
|
805
|
+
* const result = yield* scraper.fetchAndParse('https://example.com').pipe(
|
|
806
|
+
* Effect.catchTags({
|
|
807
|
+
* NetworkError: (error) => {
|
|
808
|
+
* console.log('Network error:', error.message);
|
|
809
|
+
* return Effect.succeed(null);
|
|
810
|
+
* },
|
|
811
|
+
* ResponseError: (error) => {
|
|
812
|
+
* console.log('HTTP error:', error.statusCode);
|
|
813
|
+
* return Effect.succeed(null);
|
|
814
|
+
* }
|
|
815
|
+
* })
|
|
816
|
+
* );
|
|
817
|
+
* ```
|
|
818
|
+
*
|
|
819
|
+
* @performance
|
|
820
|
+
* - Request timeout: 30 seconds
|
|
821
|
+
* - Response parsing timeout: 10 seconds
|
|
822
|
+
* - Memory usage: ~2-5MB per page depending on content size
|
|
823
|
+
*
|
|
824
|
+
* @security
|
|
825
|
+
* - Validates content types to prevent processing binary files
|
|
826
|
+
* - Uses AbortController to prevent hanging requests
|
|
827
|
+
* - No execution of JavaScript content (static HTML parsing only)
|
|
828
|
+
*/
|
|
829
|
+
fetchAndParse: (url, depth = 0) => Effect.gen(function* () {
|
|
830
|
+
const startTime = yield* Effect.sync(() => /* @__PURE__ */ new Date());
|
|
831
|
+
const startMs = startTime.getTime();
|
|
832
|
+
const logger = yield* SpiderLogger;
|
|
833
|
+
const domain = new URL(url).hostname;
|
|
834
|
+
const controller = new AbortController();
|
|
835
|
+
const timeoutMs = 3e4;
|
|
836
|
+
const timeoutId = setTimeout(() => {
|
|
837
|
+
const duration = Date.now() - startMs;
|
|
838
|
+
Effect.runSync(
|
|
839
|
+
logger.logEdgeCase(domain, "fetch_abort_triggered", {
|
|
840
|
+
url,
|
|
841
|
+
durationMs: duration,
|
|
842
|
+
reason: "timeout",
|
|
843
|
+
timeoutMs
|
|
844
|
+
})
|
|
845
|
+
);
|
|
846
|
+
controller.abort();
|
|
847
|
+
}, timeoutMs);
|
|
848
|
+
const response = yield* Effect.tryPromise({
|
|
849
|
+
try: async () => {
|
|
850
|
+
try {
|
|
851
|
+
const resp = await fetch(url, { signal: controller.signal });
|
|
852
|
+
clearTimeout(timeoutId);
|
|
853
|
+
const contentType = resp.headers.get("content-type") || "";
|
|
854
|
+
if (!contentType.includes("text/html") && !contentType.includes("application/xhtml") && !contentType.includes("text/") && contentType !== "") {
|
|
855
|
+
throw new Error(`Skipping non-HTML content: ${contentType}`);
|
|
856
|
+
}
|
|
857
|
+
return resp;
|
|
858
|
+
} catch (error) {
|
|
859
|
+
clearTimeout(timeoutId);
|
|
860
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
861
|
+
throw new Error(
|
|
862
|
+
`Request aborted after ${Date.now() - startMs}ms`
|
|
863
|
+
);
|
|
864
|
+
}
|
|
865
|
+
throw error;
|
|
866
|
+
}
|
|
867
|
+
},
|
|
868
|
+
catch: (error) => NetworkError.fromCause(url, error)
|
|
869
|
+
});
|
|
870
|
+
const textController = new AbortController();
|
|
871
|
+
const textTimeoutMs = 1e4;
|
|
872
|
+
const textTimeoutId = setTimeout(() => {
|
|
873
|
+
const duration = Date.now() - startMs;
|
|
874
|
+
Effect.runSync(
|
|
875
|
+
logger.logEdgeCase(domain, "response_text_abort_triggered", {
|
|
876
|
+
url,
|
|
877
|
+
durationMs: duration,
|
|
878
|
+
reason: "timeout",
|
|
879
|
+
timeoutMs: textTimeoutMs
|
|
880
|
+
})
|
|
881
|
+
);
|
|
882
|
+
textController.abort();
|
|
883
|
+
}, textTimeoutMs);
|
|
884
|
+
const html = yield* Effect.tryPromise({
|
|
885
|
+
try: async () => {
|
|
886
|
+
try {
|
|
887
|
+
const reader = response.body?.getReader();
|
|
888
|
+
if (!reader) throw new Error("No response body");
|
|
889
|
+
const decoder = new TextDecoder();
|
|
890
|
+
let html2 = "";
|
|
891
|
+
while (true) {
|
|
892
|
+
const { done, value } = await reader.read();
|
|
893
|
+
if (done) break;
|
|
894
|
+
html2 += decoder.decode(value, { stream: true });
|
|
895
|
+
if (textController.signal.aborted) {
|
|
896
|
+
reader.cancel();
|
|
897
|
+
throw new Error("Response parsing aborted");
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
clearTimeout(textTimeoutId);
|
|
901
|
+
return html2;
|
|
902
|
+
} catch (error) {
|
|
903
|
+
clearTimeout(textTimeoutId);
|
|
904
|
+
throw error;
|
|
905
|
+
}
|
|
906
|
+
},
|
|
907
|
+
catch: (error) => ResponseError.fromCause(url, error)
|
|
908
|
+
});
|
|
909
|
+
const $ = cheerio.load(html);
|
|
910
|
+
const metadata = {};
|
|
911
|
+
$("meta").each((_, element) => {
|
|
912
|
+
const $meta = $(element);
|
|
913
|
+
const name = $meta.attr("name") || $meta.attr("property") || $meta.attr("http-equiv");
|
|
914
|
+
const content = $meta.attr("content");
|
|
915
|
+
if (name && content) {
|
|
916
|
+
metadata[name] = content;
|
|
917
|
+
}
|
|
918
|
+
});
|
|
919
|
+
const commonMetadata = {
|
|
920
|
+
description: metadata["description"],
|
|
921
|
+
keywords: metadata["keywords"],
|
|
922
|
+
author: metadata["author"],
|
|
923
|
+
robots: metadata["robots"]
|
|
924
|
+
};
|
|
925
|
+
const headers = {};
|
|
926
|
+
response.headers.forEach((value, key) => {
|
|
927
|
+
headers[key] = value;
|
|
928
|
+
});
|
|
929
|
+
const endTime = yield* Effect.sync(() => /* @__PURE__ */ new Date());
|
|
930
|
+
const durationMs = endTime.getTime() - startTime.getTime();
|
|
931
|
+
const pageData = {
|
|
932
|
+
url,
|
|
933
|
+
html,
|
|
934
|
+
title: $("title").text() || void 0,
|
|
935
|
+
metadata,
|
|
936
|
+
commonMetadata: Object.values(commonMetadata).some((v) => v) ? commonMetadata : void 0,
|
|
937
|
+
statusCode: response.status,
|
|
938
|
+
headers,
|
|
939
|
+
fetchedAt: startTime,
|
|
940
|
+
scrapeDurationMs: durationMs,
|
|
941
|
+
depth
|
|
942
|
+
};
|
|
943
|
+
return yield* Schema.decode(PageDataSchema)(pageData);
|
|
944
|
+
})
|
|
945
|
+
}))
|
|
946
|
+
}
|
|
947
|
+
) {
|
|
948
|
+
}
|
|
949
|
+
class RobotsService extends Effect.Service()(
|
|
950
|
+
"@jambudipa.io/RobotsService",
|
|
951
|
+
{
|
|
952
|
+
effect: Effect.sync(() => {
|
|
953
|
+
const robotsCache = MutableHashMap.empty();
|
|
954
|
+
const parseRobotsTxt = (content, userAgent = "*") => {
|
|
955
|
+
const lines = content.split("\n");
|
|
956
|
+
const rules = {
|
|
957
|
+
disallowedPaths: /* @__PURE__ */ new Set(),
|
|
958
|
+
userAgent
|
|
959
|
+
};
|
|
960
|
+
let currentUserAgent = "";
|
|
961
|
+
let isRelevantSection = false;
|
|
962
|
+
for (const line of lines) {
|
|
963
|
+
const trimmed = line.trim();
|
|
964
|
+
if (trimmed.startsWith("#") || !trimmed) continue;
|
|
965
|
+
const [directive, ...valueParts] = trimmed.split(":");
|
|
966
|
+
const value = valueParts.join(":").trim();
|
|
967
|
+
if (directive.toLowerCase() === "user-agent") {
|
|
968
|
+
currentUserAgent = value;
|
|
969
|
+
isRelevantSection = currentUserAgent === "*" || currentUserAgent.toLowerCase() === userAgent.toLowerCase();
|
|
970
|
+
} else if (isRelevantSection) {
|
|
971
|
+
if (directive.toLowerCase() === "disallow" && value) {
|
|
972
|
+
rules.disallowedPaths.add(value);
|
|
973
|
+
} else if (directive.toLowerCase() === "crawl-delay") {
|
|
974
|
+
rules.crawlDelay = parseInt(value);
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
return rules;
|
|
979
|
+
};
|
|
980
|
+
const fetchRobotsTxt = (baseUrl) => {
|
|
981
|
+
const robotsUrl = new URL("/robots.txt", baseUrl);
|
|
982
|
+
return Effect.tryPromise({
|
|
983
|
+
try: async () => {
|
|
984
|
+
const response = await fetch(robotsUrl.toString());
|
|
985
|
+
if (!response.ok) {
|
|
986
|
+
return null;
|
|
987
|
+
}
|
|
988
|
+
return await response.text();
|
|
989
|
+
},
|
|
990
|
+
catch: (error) => RobotsTxtError.fromCause(robotsUrl.toString(), error)
|
|
991
|
+
});
|
|
992
|
+
};
|
|
993
|
+
const isPathAllowed = (url, rules) => {
|
|
994
|
+
const path2 = url.pathname;
|
|
995
|
+
for (const disallowedPath of rules.disallowedPaths) {
|
|
996
|
+
if (disallowedPath === "/") return false;
|
|
997
|
+
try {
|
|
998
|
+
const pattern = disallowedPath.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\\\*/g, ".*");
|
|
999
|
+
if (new RegExp(`^${pattern}`).test(path2)) {
|
|
1000
|
+
return false;
|
|
1001
|
+
}
|
|
1002
|
+
} catch {
|
|
1003
|
+
if (disallowedPath.endsWith("*")) {
|
|
1004
|
+
const prefix = disallowedPath.slice(0, -1);
|
|
1005
|
+
if (path2.startsWith(prefix)) {
|
|
1006
|
+
return false;
|
|
1007
|
+
}
|
|
1008
|
+
} else if (path2.startsWith(disallowedPath)) {
|
|
1009
|
+
return false;
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
return true;
|
|
1014
|
+
};
|
|
1015
|
+
return {
|
|
1016
|
+
checkUrl: (urlString) => Effect.gen(function* () {
|
|
1017
|
+
let url;
|
|
1018
|
+
let baseUrl;
|
|
1019
|
+
try {
|
|
1020
|
+
url = new URL(urlString);
|
|
1021
|
+
baseUrl = new URL(`${url.protocol}//${url.host}`);
|
|
1022
|
+
} catch (error) {
|
|
1023
|
+
yield* Effect.logWarning(
|
|
1024
|
+
`Invalid URL "${urlString}": ${error instanceof Error ? error.message : String(error)}. Allowing access.`
|
|
1025
|
+
);
|
|
1026
|
+
return { allowed: true };
|
|
1027
|
+
}
|
|
1028
|
+
const cacheKey = baseUrl.toString();
|
|
1029
|
+
const cachedRules = MutableHashMap.get(robotsCache, cacheKey);
|
|
1030
|
+
let rules;
|
|
1031
|
+
if (Option.isNone(cachedRules)) {
|
|
1032
|
+
const robotsContent = yield* fetchRobotsTxt(baseUrl).pipe(
|
|
1033
|
+
Effect.catchAll(
|
|
1034
|
+
(error) => Effect.logWarning(
|
|
1035
|
+
`Failed to fetch robots.txt for ${baseUrl}: ${error.message}. Allowing access.`
|
|
1036
|
+
).pipe(Effect.map(() => null))
|
|
1037
|
+
)
|
|
1038
|
+
);
|
|
1039
|
+
if (robotsContent) {
|
|
1040
|
+
try {
|
|
1041
|
+
rules = parseRobotsTxt(robotsContent);
|
|
1042
|
+
} catch {
|
|
1043
|
+
rules = { disallowedPaths: /* @__PURE__ */ new Set(), userAgent: "*" };
|
|
1044
|
+
}
|
|
1045
|
+
} else {
|
|
1046
|
+
rules = { disallowedPaths: /* @__PURE__ */ new Set(), userAgent: "*" };
|
|
1047
|
+
}
|
|
1048
|
+
MutableHashMap.set(robotsCache, cacheKey, rules);
|
|
1049
|
+
} else {
|
|
1050
|
+
rules = cachedRules.value;
|
|
1051
|
+
}
|
|
1052
|
+
return {
|
|
1053
|
+
allowed: isPathAllowed(url, rules),
|
|
1054
|
+
crawlDelay: rules.crawlDelay
|
|
1055
|
+
};
|
|
1056
|
+
}),
|
|
1057
|
+
getRules: (domain) => Effect.sync(() => {
|
|
1058
|
+
const baseUrl = new URL(domain);
|
|
1059
|
+
const cacheKey = baseUrl.toString();
|
|
1060
|
+
return MutableHashMap.get(robotsCache, cacheKey);
|
|
1061
|
+
})
|
|
1062
|
+
};
|
|
1063
|
+
})
|
|
1064
|
+
}
|
|
1065
|
+
) {
|
|
1066
|
+
}
|
|
1067
|
+
class LinkExtractionError extends Data.TaggedError(
|
|
1068
|
+
"LinkExtractionError"
|
|
1069
|
+
) {
|
|
1070
|
+
}
|
|
1071
|
+
const DEFAULT_CONFIG = {
|
|
1072
|
+
restrictCss: [],
|
|
1073
|
+
tags: ["a", "area", "form", "frame", "iframe", "link"],
|
|
1074
|
+
attrs: ["href", "action", "src"],
|
|
1075
|
+
extractFromInputs: false
|
|
1076
|
+
};
|
|
1077
|
+
class LinkExtractorService extends Effect.Service()(
|
|
1078
|
+
"@jambudipa.io/LinkExtractorService",
|
|
1079
|
+
{
|
|
1080
|
+
effect: Effect.succeed({
|
|
1081
|
+
extractLinks: (html, config) => Effect.gen(function* () {
|
|
1082
|
+
const finalConfig = { ...DEFAULT_CONFIG, ...config };
|
|
1083
|
+
try {
|
|
1084
|
+
const result = extractRawLinks(html, finalConfig);
|
|
1085
|
+
return result;
|
|
1086
|
+
} catch (error) {
|
|
1087
|
+
return yield* Effect.fail(
|
|
1088
|
+
new LinkExtractionError({
|
|
1089
|
+
message: `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`,
|
|
1090
|
+
cause: error
|
|
1091
|
+
})
|
|
1092
|
+
);
|
|
1093
|
+
}
|
|
1094
|
+
})
|
|
1095
|
+
})
|
|
1096
|
+
}
|
|
1097
|
+
) {
|
|
1098
|
+
}
|
|
1099
|
+
const LinkExtractorServiceLayer = LinkExtractorService.Default;
|
|
1100
|
+
const extractRawLinks = (html, config) => {
|
|
1101
|
+
const $ = cheerio.load(html);
|
|
1102
|
+
const foundUrls = [];
|
|
1103
|
+
const extractionBreakdown = {};
|
|
1104
|
+
let totalElementsProcessed = 0;
|
|
1105
|
+
const extractUrlFromAttribute = (element, attr) => {
|
|
1106
|
+
const value = $(element).attr(attr);
|
|
1107
|
+
if (!value || !value.trim()) return null;
|
|
1108
|
+
return value.trim();
|
|
1109
|
+
};
|
|
1110
|
+
const trackExtraction = (elementType, url) => {
|
|
1111
|
+
totalElementsProcessed++;
|
|
1112
|
+
if (url) {
|
|
1113
|
+
foundUrls.push(url);
|
|
1114
|
+
extractionBreakdown[elementType] = (extractionBreakdown[elementType] || 0) + 1;
|
|
1115
|
+
}
|
|
1116
|
+
};
|
|
1117
|
+
if (config.restrictCss.length > 0) {
|
|
1118
|
+
config.restrictCss.forEach((cssSelector) => {
|
|
1119
|
+
$(cssSelector).each((_, element) => {
|
|
1120
|
+
const tagName = element.name?.toLowerCase() || "unknown";
|
|
1121
|
+
config.attrs.forEach((attr) => {
|
|
1122
|
+
const url = extractUrlFromAttribute(element, attr);
|
|
1123
|
+
if (url) trackExtraction(tagName, url);
|
|
1124
|
+
});
|
|
1125
|
+
});
|
|
1126
|
+
});
|
|
1127
|
+
} else {
|
|
1128
|
+
config.tags.forEach((tag) => {
|
|
1129
|
+
config.attrs.forEach((attr) => {
|
|
1130
|
+
$(`${tag}[${attr}]`).each((_, element) => {
|
|
1131
|
+
const url = extractUrlFromAttribute(element, attr);
|
|
1132
|
+
trackExtraction(tag, url);
|
|
1133
|
+
});
|
|
1134
|
+
});
|
|
1135
|
+
});
|
|
1136
|
+
}
|
|
1137
|
+
if (config.extractFromInputs) {
|
|
1138
|
+
$('input[type="hidden"]').each((_, element) => {
|
|
1139
|
+
const name = $(element).attr("name")?.toLowerCase() || "";
|
|
1140
|
+
const value = $(element).attr("value");
|
|
1141
|
+
if ((name.includes("url") || name.includes("redirect") || name.includes("next")) && value?.trim()) {
|
|
1142
|
+
trackExtraction("input", value.trim());
|
|
1143
|
+
}
|
|
1144
|
+
});
|
|
1145
|
+
}
|
|
1146
|
+
return {
|
|
1147
|
+
links: foundUrls,
|
|
1148
|
+
totalElementsProcessed,
|
|
1149
|
+
extractionBreakdown
|
|
1150
|
+
};
|
|
1151
|
+
};
|
|
1152
|
+
class SpiderStateKey extends Schema.Class(
|
|
1153
|
+
"SpiderStateKey"
|
|
1154
|
+
)({
|
|
1155
|
+
/** Unique identifier for the session */
|
|
1156
|
+
id: Schema.String,
|
|
1157
|
+
/** When the session was created */
|
|
1158
|
+
timestamp: Schema.Date,
|
|
1159
|
+
/** Human-readable name for the session */
|
|
1160
|
+
name: Schema.String
|
|
1161
|
+
}) {
|
|
1162
|
+
}
|
|
1163
|
+
class PriorityRequest extends Schema.Class(
|
|
1164
|
+
"PriorityRequest"
|
|
1165
|
+
)({
|
|
1166
|
+
/** The crawl task containing URL and depth information */
|
|
1167
|
+
request: Schema.Struct({
|
|
1168
|
+
url: Schema.String,
|
|
1169
|
+
depth: Schema.Number,
|
|
1170
|
+
fromUrl: Schema.optional(Schema.String)
|
|
1171
|
+
}),
|
|
1172
|
+
/** Priority level (higher numbers processed first) */
|
|
1173
|
+
priority: Schema.Number,
|
|
1174
|
+
/** When this request was created */
|
|
1175
|
+
timestamp: Schema.Date,
|
|
1176
|
+
/** Unique fingerprint for deduplication */
|
|
1177
|
+
fingerprint: Schema.String
|
|
1178
|
+
}) {
|
|
1179
|
+
}
|
|
1180
|
+
class SpiderState extends Schema.Class("SpiderState")({
|
|
1181
|
+
/** The state key identifying this session */
|
|
1182
|
+
key: SpiderStateKey,
|
|
1183
|
+
/** All requests waiting to be processed */
|
|
1184
|
+
pendingRequests: Schema.Array(PriorityRequest),
|
|
1185
|
+
/** Fingerprints of URLs already visited (for deduplication) */
|
|
1186
|
+
visitedFingerprints: Schema.Array(Schema.String),
|
|
1187
|
+
/** Total number of requests processed so far */
|
|
1188
|
+
totalProcessed: Schema.Number
|
|
1189
|
+
}) {
|
|
1190
|
+
}
|
|
1191
|
+
class SpiderSchedulerService extends Effect.Service()(
|
|
1192
|
+
"@jambudipa.io/SpiderSchedulerService",
|
|
1193
|
+
{
|
|
1194
|
+
effect: Effect.gen(function* () {
|
|
1195
|
+
const config = yield* SpiderConfig;
|
|
1196
|
+
const shouldNormalizeUrls = yield* config.shouldNormalizeUrlsForDeduplication();
|
|
1197
|
+
const memoryQueue = yield* Queue.unbounded();
|
|
1198
|
+
const seenFingerprints = MutableHashMap.empty();
|
|
1199
|
+
const pendingRequestsForPersistence = [];
|
|
1200
|
+
let totalProcessed = 0;
|
|
1201
|
+
let persistenceLayer = null;
|
|
1202
|
+
let currentStateKey = null;
|
|
1203
|
+
const normalizeUrl = (url) => {
|
|
1204
|
+
if (!shouldNormalizeUrls) {
|
|
1205
|
+
return url;
|
|
1206
|
+
}
|
|
1207
|
+
try {
|
|
1208
|
+
const parsed = new URL(url);
|
|
1209
|
+
let normalizedPath = parsed.pathname.replace(/\/+/g, "/").replace(/\/$/, "");
|
|
1210
|
+
if (normalizedPath === "") {
|
|
1211
|
+
normalizedPath = "/";
|
|
1212
|
+
}
|
|
1213
|
+
parsed.pathname = normalizedPath;
|
|
1214
|
+
parsed.hash = "";
|
|
1215
|
+
if (parsed.protocol === "http:" && parsed.port === "80" || parsed.protocol === "https:" && parsed.port === "443") {
|
|
1216
|
+
parsed.port = "";
|
|
1217
|
+
}
|
|
1218
|
+
if (parsed.search) {
|
|
1219
|
+
const params = new URLSearchParams(parsed.search);
|
|
1220
|
+
const sortedParams = new URLSearchParams();
|
|
1221
|
+
Array.from(params.keys()).sort().forEach((key) => {
|
|
1222
|
+
params.getAll(key).forEach((value) => {
|
|
1223
|
+
sortedParams.append(key, value);
|
|
1224
|
+
});
|
|
1225
|
+
});
|
|
1226
|
+
parsed.search = sortedParams.toString();
|
|
1227
|
+
}
|
|
1228
|
+
return parsed.toString();
|
|
1229
|
+
} catch {
|
|
1230
|
+
return url;
|
|
1231
|
+
}
|
|
1232
|
+
};
|
|
1233
|
+
const generateFingerprint = (request) => {
|
|
1234
|
+
const normalizedUrl = normalizeUrl(request.url);
|
|
1235
|
+
return `${normalizedUrl}:${request.depth}`;
|
|
1236
|
+
};
|
|
1237
|
+
const createPriorityRequest = (request, priority) => new PriorityRequest({
|
|
1238
|
+
request,
|
|
1239
|
+
priority,
|
|
1240
|
+
timestamp: /* @__PURE__ */ new Date(),
|
|
1241
|
+
fingerprint: generateFingerprint(request)
|
|
1242
|
+
});
|
|
1243
|
+
const persistState = () => Effect.gen(function* () {
|
|
1244
|
+
if (!persistenceLayer || !currentStateKey) {
|
|
1245
|
+
return;
|
|
1246
|
+
}
|
|
1247
|
+
const state = new SpiderState({
|
|
1248
|
+
key: currentStateKey,
|
|
1249
|
+
pendingRequests: [...pendingRequestsForPersistence],
|
|
1250
|
+
visitedFingerprints: Array.from(
|
|
1251
|
+
MutableHashMap.keys(seenFingerprints)
|
|
1252
|
+
),
|
|
1253
|
+
totalProcessed
|
|
1254
|
+
});
|
|
1255
|
+
yield* persistenceLayer.saveState(currentStateKey, state);
|
|
1256
|
+
});
|
|
1257
|
+
const restoreFromStateImpl = (state) => Effect.gen(function* () {
|
|
1258
|
+
const currentSize = yield* Queue.size(memoryQueue);
|
|
1259
|
+
for (let i = 0; i < currentSize; i++) {
|
|
1260
|
+
yield* Queue.take(memoryQueue).pipe(Effect.ignore);
|
|
1261
|
+
}
|
|
1262
|
+
MutableHashMap.clear(seenFingerprints);
|
|
1263
|
+
pendingRequestsForPersistence.length = 0;
|
|
1264
|
+
state.visitedFingerprints.forEach((fp) => {
|
|
1265
|
+
MutableHashMap.set(seenFingerprints, fp, true);
|
|
1266
|
+
});
|
|
1267
|
+
const sortedRequests = [...state.pendingRequests].sort(
|
|
1268
|
+
(a, b) => b.priority - a.priority
|
|
1269
|
+
);
|
|
1270
|
+
pendingRequestsForPersistence.push(...sortedRequests);
|
|
1271
|
+
yield* Effect.forEach(
|
|
1272
|
+
sortedRequests,
|
|
1273
|
+
(req) => Queue.offer(memoryQueue, req)
|
|
1274
|
+
);
|
|
1275
|
+
totalProcessed = state.totalProcessed;
|
|
1276
|
+
currentStateKey = state.key;
|
|
1277
|
+
});
|
|
1278
|
+
return {
|
|
1279
|
+
// Configure persistence layer for resumable scraping
|
|
1280
|
+
configurePersistence: (persistence, stateKey) => Effect.sync(() => {
|
|
1281
|
+
persistenceLayer = persistence;
|
|
1282
|
+
currentStateKey = stateKey;
|
|
1283
|
+
}),
|
|
1284
|
+
// Remove persistence configuration
|
|
1285
|
+
clearPersistence: () => Effect.sync(() => {
|
|
1286
|
+
persistenceLayer = null;
|
|
1287
|
+
currentStateKey = null;
|
|
1288
|
+
}),
|
|
1289
|
+
// Enqueue a request with priority
|
|
1290
|
+
enqueue: (request, priority = 0) => Effect.gen(function* () {
|
|
1291
|
+
const fingerprint = generateFingerprint(request);
|
|
1292
|
+
if (MutableHashMap.has(seenFingerprints, fingerprint)) {
|
|
1293
|
+
return false;
|
|
1294
|
+
}
|
|
1295
|
+
MutableHashMap.set(seenFingerprints, fingerprint, true);
|
|
1296
|
+
const priorityRequest = createPriorityRequest(request, priority);
|
|
1297
|
+
yield* Queue.offer(memoryQueue, priorityRequest);
|
|
1298
|
+
pendingRequestsForPersistence.push(priorityRequest);
|
|
1299
|
+
if (persistenceLayer && currentStateKey) {
|
|
1300
|
+
yield* persistState();
|
|
1301
|
+
}
|
|
1302
|
+
return true;
|
|
1303
|
+
}),
|
|
1304
|
+
// Dequeue highest priority request
|
|
1305
|
+
dequeue: () => Effect.gen(function* () {
|
|
1306
|
+
const request = yield* Queue.take(memoryQueue);
|
|
1307
|
+
totalProcessed++;
|
|
1308
|
+
const index = pendingRequestsForPersistence.findIndex(
|
|
1309
|
+
(r) => r.fingerprint === request.fingerprint
|
|
1310
|
+
);
|
|
1311
|
+
if (index !== -1) {
|
|
1312
|
+
pendingRequestsForPersistence.splice(index, 1);
|
|
1313
|
+
}
|
|
1314
|
+
if (persistenceLayer && currentStateKey) {
|
|
1315
|
+
yield* persistState();
|
|
1316
|
+
}
|
|
1317
|
+
return request;
|
|
1318
|
+
}),
|
|
1319
|
+
// Get queue size
|
|
1320
|
+
size: () => Queue.size(memoryQueue),
|
|
1321
|
+
// Check if queue is empty
|
|
1322
|
+
isEmpty: () => Queue.size(memoryQueue).pipe(Effect.map((size) => size === 0)),
|
|
1323
|
+
// Get current state for persistence
|
|
1324
|
+
getState: () => Effect.gen(function* () {
|
|
1325
|
+
if (!currentStateKey) {
|
|
1326
|
+
return yield* Effect.fail(
|
|
1327
|
+
new ConfigurationError({
|
|
1328
|
+
message: "No state key configured",
|
|
1329
|
+
details: "State key is required for persistence operations"
|
|
1330
|
+
})
|
|
1331
|
+
);
|
|
1332
|
+
}
|
|
1333
|
+
return new SpiderState({
|
|
1334
|
+
key: currentStateKey,
|
|
1335
|
+
pendingRequests: [...pendingRequestsForPersistence],
|
|
1336
|
+
visitedFingerprints: Array.from(
|
|
1337
|
+
MutableHashMap.keys(seenFingerprints)
|
|
1338
|
+
),
|
|
1339
|
+
totalProcessed
|
|
1340
|
+
});
|
|
1341
|
+
}),
|
|
1342
|
+
// Restore from state
|
|
1343
|
+
restoreFromState: restoreFromStateImpl,
|
|
1344
|
+
// Generic restore method that can work with any persistence implementation
|
|
1345
|
+
restore: (persistence, stateKey) => Effect.gen(function* () {
|
|
1346
|
+
const state = yield* persistence.loadState(stateKey);
|
|
1347
|
+
if (state) {
|
|
1348
|
+
persistenceLayer = persistence;
|
|
1349
|
+
yield* restoreFromStateImpl(state);
|
|
1350
|
+
return true;
|
|
1351
|
+
}
|
|
1352
|
+
return false;
|
|
1353
|
+
})
|
|
1354
|
+
};
|
|
1355
|
+
}),
|
|
1356
|
+
dependencies: [SpiderConfig.Default]
|
|
1357
|
+
}
|
|
1358
|
+
) {
|
|
1359
|
+
}
|
|
1360
|
+
const SpiderScheduler_service = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Object.defineProperty({
|
|
1361
|
+
__proto__: null,
|
|
1362
|
+
PriorityRequest,
|
|
1363
|
+
SpiderSchedulerService,
|
|
1364
|
+
SpiderState,
|
|
1365
|
+
SpiderStateKey
|
|
1366
|
+
}, Symbol.toStringTag, { value: "Module" }));
|
|
1367
|
+
class SpiderService extends Effect.Service()(
|
|
1368
|
+
"@jambudipa.io/Spider",
|
|
1369
|
+
{
|
|
1370
|
+
effect: Effect.gen(function* () {
|
|
1371
|
+
const robots = yield* RobotsService;
|
|
1372
|
+
const scraper = yield* ScraperService;
|
|
1373
|
+
const logger = yield* SpiderLogger;
|
|
1374
|
+
const linkExtractor = yield* LinkExtractorService;
|
|
1375
|
+
const maybeScheduler = yield* Effect.serviceOption(
|
|
1376
|
+
SpiderSchedulerService
|
|
1377
|
+
);
|
|
1378
|
+
const scheduler = Option.isSome(maybeScheduler) ? maybeScheduler.value : null;
|
|
1379
|
+
const self = {
|
|
1380
|
+
/**
|
|
1381
|
+
* Starts crawling from the specified URL and processes results through the provided sink.
|
|
1382
|
+
*
|
|
1383
|
+
* This method:
|
|
1384
|
+
* 1. Validates the starting URL against configuration rules
|
|
1385
|
+
* 2. Starts a configurable number of worker fibers
|
|
1386
|
+
* 3. Each worker processes URLs from a shared queue
|
|
1387
|
+
* 4. Results are streamed through the provided sink
|
|
1388
|
+
* 5. New URLs discovered are queued for processing
|
|
1389
|
+
*
|
|
1390
|
+
* @param startingUrls - The starting URL(s) for crawling (single string or array)
|
|
1391
|
+
* @param sink - Sink to process crawl results as they're produced
|
|
1392
|
+
* @param options - Optional enhanced link extraction configuration
|
|
1393
|
+
* @returns Effect containing crawl statistics (total pages, completion status)
|
|
1394
|
+
*
|
|
1395
|
+
* @example
|
|
1396
|
+
* Basic usage:
|
|
1397
|
+
* ```typescript
|
|
1398
|
+
* const collectSink = Sink.forEach<CrawlResult>(result =>
|
|
1399
|
+
* Effect.sync(() => console.log(`Found: ${result.pageData.title}`))
|
|
1400
|
+
* );
|
|
1401
|
+
*
|
|
1402
|
+
* const stats = yield* spider.crawl('https://example.com', collectSink);
|
|
1403
|
+
* ```
|
|
1404
|
+
*
|
|
1405
|
+
* With multiple starting URLs:
|
|
1406
|
+
* ```typescript
|
|
1407
|
+
* const stats = yield* spider.crawl([
|
|
1408
|
+
* 'https://example.com',
|
|
1409
|
+
* 'https://other-domain.com'
|
|
1410
|
+
* ], collectSink);
|
|
1411
|
+
* ```
|
|
1412
|
+
*
|
|
1413
|
+
* With enhanced link extraction:
|
|
1414
|
+
* ```typescript
|
|
1415
|
+
* const stats = yield* spider.crawl('https://example.com', collectSink, {
|
|
1416
|
+
* useEnhancedExtraction: true,
|
|
1417
|
+
* linkExtractorConfig: {
|
|
1418
|
+
* allowPatterns: [/\/articles\//],
|
|
1419
|
+
* restrictCss: ['.content a']
|
|
1420
|
+
* }
|
|
1421
|
+
* });
|
|
1422
|
+
* ```
|
|
1423
|
+
*/
|
|
1424
|
+
crawl: (startingUrls, sink, options) => Effect.gen(function* () {
|
|
1425
|
+
const config = yield* SpiderConfig;
|
|
1426
|
+
if (!config) {
|
|
1427
|
+
return yield* Effect.fail(
|
|
1428
|
+
new Error("SpiderConfig is required for crawling operations")
|
|
1429
|
+
);
|
|
1430
|
+
}
|
|
1431
|
+
const normalizeUrlInput = (input) => {
|
|
1432
|
+
if (typeof input === "string") {
|
|
1433
|
+
return [{ url: input }];
|
|
1434
|
+
}
|
|
1435
|
+
if (Array.isArray(input)) {
|
|
1436
|
+
return input.map(
|
|
1437
|
+
(item) => typeof item === "string" ? { url: item } : item
|
|
1438
|
+
);
|
|
1439
|
+
}
|
|
1440
|
+
return [input];
|
|
1441
|
+
};
|
|
1442
|
+
const urlsWithMetadata = normalizeUrlInput(startingUrls);
|
|
1443
|
+
const domainMap = /* @__PURE__ */ new Map();
|
|
1444
|
+
for (const urlObj of urlsWithMetadata) {
|
|
1445
|
+
try {
|
|
1446
|
+
const url = new URL(urlObj.url);
|
|
1447
|
+
const domain = url.hostname.toLowerCase();
|
|
1448
|
+
const normalizedDomain = domain.replace(/^www\./, "");
|
|
1449
|
+
if (!domainMap.has(normalizedDomain)) {
|
|
1450
|
+
domainMap.set(normalizedDomain, urlObj);
|
|
1451
|
+
} else {
|
|
1452
|
+
console.warn(
|
|
1453
|
+
`Skipping duplicate domain: ${domain} (normalized: ${normalizedDomain}, URL: ${urlObj.url})`
|
|
1454
|
+
);
|
|
1455
|
+
}
|
|
1456
|
+
} catch (e) {
|
|
1457
|
+
console.error(`Invalid URL skipped: ${urlObj.url}`, e);
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
const deduplicatedUrls = Array.from(domainMap.values());
|
|
1461
|
+
const concurrency = yield* config.getConcurrency();
|
|
1462
|
+
if (deduplicatedUrls.length > 1) {
|
|
1463
|
+
const configOptions = yield* config.getOptions();
|
|
1464
|
+
if (configOptions.allowedDomains || configOptions.blockedDomains) {
|
|
1465
|
+
console.warn(
|
|
1466
|
+
"Warning: Multiple starting URLs detected with allowedDomains/blockedDomains configured. Domain restrictions will be ignored - each URL will be restricted to its own domain instead."
|
|
1467
|
+
);
|
|
1468
|
+
}
|
|
1469
|
+
}
|
|
1470
|
+
yield* logger.logSpiderLifecycle("start", {
|
|
1471
|
+
totalUrls: deduplicatedUrls.length,
|
|
1472
|
+
urls: deduplicatedUrls.map((u) => u.url),
|
|
1473
|
+
originalCount: urlsWithMetadata.length,
|
|
1474
|
+
deduplicatedCount: deduplicatedUrls.length
|
|
1475
|
+
});
|
|
1476
|
+
const restrictToStartingDomain = true;
|
|
1477
|
+
const results = yield* Effect.all(
|
|
1478
|
+
deduplicatedUrls.map(
|
|
1479
|
+
({ url, metadata }) => self.crawlSingle(
|
|
1480
|
+
url,
|
|
1481
|
+
sink,
|
|
1482
|
+
options,
|
|
1483
|
+
metadata,
|
|
1484
|
+
restrictToStartingDomain
|
|
1485
|
+
)
|
|
1486
|
+
),
|
|
1487
|
+
{ concurrency }
|
|
1488
|
+
);
|
|
1489
|
+
yield* logger.logSpiderLifecycle("complete", {
|
|
1490
|
+
totalDomains: results.length,
|
|
1491
|
+
totalPages: results.reduce(
|
|
1492
|
+
(sum, r) => sum + (r.pagesScraped || 0),
|
|
1493
|
+
0
|
|
1494
|
+
)
|
|
1495
|
+
});
|
|
1496
|
+
return {
|
|
1497
|
+
completed: true
|
|
1498
|
+
};
|
|
1499
|
+
}),
|
|
1500
|
+
// Single URL crawling - each gets its own queue, workers, and deduplicator
|
|
1501
|
+
crawlSingle: (urlString, sink, options, initialMetadata, restrictToStartingDomain) => Effect.gen(function* () {
|
|
1502
|
+
const config = yield* SpiderConfig;
|
|
1503
|
+
let domain;
|
|
1504
|
+
try {
|
|
1505
|
+
const url = new URL(urlString);
|
|
1506
|
+
domain = url.hostname;
|
|
1507
|
+
} catch {
|
|
1508
|
+
domain = "invalid-url";
|
|
1509
|
+
}
|
|
1510
|
+
yield* logger.logDomainStart(domain, urlString);
|
|
1511
|
+
const localDeduplicator = yield* Effect.provide(
|
|
1512
|
+
UrlDeduplicatorService,
|
|
1513
|
+
UrlDeduplicatorService.Default
|
|
1514
|
+
);
|
|
1515
|
+
const urlQueue = yield* Queue.unbounded();
|
|
1516
|
+
const resultPubSub = yield* PubSub.unbounded();
|
|
1517
|
+
const activeWorkers = MutableRef.make(0);
|
|
1518
|
+
const maxPagesReached = MutableRef.make(false);
|
|
1519
|
+
const domainCompleted = MutableRef.make(false);
|
|
1520
|
+
const queueMutex = yield* Effect.makeSemaphore(1);
|
|
1521
|
+
const workerHealthChecks = MutableRef.make(
|
|
1522
|
+
/* @__PURE__ */ new Map()
|
|
1523
|
+
);
|
|
1524
|
+
const reportWorkerHealth = (workerId) => Effect.sync(() => {
|
|
1525
|
+
const healthMap = MutableRef.get(workerHealthChecks);
|
|
1526
|
+
healthMap.set(workerId, /* @__PURE__ */ new Date());
|
|
1527
|
+
return healthMap;
|
|
1528
|
+
});
|
|
1529
|
+
const workerHealthMonitor = Effect.gen(function* () {
|
|
1530
|
+
const healthMap = MutableRef.get(workerHealthChecks);
|
|
1531
|
+
const now = Date.now();
|
|
1532
|
+
const staleThreshold = 6e4;
|
|
1533
|
+
for (const [workerId, lastCheck] of healthMap) {
|
|
1534
|
+
const elapsed = now - lastCheck.getTime();
|
|
1535
|
+
if (elapsed > staleThreshold) {
|
|
1536
|
+
yield* logger.logEdgeCase(domain, "worker_death_detected", {
|
|
1537
|
+
workerId,
|
|
1538
|
+
lastSeen: elapsed + "ms ago",
|
|
1539
|
+
message: `DEAD WORKER: ${workerId} - No heartbeat for ${Math.round(elapsed / 1e3)}s`
|
|
1540
|
+
});
|
|
1541
|
+
healthMap.delete(workerId);
|
|
1542
|
+
}
|
|
1543
|
+
}
|
|
1544
|
+
}).pipe(
|
|
1545
|
+
Effect.repeat(Schedule.fixed("15 seconds"))
|
|
1546
|
+
// Check every 15 seconds
|
|
1547
|
+
);
|
|
1548
|
+
const queueManager = {
|
|
1549
|
+
// Atomic take: either returns task and increments active count, or detects completion
|
|
1550
|
+
takeTaskOrComplete: queueMutex.withPermits(1)(
|
|
1551
|
+
Effect.gen(function* () {
|
|
1552
|
+
const isCompleted = MutableRef.get(domainCompleted);
|
|
1553
|
+
if (isCompleted) {
|
|
1554
|
+
return {
|
|
1555
|
+
type: "completed",
|
|
1556
|
+
reason: "already_completed",
|
|
1557
|
+
wasFirstToComplete: false
|
|
1558
|
+
};
|
|
1559
|
+
}
|
|
1560
|
+
const hasMaxPages = MutableRef.get(maxPagesReached);
|
|
1561
|
+
if (hasMaxPages) {
|
|
1562
|
+
const wasCompleted = MutableRef.compareAndSet(
|
|
1563
|
+
domainCompleted,
|
|
1564
|
+
false,
|
|
1565
|
+
true
|
|
1566
|
+
);
|
|
1567
|
+
return {
|
|
1568
|
+
type: "completed",
|
|
1569
|
+
reason: "max_pages",
|
|
1570
|
+
wasFirstToComplete: wasCompleted
|
|
1571
|
+
};
|
|
1572
|
+
}
|
|
1573
|
+
const pollResult = yield* Queue.poll(urlQueue);
|
|
1574
|
+
if (pollResult._tag === "Some") {
|
|
1575
|
+
const activeCount = MutableRef.updateAndGet(
|
|
1576
|
+
activeWorkers,
|
|
1577
|
+
(n) => n + 1
|
|
1578
|
+
);
|
|
1579
|
+
return {
|
|
1580
|
+
type: "task",
|
|
1581
|
+
task: pollResult.value,
|
|
1582
|
+
activeCount
|
|
1583
|
+
};
|
|
1584
|
+
} else {
|
|
1585
|
+
const currentActive = MutableRef.get(activeWorkers);
|
|
1586
|
+
if (currentActive === 0) {
|
|
1587
|
+
const wasCompleted = MutableRef.compareAndSet(
|
|
1588
|
+
domainCompleted,
|
|
1589
|
+
false,
|
|
1590
|
+
true
|
|
1591
|
+
);
|
|
1592
|
+
return {
|
|
1593
|
+
type: "completed",
|
|
1594
|
+
reason: "no_more_urls",
|
|
1595
|
+
wasFirstToComplete: wasCompleted
|
|
1596
|
+
};
|
|
1597
|
+
} else {
|
|
1598
|
+
return {
|
|
1599
|
+
type: "empty_but_active",
|
|
1600
|
+
activeWorkers: currentActive
|
|
1601
|
+
};
|
|
1602
|
+
}
|
|
1603
|
+
}
|
|
1604
|
+
})
|
|
1605
|
+
),
|
|
1606
|
+
// Add task to queue
|
|
1607
|
+
addTask: (task) => Queue.offer(urlQueue, task),
|
|
1608
|
+
// Mark worker as idle (decrement active count with bounds checking)
|
|
1609
|
+
markIdle: () => Effect.sync(
|
|
1610
|
+
() => MutableRef.updateAndGet(
|
|
1611
|
+
activeWorkers,
|
|
1612
|
+
(n) => Math.max(0, n - 1)
|
|
1613
|
+
)
|
|
1614
|
+
),
|
|
1615
|
+
// Get queue size for logging (with defensive bounds checking)
|
|
1616
|
+
size: () => Effect.map(Queue.size(urlQueue), (size) => Math.max(0, size))
|
|
1617
|
+
};
|
|
1618
|
+
const generateWorkerId = () => Effect.gen(function* () {
|
|
1619
|
+
const random = yield* Random.nextIntBetween(1e3, 9999);
|
|
1620
|
+
return `${domain}-worker-${random}`;
|
|
1621
|
+
});
|
|
1622
|
+
const worker = (workerId) => Effect.gen(function* () {
|
|
1623
|
+
yield* logger.logWorkerLifecycle(
|
|
1624
|
+
workerId,
|
|
1625
|
+
domain,
|
|
1626
|
+
"entering_loop"
|
|
1627
|
+
);
|
|
1628
|
+
while (true) {
|
|
1629
|
+
yield* reportWorkerHealth(workerId);
|
|
1630
|
+
const queueSize = yield* queueManager.size();
|
|
1631
|
+
const memUsage = process.memoryUsage();
|
|
1632
|
+
if (memUsage.heapUsed > 1024 * 1024 * 1024) {
|
|
1633
|
+
yield* logger.logEdgeCase(domain, "high_memory_usage", {
|
|
1634
|
+
workerId,
|
|
1635
|
+
heapUsed: Math.round(memUsage.heapUsed / 1024 / 1024) + "MB",
|
|
1636
|
+
heapTotal: Math.round(memUsage.heapTotal / 1024 / 1024) + "MB",
|
|
1637
|
+
queueSize
|
|
1638
|
+
});
|
|
1639
|
+
}
|
|
1640
|
+
if (queueSize > 1e4) {
|
|
1641
|
+
yield* logger.logEdgeCase(domain, "excessive_queue_size", {
|
|
1642
|
+
workerId,
|
|
1643
|
+
queueSize,
|
|
1644
|
+
message: "Queue size exceeds 10,000 items - potential memory issue"
|
|
1645
|
+
});
|
|
1646
|
+
}
|
|
1647
|
+
yield* logger.logWorkerState(
|
|
1648
|
+
workerId,
|
|
1649
|
+
domain,
|
|
1650
|
+
"taking_task",
|
|
1651
|
+
{
|
|
1652
|
+
queueSize
|
|
1653
|
+
}
|
|
1654
|
+
);
|
|
1655
|
+
const result = yield* queueManager.takeTaskOrComplete.pipe(
|
|
1656
|
+
Effect.timeout("10 seconds"),
|
|
1657
|
+
Effect.tap(
|
|
1658
|
+
() => logger.logEdgeCase(domain, "task_acquisition_success", {
|
|
1659
|
+
workerId,
|
|
1660
|
+
message: "Task acquired successfully"
|
|
1661
|
+
})
|
|
1662
|
+
),
|
|
1663
|
+
Effect.tapError(
|
|
1664
|
+
(error) => logger.logEdgeCase(domain, "deadlock_detected", {
|
|
1665
|
+
workerId,
|
|
1666
|
+
error: String(error),
|
|
1667
|
+
message: "DEADLOCK: Task acquisition timed out - worker stuck in atomic operation",
|
|
1668
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
1669
|
+
})
|
|
1670
|
+
),
|
|
1671
|
+
Effect.catchAll(
|
|
1672
|
+
(error) => Effect.gen(function* () {
|
|
1673
|
+
yield* logger.logEdgeCase(
|
|
1674
|
+
domain,
|
|
1675
|
+
"task_acquisition_failed",
|
|
1676
|
+
{
|
|
1677
|
+
workerId,
|
|
1678
|
+
error: String(error),
|
|
1679
|
+
isTimeout: error?.name === "TimeoutException",
|
|
1680
|
+
message: "Task acquisition failed, marking worker as idle and retrying"
|
|
1681
|
+
}
|
|
1682
|
+
);
|
|
1683
|
+
yield* queueManager.markIdle();
|
|
1684
|
+
return {
|
|
1685
|
+
type: "empty_but_active",
|
|
1686
|
+
activeWorkers: 0
|
|
1687
|
+
};
|
|
1688
|
+
})
|
|
1689
|
+
)
|
|
1690
|
+
);
|
|
1691
|
+
if (result.type === "completed") {
|
|
1692
|
+
if ("wasFirstToComplete" in result && result.wasFirstToComplete) {
|
|
1693
|
+
const reason = result.reason || "unknown";
|
|
1694
|
+
yield* logger.logEvent({
|
|
1695
|
+
type: "domain_complete",
|
|
1696
|
+
domain,
|
|
1697
|
+
message: `Worker ${workerId} detected domain completion - ${reason}`,
|
|
1698
|
+
details: { reason }
|
|
1699
|
+
});
|
|
1700
|
+
}
|
|
1701
|
+
yield* logger.logWorkerLifecycle(
|
|
1702
|
+
workerId,
|
|
1703
|
+
domain,
|
|
1704
|
+
"exiting_loop",
|
|
1705
|
+
"detected_completion"
|
|
1706
|
+
);
|
|
1707
|
+
break;
|
|
1708
|
+
} else if (result.type === "empty_but_active") {
|
|
1709
|
+
const backoffMs = Math.min(
|
|
1710
|
+
1e3 * Math.pow(2, Math.floor(Math.random() * 3)),
|
|
1711
|
+
5e3
|
|
1712
|
+
);
|
|
1713
|
+
yield* Effect.sleep(`${backoffMs} millis`);
|
|
1714
|
+
continue;
|
|
1715
|
+
} else if (result.type === "task") {
|
|
1716
|
+
const task2 = result.task;
|
|
1717
|
+
yield* logger.logWorkerState(
|
|
1718
|
+
workerId,
|
|
1719
|
+
domain,
|
|
1720
|
+
"marked_active",
|
|
1721
|
+
{
|
|
1722
|
+
taskUrl: task2.url,
|
|
1723
|
+
activeWorkers: result.activeCount
|
|
1724
|
+
}
|
|
1725
|
+
);
|
|
1726
|
+
const wasAdded = yield* localDeduplicator.tryAdd(task2.url);
|
|
1727
|
+
if (!wasAdded) {
|
|
1728
|
+
const postIdleCount = yield* queueManager.markIdle();
|
|
1729
|
+
yield* logger.logWorkerState(
|
|
1730
|
+
workerId,
|
|
1731
|
+
domain,
|
|
1732
|
+
"marked_idle",
|
|
1733
|
+
{
|
|
1734
|
+
taskUrl: task2.url,
|
|
1735
|
+
activeWorkers: postIdleCount,
|
|
1736
|
+
reason: "duplicate_url"
|
|
1737
|
+
}
|
|
1738
|
+
);
|
|
1739
|
+
continue;
|
|
1740
|
+
}
|
|
1741
|
+
} else {
|
|
1742
|
+
yield* Effect.sleep("1 second");
|
|
1743
|
+
continue;
|
|
1744
|
+
}
|
|
1745
|
+
const task = result.task;
|
|
1746
|
+
yield* logger.logEdgeCase(domain, "before_shouldFollowUrl", {
|
|
1747
|
+
workerId,
|
|
1748
|
+
url: task.url,
|
|
1749
|
+
message: "About to check shouldFollowUrl"
|
|
1750
|
+
});
|
|
1751
|
+
const shouldFollow = yield* config.shouldFollowUrl(
|
|
1752
|
+
task.url,
|
|
1753
|
+
task.fromUrl,
|
|
1754
|
+
restrictToStartingDomain ? urlString : void 0
|
|
1755
|
+
);
|
|
1756
|
+
yield* logger.logEdgeCase(domain, "after_shouldFollowUrl", {
|
|
1757
|
+
workerId,
|
|
1758
|
+
url: task.url,
|
|
1759
|
+
follow: shouldFollow.follow,
|
|
1760
|
+
reason: shouldFollow.reason,
|
|
1761
|
+
message: "Completed shouldFollowUrl check"
|
|
1762
|
+
});
|
|
1763
|
+
if (!shouldFollow.follow) {
|
|
1764
|
+
const newIdleCount2 = yield* queueManager.markIdle();
|
|
1765
|
+
yield* logger.logWorkerState(
|
|
1766
|
+
workerId,
|
|
1767
|
+
domain,
|
|
1768
|
+
"marked_idle",
|
|
1769
|
+
{
|
|
1770
|
+
reason: "shouldNotFollow",
|
|
1771
|
+
activeWorkers: newIdleCount2
|
|
1772
|
+
}
|
|
1773
|
+
);
|
|
1774
|
+
continue;
|
|
1775
|
+
}
|
|
1776
|
+
const ignoreRobots = yield* config.shouldIgnoreRobotsTxt();
|
|
1777
|
+
if (!ignoreRobots) {
|
|
1778
|
+
yield* logger.logEdgeCase(domain, "before_robots_check", {
|
|
1779
|
+
workerId,
|
|
1780
|
+
url: task.url,
|
|
1781
|
+
message: "About to check robots.txt"
|
|
1782
|
+
});
|
|
1783
|
+
const robotsCheck = yield* robots.checkUrl(task.url);
|
|
1784
|
+
yield* logger.logEdgeCase(domain, "after_robots_check", {
|
|
1785
|
+
workerId,
|
|
1786
|
+
url: task.url,
|
|
1787
|
+
allowed: robotsCheck.allowed,
|
|
1788
|
+
crawlDelay: robotsCheck.crawlDelay,
|
|
1789
|
+
message: "Completed robots.txt check"
|
|
1790
|
+
});
|
|
1791
|
+
if (!robotsCheck.allowed) {
|
|
1792
|
+
const newIdleCount2 = yield* queueManager.markIdle();
|
|
1793
|
+
yield* logger.logWorkerState(
|
|
1794
|
+
workerId,
|
|
1795
|
+
domain,
|
|
1796
|
+
"marked_idle",
|
|
1797
|
+
{
|
|
1798
|
+
reason: "robotsBlocked",
|
|
1799
|
+
activeWorkers: newIdleCount2
|
|
1800
|
+
}
|
|
1801
|
+
);
|
|
1802
|
+
continue;
|
|
1803
|
+
}
|
|
1804
|
+
if (robotsCheck.crawlDelay) {
|
|
1805
|
+
const maxCrawlDelayMs = yield* config.getMaxRobotsCrawlDelay();
|
|
1806
|
+
const maxCrawlDelaySeconds = maxCrawlDelayMs / 1e3;
|
|
1807
|
+
const effectiveCrawlDelay = Math.min(
|
|
1808
|
+
robotsCheck.crawlDelay,
|
|
1809
|
+
maxCrawlDelaySeconds
|
|
1810
|
+
);
|
|
1811
|
+
if (effectiveCrawlDelay < robotsCheck.crawlDelay) {
|
|
1812
|
+
yield* logger.logEvent({
|
|
1813
|
+
type: "crawl_delay_capped",
|
|
1814
|
+
domain,
|
|
1815
|
+
workerId,
|
|
1816
|
+
message: `[CRAWL_DELAY] Capping robots.txt delay from ${robotsCheck.crawlDelay}s to ${effectiveCrawlDelay}s`,
|
|
1817
|
+
details: {
|
|
1818
|
+
robotsCrawlDelay: robotsCheck.crawlDelay,
|
|
1819
|
+
maxCrawlDelay: maxCrawlDelaySeconds,
|
|
1820
|
+
effectiveDelay: effectiveCrawlDelay
|
|
1821
|
+
}
|
|
1822
|
+
});
|
|
1823
|
+
}
|
|
1824
|
+
yield* Effect.sleep(`${effectiveCrawlDelay} seconds`);
|
|
1825
|
+
}
|
|
1826
|
+
}
|
|
1827
|
+
const requestDelay = yield* config.getRequestDelay();
|
|
1828
|
+
yield* Effect.sleep(`${requestDelay} millis`);
|
|
1829
|
+
const fetchStartTime = Date.now();
|
|
1830
|
+
yield* logger.logEdgeCase(domain, "before_fetch", {
|
|
1831
|
+
workerId,
|
|
1832
|
+
url: task.url,
|
|
1833
|
+
depth: task.depth,
|
|
1834
|
+
message: "About to fetch and parse page",
|
|
1835
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1836
|
+
fetchStartMs: fetchStartTime
|
|
1837
|
+
});
|
|
1838
|
+
const pageData = yield* scraper.fetchAndParse(task.url, task.depth).pipe(
|
|
1839
|
+
// Add overall timeout to prevent workers from hanging
|
|
1840
|
+
Effect.timeout("45 seconds"),
|
|
1841
|
+
Effect.retry({
|
|
1842
|
+
times: 2,
|
|
1843
|
+
// Reduced retries to prevent long hangs
|
|
1844
|
+
schedule: Schedule.exponential("1 second")
|
|
1845
|
+
}),
|
|
1846
|
+
Effect.catchAll(
|
|
1847
|
+
(error) => Effect.gen(function* () {
|
|
1848
|
+
const fetchDuration = Date.now() - fetchStartTime;
|
|
1849
|
+
if (error?.name === "TimeoutException") {
|
|
1850
|
+
yield* logger.logEdgeCase(domain, "fetch_timeout", {
|
|
1851
|
+
workerId,
|
|
1852
|
+
url: task.url,
|
|
1853
|
+
message: `Fetch operation timed out after ${fetchDuration}ms`,
|
|
1854
|
+
durationMs: fetchDuration,
|
|
1855
|
+
timeoutExpectedMs: 45e3
|
|
1856
|
+
});
|
|
1857
|
+
} else {
|
|
1858
|
+
yield* logger.logEdgeCase(domain, "fetch_error", {
|
|
1859
|
+
workerId,
|
|
1860
|
+
url: task.url,
|
|
1861
|
+
error: String(error),
|
|
1862
|
+
errorName: error?.name || "Unknown",
|
|
1863
|
+
message: `Fetch operation failed after ${fetchDuration}ms`,
|
|
1864
|
+
durationMs: fetchDuration
|
|
1865
|
+
});
|
|
1866
|
+
}
|
|
1867
|
+
return null;
|
|
1868
|
+
})
|
|
1869
|
+
)
|
|
1870
|
+
);
|
|
1871
|
+
if (pageData) {
|
|
1872
|
+
const fetchDuration = Date.now() - fetchStartTime;
|
|
1873
|
+
if (task.extractData) {
|
|
1874
|
+
const extractedData = yield* Effect.sync(() => {
|
|
1875
|
+
const $ = cheerio.load(pageData.html);
|
|
1876
|
+
const result2 = {};
|
|
1877
|
+
for (const [fieldName, fieldConfig] of Object.entries(
|
|
1878
|
+
task.extractData
|
|
1879
|
+
)) {
|
|
1880
|
+
if (typeof fieldConfig === "string") {
|
|
1881
|
+
result2[fieldName] = $(fieldConfig).text().trim() || void 0;
|
|
1882
|
+
} else if (typeof fieldConfig === "object") {
|
|
1883
|
+
const fc = fieldConfig;
|
|
1884
|
+
const {
|
|
1885
|
+
selector,
|
|
1886
|
+
text,
|
|
1887
|
+
attribute,
|
|
1888
|
+
multiple,
|
|
1889
|
+
exists
|
|
1890
|
+
} = fc;
|
|
1891
|
+
if (exists) {
|
|
1892
|
+
result2[fieldName] = $(selector).length > 0;
|
|
1893
|
+
} else if (multiple) {
|
|
1894
|
+
const elements = $(selector);
|
|
1895
|
+
const values = [];
|
|
1896
|
+
elements.each((_, el) => {
|
|
1897
|
+
const $el = $(el);
|
|
1898
|
+
if (fc.fields) {
|
|
1899
|
+
const nestedResult = {};
|
|
1900
|
+
for (const [
|
|
1901
|
+
nestedName,
|
|
1902
|
+
nestedConfig
|
|
1903
|
+
] of Object.entries(fc.fields)) {
|
|
1904
|
+
if (typeof nestedConfig === "object") {
|
|
1905
|
+
const nc = nestedConfig;
|
|
1906
|
+
const $nested = $el.find(nc.selector);
|
|
1907
|
+
if (nc.attribute) {
|
|
1908
|
+
nestedResult[nestedName] = $nested.attr(
|
|
1909
|
+
nc.attribute
|
|
1910
|
+
);
|
|
1911
|
+
} else {
|
|
1912
|
+
nestedResult[nestedName] = $nested.text().trim();
|
|
1913
|
+
}
|
|
1914
|
+
}
|
|
1915
|
+
}
|
|
1916
|
+
values.push(nestedResult);
|
|
1917
|
+
} else if (attribute) {
|
|
1918
|
+
values.push($el.attr(attribute));
|
|
1919
|
+
} else {
|
|
1920
|
+
values.push($el.text().trim());
|
|
1921
|
+
}
|
|
1922
|
+
});
|
|
1923
|
+
result2[fieldName] = values.length > 0 ? values : void 0;
|
|
1924
|
+
} else {
|
|
1925
|
+
const $el = $(selector);
|
|
1926
|
+
if (attribute) {
|
|
1927
|
+
result2[fieldName] = $el.attr(attribute);
|
|
1928
|
+
} else {
|
|
1929
|
+
result2[fieldName] = $el.text().trim() || void 0;
|
|
1930
|
+
}
|
|
1931
|
+
}
|
|
1932
|
+
}
|
|
1933
|
+
}
|
|
1934
|
+
return result2;
|
|
1935
|
+
});
|
|
1936
|
+
pageData.extractedData = extractedData;
|
|
1937
|
+
}
|
|
1938
|
+
const currentPageCount = yield* localDeduplicator.size();
|
|
1939
|
+
yield* logger.logEdgeCase(domain, "fetch_success", {
|
|
1940
|
+
workerId,
|
|
1941
|
+
url: task.url,
|
|
1942
|
+
message: `Fetch completed successfully`,
|
|
1943
|
+
durationMs: fetchDuration
|
|
1944
|
+
});
|
|
1945
|
+
yield* logger.logPageScraped(
|
|
1946
|
+
task.url,
|
|
1947
|
+
domain,
|
|
1948
|
+
currentPageCount
|
|
1949
|
+
);
|
|
1950
|
+
yield* PubSub.publish(resultPubSub, {
|
|
1951
|
+
pageData,
|
|
1952
|
+
depth: task.depth,
|
|
1953
|
+
timestamp: /* @__PURE__ */ new Date(),
|
|
1954
|
+
metadata: task.metadata
|
|
1955
|
+
});
|
|
1956
|
+
const maxDepth = yield* config.getMaxDepth();
|
|
1957
|
+
if (!maxDepth || task.depth < maxDepth) {
|
|
1958
|
+
let linksToProcess = [];
|
|
1959
|
+
const extractionResult = linkExtractor ? yield* (() => {
|
|
1960
|
+
const extractorConfig = options?.linkExtractorConfig || {};
|
|
1961
|
+
return linkExtractor.extractLinks(pageData.html, extractorConfig).pipe(
|
|
1962
|
+
Effect.catchAll(
|
|
1963
|
+
() => Effect.succeed({
|
|
1964
|
+
links: [],
|
|
1965
|
+
totalElementsProcessed: 0,
|
|
1966
|
+
extractionBreakdown: {}
|
|
1967
|
+
})
|
|
1968
|
+
)
|
|
1969
|
+
);
|
|
1970
|
+
})() : {
|
|
1971
|
+
links: []
|
|
1972
|
+
};
|
|
1973
|
+
linksToProcess = extractionResult.links.map((url) => {
|
|
1974
|
+
try {
|
|
1975
|
+
return new URL(url, pageData.url).toString();
|
|
1976
|
+
} catch {
|
|
1977
|
+
return null;
|
|
1978
|
+
}
|
|
1979
|
+
}).filter((url) => url !== null);
|
|
1980
|
+
for (const link of linksToProcess) {
|
|
1981
|
+
const linkShouldFollow = yield* config.shouldFollowUrl(
|
|
1982
|
+
link,
|
|
1983
|
+
task.url,
|
|
1984
|
+
restrictToStartingDomain ? urlString : void 0
|
|
1985
|
+
);
|
|
1986
|
+
if (!linkShouldFollow.follow) {
|
|
1987
|
+
continue;
|
|
1988
|
+
}
|
|
1989
|
+
const alreadySeen = yield* localDeduplicator.contains(link);
|
|
1990
|
+
if (!alreadySeen) {
|
|
1991
|
+
yield* queueManager.addTask({
|
|
1992
|
+
url: link,
|
|
1993
|
+
depth: task.depth + 1,
|
|
1994
|
+
fromUrl: task.url,
|
|
1995
|
+
metadata: task.metadata
|
|
1996
|
+
});
|
|
1997
|
+
const newQueueSize = yield* queueManager.size();
|
|
1998
|
+
if (newQueueSize % 10 === 0 || newQueueSize <= 5) {
|
|
1999
|
+
yield* logger.logEvent({
|
|
2000
|
+
type: "queue_status",
|
|
2001
|
+
domain,
|
|
2002
|
+
workerId,
|
|
2003
|
+
message: `[QUEUE_STATE] URL added to queue: ${link}`,
|
|
2004
|
+
details: {
|
|
2005
|
+
queueSize: newQueueSize,
|
|
2006
|
+
addedUrl: link,
|
|
2007
|
+
fromUrl: task.url
|
|
2008
|
+
}
|
|
2009
|
+
});
|
|
2010
|
+
}
|
|
2011
|
+
}
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
const newIdleCount = yield* queueManager.markIdle();
|
|
2016
|
+
yield* logger.logWorkerState(
|
|
2017
|
+
workerId,
|
|
2018
|
+
domain,
|
|
2019
|
+
"task_completed",
|
|
2020
|
+
{
|
|
2021
|
+
taskUrl: task.url,
|
|
2022
|
+
activeWorkers: newIdleCount,
|
|
2023
|
+
pageProcessed: !!pageData
|
|
2024
|
+
}
|
|
2025
|
+
);
|
|
2026
|
+
const maxPages2 = yield* config.getMaxPages();
|
|
2027
|
+
if (maxPages2) {
|
|
2028
|
+
const currentPageCount = yield* localDeduplicator.size();
|
|
2029
|
+
if (currentPageCount >= maxPages2) {
|
|
2030
|
+
const wasFirstToReachMax = MutableRef.compareAndSet(
|
|
2031
|
+
maxPagesReached,
|
|
2032
|
+
false,
|
|
2033
|
+
true
|
|
2034
|
+
);
|
|
2035
|
+
if (wasFirstToReachMax) {
|
|
2036
|
+
yield* logger.logPageScraped(
|
|
2037
|
+
task.url,
|
|
2038
|
+
domain,
|
|
2039
|
+
currentPageCount
|
|
2040
|
+
);
|
|
2041
|
+
yield* logger.logEvent({
|
|
2042
|
+
type: "domain_complete",
|
|
2043
|
+
domain,
|
|
2044
|
+
message: `Domain ${domain} reached max pages limit: ${currentPageCount}`,
|
|
2045
|
+
details: {
|
|
2046
|
+
currentPageCount,
|
|
2047
|
+
maxPages: maxPages2,
|
|
2048
|
+
reason: "max_pages_reached"
|
|
2049
|
+
}
|
|
2050
|
+
});
|
|
2051
|
+
}
|
|
2052
|
+
yield* logger.logWorkerLifecycle(
|
|
2053
|
+
workerId,
|
|
2054
|
+
domain,
|
|
2055
|
+
"exiting_loop",
|
|
2056
|
+
"max_pages_reached",
|
|
2057
|
+
{
|
|
2058
|
+
currentPageCount,
|
|
2059
|
+
maxPages: maxPages2
|
|
2060
|
+
}
|
|
2061
|
+
);
|
|
2062
|
+
break;
|
|
2063
|
+
}
|
|
2064
|
+
}
|
|
2065
|
+
const pageCount = yield* localDeduplicator.size();
|
|
2066
|
+
if (pageCount % 10 === 0) {
|
|
2067
|
+
const queueSize2 = yield* queueManager.size();
|
|
2068
|
+
const activeCount = MutableRef.get(activeWorkers);
|
|
2069
|
+
const maxWorkers2 = yield* config.getMaxConcurrentWorkers();
|
|
2070
|
+
yield* logger.logDomainStatus(domain, {
|
|
2071
|
+
pagesScraped: pageCount,
|
|
2072
|
+
queueSize: queueSize2,
|
|
2073
|
+
activeWorkers: activeCount,
|
|
2074
|
+
maxWorkers: maxWorkers2
|
|
2075
|
+
});
|
|
2076
|
+
}
|
|
2077
|
+
}
|
|
2078
|
+
yield* logger.logWorkerLifecycle(
|
|
2079
|
+
workerId,
|
|
2080
|
+
domain,
|
|
2081
|
+
"exiting_loop",
|
|
2082
|
+
"normal_completion"
|
|
2083
|
+
);
|
|
2084
|
+
}).pipe(
|
|
2085
|
+
// Ensure this runs even if the worker is interrupted/crashes
|
|
2086
|
+
Effect.ensuring(
|
|
2087
|
+
logger.logWorkerLifecycle(
|
|
2088
|
+
workerId,
|
|
2089
|
+
domain,
|
|
2090
|
+
"exiting_loop",
|
|
2091
|
+
"effect_ensuring_cleanup"
|
|
2092
|
+
)
|
|
2093
|
+
),
|
|
2094
|
+
// Add catchAll to handle any unhandled errors
|
|
2095
|
+
Effect.catchAll(
|
|
2096
|
+
(error) => Effect.gen(function* () {
|
|
2097
|
+
yield* logger.logEdgeCase(domain, "worker_crash", {
|
|
2098
|
+
workerId,
|
|
2099
|
+
error: String(error),
|
|
2100
|
+
message: `Worker ${workerId} crashed with error: ${error}`,
|
|
2101
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2102
|
+
});
|
|
2103
|
+
yield* logger.logWorkerLifecycle(
|
|
2104
|
+
workerId,
|
|
2105
|
+
domain,
|
|
2106
|
+
"exiting_loop",
|
|
2107
|
+
"error_exit"
|
|
2108
|
+
);
|
|
2109
|
+
})
|
|
2110
|
+
)
|
|
2111
|
+
);
|
|
2112
|
+
yield* queueManager.addTask({
|
|
2113
|
+
url: urlString,
|
|
2114
|
+
depth: 0,
|
|
2115
|
+
metadata: initialMetadata,
|
|
2116
|
+
extractData: options?.extractData
|
|
2117
|
+
});
|
|
2118
|
+
yield* logger.logEvent({
|
|
2119
|
+
type: "queue_status",
|
|
2120
|
+
domain,
|
|
2121
|
+
message: `[QUEUE_STATE] Initial URL queued: ${urlString}`,
|
|
2122
|
+
details: { queueSize: 1, initialUrl: urlString }
|
|
2123
|
+
});
|
|
2124
|
+
const maxWorkers = yield* config.getMaxConcurrentWorkers();
|
|
2125
|
+
const workerFibers = [];
|
|
2126
|
+
for (let i = 0; i < maxWorkers; i++) {
|
|
2127
|
+
const workerId = yield* generateWorkerId();
|
|
2128
|
+
yield* logger.logWorkerLifecycle(
|
|
2129
|
+
workerId,
|
|
2130
|
+
domain,
|
|
2131
|
+
"created",
|
|
2132
|
+
void 0,
|
|
2133
|
+
{
|
|
2134
|
+
workerIndex: i,
|
|
2135
|
+
totalWorkers: maxWorkers
|
|
2136
|
+
}
|
|
2137
|
+
);
|
|
2138
|
+
const fiber = yield* Effect.fork(worker(workerId));
|
|
2139
|
+
workerFibers.push(fiber);
|
|
2140
|
+
}
|
|
2141
|
+
const healthMonitorFiber = yield* Effect.fork(workerHealthMonitor);
|
|
2142
|
+
const resultStream = Stream.fromPubSub(resultPubSub);
|
|
2143
|
+
const sinkFiber = yield* Effect.fork(
|
|
2144
|
+
Stream.run(resultStream, sink)
|
|
2145
|
+
);
|
|
2146
|
+
const failureDetector = Effect.gen(function* () {
|
|
2147
|
+
let lastPageCount = 0;
|
|
2148
|
+
let stuckIterations = 0;
|
|
2149
|
+
while (!MutableRef.get(domainCompleted)) {
|
|
2150
|
+
yield* Effect.sleep("30 seconds");
|
|
2151
|
+
const pageCount = yield* localDeduplicator.size();
|
|
2152
|
+
const queueSize = yield* queueManager.size();
|
|
2153
|
+
const activeCount = MutableRef.get(activeWorkers);
|
|
2154
|
+
const hasQueueItems = queueSize > 0;
|
|
2155
|
+
const hasNoActiveWorkers = activeCount === 0;
|
|
2156
|
+
const hasNegativeQueue = queueSize < 0;
|
|
2157
|
+
const noProgressMade = pageCount === lastPageCount;
|
|
2158
|
+
if (hasNegativeQueue) {
|
|
2159
|
+
yield* logger.logEdgeCase(domain, "negative_queue_detected", {
|
|
2160
|
+
queueSize,
|
|
2161
|
+
activeWorkers: activeCount,
|
|
2162
|
+
pageCount
|
|
2163
|
+
});
|
|
2164
|
+
}
|
|
2165
|
+
const criticalFailures = [
|
|
2166
|
+
hasNoActiveWorkers && hasQueueItems && pageCount > 0,
|
|
2167
|
+
// 0 workers with queue items
|
|
2168
|
+
hasNegativeQueue,
|
|
2169
|
+
// Invalid queue state
|
|
2170
|
+
activeCount === 0 && pageCount <= 1 && stuckIterations >= 2
|
|
2171
|
+
// Completely stuck
|
|
2172
|
+
];
|
|
2173
|
+
if (criticalFailures.some(Boolean)) {
|
|
2174
|
+
const reason = hasNoActiveWorkers && hasQueueItems ? "no_workers_with_queue_items" : hasNegativeQueue ? "negative_queue_size" : "no_progress_for_60s";
|
|
2175
|
+
yield* logger.logEdgeCase(
|
|
2176
|
+
domain,
|
|
2177
|
+
"critical_failure_detected",
|
|
2178
|
+
{
|
|
2179
|
+
timeElapsed: `${(stuckIterations + 1) * 30}s`,
|
|
2180
|
+
pageCount,
|
|
2181
|
+
queueSize,
|
|
2182
|
+
activeWorkers: activeCount,
|
|
2183
|
+
reason
|
|
2184
|
+
}
|
|
2185
|
+
);
|
|
2186
|
+
const wasCompleted = MutableRef.compareAndSet(
|
|
2187
|
+
domainCompleted,
|
|
2188
|
+
false,
|
|
2189
|
+
true
|
|
2190
|
+
);
|
|
2191
|
+
if (wasCompleted) {
|
|
2192
|
+
yield* logger.logDomainComplete(domain, pageCount, "error");
|
|
2193
|
+
}
|
|
2194
|
+
break;
|
|
2195
|
+
}
|
|
2196
|
+
if (noProgressMade) {
|
|
2197
|
+
stuckIterations++;
|
|
2198
|
+
} else {
|
|
2199
|
+
stuckIterations = 0;
|
|
2200
|
+
lastPageCount = pageCount;
|
|
2201
|
+
}
|
|
2202
|
+
}
|
|
2203
|
+
});
|
|
2204
|
+
const failureDetectorFiber = yield* Effect.fork(failureDetector);
|
|
2205
|
+
yield* Effect.all(
|
|
2206
|
+
workerFibers.map((f) => Fiber.join(f)),
|
|
2207
|
+
{ concurrency: "unbounded" }
|
|
2208
|
+
);
|
|
2209
|
+
yield* Fiber.interrupt(failureDetectorFiber).pipe(Effect.ignore);
|
|
2210
|
+
yield* Fiber.interrupt(healthMonitorFiber).pipe(Effect.ignore);
|
|
2211
|
+
yield* logger.logEvent({
|
|
2212
|
+
type: "queue_status",
|
|
2213
|
+
domain,
|
|
2214
|
+
message: `[QUEUE_STATE] Shutting down queue for domain completion`,
|
|
2215
|
+
details: { finalQueueSize: yield* queueManager.size() }
|
|
2216
|
+
});
|
|
2217
|
+
const finalPageCount = yield* localDeduplicator.size();
|
|
2218
|
+
const maxPages = yield* config.getMaxPages();
|
|
2219
|
+
const completionReason = maxPages && finalPageCount >= maxPages ? "max_pages" : "queue_empty";
|
|
2220
|
+
yield* logger.logDomainComplete(
|
|
2221
|
+
domain,
|
|
2222
|
+
finalPageCount,
|
|
2223
|
+
completionReason
|
|
2224
|
+
);
|
|
2225
|
+
yield* PubSub.shutdown(resultPubSub);
|
|
2226
|
+
yield* logger.logEvent({
|
|
2227
|
+
type: "spider_lifecycle",
|
|
2228
|
+
domain,
|
|
2229
|
+
message: `Waiting for sink to process remaining results...`
|
|
2230
|
+
});
|
|
2231
|
+
yield* Fiber.join(sinkFiber);
|
|
2232
|
+
yield* logger.logEvent({
|
|
2233
|
+
type: "spider_lifecycle",
|
|
2234
|
+
domain,
|
|
2235
|
+
message: `Sink processing complete. All ${finalPageCount} pages saved.`
|
|
2236
|
+
});
|
|
2237
|
+
return {
|
|
2238
|
+
completed: true,
|
|
2239
|
+
pagesScraped: finalPageCount,
|
|
2240
|
+
domain
|
|
2241
|
+
};
|
|
2242
|
+
}),
|
|
2243
|
+
/**
|
|
2244
|
+
* Resume a previous crawling session from persistent storage.
|
|
2245
|
+
*
|
|
2246
|
+
* This method requires resumability to be enabled in the SpiderConfig and
|
|
2247
|
+
* a StatePersistence implementation to be configured. It will restore the
|
|
2248
|
+
* crawling state and continue processing from where it left off.
|
|
2249
|
+
*
|
|
2250
|
+
* @param stateKey - The unique identifier for the session to resume
|
|
2251
|
+
* @param sink - Sink to process crawl results as they're produced
|
|
2252
|
+
* @param persistence - Optional persistence implementation (uses configured one if not provided)
|
|
2253
|
+
* @returns Effect containing crawl statistics
|
|
2254
|
+
*
|
|
2255
|
+
* @example
|
|
2256
|
+
* ```typescript
|
|
2257
|
+
* const stateKey = new SpiderStateKey({
|
|
2258
|
+
* id: 'my-crawl-session',
|
|
2259
|
+
* timestamp: new Date('2024-01-01'),
|
|
2260
|
+
* name: 'Example Crawl'
|
|
2261
|
+
* });
|
|
2262
|
+
*
|
|
2263
|
+
* const collectSink = Sink.forEach<CrawlResult>(result =>
|
|
2264
|
+
* Effect.sync(() => console.log(`Resumed: ${result.pageData.title}`))
|
|
2265
|
+
* );
|
|
2266
|
+
*
|
|
2267
|
+
* const stats = yield* spider.resume(stateKey, collectSink);
|
|
2268
|
+
* ```
|
|
2269
|
+
*/
|
|
2270
|
+
resume: (stateKey, _sink, _persistence) => Effect.gen(function* () {
|
|
2271
|
+
if (!scheduler) {
|
|
2272
|
+
return yield* Effect.fail(
|
|
2273
|
+
new Error(
|
|
2274
|
+
"Resume functionality requires SpiderSchedulerService to be available. Make sure resumability is enabled in SpiderConfig and SpiderSchedulerService is provided."
|
|
2275
|
+
)
|
|
2276
|
+
);
|
|
2277
|
+
}
|
|
2278
|
+
const config = yield* SpiderConfig;
|
|
2279
|
+
if (!config) {
|
|
2280
|
+
return yield* Effect.fail(
|
|
2281
|
+
new Error(
|
|
2282
|
+
"SpiderConfig is required for resumability operations"
|
|
2283
|
+
)
|
|
2284
|
+
);
|
|
2285
|
+
}
|
|
2286
|
+
const resumabilityEnabled = yield* config.isResumabilityEnabled();
|
|
2287
|
+
if (!resumabilityEnabled) {
|
|
2288
|
+
return yield* Effect.fail(
|
|
2289
|
+
new Error(
|
|
2290
|
+
"Resume functionality requires resumability to be enabled in SpiderConfig. Set enableResumability: true in your spider configuration."
|
|
2291
|
+
)
|
|
2292
|
+
);
|
|
2293
|
+
}
|
|
2294
|
+
console.log(`Resuming session: ${stateKey.id}`);
|
|
2295
|
+
return {
|
|
2296
|
+
completed: true,
|
|
2297
|
+
resumed: true
|
|
2298
|
+
};
|
|
2299
|
+
}),
|
|
2300
|
+
/**
|
|
2301
|
+
* Returns the list of URLs that have been visited during crawling.
|
|
2302
|
+
*
|
|
2303
|
+
* @returns Effect containing array of visited URLs
|
|
2304
|
+
*
|
|
2305
|
+
* @remarks
|
|
2306
|
+
* This is currently a placeholder implementation. In a future version,
|
|
2307
|
+
* this will return the actual list of visited URLs from the current session.
|
|
2308
|
+
*/
|
|
2309
|
+
getVisitedUrls: () => Effect.sync(() => [])
|
|
2310
|
+
};
|
|
2311
|
+
return self;
|
|
2312
|
+
}),
|
|
2313
|
+
dependencies: [
|
|
2314
|
+
RobotsService.Default,
|
|
2315
|
+
ScraperService.Default,
|
|
2316
|
+
UrlDeduplicatorService.Default,
|
|
2317
|
+
SpiderConfig.Default,
|
|
2318
|
+
LinkExtractorService.Default,
|
|
2319
|
+
SpiderLoggerLive
|
|
2320
|
+
]
|
|
2321
|
+
}
|
|
2322
|
+
) {
|
|
2323
|
+
}
|
|
2324
|
+
class MiddlewareManager extends Effect.Service()(
|
|
2325
|
+
"@jambudipa.io/MiddlewareManager",
|
|
2326
|
+
{
|
|
2327
|
+
effect: Effect.sync(() => ({
|
|
2328
|
+
/**
|
|
2329
|
+
* Processes a request through the middleware pipeline.
|
|
2330
|
+
*
|
|
2331
|
+
* Middleware are executed in order from first to last, with each middleware
|
|
2332
|
+
* receiving the output of the previous middleware as input.
|
|
2333
|
+
*
|
|
2334
|
+
* @param request - The initial request to process
|
|
2335
|
+
* @param middlewares - Array of middleware to apply
|
|
2336
|
+
* @returns Effect containing the processed request
|
|
2337
|
+
*/
|
|
2338
|
+
processRequest: (request, middlewares) => Effect.reduce(
|
|
2339
|
+
middlewares,
|
|
2340
|
+
request,
|
|
2341
|
+
(req, middleware) => middleware.processRequest ? middleware.processRequest(req) : Effect.succeed(req)
|
|
2342
|
+
),
|
|
2343
|
+
/**
|
|
2344
|
+
* Processes a response through the middleware pipeline in reverse order.
|
|
2345
|
+
*
|
|
2346
|
+
* Middleware are executed in reverse order (last to first) to provide
|
|
2347
|
+
* proper nesting of response processing.
|
|
2348
|
+
*
|
|
2349
|
+
* @param response - The response to process
|
|
2350
|
+
* @param request - The original request (for context)
|
|
2351
|
+
* @param middlewares - Array of middleware to apply
|
|
2352
|
+
* @returns Effect containing the processed response
|
|
2353
|
+
*/
|
|
2354
|
+
processResponse: (response, request, middlewares) => Effect.reduce(
|
|
2355
|
+
middlewares.slice().reverse(),
|
|
2356
|
+
response,
|
|
2357
|
+
(res, middleware) => middleware.processResponse ? middleware.processResponse(res, request) : Effect.succeed(res)
|
|
2358
|
+
),
|
|
2359
|
+
/**
|
|
2360
|
+
* Processes an exception through the middleware pipeline in reverse order.
|
|
2361
|
+
*
|
|
2362
|
+
* Middleware are given a chance to handle or recover from exceptions.
|
|
2363
|
+
* If a middleware returns a SpiderResponse, it indicates successful recovery.
|
|
2364
|
+
* If it returns null, the exception continues to propagate.
|
|
2365
|
+
*
|
|
2366
|
+
* @param error - The error that occurred
|
|
2367
|
+
* @param request - The request that caused the error
|
|
2368
|
+
* @param middlewares - Array of middleware to apply
|
|
2369
|
+
* @returns Effect containing a recovered response or null
|
|
2370
|
+
*/
|
|
2371
|
+
processException: (error, request, middlewares) => Effect.reduce(
|
|
2372
|
+
middlewares.slice().reverse(),
|
|
2373
|
+
null,
|
|
2374
|
+
(res, middleware) => middleware.processException ? middleware.processException(error, request) : Effect.succeed(res)
|
|
2375
|
+
)
|
|
2376
|
+
}))
|
|
2377
|
+
}
|
|
2378
|
+
) {
|
|
2379
|
+
}
|
|
2380
|
+
class RateLimitMiddleware extends Effect.Service()(
|
|
2381
|
+
"@jambudipa.io/RateLimitMiddleware",
|
|
2382
|
+
{
|
|
2383
|
+
effect: Effect.sync(() => {
|
|
2384
|
+
const domainLastRequest = MutableHashMap.empty();
|
|
2385
|
+
const domainRequestCount = MutableHashMap.empty();
|
|
2386
|
+
const domainWindowStart = MutableHashMap.empty();
|
|
2387
|
+
return {
|
|
2388
|
+
create: (config) => ({
|
|
2389
|
+
processRequest: (request) => Effect.gen(function* () {
|
|
2390
|
+
const url = new URL(request.task.url);
|
|
2391
|
+
const domain = url.hostname;
|
|
2392
|
+
const now = Date.now();
|
|
2393
|
+
if (config.requestDelayMs) {
|
|
2394
|
+
yield* Effect.sleep(`${config.requestDelayMs} millis`);
|
|
2395
|
+
}
|
|
2396
|
+
const windowDuration = 1e3;
|
|
2397
|
+
const windowStart = Option.getOrElse(
|
|
2398
|
+
MutableHashMap.get(domainWindowStart, domain),
|
|
2399
|
+
() => now
|
|
2400
|
+
);
|
|
2401
|
+
const currentCount = Option.getOrElse(
|
|
2402
|
+
MutableHashMap.get(domainRequestCount, domain),
|
|
2403
|
+
() => 0
|
|
2404
|
+
);
|
|
2405
|
+
if (now - windowStart >= windowDuration) {
|
|
2406
|
+
MutableHashMap.set(domainWindowStart, domain, now);
|
|
2407
|
+
MutableHashMap.set(domainRequestCount, domain, 0);
|
|
2408
|
+
} else if (currentCount >= config.maxRequestsPerSecondPerDomain) {
|
|
2409
|
+
const waitTime = windowDuration - (now - windowStart);
|
|
2410
|
+
yield* Effect.sleep(`${waitTime} millis`);
|
|
2411
|
+
MutableHashMap.set(domainWindowStart, domain, Date.now());
|
|
2412
|
+
MutableHashMap.set(domainRequestCount, domain, 0);
|
|
2413
|
+
}
|
|
2414
|
+
const newCount = Option.getOrElse(
|
|
2415
|
+
MutableHashMap.get(domainRequestCount, domain),
|
|
2416
|
+
() => 0
|
|
2417
|
+
) + 1;
|
|
2418
|
+
MutableHashMap.set(domainRequestCount, domain, newCount);
|
|
2419
|
+
MutableHashMap.set(domainLastRequest, domain, Date.now());
|
|
2420
|
+
yield* Effect.logDebug(
|
|
2421
|
+
`Rate limit: ${domain} - ${newCount}/${config.maxRequestsPerSecondPerDomain} requests in window`
|
|
2422
|
+
);
|
|
2423
|
+
return request;
|
|
2424
|
+
})
|
|
2425
|
+
})
|
|
2426
|
+
};
|
|
2427
|
+
})
|
|
2428
|
+
}
|
|
2429
|
+
) {
|
|
2430
|
+
}
|
|
2431
|
+
class LoggingMiddleware extends Effect.Service()(
|
|
2432
|
+
"@jambudipa.io/LoggingMiddleware",
|
|
2433
|
+
{
|
|
2434
|
+
effect: Effect.sync(() => ({
|
|
2435
|
+
create: (config = {}) => {
|
|
2436
|
+
const {
|
|
2437
|
+
logRequests = true,
|
|
2438
|
+
logResponses = true,
|
|
2439
|
+
logErrors = true,
|
|
2440
|
+
logLevel = "info"
|
|
2441
|
+
} = config;
|
|
2442
|
+
return {
|
|
2443
|
+
processRequest: (request) => Effect.gen(function* () {
|
|
2444
|
+
if (logRequests) {
|
|
2445
|
+
const logMessage = `Processing request: ${request.task.url} (depth: ${request.task.depth})`;
|
|
2446
|
+
switch (logLevel) {
|
|
2447
|
+
case "debug":
|
|
2448
|
+
yield* Effect.logDebug(logMessage);
|
|
2449
|
+
break;
|
|
2450
|
+
case "info":
|
|
2451
|
+
yield* Effect.logInfo(logMessage);
|
|
2452
|
+
break;
|
|
2453
|
+
case "warn":
|
|
2454
|
+
yield* Effect.logWarning(logMessage);
|
|
2455
|
+
break;
|
|
2456
|
+
case "error":
|
|
2457
|
+
yield* Effect.logError(logMessage);
|
|
2458
|
+
break;
|
|
2459
|
+
}
|
|
2460
|
+
}
|
|
2461
|
+
return request;
|
|
2462
|
+
}),
|
|
2463
|
+
processResponse: (response, request) => Effect.gen(function* () {
|
|
2464
|
+
if (logResponses) {
|
|
2465
|
+
const logMessage = `Received response: ${request.task.url} (status: ${response.statusCode || "unknown"}, size: ${response.pageData.html.length} bytes)`;
|
|
2466
|
+
switch (logLevel) {
|
|
2467
|
+
case "debug":
|
|
2468
|
+
yield* Effect.logDebug(logMessage);
|
|
2469
|
+
break;
|
|
2470
|
+
case "info":
|
|
2471
|
+
yield* Effect.logInfo(logMessage);
|
|
2472
|
+
break;
|
|
2473
|
+
case "warn":
|
|
2474
|
+
yield* Effect.logWarning(logMessage);
|
|
2475
|
+
break;
|
|
2476
|
+
case "error":
|
|
2477
|
+
yield* Effect.logError(logMessage);
|
|
2478
|
+
break;
|
|
2479
|
+
}
|
|
2480
|
+
}
|
|
2481
|
+
return response;
|
|
2482
|
+
}),
|
|
2483
|
+
processException: (error, request) => Effect.gen(function* () {
|
|
2484
|
+
if (logErrors) {
|
|
2485
|
+
const logMessage = `Error processing request: ${request.task.url} - ${error.message}`;
|
|
2486
|
+
yield* Effect.logError(logMessage);
|
|
2487
|
+
}
|
|
2488
|
+
return null;
|
|
2489
|
+
})
|
|
2490
|
+
};
|
|
2491
|
+
}
|
|
2492
|
+
}))
|
|
2493
|
+
}
|
|
2494
|
+
) {
|
|
2495
|
+
}
|
|
2496
|
+
class UserAgentMiddleware extends Effect.Service()(
|
|
2497
|
+
"@jambudipa.io/UserAgentMiddleware",
|
|
2498
|
+
{
|
|
2499
|
+
effect: Effect.sync(() => ({
|
|
2500
|
+
create: (userAgent) => ({
|
|
2501
|
+
processRequest: (request) => Effect.succeed({
|
|
2502
|
+
...request,
|
|
2503
|
+
headers: {
|
|
2504
|
+
...request.headers,
|
|
2505
|
+
"User-Agent": userAgent
|
|
2506
|
+
}
|
|
2507
|
+
})
|
|
2508
|
+
})
|
|
2509
|
+
}))
|
|
2510
|
+
}
|
|
2511
|
+
) {
|
|
2512
|
+
}
|
|
2513
|
+
class StatsMiddleware extends Effect.Service()(
|
|
2514
|
+
"@jambudipa.io/StatsMiddleware",
|
|
2515
|
+
{
|
|
2516
|
+
effect: Effect.sync(() => ({
|
|
2517
|
+
create: () => {
|
|
2518
|
+
const stats = MutableHashMap.empty();
|
|
2519
|
+
const startTime = Date.now();
|
|
2520
|
+
const incr = (key, count = 1) => {
|
|
2521
|
+
const current = Option.getOrElse(
|
|
2522
|
+
MutableHashMap.get(stats, key),
|
|
2523
|
+
() => 0
|
|
2524
|
+
);
|
|
2525
|
+
MutableHashMap.set(stats, key, current + count);
|
|
2526
|
+
};
|
|
2527
|
+
return {
|
|
2528
|
+
middleware: {
|
|
2529
|
+
processRequest: (request) => Effect.sync(() => {
|
|
2530
|
+
incr("requests_processed");
|
|
2531
|
+
incr(`requests_depth_${request.task.depth}`);
|
|
2532
|
+
return request;
|
|
2533
|
+
}),
|
|
2534
|
+
processResponse: (response) => Effect.sync(() => {
|
|
2535
|
+
incr("responses_received");
|
|
2536
|
+
if (response.statusCode) {
|
|
2537
|
+
incr(`status_${response.statusCode}`);
|
|
2538
|
+
if (response.statusCode >= 200 && response.statusCode < 300) {
|
|
2539
|
+
incr("responses_success");
|
|
2540
|
+
} else if (response.statusCode >= 400) {
|
|
2541
|
+
incr("responses_error");
|
|
2542
|
+
}
|
|
2543
|
+
}
|
|
2544
|
+
incr("bytes_downloaded", response.pageData.html.length);
|
|
2545
|
+
return response;
|
|
2546
|
+
}),
|
|
2547
|
+
processException: (error) => Effect.sync(() => {
|
|
2548
|
+
incr("exceptions");
|
|
2549
|
+
incr(`exception_${error.constructor.name}`);
|
|
2550
|
+
return null;
|
|
2551
|
+
})
|
|
2552
|
+
},
|
|
2553
|
+
getStats: () => Effect.sync(() => ({
|
|
2554
|
+
...Object.fromEntries(Array.from(stats)),
|
|
2555
|
+
runtime_seconds: (Date.now() - startTime) / 1e3
|
|
2556
|
+
}))
|
|
2557
|
+
};
|
|
2558
|
+
}
|
|
2559
|
+
}))
|
|
2560
|
+
}
|
|
2561
|
+
) {
|
|
2562
|
+
}
|
|
2563
|
+
class StateDelta extends Schema.Class("StateDelta")({
|
|
2564
|
+
/** Session this delta applies to */
|
|
2565
|
+
stateKey: Schema.String,
|
|
2566
|
+
/** Sequence number for ordering deltas */
|
|
2567
|
+
sequence: Schema.Number,
|
|
2568
|
+
/** When this delta was created */
|
|
2569
|
+
timestamp: Schema.Date,
|
|
2570
|
+
/** The operation that created this delta */
|
|
2571
|
+
operation: Schema.Union(
|
|
2572
|
+
Schema.Struct({
|
|
2573
|
+
type: Schema.Literal("enqueue"),
|
|
2574
|
+
request: PriorityRequest
|
|
2575
|
+
}),
|
|
2576
|
+
Schema.Struct({
|
|
2577
|
+
type: Schema.Literal("dequeue"),
|
|
2578
|
+
fingerprint: Schema.String
|
|
2579
|
+
}),
|
|
2580
|
+
Schema.Struct({
|
|
2581
|
+
type: Schema.Literal("mark_visited"),
|
|
2582
|
+
fingerprint: Schema.String
|
|
2583
|
+
})
|
|
2584
|
+
)
|
|
2585
|
+
}) {
|
|
2586
|
+
}
|
|
2587
|
+
class PersistenceError2 extends Data.TaggedError("PersistenceError") {
|
|
2588
|
+
}
|
|
2589
|
+
const DEFAULT_HYBRID_CONFIG = {
|
|
2590
|
+
snapshotInterval: 1e3,
|
|
2591
|
+
maxDeltasBeforeSnapshot: 500,
|
|
2592
|
+
compactionEnabled: true,
|
|
2593
|
+
batchDeltas: true,
|
|
2594
|
+
deltaBatchSize: 10
|
|
2595
|
+
};
|
|
2596
|
+
class FullStatePersistence {
|
|
2597
|
+
constructor(backend) {
|
|
2598
|
+
this.backend = backend;
|
|
2599
|
+
}
|
|
2600
|
+
persist = (operation) => {
|
|
2601
|
+
const self = this;
|
|
2602
|
+
return Effect.gen(function* () {
|
|
2603
|
+
if (!self.backend.saveState) {
|
|
2604
|
+
return yield* Effect.fail(
|
|
2605
|
+
new PersistenceError2({
|
|
2606
|
+
message: `Backend ${self.backend.name} does not support full state persistence`,
|
|
2607
|
+
operation: "persist"
|
|
2608
|
+
})
|
|
2609
|
+
);
|
|
2610
|
+
}
|
|
2611
|
+
yield* self.backend.saveState(
|
|
2612
|
+
operation.resultingState.key,
|
|
2613
|
+
operation.resultingState
|
|
2614
|
+
);
|
|
2615
|
+
});
|
|
2616
|
+
};
|
|
2617
|
+
restore = (key) => {
|
|
2618
|
+
const self = this;
|
|
2619
|
+
return Effect.gen(function* () {
|
|
2620
|
+
if (!self.backend.loadState) {
|
|
2621
|
+
return yield* Effect.fail(
|
|
2622
|
+
new PersistenceError2({
|
|
2623
|
+
message: `Backend ${self.backend.name} does not support state loading`,
|
|
2624
|
+
operation: "restore"
|
|
2625
|
+
})
|
|
2626
|
+
);
|
|
2627
|
+
}
|
|
2628
|
+
return yield* self.backend.loadState(key);
|
|
2629
|
+
});
|
|
2630
|
+
};
|
|
2631
|
+
cleanup = (key) => {
|
|
2632
|
+
const self = this;
|
|
2633
|
+
return Effect.gen(function* () {
|
|
2634
|
+
if (!self.backend.deleteState) {
|
|
2635
|
+
return yield* Effect.fail(
|
|
2636
|
+
new PersistenceError2({
|
|
2637
|
+
message: `Backend ${self.backend.name} does not support state deletion`,
|
|
2638
|
+
operation: "cleanup"
|
|
2639
|
+
})
|
|
2640
|
+
);
|
|
2641
|
+
}
|
|
2642
|
+
yield* self.backend.deleteState(key);
|
|
2643
|
+
});
|
|
2644
|
+
};
|
|
2645
|
+
getInfo = () => ({
|
|
2646
|
+
name: "FullStatePersistence",
|
|
2647
|
+
description: "Saves complete state on every operation. Simple but potentially inefficient for large crawls.",
|
|
2648
|
+
capabilities: ["full-state-save", "full-state-restore", "simple-cleanup"]
|
|
2649
|
+
});
|
|
2650
|
+
}
|
|
2651
|
+
class DeltaPersistence {
|
|
2652
|
+
constructor(backend) {
|
|
2653
|
+
this.backend = backend;
|
|
2654
|
+
}
|
|
2655
|
+
persist = (operation) => {
|
|
2656
|
+
const self = this;
|
|
2657
|
+
return Effect.gen(function* () {
|
|
2658
|
+
if (!self.backend.saveDelta) {
|
|
2659
|
+
return yield* Effect.fail(
|
|
2660
|
+
new PersistenceError2({
|
|
2661
|
+
message: `Backend ${self.backend.name} does not support delta persistence`,
|
|
2662
|
+
operation: "persist"
|
|
2663
|
+
})
|
|
2664
|
+
);
|
|
2665
|
+
}
|
|
2666
|
+
yield* self.backend.saveDelta(operation.delta);
|
|
2667
|
+
});
|
|
2668
|
+
};
|
|
2669
|
+
restore = (key) => {
|
|
2670
|
+
const self = this;
|
|
2671
|
+
return Effect.gen(function* () {
|
|
2672
|
+
if (!self.backend.loadDeltas) {
|
|
2673
|
+
return yield* Effect.fail(
|
|
2674
|
+
new PersistenceError2({
|
|
2675
|
+
message: `Backend ${self.backend.name} does not support delta loading`,
|
|
2676
|
+
operation: "restore"
|
|
2677
|
+
})
|
|
2678
|
+
);
|
|
2679
|
+
}
|
|
2680
|
+
const deltas = yield* self.backend.loadDeltas(key);
|
|
2681
|
+
if (deltas.length === 0) {
|
|
2682
|
+
return null;
|
|
2683
|
+
}
|
|
2684
|
+
return yield* self.reconstructStateFromDeltas(key, deltas);
|
|
2685
|
+
});
|
|
2686
|
+
};
|
|
2687
|
+
cleanup = (key) => {
|
|
2688
|
+
const self = this;
|
|
2689
|
+
return Effect.gen(function* () {
|
|
2690
|
+
if (!self.backend.loadDeltas || !self.backend.compactDeltas) {
|
|
2691
|
+
return yield* Effect.fail(
|
|
2692
|
+
new PersistenceError2({
|
|
2693
|
+
message: `Backend ${self.backend.name} does not support delta cleanup`,
|
|
2694
|
+
operation: "cleanup"
|
|
2695
|
+
})
|
|
2696
|
+
);
|
|
2697
|
+
}
|
|
2698
|
+
const deltas = yield* self.backend.loadDeltas(key);
|
|
2699
|
+
if (deltas.length > 0) {
|
|
2700
|
+
const maxSequence = Math.max(...deltas.map((d) => d.sequence));
|
|
2701
|
+
yield* self.backend.compactDeltas(key, maxSequence + 1);
|
|
2702
|
+
}
|
|
2703
|
+
});
|
|
2704
|
+
};
|
|
2705
|
+
reconstructStateFromDeltas = (key, deltas) => Effect.gen(function* () {
|
|
2706
|
+
const sortedDeltas = [...deltas].sort((a, b) => a.sequence - b.sequence);
|
|
2707
|
+
const pendingRequests = [];
|
|
2708
|
+
const visitedFingerprints = [];
|
|
2709
|
+
let totalProcessed = 0;
|
|
2710
|
+
for (const delta of sortedDeltas) {
|
|
2711
|
+
switch (delta.operation.type) {
|
|
2712
|
+
case "enqueue":
|
|
2713
|
+
pendingRequests.push(delta.operation.request);
|
|
2714
|
+
break;
|
|
2715
|
+
case "dequeue": {
|
|
2716
|
+
const operation = delta.operation;
|
|
2717
|
+
if (operation.type === "dequeue") {
|
|
2718
|
+
const dequeueIndex = pendingRequests.findIndex(
|
|
2719
|
+
(req) => req.fingerprint === operation.fingerprint
|
|
2720
|
+
);
|
|
2721
|
+
if (dequeueIndex >= 0) {
|
|
2722
|
+
pendingRequests.splice(dequeueIndex, 1);
|
|
2723
|
+
totalProcessed++;
|
|
2724
|
+
}
|
|
2725
|
+
}
|
|
2726
|
+
break;
|
|
2727
|
+
}
|
|
2728
|
+
case "mark_visited": {
|
|
2729
|
+
const operation = delta.operation;
|
|
2730
|
+
if (operation.type === "mark_visited") {
|
|
2731
|
+
if (!visitedFingerprints.includes(operation.fingerprint)) {
|
|
2732
|
+
visitedFingerprints.push(operation.fingerprint);
|
|
2733
|
+
}
|
|
2734
|
+
}
|
|
2735
|
+
break;
|
|
2736
|
+
}
|
|
2737
|
+
}
|
|
2738
|
+
}
|
|
2739
|
+
return yield* Effect.tryPromise({
|
|
2740
|
+
try: async () => {
|
|
2741
|
+
const { SpiderState: SpiderState2 } = await Promise.resolve().then(() => SpiderScheduler_service);
|
|
2742
|
+
return new SpiderState2({
|
|
2743
|
+
key,
|
|
2744
|
+
pendingRequests,
|
|
2745
|
+
visitedFingerprints,
|
|
2746
|
+
totalProcessed
|
|
2747
|
+
});
|
|
2748
|
+
},
|
|
2749
|
+
catch: (error) => new PersistenceError2({
|
|
2750
|
+
message: "Failed to import SpiderState",
|
|
2751
|
+
cause: error,
|
|
2752
|
+
operation: "reconstructStateFromDeltas"
|
|
2753
|
+
})
|
|
2754
|
+
});
|
|
2755
|
+
});
|
|
2756
|
+
getInfo = () => ({
|
|
2757
|
+
name: "DeltaPersistence",
|
|
2758
|
+
description: "Saves only incremental changes. Efficient for large crawls but requires delta replay.",
|
|
2759
|
+
capabilities: ["delta-save", "delta-restore", "state-reconstruction"]
|
|
2760
|
+
});
|
|
2761
|
+
}
|
|
2762
|
+
class HybridPersistence {
|
|
2763
|
+
constructor(backend, config = DEFAULT_HYBRID_CONFIG) {
|
|
2764
|
+
this.backend = backend;
|
|
2765
|
+
this.config = config;
|
|
2766
|
+
}
|
|
2767
|
+
operationCount = 0;
|
|
2768
|
+
lastSnapshotSequence = 0;
|
|
2769
|
+
pendingDeltas = [];
|
|
2770
|
+
persist = (operation) => {
|
|
2771
|
+
const self = this;
|
|
2772
|
+
return Effect.gen(function* () {
|
|
2773
|
+
self.operationCount++;
|
|
2774
|
+
if (self.config.batchDeltas) {
|
|
2775
|
+
self.pendingDeltas.push(operation.delta);
|
|
2776
|
+
}
|
|
2777
|
+
const shouldSnapshot = operation.shouldSnapshot || self.operationCount % self.config.snapshotInterval === 0 || self.operationCount - self.lastSnapshotSequence >= self.config.maxDeltasBeforeSnapshot;
|
|
2778
|
+
if (shouldSnapshot) {
|
|
2779
|
+
yield* self.saveSnapshot(operation);
|
|
2780
|
+
} else {
|
|
2781
|
+
yield* self.saveDelta(operation);
|
|
2782
|
+
}
|
|
2783
|
+
if (self.config.batchDeltas && self.pendingDeltas.length >= self.config.deltaBatchSize) {
|
|
2784
|
+
yield* self.flushPendingDeltas();
|
|
2785
|
+
}
|
|
2786
|
+
});
|
|
2787
|
+
};
|
|
2788
|
+
saveSnapshot = (operation) => {
|
|
2789
|
+
const self = this;
|
|
2790
|
+
return Effect.gen(function* () {
|
|
2791
|
+
if (!self.backend.saveSnapshot) {
|
|
2792
|
+
return yield* Effect.fail(
|
|
2793
|
+
new PersistenceError2({
|
|
2794
|
+
message: `Backend ${self.backend.name} does not support snapshots`,
|
|
2795
|
+
operation: "saveSnapshot"
|
|
2796
|
+
})
|
|
2797
|
+
);
|
|
2798
|
+
}
|
|
2799
|
+
yield* self.backend.saveSnapshot(
|
|
2800
|
+
operation.resultingState.key,
|
|
2801
|
+
operation.resultingState,
|
|
2802
|
+
operation.delta.sequence
|
|
2803
|
+
);
|
|
2804
|
+
self.lastSnapshotSequence = operation.delta.sequence;
|
|
2805
|
+
if (self.config.compactionEnabled && self.backend.compactDeltas) {
|
|
2806
|
+
yield* self.backend.compactDeltas(
|
|
2807
|
+
operation.resultingState.key,
|
|
2808
|
+
operation.delta.sequence
|
|
2809
|
+
);
|
|
2810
|
+
}
|
|
2811
|
+
self.pendingDeltas = [];
|
|
2812
|
+
});
|
|
2813
|
+
};
|
|
2814
|
+
saveDelta = (operation) => {
|
|
2815
|
+
const self = this;
|
|
2816
|
+
return Effect.gen(function* () {
|
|
2817
|
+
if (!self.config.batchDeltas) {
|
|
2818
|
+
if (!self.backend.saveDelta) {
|
|
2819
|
+
return yield* Effect.fail(
|
|
2820
|
+
new PersistenceError2({
|
|
2821
|
+
message: `Backend ${self.backend.name} does not support delta persistence`,
|
|
2822
|
+
operation: "saveDelta"
|
|
2823
|
+
})
|
|
2824
|
+
);
|
|
2825
|
+
}
|
|
2826
|
+
yield* self.backend.saveDelta(operation.delta);
|
|
2827
|
+
}
|
|
2828
|
+
});
|
|
2829
|
+
};
|
|
2830
|
+
flushPendingDeltas = () => {
|
|
2831
|
+
const self = this;
|
|
2832
|
+
return Effect.gen(function* () {
|
|
2833
|
+
if (self.pendingDeltas.length === 0) return;
|
|
2834
|
+
if (self.backend.saveDeltas) {
|
|
2835
|
+
yield* self.backend.saveDeltas([...self.pendingDeltas]);
|
|
2836
|
+
} else if (self.backend.saveDelta) {
|
|
2837
|
+
for (const delta of self.pendingDeltas) {
|
|
2838
|
+
yield* self.backend.saveDelta(delta);
|
|
2839
|
+
}
|
|
2840
|
+
} else {
|
|
2841
|
+
return yield* Effect.fail(
|
|
2842
|
+
new PersistenceError2({
|
|
2843
|
+
message: `Backend ${self.backend.name} does not support delta persistence`,
|
|
2844
|
+
operation: "flushPendingDeltas"
|
|
2845
|
+
})
|
|
2846
|
+
);
|
|
2847
|
+
}
|
|
2848
|
+
self.pendingDeltas = [];
|
|
2849
|
+
});
|
|
2850
|
+
};
|
|
2851
|
+
restore = (key) => {
|
|
2852
|
+
const self = this;
|
|
2853
|
+
return Effect.gen(function* () {
|
|
2854
|
+
let baseState = null;
|
|
2855
|
+
let fromSequence = 0;
|
|
2856
|
+
if (self.backend.loadLatestSnapshot) {
|
|
2857
|
+
const snapshot = yield* self.backend.loadLatestSnapshot(key);
|
|
2858
|
+
if (snapshot) {
|
|
2859
|
+
baseState = snapshot.state;
|
|
2860
|
+
fromSequence = snapshot.sequence + 1;
|
|
2861
|
+
}
|
|
2862
|
+
}
|
|
2863
|
+
if (!self.backend.loadDeltas) {
|
|
2864
|
+
if (baseState) {
|
|
2865
|
+
return baseState;
|
|
2866
|
+
}
|
|
2867
|
+
return yield* Effect.fail(
|
|
2868
|
+
new PersistenceError2({
|
|
2869
|
+
message: `Backend ${self.backend.name} does not support delta loading`,
|
|
2870
|
+
operation: "restore"
|
|
2871
|
+
})
|
|
2872
|
+
);
|
|
2873
|
+
}
|
|
2874
|
+
const deltas = yield* self.backend.loadDeltas(key, fromSequence);
|
|
2875
|
+
if (!baseState && deltas.length === 0) {
|
|
2876
|
+
return null;
|
|
2877
|
+
}
|
|
2878
|
+
if (deltas.length === 0) {
|
|
2879
|
+
return baseState;
|
|
2880
|
+
}
|
|
2881
|
+
return yield* self.applyDeltasToState(key, baseState, deltas);
|
|
2882
|
+
});
|
|
2883
|
+
};
|
|
2884
|
+
applyDeltasToState = (key, baseState, deltas) => {
|
|
2885
|
+
const self = this;
|
|
2886
|
+
return Effect.gen(function* () {
|
|
2887
|
+
if (!baseState) {
|
|
2888
|
+
const deltaStrategy = new DeltaPersistence(self.backend);
|
|
2889
|
+
return yield* deltaStrategy.reconstructStateFromDeltas(key, deltas);
|
|
2890
|
+
}
|
|
2891
|
+
const sortedDeltas = [...deltas].sort((a, b) => a.sequence - b.sequence);
|
|
2892
|
+
const pendingRequests = [...baseState.pendingRequests];
|
|
2893
|
+
const visitedFingerprints = [...baseState.visitedFingerprints];
|
|
2894
|
+
let totalProcessed = baseState.totalProcessed;
|
|
2895
|
+
for (const delta of sortedDeltas) {
|
|
2896
|
+
switch (delta.operation.type) {
|
|
2897
|
+
case "enqueue":
|
|
2898
|
+
pendingRequests.push(delta.operation.request);
|
|
2899
|
+
break;
|
|
2900
|
+
case "dequeue": {
|
|
2901
|
+
const operation = delta.operation;
|
|
2902
|
+
if (operation.type === "dequeue") {
|
|
2903
|
+
const dequeueIndex = pendingRequests.findIndex(
|
|
2904
|
+
(req) => req.fingerprint === operation.fingerprint
|
|
2905
|
+
);
|
|
2906
|
+
if (dequeueIndex >= 0) {
|
|
2907
|
+
pendingRequests.splice(dequeueIndex, 1);
|
|
2908
|
+
totalProcessed++;
|
|
2909
|
+
}
|
|
2910
|
+
}
|
|
2911
|
+
break;
|
|
2912
|
+
}
|
|
2913
|
+
case "mark_visited": {
|
|
2914
|
+
const operation = delta.operation;
|
|
2915
|
+
if (operation.type === "mark_visited") {
|
|
2916
|
+
if (!visitedFingerprints.includes(operation.fingerprint)) {
|
|
2917
|
+
visitedFingerprints.push(operation.fingerprint);
|
|
2918
|
+
}
|
|
2919
|
+
}
|
|
2920
|
+
break;
|
|
2921
|
+
}
|
|
2922
|
+
}
|
|
2923
|
+
}
|
|
2924
|
+
return yield* Effect.tryPromise({
|
|
2925
|
+
try: async () => {
|
|
2926
|
+
const { SpiderState: SpiderState2 } = await Promise.resolve().then(() => SpiderScheduler_service);
|
|
2927
|
+
return new SpiderState2({
|
|
2928
|
+
key,
|
|
2929
|
+
pendingRequests,
|
|
2930
|
+
visitedFingerprints,
|
|
2931
|
+
totalProcessed
|
|
2932
|
+
});
|
|
2933
|
+
},
|
|
2934
|
+
catch: (error) => new PersistenceError2({
|
|
2935
|
+
message: "Failed to import SpiderState",
|
|
2936
|
+
cause: error,
|
|
2937
|
+
operation: "applyDeltasToState"
|
|
2938
|
+
})
|
|
2939
|
+
});
|
|
2940
|
+
});
|
|
2941
|
+
};
|
|
2942
|
+
cleanup = (key) => {
|
|
2943
|
+
const self = this;
|
|
2944
|
+
return Effect.gen(function* () {
|
|
2945
|
+
yield* self.flushPendingDeltas();
|
|
2946
|
+
if (self.backend.deleteState) {
|
|
2947
|
+
yield* self.backend.deleteState(key);
|
|
2948
|
+
}
|
|
2949
|
+
if (self.backend.compactDeltas) {
|
|
2950
|
+
yield* self.backend.compactDeltas(key, Number.MAX_SAFE_INTEGER);
|
|
2951
|
+
}
|
|
2952
|
+
});
|
|
2953
|
+
};
|
|
2954
|
+
getInfo = () => ({
|
|
2955
|
+
name: "HybridPersistence",
|
|
2956
|
+
description: "Combines deltas and snapshots for optimal performance and recovery speed.",
|
|
2957
|
+
capabilities: [
|
|
2958
|
+
"delta-save",
|
|
2959
|
+
"snapshot-save",
|
|
2960
|
+
"batch-deltas",
|
|
2961
|
+
"fast-recovery",
|
|
2962
|
+
"automatic-compaction"
|
|
2963
|
+
]
|
|
2964
|
+
});
|
|
2965
|
+
}
|
|
2966
|
+
class ResumabilityService extends Effect.Service()(
|
|
2967
|
+
"@jambudipa.io/ResumabilityService",
|
|
2968
|
+
{
|
|
2969
|
+
effect: Effect.gen(function* () {
|
|
2970
|
+
let strategy = null;
|
|
2971
|
+
let backend = null;
|
|
2972
|
+
const service = {
|
|
2973
|
+
/**
|
|
2974
|
+
* Configure the resumability service with a specific strategy and backend.
|
|
2975
|
+
*
|
|
2976
|
+
* This method initializes the storage backend and creates the appropriate
|
|
2977
|
+
* persistence strategy based on the configuration.
|
|
2978
|
+
*
|
|
2979
|
+
* @param config - Resumability configuration
|
|
2980
|
+
* @returns Effect that completes when configuration is applied
|
|
2981
|
+
*/
|
|
2982
|
+
configure: (config) => Effect.gen(function* () {
|
|
2983
|
+
backend = config.backend;
|
|
2984
|
+
yield* backend.initialize();
|
|
2985
|
+
strategy = yield* createStrategy(config);
|
|
2986
|
+
}),
|
|
2987
|
+
/**
|
|
2988
|
+
* Persist a state operation using the configured strategy.
|
|
2989
|
+
*
|
|
2990
|
+
* @param operation - State operation to persist
|
|
2991
|
+
* @returns Effect that completes when operation is persisted
|
|
2992
|
+
*/
|
|
2993
|
+
persistOperation: (operation) => Effect.gen(function* () {
|
|
2994
|
+
if (!strategy) {
|
|
2995
|
+
return yield* Effect.fail(
|
|
2996
|
+
new PersistenceError2({
|
|
2997
|
+
message: "ResumabilityService not configured. Call configure() first.",
|
|
2998
|
+
operation: "persistOperation"
|
|
2999
|
+
})
|
|
3000
|
+
);
|
|
3001
|
+
}
|
|
3002
|
+
yield* strategy.persist(operation);
|
|
3003
|
+
}),
|
|
3004
|
+
/**
|
|
3005
|
+
* Restore spider state from persistent storage.
|
|
3006
|
+
*
|
|
3007
|
+
* @param key - State key identifying the session to restore
|
|
3008
|
+
* @returns Effect containing the restored state, or null if not found
|
|
3009
|
+
*/
|
|
3010
|
+
restore: (key) => Effect.gen(function* () {
|
|
3011
|
+
if (!strategy) {
|
|
3012
|
+
return yield* Effect.fail(
|
|
3013
|
+
new PersistenceError2({
|
|
3014
|
+
message: "ResumabilityService not configured. Call configure() first.",
|
|
3015
|
+
operation: "restore"
|
|
3016
|
+
})
|
|
3017
|
+
);
|
|
3018
|
+
}
|
|
3019
|
+
return yield* strategy.restore(key);
|
|
3020
|
+
}),
|
|
3021
|
+
/**
|
|
3022
|
+
* Clean up old state data for a session.
|
|
3023
|
+
*
|
|
3024
|
+
* @param key - State key identifying the session to clean up
|
|
3025
|
+
* @returns Effect that completes when cleanup is finished
|
|
3026
|
+
*/
|
|
3027
|
+
cleanup: (key) => Effect.gen(function* () {
|
|
3028
|
+
if (!strategy) {
|
|
3029
|
+
return yield* Effect.fail(
|
|
3030
|
+
new PersistenceError2({
|
|
3031
|
+
message: "ResumabilityService not configured. Call configure() first.",
|
|
3032
|
+
operation: "cleanup"
|
|
3033
|
+
})
|
|
3034
|
+
);
|
|
3035
|
+
}
|
|
3036
|
+
yield* strategy.cleanup(key);
|
|
3037
|
+
}),
|
|
3038
|
+
/**
|
|
3039
|
+
* List all available sessions in storage.
|
|
3040
|
+
*
|
|
3041
|
+
* @returns Effect containing array of session keys
|
|
3042
|
+
*/
|
|
3043
|
+
listSessions: () => Effect.gen(function* () {
|
|
3044
|
+
if (!backend) {
|
|
3045
|
+
return yield* Effect.fail(
|
|
3046
|
+
new PersistenceError2({
|
|
3047
|
+
message: "ResumabilityService not configured. Call configure() first.",
|
|
3048
|
+
operation: "listSessions"
|
|
3049
|
+
})
|
|
3050
|
+
);
|
|
3051
|
+
}
|
|
3052
|
+
if (!backend.listSessions) {
|
|
3053
|
+
return yield* Effect.fail(
|
|
3054
|
+
new PersistenceError2({
|
|
3055
|
+
message: `Backend ${backend.name} does not support listing sessions`,
|
|
3056
|
+
operation: "listSessions"
|
|
3057
|
+
})
|
|
3058
|
+
);
|
|
3059
|
+
}
|
|
3060
|
+
return yield* backend.listSessions();
|
|
3061
|
+
}),
|
|
3062
|
+
/**
|
|
3063
|
+
* Get information about the current configuration.
|
|
3064
|
+
*
|
|
3065
|
+
* @returns Information about strategy and backend
|
|
3066
|
+
*/
|
|
3067
|
+
getInfo: () => Effect.gen(function* () {
|
|
3068
|
+
if (!strategy || !backend) {
|
|
3069
|
+
return yield* Effect.fail(
|
|
3070
|
+
new PersistenceError2({
|
|
3071
|
+
message: "ResumabilityService not configured. Call configure() first.",
|
|
3072
|
+
operation: "getInfo"
|
|
3073
|
+
})
|
|
3074
|
+
);
|
|
3075
|
+
}
|
|
3076
|
+
return {
|
|
3077
|
+
strategy: strategy.getInfo(),
|
|
3078
|
+
backend: {
|
|
3079
|
+
name: backend.name,
|
|
3080
|
+
capabilities: backend.capabilities
|
|
3081
|
+
}
|
|
3082
|
+
};
|
|
3083
|
+
}),
|
|
3084
|
+
/**
|
|
3085
|
+
* Reconfigure the service with new settings.
|
|
3086
|
+
*
|
|
3087
|
+
* This will clean up the current backend and reinitialize with new config.
|
|
3088
|
+
*
|
|
3089
|
+
* @param config - New configuration
|
|
3090
|
+
* @returns Effect that completes when reconfiguration is finished
|
|
3091
|
+
*/
|
|
3092
|
+
reconfigure: (config) => Effect.gen(function* () {
|
|
3093
|
+
if (backend) {
|
|
3094
|
+
yield* backend.cleanup();
|
|
3095
|
+
}
|
|
3096
|
+
yield* service.configure(config);
|
|
3097
|
+
})
|
|
3098
|
+
};
|
|
3099
|
+
return service;
|
|
3100
|
+
})
|
|
3101
|
+
}
|
|
3102
|
+
) {
|
|
3103
|
+
/**
|
|
3104
|
+
* Create a ResumabilityService layer from configuration.
|
|
3105
|
+
*
|
|
3106
|
+
* This is the primary way to create and configure the ResumabilityService.
|
|
3107
|
+
*
|
|
3108
|
+
* @param config - Resumability configuration
|
|
3109
|
+
* @returns Effect layer providing the configured ResumabilityService
|
|
3110
|
+
*/
|
|
3111
|
+
static fromConfig = (config) => Effect.gen(function* () {
|
|
3112
|
+
const service = yield* ResumabilityService;
|
|
3113
|
+
yield* service.configure(config);
|
|
3114
|
+
return service;
|
|
3115
|
+
}).pipe(Effect.provide(ResumabilityService.Default));
|
|
3116
|
+
}
|
|
3117
|
+
const createStrategy = (config) => Effect.gen(function* () {
|
|
3118
|
+
const { strategy: strategyType, backend, hybridConfig } = config;
|
|
3119
|
+
switch (strategyType) {
|
|
3120
|
+
case "full-state":
|
|
3121
|
+
return new FullStatePersistence(backend);
|
|
3122
|
+
case "delta":
|
|
3123
|
+
return new DeltaPersistence(backend);
|
|
3124
|
+
case "hybrid":
|
|
3125
|
+
return new HybridPersistence(
|
|
3126
|
+
backend,
|
|
3127
|
+
hybridConfig || DEFAULT_HYBRID_CONFIG
|
|
3128
|
+
);
|
|
3129
|
+
case "auto":
|
|
3130
|
+
const capabilities = backend.capabilities;
|
|
3131
|
+
if (capabilities.supportsDelta && capabilities.supportsSnapshot) {
|
|
3132
|
+
return new HybridPersistence(
|
|
3133
|
+
backend,
|
|
3134
|
+
hybridConfig || DEFAULT_HYBRID_CONFIG
|
|
3135
|
+
);
|
|
3136
|
+
} else if (capabilities.supportsDelta) {
|
|
3137
|
+
return new DeltaPersistence(backend);
|
|
3138
|
+
} else {
|
|
3139
|
+
return new FullStatePersistence(backend);
|
|
3140
|
+
}
|
|
3141
|
+
default:
|
|
3142
|
+
return yield* Effect.fail(
|
|
3143
|
+
new PersistenceError2({
|
|
3144
|
+
message: `Unknown strategy type: ${strategyType}`,
|
|
3145
|
+
operation: "createStrategy"
|
|
3146
|
+
})
|
|
3147
|
+
);
|
|
3148
|
+
}
|
|
3149
|
+
});
|
|
3150
|
+
const createStateOperation = (delta, resultingState, shouldSnapshot = false) => ({
|
|
3151
|
+
delta,
|
|
3152
|
+
resultingState,
|
|
3153
|
+
shouldSnapshot
|
|
3154
|
+
});
|
|
3155
|
+
const ResumabilityConfigs = {
|
|
3156
|
+
/**
|
|
3157
|
+
* Create a file-based configuration.
|
|
3158
|
+
*
|
|
3159
|
+
* @param baseDir - Directory to store state files
|
|
3160
|
+
* @param strategy - Persistence strategy (defaults to 'auto')
|
|
3161
|
+
* @returns ResumabilityConfig
|
|
3162
|
+
*/
|
|
3163
|
+
file: (baseDir, strategy = "auto") => ({
|
|
3164
|
+
strategy,
|
|
3165
|
+
backend: new (require("./backends/FileStorageBackend.js")).FileStorageBackend(
|
|
3166
|
+
baseDir
|
|
3167
|
+
)
|
|
3168
|
+
}),
|
|
3169
|
+
/**
|
|
3170
|
+
* Create a Redis-based configuration.
|
|
3171
|
+
*
|
|
3172
|
+
* @param redisClient - Redis client instance
|
|
3173
|
+
* @param strategy - Persistence strategy (defaults to 'hybrid')
|
|
3174
|
+
* @param keyPrefix - Redis key prefix (defaults to 'spider')
|
|
3175
|
+
* @returns ResumabilityConfig
|
|
3176
|
+
*/
|
|
3177
|
+
redis: (redisClient, strategy = "hybrid", keyPrefix = "spider") => ({
|
|
3178
|
+
strategy,
|
|
3179
|
+
backend: new (require("./backends/RedisStorageBackend.js")).RedisStorageBackend(
|
|
3180
|
+
redisClient,
|
|
3181
|
+
keyPrefix
|
|
3182
|
+
)
|
|
3183
|
+
}),
|
|
3184
|
+
/**
|
|
3185
|
+
* Create a PostgreSQL-based configuration.
|
|
3186
|
+
*
|
|
3187
|
+
* @param dbClient - Database client instance
|
|
3188
|
+
* @param strategy - Persistence strategy (defaults to 'hybrid')
|
|
3189
|
+
* @param config - PostgreSQL configuration
|
|
3190
|
+
* @returns ResumabilityConfig
|
|
3191
|
+
*/
|
|
3192
|
+
postgres: (dbClient, strategy = "hybrid", config) => ({
|
|
3193
|
+
strategy,
|
|
3194
|
+
backend: new (require("./backends/PostgresStorageBackend.js")).PostgresStorageBackend(
|
|
3195
|
+
dbClient,
|
|
3196
|
+
config
|
|
3197
|
+
)
|
|
3198
|
+
})
|
|
3199
|
+
};
|
|
3200
|
+
class FileStorageBackend {
|
|
3201
|
+
constructor(baseDir) {
|
|
3202
|
+
this.baseDir = baseDir;
|
|
3203
|
+
}
|
|
3204
|
+
capabilities = {
|
|
3205
|
+
supportsDelta: true,
|
|
3206
|
+
supportsSnapshot: true,
|
|
3207
|
+
supportsStreaming: false,
|
|
3208
|
+
supportsConcurrency: false,
|
|
3209
|
+
// File system isn't great for concurrent access
|
|
3210
|
+
latency: "low"
|
|
3211
|
+
};
|
|
3212
|
+
name = "FileStorageBackend";
|
|
3213
|
+
initialize = () => {
|
|
3214
|
+
const self = this;
|
|
3215
|
+
return Effect.gen(function* () {
|
|
3216
|
+
yield* Effect.tryPromise({
|
|
3217
|
+
try: () => fs$1.mkdir(self.baseDir, { recursive: true }),
|
|
3218
|
+
catch: (error) => new PersistenceError2({
|
|
3219
|
+
message: `Failed to initialize file storage: ${error}`,
|
|
3220
|
+
cause: error,
|
|
3221
|
+
operation: "initialize"
|
|
3222
|
+
})
|
|
3223
|
+
});
|
|
3224
|
+
yield* Effect.tryPromise({
|
|
3225
|
+
try: () => fs$1.mkdir(path.join(self.baseDir, "sessions"), { recursive: true }),
|
|
3226
|
+
catch: (error) => new PersistenceError2({
|
|
3227
|
+
message: `Failed to initialize file storage: ${error}`,
|
|
3228
|
+
cause: error,
|
|
3229
|
+
operation: "initialize"
|
|
3230
|
+
})
|
|
3231
|
+
});
|
|
3232
|
+
});
|
|
3233
|
+
};
|
|
3234
|
+
cleanup = () => Effect.succeed(void 0);
|
|
3235
|
+
// No cleanup needed for file backend
|
|
3236
|
+
// Full state operations
|
|
3237
|
+
saveState = (key, state) => {
|
|
3238
|
+
const self = this;
|
|
3239
|
+
return Effect.gen(function* () {
|
|
3240
|
+
const sessionDir = self.getSessionDir(key);
|
|
3241
|
+
const statePath = path.join(sessionDir, "state.json");
|
|
3242
|
+
yield* Effect.tryPromise({
|
|
3243
|
+
try: () => fs$1.mkdir(sessionDir, { recursive: true }),
|
|
3244
|
+
catch: (error) => new PersistenceError2({
|
|
3245
|
+
message: `Failed to create session directory: ${error}`,
|
|
3246
|
+
cause: error,
|
|
3247
|
+
operation: "saveState"
|
|
3248
|
+
})
|
|
3249
|
+
});
|
|
3250
|
+
const encoded = Schema.encodeSync(SpiderState)(state);
|
|
3251
|
+
yield* Effect.tryPromise({
|
|
3252
|
+
try: () => fs$1.writeFile(statePath, JSON.stringify(encoded, null, 2), "utf8"),
|
|
3253
|
+
catch: (error) => new PersistenceError2({
|
|
3254
|
+
message: `Failed to save state: ${error}`,
|
|
3255
|
+
cause: error,
|
|
3256
|
+
operation: "saveState"
|
|
3257
|
+
})
|
|
3258
|
+
});
|
|
3259
|
+
});
|
|
3260
|
+
};
|
|
3261
|
+
loadState = (key) => {
|
|
3262
|
+
const self = this;
|
|
3263
|
+
return Effect.gen(function* () {
|
|
3264
|
+
const sessionDir = self.getSessionDir(key);
|
|
3265
|
+
const statePath = path.join(sessionDir, "state.json");
|
|
3266
|
+
const result = yield* Effect.tryPromise(
|
|
3267
|
+
() => fs$1.readFile(statePath, "utf8")
|
|
3268
|
+
).pipe(
|
|
3269
|
+
Effect.catchAll((error) => {
|
|
3270
|
+
if (error.code === "ENOENT") {
|
|
3271
|
+
return Effect.succeed(null);
|
|
3272
|
+
}
|
|
3273
|
+
return Effect.fail(
|
|
3274
|
+
new PersistenceError2({
|
|
3275
|
+
message: `Failed to load state: ${error}`,
|
|
3276
|
+
cause: error,
|
|
3277
|
+
operation: "loadState"
|
|
3278
|
+
})
|
|
3279
|
+
);
|
|
3280
|
+
})
|
|
3281
|
+
);
|
|
3282
|
+
if (result === null) {
|
|
3283
|
+
return null;
|
|
3284
|
+
}
|
|
3285
|
+
try {
|
|
3286
|
+
const parsed = JSON.parse(result);
|
|
3287
|
+
const decoded = Schema.decodeUnknownSync(SpiderState)(parsed);
|
|
3288
|
+
return decoded;
|
|
3289
|
+
} catch (error) {
|
|
3290
|
+
return yield* Effect.fail(
|
|
3291
|
+
new PersistenceError2({
|
|
3292
|
+
message: `Failed to parse state: ${error}`,
|
|
3293
|
+
cause: error,
|
|
3294
|
+
operation: "loadState"
|
|
3295
|
+
})
|
|
3296
|
+
);
|
|
3297
|
+
}
|
|
3298
|
+
});
|
|
3299
|
+
};
|
|
3300
|
+
deleteState = (key) => {
|
|
3301
|
+
const self = this;
|
|
3302
|
+
return Effect.gen(function* () {
|
|
3303
|
+
const sessionDir = self.getSessionDir(key);
|
|
3304
|
+
yield* Effect.tryPromise({
|
|
3305
|
+
try: () => fs$1.rm(sessionDir, { recursive: true, force: true }),
|
|
3306
|
+
catch: (error) => new PersistenceError2({
|
|
3307
|
+
message: `Failed to delete state: ${error}`,
|
|
3308
|
+
cause: error,
|
|
3309
|
+
operation: "deleteState"
|
|
3310
|
+
})
|
|
3311
|
+
});
|
|
3312
|
+
});
|
|
3313
|
+
};
|
|
3314
|
+
// Delta operations
|
|
3315
|
+
saveDelta = (delta) => {
|
|
3316
|
+
const self = this;
|
|
3317
|
+
return Effect.gen(function* () {
|
|
3318
|
+
const sessionDir = path.join(self.baseDir, "sessions", delta.stateKey);
|
|
3319
|
+
const deltasDir = path.join(sessionDir, "deltas");
|
|
3320
|
+
const deltaPath = path.join(
|
|
3321
|
+
deltasDir,
|
|
3322
|
+
`${delta.sequence.toString().padStart(6, "0")}.json`
|
|
3323
|
+
);
|
|
3324
|
+
yield* Effect.tryPromise({
|
|
3325
|
+
try: () => fs$1.mkdir(deltasDir, { recursive: true }),
|
|
3326
|
+
catch: (error) => new PersistenceError2({
|
|
3327
|
+
message: `Failed to create deltas directory: ${error}`,
|
|
3328
|
+
cause: error,
|
|
3329
|
+
operation: "saveDelta"
|
|
3330
|
+
})
|
|
3331
|
+
});
|
|
3332
|
+
const encoded = Schema.encodeSync(StateDelta)(delta);
|
|
3333
|
+
yield* Effect.tryPromise({
|
|
3334
|
+
try: () => fs$1.writeFile(deltaPath, JSON.stringify(encoded, null, 2), "utf8"),
|
|
3335
|
+
catch: (error) => new PersistenceError2({
|
|
3336
|
+
message: `Failed to save delta: ${error}`,
|
|
3337
|
+
cause: error,
|
|
3338
|
+
operation: "saveDelta"
|
|
3339
|
+
})
|
|
3340
|
+
});
|
|
3341
|
+
});
|
|
3342
|
+
};
|
|
3343
|
+
saveDeltas = (deltas) => {
|
|
3344
|
+
const self = this;
|
|
3345
|
+
return Effect.gen(function* () {
|
|
3346
|
+
for (const delta of deltas) {
|
|
3347
|
+
yield* self.saveDelta(delta);
|
|
3348
|
+
}
|
|
3349
|
+
});
|
|
3350
|
+
};
|
|
3351
|
+
loadDeltas = (key, fromSequence = 0) => {
|
|
3352
|
+
const self = this;
|
|
3353
|
+
return Effect.gen(function* () {
|
|
3354
|
+
const deltasDir = path.join(self.getSessionDir(key), "deltas");
|
|
3355
|
+
const files = yield* Effect.tryPromise(() => fs$1.readdir(deltasDir)).pipe(
|
|
3356
|
+
Effect.catchAll((error) => {
|
|
3357
|
+
if (error.code === "ENOENT") {
|
|
3358
|
+
return Effect.succeed([]);
|
|
3359
|
+
}
|
|
3360
|
+
return Effect.fail(
|
|
3361
|
+
new PersistenceError2({
|
|
3362
|
+
message: `Failed to read deltas directory: ${error}`,
|
|
3363
|
+
cause: error,
|
|
3364
|
+
operation: "loadDeltas"
|
|
3365
|
+
})
|
|
3366
|
+
);
|
|
3367
|
+
})
|
|
3368
|
+
);
|
|
3369
|
+
if (files.length === 0) {
|
|
3370
|
+
return [];
|
|
3371
|
+
}
|
|
3372
|
+
const deltaFiles = files.filter((f) => f.endsWith(".json")).map((f) => ({
|
|
3373
|
+
file: f,
|
|
3374
|
+
sequence: parseInt(f.replace(".json", ""), 10)
|
|
3375
|
+
})).filter(({ sequence }) => sequence >= fromSequence).sort((a, b) => a.sequence - b.sequence);
|
|
3376
|
+
const deltas = [];
|
|
3377
|
+
for (const { file } of deltaFiles) {
|
|
3378
|
+
const content = yield* Effect.tryPromise({
|
|
3379
|
+
try: () => fs$1.readFile(path.join(deltasDir, file), "utf8"),
|
|
3380
|
+
catch: (error) => new PersistenceError2({
|
|
3381
|
+
message: `Failed to read delta file ${file}: ${error}`,
|
|
3382
|
+
cause: error,
|
|
3383
|
+
operation: "loadDeltas"
|
|
3384
|
+
})
|
|
3385
|
+
});
|
|
3386
|
+
try {
|
|
3387
|
+
const parsed = JSON.parse(content);
|
|
3388
|
+
const decoded = Schema.decodeUnknownSync(StateDelta)(parsed);
|
|
3389
|
+
deltas.push(decoded);
|
|
3390
|
+
} catch (error) {
|
|
3391
|
+
return yield* Effect.fail(
|
|
3392
|
+
new PersistenceError2({
|
|
3393
|
+
message: `Failed to parse delta file ${file}: ${error}`,
|
|
3394
|
+
cause: error,
|
|
3395
|
+
operation: "loadDeltas"
|
|
3396
|
+
})
|
|
3397
|
+
);
|
|
3398
|
+
}
|
|
3399
|
+
}
|
|
3400
|
+
return deltas;
|
|
3401
|
+
});
|
|
3402
|
+
};
|
|
3403
|
+
// Snapshot operations
|
|
3404
|
+
saveSnapshot = (key, state, sequence) => {
|
|
3405
|
+
const self = this;
|
|
3406
|
+
return Effect.gen(function* () {
|
|
3407
|
+
const sessionDir = self.getSessionDir(key);
|
|
3408
|
+
const snapshotPath = path.join(sessionDir, "snapshot.json");
|
|
3409
|
+
yield* Effect.tryPromise({
|
|
3410
|
+
try: () => fs$1.mkdir(sessionDir, { recursive: true }),
|
|
3411
|
+
catch: (error) => new PersistenceError2({
|
|
3412
|
+
message: `Failed to create session directory: ${error}`,
|
|
3413
|
+
cause: error,
|
|
3414
|
+
operation: "saveSnapshot"
|
|
3415
|
+
})
|
|
3416
|
+
});
|
|
3417
|
+
const snapshotData = {
|
|
3418
|
+
state: Schema.encodeSync(SpiderState)(state),
|
|
3419
|
+
sequence,
|
|
3420
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
3421
|
+
};
|
|
3422
|
+
yield* Effect.tryPromise({
|
|
3423
|
+
try: () => fs$1.writeFile(
|
|
3424
|
+
snapshotPath,
|
|
3425
|
+
JSON.stringify(snapshotData, null, 2),
|
|
3426
|
+
"utf8"
|
|
3427
|
+
),
|
|
3428
|
+
catch: (error) => new PersistenceError2({
|
|
3429
|
+
message: `Failed to save snapshot: ${error}`,
|
|
3430
|
+
cause: error,
|
|
3431
|
+
operation: "saveSnapshot"
|
|
3432
|
+
})
|
|
3433
|
+
});
|
|
3434
|
+
});
|
|
3435
|
+
};
|
|
3436
|
+
loadLatestSnapshot = (key) => {
|
|
3437
|
+
const self = this;
|
|
3438
|
+
return Effect.gen(function* () {
|
|
3439
|
+
const sessionDir = self.getSessionDir(key);
|
|
3440
|
+
const snapshotPath = path.join(sessionDir, "snapshot.json");
|
|
3441
|
+
const content = yield* Effect.tryPromise(
|
|
3442
|
+
() => fs$1.readFile(snapshotPath, "utf8")
|
|
3443
|
+
).pipe(
|
|
3444
|
+
Effect.catchAll((error) => {
|
|
3445
|
+
if (error.code === "ENOENT") {
|
|
3446
|
+
return Effect.succeed(null);
|
|
3447
|
+
}
|
|
3448
|
+
return Effect.fail(
|
|
3449
|
+
new PersistenceError2({
|
|
3450
|
+
message: `Failed to load snapshot: ${error}`,
|
|
3451
|
+
cause: error,
|
|
3452
|
+
operation: "loadLatestSnapshot"
|
|
3453
|
+
})
|
|
3454
|
+
);
|
|
3455
|
+
})
|
|
3456
|
+
);
|
|
3457
|
+
if (content === null) {
|
|
3458
|
+
return null;
|
|
3459
|
+
}
|
|
3460
|
+
try {
|
|
3461
|
+
const parsed = JSON.parse(content);
|
|
3462
|
+
const state = Schema.decodeUnknownSync(SpiderState)(parsed.state);
|
|
3463
|
+
return {
|
|
3464
|
+
state,
|
|
3465
|
+
sequence: Number(parsed.sequence)
|
|
3466
|
+
};
|
|
3467
|
+
} catch (error) {
|
|
3468
|
+
return yield* Effect.fail(
|
|
3469
|
+
new PersistenceError2({
|
|
3470
|
+
message: `Failed to parse snapshot: ${error}`,
|
|
3471
|
+
cause: error,
|
|
3472
|
+
operation: "loadLatestSnapshot"
|
|
3473
|
+
})
|
|
3474
|
+
);
|
|
3475
|
+
}
|
|
3476
|
+
});
|
|
3477
|
+
};
|
|
3478
|
+
// Cleanup operations
|
|
3479
|
+
compactDeltas = (key, beforeSequence) => {
|
|
3480
|
+
const self = this;
|
|
3481
|
+
return Effect.gen(function* () {
|
|
3482
|
+
const deltasDir = path.join(self.getSessionDir(key), "deltas");
|
|
3483
|
+
const files = yield* Effect.tryPromise(() => fs$1.readdir(deltasDir)).pipe(
|
|
3484
|
+
Effect.catchAll((error) => {
|
|
3485
|
+
if (error.code === "ENOENT") {
|
|
3486
|
+
return Effect.succeed([]);
|
|
3487
|
+
}
|
|
3488
|
+
return Effect.fail(
|
|
3489
|
+
new PersistenceError2({
|
|
3490
|
+
message: `Failed to read deltas directory: ${error}`,
|
|
3491
|
+
cause: error,
|
|
3492
|
+
operation: "compactDeltas"
|
|
3493
|
+
})
|
|
3494
|
+
);
|
|
3495
|
+
})
|
|
3496
|
+
);
|
|
3497
|
+
if (files.length === 0) {
|
|
3498
|
+
return;
|
|
3499
|
+
}
|
|
3500
|
+
const deltaFiles = files.filter((f) => f.endsWith(".json")).map((f) => ({
|
|
3501
|
+
file: f,
|
|
3502
|
+
sequence: parseInt(f.replace(".json", ""), 10)
|
|
3503
|
+
})).filter(({ sequence }) => sequence < beforeSequence);
|
|
3504
|
+
for (const { file } of deltaFiles) {
|
|
3505
|
+
yield* Effect.tryPromise({
|
|
3506
|
+
try: () => fs$1.unlink(path.join(deltasDir, file)),
|
|
3507
|
+
catch: (error) => new PersistenceError2({
|
|
3508
|
+
message: `Failed to delete delta file ${file}: ${error}`,
|
|
3509
|
+
cause: error,
|
|
3510
|
+
operation: "compactDeltas"
|
|
3511
|
+
})
|
|
3512
|
+
});
|
|
3513
|
+
}
|
|
3514
|
+
});
|
|
3515
|
+
};
|
|
3516
|
+
listSessions = () => {
|
|
3517
|
+
const self = this;
|
|
3518
|
+
return Effect.gen(function* () {
|
|
3519
|
+
const sessionsDir = path.join(self.baseDir, "sessions");
|
|
3520
|
+
const dirs = yield* Effect.tryPromise(() => fs$1.readdir(sessionsDir)).pipe(
|
|
3521
|
+
Effect.catchAll((error) => {
|
|
3522
|
+
if (error.code === "ENOENT") {
|
|
3523
|
+
return Effect.succeed([]);
|
|
3524
|
+
}
|
|
3525
|
+
return Effect.fail(
|
|
3526
|
+
new PersistenceError2({
|
|
3527
|
+
message: `Failed to read sessions directory: ${error}`,
|
|
3528
|
+
cause: error,
|
|
3529
|
+
operation: "listSessions"
|
|
3530
|
+
})
|
|
3531
|
+
);
|
|
3532
|
+
})
|
|
3533
|
+
);
|
|
3534
|
+
if (dirs.length === 0) {
|
|
3535
|
+
return [];
|
|
3536
|
+
}
|
|
3537
|
+
const sessions = [];
|
|
3538
|
+
for (const dir of dirs) {
|
|
3539
|
+
const sessionDir = path.join(sessionsDir, dir);
|
|
3540
|
+
const statePath = path.join(sessionDir, "state.json");
|
|
3541
|
+
const content = yield* Effect.tryPromise(
|
|
3542
|
+
() => fs$1.readFile(statePath, "utf8")
|
|
3543
|
+
).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
3544
|
+
if (content === null) {
|
|
3545
|
+
continue;
|
|
3546
|
+
}
|
|
3547
|
+
try {
|
|
3548
|
+
const parsed = JSON.parse(content);
|
|
3549
|
+
Schema.decodeUnknownSync(SpiderState)(parsed);
|
|
3550
|
+
sessions.push({ id: dir, name: dir, timestamp: /* @__PURE__ */ new Date() });
|
|
3551
|
+
} catch {
|
|
3552
|
+
continue;
|
|
3553
|
+
}
|
|
3554
|
+
}
|
|
3555
|
+
return sessions;
|
|
3556
|
+
});
|
|
3557
|
+
};
|
|
3558
|
+
getSessionDir = (key) => {
|
|
3559
|
+
return path.join(this.baseDir, "sessions", key.id);
|
|
3560
|
+
};
|
|
3561
|
+
}
|
|
3562
|
+
class CookieManager extends Context.Tag("CookieManager")() {
|
|
3563
|
+
}
|
|
3564
|
+
const makeCookieManager = () => Effect.gen(function* () {
|
|
3565
|
+
const jar = new CookieJar();
|
|
3566
|
+
const jarRef = yield* Ref.make(jar);
|
|
3567
|
+
return {
|
|
3568
|
+
setCookie: (cookieString, url) => Effect.gen(function* () {
|
|
3569
|
+
const jar2 = yield* Ref.get(jarRef);
|
|
3570
|
+
yield* Effect.tryPromise({
|
|
3571
|
+
try: () => new Promise((resolve, reject) => {
|
|
3572
|
+
jar2.setCookie(cookieString, url, (err) => {
|
|
3573
|
+
if (err) reject(err);
|
|
3574
|
+
else resolve();
|
|
3575
|
+
});
|
|
3576
|
+
}),
|
|
3577
|
+
catch: (error) => new Error(`Failed to set cookie: ${error}`)
|
|
3578
|
+
});
|
|
3579
|
+
}),
|
|
3580
|
+
getCookies: (url) => Effect.gen(function* () {
|
|
3581
|
+
const jar2 = yield* Ref.get(jarRef);
|
|
3582
|
+
const cookies = yield* Effect.tryPromise({
|
|
3583
|
+
try: () => new Promise((resolve, reject) => {
|
|
3584
|
+
jar2.getCookies(url, (err, cookies2) => {
|
|
3585
|
+
if (err) reject(err);
|
|
3586
|
+
else resolve(cookies2 || []);
|
|
3587
|
+
});
|
|
3588
|
+
}),
|
|
3589
|
+
catch: () => new Error(`Failed to get cookies for ${url}`)
|
|
3590
|
+
});
|
|
3591
|
+
return cookies.map((cookie) => cookie.toString());
|
|
3592
|
+
}).pipe(Effect.orElseSucceed(() => [])),
|
|
3593
|
+
getCookieHeader: (url) => Effect.gen(function* () {
|
|
3594
|
+
const jar2 = yield* Ref.get(jarRef);
|
|
3595
|
+
const cookieHeader = yield* Effect.tryPromise({
|
|
3596
|
+
try: () => new Promise((resolve, reject) => {
|
|
3597
|
+
jar2.getCookieString(url, (err, cookies) => {
|
|
3598
|
+
if (err) reject(err);
|
|
3599
|
+
else resolve(cookies || null);
|
|
3600
|
+
});
|
|
3601
|
+
}),
|
|
3602
|
+
catch: () => null
|
|
3603
|
+
});
|
|
3604
|
+
return cookieHeader;
|
|
3605
|
+
}).pipe(Effect.orElseSucceed(() => null)),
|
|
3606
|
+
clearCookies: () => Effect.gen(function* () {
|
|
3607
|
+
const newJar = new CookieJar();
|
|
3608
|
+
yield* Ref.set(jarRef, newJar);
|
|
3609
|
+
}),
|
|
3610
|
+
serialize: () => Effect.gen(function* () {
|
|
3611
|
+
const jar2 = yield* Ref.get(jarRef);
|
|
3612
|
+
const serialized = yield* Effect.tryPromise({
|
|
3613
|
+
try: () => new Promise((resolve, reject) => {
|
|
3614
|
+
jar2.serialize((err, serializedObject) => {
|
|
3615
|
+
if (err) reject(err);
|
|
3616
|
+
else resolve(serializedObject);
|
|
3617
|
+
});
|
|
3618
|
+
}),
|
|
3619
|
+
catch: () => new Error("Failed to serialize cookies")
|
|
3620
|
+
});
|
|
3621
|
+
return JSON.stringify(serialized);
|
|
3622
|
+
}).pipe(Effect.orElseSucceed(() => "{}")),
|
|
3623
|
+
deserialize: (data) => Effect.gen(function* () {
|
|
3624
|
+
try {
|
|
3625
|
+
const parsed = JSON.parse(data);
|
|
3626
|
+
const newJar = CookieJar.deserialize(parsed);
|
|
3627
|
+
yield* Effect.tryPromise({
|
|
3628
|
+
try: () => Promise.resolve(newJar),
|
|
3629
|
+
catch: () => new Error("Failed to deserialize cookie jar")
|
|
3630
|
+
}).pipe(Effect.flatMap((jar2) => Ref.set(jarRef, jar2)));
|
|
3631
|
+
} catch (error) {
|
|
3632
|
+
yield* Effect.fail(new Error(`Invalid cookie data: ${error}`));
|
|
3633
|
+
}
|
|
3634
|
+
})
|
|
3635
|
+
};
|
|
3636
|
+
});
|
|
3637
|
+
const CookieManagerLive = Layer.effect(
|
|
3638
|
+
CookieManager,
|
|
3639
|
+
makeCookieManager()
|
|
3640
|
+
);
|
|
3641
|
+
class EnhancedHttpClient extends Context.Tag("EnhancedHttpClient")() {
|
|
3642
|
+
}
|
|
3643
|
+
const makeEnhancedHttpClient = Effect.gen(function* () {
|
|
3644
|
+
const logger = yield* SpiderLogger;
|
|
3645
|
+
const cookieManager = yield* CookieManager;
|
|
3646
|
+
const makeRequest = (url, options = {}) => Effect.gen(function* () {
|
|
3647
|
+
const startMs = Date.now();
|
|
3648
|
+
const domain = new URL(url).hostname;
|
|
3649
|
+
const cookieHeader = yield* cookieManager.getCookieHeader(url);
|
|
3650
|
+
const headers = {
|
|
3651
|
+
"User-Agent": "Mozilla/5.0 (compatible; Spider/1.0)",
|
|
3652
|
+
...options.headers
|
|
3653
|
+
};
|
|
3654
|
+
if (cookieHeader && !headers["Cookie"]) {
|
|
3655
|
+
headers["Cookie"] = cookieHeader;
|
|
3656
|
+
}
|
|
3657
|
+
if (options.method === "POST" && options.body && !headers["Content-Type"]) {
|
|
3658
|
+
if (typeof options.body === "string") {
|
|
3659
|
+
try {
|
|
3660
|
+
JSON.parse(options.body);
|
|
3661
|
+
headers["Content-Type"] = "application/json";
|
|
3662
|
+
} catch {
|
|
3663
|
+
headers["Content-Type"] = "application/x-www-form-urlencoded";
|
|
3664
|
+
}
|
|
3665
|
+
} else if (options.body instanceof FormData) ;
|
|
3666
|
+
else if (options.body instanceof URLSearchParams) {
|
|
3667
|
+
headers["Content-Type"] = "application/x-www-form-urlencoded";
|
|
3668
|
+
}
|
|
3669
|
+
}
|
|
3670
|
+
const controller = new AbortController();
|
|
3671
|
+
const timeoutMs = options.timeout || 3e4;
|
|
3672
|
+
const timeoutId = setTimeout(() => {
|
|
3673
|
+
const duration = Date.now() - startMs;
|
|
3674
|
+
Effect.runSync(
|
|
3675
|
+
logger.logEdgeCase(domain, "http_request_abort", {
|
|
3676
|
+
url,
|
|
3677
|
+
method: options.method || "GET",
|
|
3678
|
+
durationMs: duration,
|
|
3679
|
+
reason: "timeout",
|
|
3680
|
+
timeoutMs
|
|
3681
|
+
})
|
|
3682
|
+
);
|
|
3683
|
+
controller.abort();
|
|
3684
|
+
}, timeoutMs);
|
|
3685
|
+
const response = yield* Effect.tryPromise({
|
|
3686
|
+
try: async () => {
|
|
3687
|
+
const resp = await fetch(url, {
|
|
3688
|
+
method: options.method || "GET",
|
|
3689
|
+
headers,
|
|
3690
|
+
body: options.body,
|
|
3691
|
+
signal: controller.signal,
|
|
3692
|
+
redirect: options.followRedirects === false ? "manual" : "follow",
|
|
3693
|
+
credentials: options.credentials || "same-origin"
|
|
3694
|
+
});
|
|
3695
|
+
clearTimeout(timeoutId);
|
|
3696
|
+
return resp;
|
|
3697
|
+
},
|
|
3698
|
+
catch: (error) => {
|
|
3699
|
+
clearTimeout(timeoutId);
|
|
3700
|
+
return NetworkError.fromCause(url, error);
|
|
3701
|
+
}
|
|
3702
|
+
});
|
|
3703
|
+
const body = yield* Effect.tryPromise({
|
|
3704
|
+
try: () => response.text(),
|
|
3705
|
+
catch: (error) => ResponseError.fromCause(url, error)
|
|
3706
|
+
});
|
|
3707
|
+
const setCookieHeaders = response.headers.getSetCookie ? response.headers.getSetCookie() : response.headers.get("set-cookie")?.split(", ") || [];
|
|
3708
|
+
for (const cookieString of setCookieHeaders) {
|
|
3709
|
+
if (cookieString) {
|
|
3710
|
+
yield* cookieManager.setCookie(cookieString, url).pipe(Effect.catchAll(() => Effect.void));
|
|
3711
|
+
}
|
|
3712
|
+
}
|
|
3713
|
+
const responseHeaders = {};
|
|
3714
|
+
response.headers.forEach((value, key) => {
|
|
3715
|
+
responseHeaders[key] = value;
|
|
3716
|
+
});
|
|
3717
|
+
return {
|
|
3718
|
+
url: response.url,
|
|
3719
|
+
status: response.status,
|
|
3720
|
+
statusText: response.statusText,
|
|
3721
|
+
headers: responseHeaders,
|
|
3722
|
+
body,
|
|
3723
|
+
cookies: setCookieHeaders
|
|
3724
|
+
};
|
|
3725
|
+
});
|
|
3726
|
+
return {
|
|
3727
|
+
get: (url, options) => makeRequest(url, { ...options, method: "GET" }),
|
|
3728
|
+
post: (url, data, options) => Effect.gen(function* () {
|
|
3729
|
+
let body;
|
|
3730
|
+
if (data) {
|
|
3731
|
+
if (typeof data === "string" || data instanceof FormData || data instanceof URLSearchParams) {
|
|
3732
|
+
body = data;
|
|
3733
|
+
} else {
|
|
3734
|
+
body = JSON.stringify(data);
|
|
3735
|
+
}
|
|
3736
|
+
}
|
|
3737
|
+
return yield* makeRequest(url, { ...options, method: "POST", body });
|
|
3738
|
+
}),
|
|
3739
|
+
request: makeRequest,
|
|
3740
|
+
submitForm: (url, formData, options) => Effect.gen(function* () {
|
|
3741
|
+
const params = new URLSearchParams();
|
|
3742
|
+
for (const [key, value] of Object.entries(formData)) {
|
|
3743
|
+
params.append(key, value);
|
|
3744
|
+
}
|
|
3745
|
+
return yield* makeRequest(url, {
|
|
3746
|
+
...options,
|
|
3747
|
+
method: "POST",
|
|
3748
|
+
body: params,
|
|
3749
|
+
headers: {
|
|
3750
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
3751
|
+
...options?.headers
|
|
3752
|
+
}
|
|
3753
|
+
});
|
|
3754
|
+
})
|
|
3755
|
+
};
|
|
3756
|
+
});
|
|
3757
|
+
const EnhancedHttpClientLive = Layer.effect(
|
|
3758
|
+
EnhancedHttpClient,
|
|
3759
|
+
makeEnhancedHttpClient
|
|
3760
|
+
);
|
|
3761
|
+
class SessionStore extends Context.Tag("SessionStore")() {
|
|
3762
|
+
}
|
|
3763
|
+
const makeSessionStore = Effect.gen(function* () {
|
|
3764
|
+
const cookieManager = yield* CookieManager;
|
|
3765
|
+
const sessions = yield* Ref.make(/* @__PURE__ */ new Map());
|
|
3766
|
+
const currentSessionId = yield* Ref.make(
|
|
3767
|
+
Option.none()
|
|
3768
|
+
);
|
|
3769
|
+
const generateSessionId = () => `session_${Date.now()}_${Math.random().toString(36).substring(2, 9)}`;
|
|
3770
|
+
return {
|
|
3771
|
+
createSession: (id) => Effect.gen(function* () {
|
|
3772
|
+
const sessionId = id || generateSessionId();
|
|
3773
|
+
const cookiesString = yield* cookieManager.serialize();
|
|
3774
|
+
const session = {
|
|
3775
|
+
id: sessionId,
|
|
3776
|
+
cookies: cookiesString,
|
|
3777
|
+
tokens: /* @__PURE__ */ new Map(),
|
|
3778
|
+
createdAt: /* @__PURE__ */ new Date(),
|
|
3779
|
+
lastUsedAt: /* @__PURE__ */ new Date(),
|
|
3780
|
+
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1e3)
|
|
3781
|
+
// 24 hours
|
|
3782
|
+
};
|
|
3783
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3784
|
+
sessionsMap.set(sessionId, session);
|
|
3785
|
+
yield* Ref.set(sessions, sessionsMap);
|
|
3786
|
+
yield* Ref.set(currentSessionId, Option.some(sessionId));
|
|
3787
|
+
return session;
|
|
3788
|
+
}),
|
|
3789
|
+
getCurrentSession: () => Effect.gen(function* () {
|
|
3790
|
+
const sessionId = yield* Ref.get(currentSessionId);
|
|
3791
|
+
if (Option.isNone(sessionId)) {
|
|
3792
|
+
return Option.none();
|
|
3793
|
+
}
|
|
3794
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3795
|
+
const session = sessionsMap.get(sessionId.value);
|
|
3796
|
+
if (!session) {
|
|
3797
|
+
return Option.none();
|
|
3798
|
+
}
|
|
3799
|
+
session.lastUsedAt = /* @__PURE__ */ new Date();
|
|
3800
|
+
sessionsMap.set(sessionId.value, session);
|
|
3801
|
+
yield* Ref.set(sessions, sessionsMap);
|
|
3802
|
+
return Option.some(session);
|
|
3803
|
+
}),
|
|
3804
|
+
loadSession: (id) => Effect.gen(function* () {
|
|
3805
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3806
|
+
const session = sessionsMap.get(id);
|
|
3807
|
+
if (!session) {
|
|
3808
|
+
return yield* Effect.fail(new Error(`Session ${id} not found`));
|
|
3809
|
+
}
|
|
3810
|
+
if (session.expiresAt && session.expiresAt < /* @__PURE__ */ new Date()) {
|
|
3811
|
+
return yield* Effect.fail(new Error(`Session ${id} has expired`));
|
|
3812
|
+
}
|
|
3813
|
+
yield* cookieManager.deserialize(session.cookies);
|
|
3814
|
+
yield* Ref.set(currentSessionId, Option.some(id));
|
|
3815
|
+
session.lastUsedAt = /* @__PURE__ */ new Date();
|
|
3816
|
+
sessionsMap.set(id, session);
|
|
3817
|
+
yield* Ref.set(sessions, sessionsMap);
|
|
3818
|
+
}),
|
|
3819
|
+
saveSession: () => Effect.gen(function* () {
|
|
3820
|
+
const sessionId = yield* Ref.get(currentSessionId);
|
|
3821
|
+
if (Option.isNone(sessionId)) {
|
|
3822
|
+
const newSession = yield* Effect.sync(() => generateSessionId());
|
|
3823
|
+
yield* Ref.set(currentSessionId, Option.some(newSession));
|
|
3824
|
+
const session2 = yield* Effect.succeed({
|
|
3825
|
+
id: newSession,
|
|
3826
|
+
cookies: yield* cookieManager.serialize(),
|
|
3827
|
+
tokens: /* @__PURE__ */ new Map(),
|
|
3828
|
+
createdAt: /* @__PURE__ */ new Date(),
|
|
3829
|
+
lastUsedAt: /* @__PURE__ */ new Date(),
|
|
3830
|
+
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1e3)
|
|
3831
|
+
});
|
|
3832
|
+
const sessionsMap2 = yield* Ref.get(sessions);
|
|
3833
|
+
sessionsMap2.set(newSession, session2);
|
|
3834
|
+
yield* Ref.set(sessions, sessionsMap2);
|
|
3835
|
+
return newSession;
|
|
3836
|
+
}
|
|
3837
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3838
|
+
const session = sessionsMap.get(sessionId.value);
|
|
3839
|
+
if (!session) {
|
|
3840
|
+
return yield* Effect.fail(new Error("No active session to save"));
|
|
3841
|
+
}
|
|
3842
|
+
session.cookies = yield* cookieManager.serialize();
|
|
3843
|
+
session.lastUsedAt = /* @__PURE__ */ new Date();
|
|
3844
|
+
sessionsMap.set(sessionId.value, session);
|
|
3845
|
+
yield* Ref.set(sessions, sessionsMap);
|
|
3846
|
+
return sessionId.value;
|
|
3847
|
+
}),
|
|
3848
|
+
clearSession: () => Effect.gen(function* () {
|
|
3849
|
+
const sessionId = yield* Ref.get(currentSessionId);
|
|
3850
|
+
if (Option.isSome(sessionId)) {
|
|
3851
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3852
|
+
sessionsMap.delete(sessionId.value);
|
|
3853
|
+
yield* Ref.set(sessions, sessionsMap);
|
|
3854
|
+
}
|
|
3855
|
+
yield* Ref.set(currentSessionId, Option.none());
|
|
3856
|
+
yield* cookieManager.clearCookies();
|
|
3857
|
+
}),
|
|
3858
|
+
isSessionValid: () => Effect.gen(function* () {
|
|
3859
|
+
const session = yield* Effect.gen(function* () {
|
|
3860
|
+
const sessionId = yield* Ref.get(currentSessionId);
|
|
3861
|
+
if (Option.isNone(sessionId)) return null;
|
|
3862
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3863
|
+
return sessionsMap.get(sessionId.value) || null;
|
|
3864
|
+
});
|
|
3865
|
+
if (!session) return false;
|
|
3866
|
+
if (session.expiresAt && session.expiresAt < /* @__PURE__ */ new Date()) {
|
|
3867
|
+
return false;
|
|
3868
|
+
}
|
|
3869
|
+
return true;
|
|
3870
|
+
}),
|
|
3871
|
+
updateSessionData: (data) => Effect.gen(function* () {
|
|
3872
|
+
const sessionId = yield* Ref.get(currentSessionId);
|
|
3873
|
+
if (Option.isNone(sessionId)) {
|
|
3874
|
+
return yield* Effect.fail(new Error("No active session"));
|
|
3875
|
+
}
|
|
3876
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3877
|
+
const session = sessionsMap.get(sessionId.value);
|
|
3878
|
+
if (!session) {
|
|
3879
|
+
return yield* Effect.fail(new Error("Session not found"));
|
|
3880
|
+
}
|
|
3881
|
+
session.userData = { ...session.userData, ...data };
|
|
3882
|
+
session.lastUsedAt = /* @__PURE__ */ new Date();
|
|
3883
|
+
sessionsMap.set(sessionId.value, session);
|
|
3884
|
+
yield* Ref.set(sessions, sessionsMap);
|
|
3885
|
+
}),
|
|
3886
|
+
exportSession: () => Effect.gen(function* () {
|
|
3887
|
+
const sessionId = yield* Ref.get(currentSessionId);
|
|
3888
|
+
if (Option.isNone(sessionId)) {
|
|
3889
|
+
return yield* Effect.fail(new Error("No active session to export"));
|
|
3890
|
+
}
|
|
3891
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3892
|
+
const session = sessionsMap.get(sessionId.value);
|
|
3893
|
+
if (!session) {
|
|
3894
|
+
return yield* Effect.fail(new Error("Session not found"));
|
|
3895
|
+
}
|
|
3896
|
+
const tokensArray = Array.from(session.tokens.entries());
|
|
3897
|
+
return JSON.stringify({
|
|
3898
|
+
...session,
|
|
3899
|
+
tokens: tokensArray
|
|
3900
|
+
});
|
|
3901
|
+
}),
|
|
3902
|
+
importSession: (data) => Effect.gen(function* () {
|
|
3903
|
+
try {
|
|
3904
|
+
const parsed = JSON.parse(data);
|
|
3905
|
+
const session = {
|
|
3906
|
+
...parsed,
|
|
3907
|
+
tokens: new Map(parsed.tokens || []),
|
|
3908
|
+
createdAt: new Date(parsed.createdAt),
|
|
3909
|
+
lastUsedAt: new Date(parsed.lastUsedAt),
|
|
3910
|
+
expiresAt: parsed.expiresAt ? new Date(parsed.expiresAt) : void 0
|
|
3911
|
+
};
|
|
3912
|
+
const sessionsMap = yield* Ref.get(sessions);
|
|
3913
|
+
sessionsMap.set(session.id, session);
|
|
3914
|
+
yield* Ref.set(sessions, sessionsMap);
|
|
3915
|
+
yield* Effect.gen(function* () {
|
|
3916
|
+
yield* cookieManager.deserialize(session.cookies);
|
|
3917
|
+
yield* Ref.set(currentSessionId, Option.some(session.id));
|
|
3918
|
+
});
|
|
3919
|
+
} catch (error) {
|
|
3920
|
+
yield* Effect.fail(new Error(`Invalid session data: ${error}`));
|
|
3921
|
+
}
|
|
3922
|
+
})
|
|
3923
|
+
};
|
|
3924
|
+
});
|
|
3925
|
+
const SessionStoreLive = Layer.effect(SessionStore, makeSessionStore);
|
|
3926
|
+
var TokenType = /* @__PURE__ */ ((TokenType2) => {
|
|
3927
|
+
TokenType2["CSRF"] = "csrf";
|
|
3928
|
+
TokenType2["API"] = "api";
|
|
3929
|
+
TokenType2["AUTH"] = "auth";
|
|
3930
|
+
TokenType2["REFRESH"] = "refresh";
|
|
3931
|
+
return TokenType2;
|
|
3932
|
+
})(TokenType || {});
|
|
3933
|
+
class StateManager extends Context.Tag("StateManager")() {
|
|
3934
|
+
}
|
|
3935
|
+
const makeStateManager = () => Effect.gen(function* () {
|
|
3936
|
+
const tokens = yield* Ref.make(/* @__PURE__ */ new Map());
|
|
3937
|
+
const localStorage = yield* Ref.make(/* @__PURE__ */ new Map());
|
|
3938
|
+
const sessionStorage = yield* Ref.make(/* @__PURE__ */ new Map());
|
|
3939
|
+
return {
|
|
3940
|
+
extractCSRFToken: (html) => Effect.gen(function* () {
|
|
3941
|
+
const $ = cheerio.load(html);
|
|
3942
|
+
const csrfSelectors = [
|
|
3943
|
+
'meta[name="csrf-token"]',
|
|
3944
|
+
'meta[name="_csrf"]',
|
|
3945
|
+
'meta[name="csrf_token"]',
|
|
3946
|
+
'meta[name="authenticity_token"]',
|
|
3947
|
+
'input[name="csrf_token"]',
|
|
3948
|
+
'input[name="_csrf"]',
|
|
3949
|
+
'input[name="authenticity_token"]',
|
|
3950
|
+
'input[name="__RequestVerificationToken"]'
|
|
3951
|
+
];
|
|
3952
|
+
for (const selector of csrfSelectors) {
|
|
3953
|
+
const element = $(selector);
|
|
3954
|
+
if (element.length > 0) {
|
|
3955
|
+
const token = element.attr("content") || element.attr("value");
|
|
3956
|
+
if (token) {
|
|
3957
|
+
return token;
|
|
3958
|
+
}
|
|
3959
|
+
}
|
|
3960
|
+
}
|
|
3961
|
+
const scriptTags = $("script:not([src])");
|
|
3962
|
+
const scriptContent = scriptTags.map((_, el) => $(el).html()).get().join("\n");
|
|
3963
|
+
const patterns = [
|
|
3964
|
+
/window\.csrfToken\s*=\s*["']([^"']+)["']/,
|
|
3965
|
+
/csrf[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
3966
|
+
/_token["']?\s*[:=]\s*["']([^"']+)["']/,
|
|
3967
|
+
/authenticity_token["']?\s*[:=]\s*["']([^"']+)["']/,
|
|
3968
|
+
/X-CSRF-Token["']?\s*[:=]\s*["']([^"']+)["']/
|
|
3969
|
+
];
|
|
3970
|
+
for (const pattern of patterns) {
|
|
3971
|
+
const match = scriptContent.match(pattern);
|
|
3972
|
+
if (match && match[1]) {
|
|
3973
|
+
return match[1];
|
|
3974
|
+
}
|
|
3975
|
+
}
|
|
3976
|
+
return yield* Effect.fail(new Error("CSRF token not found in HTML"));
|
|
3977
|
+
}),
|
|
3978
|
+
extractAPIToken: (scripts) => Effect.gen(function* () {
|
|
3979
|
+
const scriptContent = scripts.join("\n");
|
|
3980
|
+
const patterns = [
|
|
3981
|
+
/api[_-]?key["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
3982
|
+
/api[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
3983
|
+
/X-Secret-Token["']?\s*[:=]\s*["']([^"']+)["']/,
|
|
3984
|
+
/authorization["']?\s*[:=]\s*["']Bearer\s+([^"']+)["']/i,
|
|
3985
|
+
/access[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
3986
|
+
/secret[_-]?key["']?\s*[:=]\s*["']([^"']+)["']/i
|
|
3987
|
+
];
|
|
3988
|
+
for (const pattern of patterns) {
|
|
3989
|
+
const match = scriptContent.match(pattern);
|
|
3990
|
+
if (match && match[1]) {
|
|
3991
|
+
return match[1];
|
|
3992
|
+
}
|
|
3993
|
+
}
|
|
3994
|
+
const windowPattern = /window\[["']([^"']*[Tt]oken[^"']*)["']\]\s*=\s*["']([^"']+)["']/g;
|
|
3995
|
+
let windowMatch;
|
|
3996
|
+
while ((windowMatch = windowPattern.exec(scriptContent)) !== null) {
|
|
3997
|
+
if (windowMatch[2]) {
|
|
3998
|
+
return windowMatch[2];
|
|
3999
|
+
}
|
|
4000
|
+
}
|
|
4001
|
+
return yield* Effect.fail(
|
|
4002
|
+
new Error("API token not found in scripts")
|
|
4003
|
+
);
|
|
4004
|
+
}),
|
|
4005
|
+
storeToken: (type, value, expiry) => Effect.gen(function* () {
|
|
4006
|
+
const token = {
|
|
4007
|
+
type,
|
|
4008
|
+
value,
|
|
4009
|
+
expiry
|
|
4010
|
+
};
|
|
4011
|
+
const tokensMap = yield* Ref.get(tokens);
|
|
4012
|
+
tokensMap.set(type, token);
|
|
4013
|
+
yield* Ref.set(tokens, tokensMap);
|
|
4014
|
+
}),
|
|
4015
|
+
getToken: (type) => Effect.gen(function* () {
|
|
4016
|
+
const tokensMap = yield* Ref.get(tokens);
|
|
4017
|
+
const token = tokensMap.get(type);
|
|
4018
|
+
if (!token) {
|
|
4019
|
+
return yield* Effect.fail(
|
|
4020
|
+
new Error(`Token of type ${type} not found`)
|
|
4021
|
+
);
|
|
4022
|
+
}
|
|
4023
|
+
if (token.expiry && token.expiry < /* @__PURE__ */ new Date()) {
|
|
4024
|
+
return yield* Effect.fail(
|
|
4025
|
+
new Error(`Token of type ${type} has expired`)
|
|
4026
|
+
);
|
|
4027
|
+
}
|
|
4028
|
+
return token.value;
|
|
4029
|
+
}),
|
|
4030
|
+
isTokenValid: (type) => Effect.gen(function* () {
|
|
4031
|
+
const tokensMap = yield* Ref.get(tokens);
|
|
4032
|
+
const token = tokensMap.get(type);
|
|
4033
|
+
if (!token) {
|
|
4034
|
+
return false;
|
|
4035
|
+
}
|
|
4036
|
+
if (token.expiry && token.expiry < /* @__PURE__ */ new Date()) {
|
|
4037
|
+
return false;
|
|
4038
|
+
}
|
|
4039
|
+
return true;
|
|
4040
|
+
}),
|
|
4041
|
+
setLocalStorage: (key, value) => Effect.gen(function* () {
|
|
4042
|
+
const storage = yield* Ref.get(localStorage);
|
|
4043
|
+
storage.set(key, value);
|
|
4044
|
+
yield* Ref.set(localStorage, storage);
|
|
4045
|
+
}),
|
|
4046
|
+
getLocalStorage: (key) => Effect.gen(function* () {
|
|
4047
|
+
const storage = yield* Ref.get(localStorage);
|
|
4048
|
+
const value = storage.get(key);
|
|
4049
|
+
if (!value) {
|
|
4050
|
+
return yield* Effect.fail(
|
|
4051
|
+
new Error(`Local storage key '${key}' not found`)
|
|
4052
|
+
);
|
|
4053
|
+
}
|
|
4054
|
+
return value;
|
|
4055
|
+
}),
|
|
4056
|
+
clearLocalStorage: () => Effect.gen(function* () {
|
|
4057
|
+
yield* Ref.set(localStorage, /* @__PURE__ */ new Map());
|
|
4058
|
+
}),
|
|
4059
|
+
setSessionStorage: (key, value) => Effect.gen(function* () {
|
|
4060
|
+
const storage = yield* Ref.get(sessionStorage);
|
|
4061
|
+
storage.set(key, value);
|
|
4062
|
+
yield* Ref.set(sessionStorage, storage);
|
|
4063
|
+
}),
|
|
4064
|
+
getSessionStorage: (key) => Effect.gen(function* () {
|
|
4065
|
+
const storage = yield* Ref.get(sessionStorage);
|
|
4066
|
+
const value = storage.get(key);
|
|
4067
|
+
if (!value) {
|
|
4068
|
+
return yield* Effect.fail(
|
|
4069
|
+
new Error(`Session storage key '${key}' not found`)
|
|
4070
|
+
);
|
|
4071
|
+
}
|
|
4072
|
+
return value;
|
|
4073
|
+
}),
|
|
4074
|
+
clearSessionStorage: () => Effect.gen(function* () {
|
|
4075
|
+
yield* Ref.set(sessionStorage, /* @__PURE__ */ new Map());
|
|
4076
|
+
}),
|
|
4077
|
+
clearState: () => Effect.gen(function* () {
|
|
4078
|
+
yield* Ref.set(tokens, /* @__PURE__ */ new Map());
|
|
4079
|
+
yield* Ref.set(localStorage, /* @__PURE__ */ new Map());
|
|
4080
|
+
yield* Ref.set(sessionStorage, /* @__PURE__ */ new Map());
|
|
4081
|
+
})
|
|
4082
|
+
};
|
|
4083
|
+
});
|
|
4084
|
+
const StateManagerLive = Layer.effect(StateManager, makeStateManager());
|
|
4085
|
+
class TokenExtractor extends Context.Tag("TokenExtractor")() {
|
|
4086
|
+
}
|
|
4087
|
+
const makeTokenExtractor = Effect.gen(function* () {
|
|
4088
|
+
const stateManager = yield* StateManager;
|
|
4089
|
+
const httpClient = yield* EnhancedHttpClient;
|
|
4090
|
+
const logger = yield* SpiderLogger;
|
|
4091
|
+
const extractFromHTML = (html) => {
|
|
4092
|
+
const tokens = [];
|
|
4093
|
+
const $ = cheerio.load(html);
|
|
4094
|
+
const csrfSelectors = [
|
|
4095
|
+
{ selector: 'meta[name="csrf-token"]', attr: "content" },
|
|
4096
|
+
{ selector: 'meta[name="_csrf"]', attr: "content" },
|
|
4097
|
+
{ selector: 'meta[name="csrf_token"]', attr: "content" },
|
|
4098
|
+
{ selector: 'meta[name="authenticity_token"]', attr: "content" },
|
|
4099
|
+
{ selector: 'input[name="csrf_token"]', attr: "value" },
|
|
4100
|
+
{ selector: 'input[name="_csrf"]', attr: "value" },
|
|
4101
|
+
{ selector: 'input[name="authenticity_token"]', attr: "value" },
|
|
4102
|
+
{ selector: 'input[name="__RequestVerificationToken"]', attr: "value" }
|
|
4103
|
+
];
|
|
4104
|
+
for (const { selector, attr } of csrfSelectors) {
|
|
4105
|
+
const element = $(selector);
|
|
4106
|
+
if (element.length > 0) {
|
|
4107
|
+
const value = element.attr(attr);
|
|
4108
|
+
if (value) {
|
|
4109
|
+
tokens.push({
|
|
4110
|
+
type: TokenType.CSRF,
|
|
4111
|
+
value,
|
|
4112
|
+
source: "html",
|
|
4113
|
+
selector
|
|
4114
|
+
});
|
|
4115
|
+
}
|
|
4116
|
+
}
|
|
4117
|
+
}
|
|
4118
|
+
const apiSelectors = [
|
|
4119
|
+
{ selector: 'meta[name="api-key"]', attr: "content" },
|
|
4120
|
+
{ selector: 'meta[name="api_key"]', attr: "content" },
|
|
4121
|
+
{ selector: 'meta[name="api-token"]', attr: "content" },
|
|
4122
|
+
{ selector: 'meta[name="access-token"]', attr: "content" }
|
|
4123
|
+
];
|
|
4124
|
+
for (const { selector, attr } of apiSelectors) {
|
|
4125
|
+
const element = $(selector);
|
|
4126
|
+
if (element.length > 0) {
|
|
4127
|
+
const value = element.attr(attr);
|
|
4128
|
+
if (value) {
|
|
4129
|
+
tokens.push({
|
|
4130
|
+
type: TokenType.API,
|
|
4131
|
+
value,
|
|
4132
|
+
source: "html",
|
|
4133
|
+
selector
|
|
4134
|
+
});
|
|
4135
|
+
}
|
|
4136
|
+
}
|
|
4137
|
+
}
|
|
4138
|
+
return tokens;
|
|
4139
|
+
};
|
|
4140
|
+
const extractFromScripts = (html) => {
|
|
4141
|
+
const tokens = [];
|
|
4142
|
+
const $ = cheerio.load(html);
|
|
4143
|
+
const scriptTags = $("script:not([src])");
|
|
4144
|
+
const scriptContent = scriptTags.map((_, el) => $(el).html()).get().join("\n");
|
|
4145
|
+
const csrfPatterns = [
|
|
4146
|
+
{
|
|
4147
|
+
pattern: /window\.csrfToken\s*=\s*["']([^"']+)["']/,
|
|
4148
|
+
name: "window.csrfToken"
|
|
4149
|
+
},
|
|
4150
|
+
{
|
|
4151
|
+
pattern: /csrf[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
4152
|
+
name: "csrf_token"
|
|
4153
|
+
},
|
|
4154
|
+
{ pattern: /_token["']?\s*[:=]\s*["']([^"']+)["']/, name: "_token" },
|
|
4155
|
+
{
|
|
4156
|
+
pattern: /authenticity_token["']?\s*[:=]\s*["']([^"']+)["']/,
|
|
4157
|
+
name: "authenticity_token"
|
|
4158
|
+
},
|
|
4159
|
+
{
|
|
4160
|
+
pattern: /X-CSRF-Token["']?\s*[:=]\s*["']([^"']+)["']/,
|
|
4161
|
+
name: "X-CSRF-Token"
|
|
4162
|
+
}
|
|
4163
|
+
];
|
|
4164
|
+
for (const { pattern, name } of csrfPatterns) {
|
|
4165
|
+
const match = scriptContent.match(pattern);
|
|
4166
|
+
if (match && match[1]) {
|
|
4167
|
+
tokens.push({
|
|
4168
|
+
type: TokenType.CSRF,
|
|
4169
|
+
value: match[1],
|
|
4170
|
+
source: "script",
|
|
4171
|
+
pattern: name
|
|
4172
|
+
});
|
|
4173
|
+
}
|
|
4174
|
+
}
|
|
4175
|
+
const apiPatterns = [
|
|
4176
|
+
{
|
|
4177
|
+
pattern: /api[_-]?key["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
4178
|
+
name: "api_key"
|
|
4179
|
+
},
|
|
4180
|
+
{
|
|
4181
|
+
pattern: /api[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
4182
|
+
name: "api_token"
|
|
4183
|
+
},
|
|
4184
|
+
{
|
|
4185
|
+
pattern: /X-Secret-Token["']?\s*[:=]\s*["']([^"']+)["']/,
|
|
4186
|
+
name: "X-Secret-Token"
|
|
4187
|
+
},
|
|
4188
|
+
{
|
|
4189
|
+
pattern: /authorization["']?\s*[:=]\s*["']Bearer\s+([^"']+)["']/i,
|
|
4190
|
+
name: "authorization"
|
|
4191
|
+
},
|
|
4192
|
+
{
|
|
4193
|
+
pattern: /access[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
4194
|
+
name: "access_token"
|
|
4195
|
+
},
|
|
4196
|
+
{
|
|
4197
|
+
pattern: /secret[_-]?key["']?\s*[:=]\s*["']([^"']+)["']/i,
|
|
4198
|
+
name: "secret_key"
|
|
4199
|
+
}
|
|
4200
|
+
];
|
|
4201
|
+
for (const { pattern, name } of apiPatterns) {
|
|
4202
|
+
const match = scriptContent.match(pattern);
|
|
4203
|
+
if (match && match[1]) {
|
|
4204
|
+
tokens.push({
|
|
4205
|
+
type: TokenType.API,
|
|
4206
|
+
value: match[1],
|
|
4207
|
+
source: "script",
|
|
4208
|
+
pattern: name
|
|
4209
|
+
});
|
|
4210
|
+
}
|
|
4211
|
+
}
|
|
4212
|
+
const windowPattern = /window\[["']([^"']*[Tt]oken[^"']*)["']\]\s*=\s*["']([^"']+)["']/g;
|
|
4213
|
+
let windowMatch;
|
|
4214
|
+
while ((windowMatch = windowPattern.exec(scriptContent)) !== null) {
|
|
4215
|
+
if (windowMatch[2]) {
|
|
4216
|
+
const keyLower = windowMatch[1].toLowerCase();
|
|
4217
|
+
const type = keyLower.includes("csrf") || keyLower.includes("authenticity") ? TokenType.CSRF : TokenType.API;
|
|
4218
|
+
tokens.push({
|
|
4219
|
+
type,
|
|
4220
|
+
value: windowMatch[2],
|
|
4221
|
+
source: "script",
|
|
4222
|
+
pattern: `window['${windowMatch[1]}']`
|
|
4223
|
+
});
|
|
4224
|
+
}
|
|
4225
|
+
}
|
|
4226
|
+
return tokens;
|
|
4227
|
+
};
|
|
4228
|
+
const extractFromHeaders = (headers) => {
|
|
4229
|
+
const tokens = [];
|
|
4230
|
+
const headerPatterns = [
|
|
4231
|
+
{ header: "x-csrf-token", type: TokenType.CSRF },
|
|
4232
|
+
{ header: "x-auth-token", type: TokenType.AUTH },
|
|
4233
|
+
{ header: "x-api-key", type: TokenType.API },
|
|
4234
|
+
{ header: "authorization", type: TokenType.AUTH },
|
|
4235
|
+
{ header: "x-access-token", type: TokenType.AUTH }
|
|
4236
|
+
];
|
|
4237
|
+
for (const { header, type } of headerPatterns) {
|
|
4238
|
+
const value = headers[header] || headers[header.toLowerCase()];
|
|
4239
|
+
if (value) {
|
|
4240
|
+
tokens.push({
|
|
4241
|
+
type,
|
|
4242
|
+
value,
|
|
4243
|
+
source: "header",
|
|
4244
|
+
pattern: header
|
|
4245
|
+
});
|
|
4246
|
+
}
|
|
4247
|
+
}
|
|
4248
|
+
return tokens;
|
|
4249
|
+
};
|
|
4250
|
+
const service = {
|
|
4251
|
+
extractTokensFromResponse: (response) => Effect.gen(function* () {
|
|
4252
|
+
const tokens = [];
|
|
4253
|
+
tokens.push(...extractFromHTML(response.body));
|
|
4254
|
+
tokens.push(...extractFromScripts(response.body));
|
|
4255
|
+
tokens.push(...extractFromHeaders(response.headers));
|
|
4256
|
+
const uniqueTokens = /* @__PURE__ */ new Map();
|
|
4257
|
+
for (const token of tokens) {
|
|
4258
|
+
const key = `${token.type}:${token.value}`;
|
|
4259
|
+
if (!uniqueTokens.has(key)) {
|
|
4260
|
+
uniqueTokens.set(key, token);
|
|
4261
|
+
yield* stateManager.storeToken(
|
|
4262
|
+
token.type,
|
|
4263
|
+
token.value,
|
|
4264
|
+
new Date(Date.now() + 36e5)
|
|
4265
|
+
// 1 hour expiry
|
|
4266
|
+
);
|
|
4267
|
+
yield* logger.logEdgeCase(
|
|
4268
|
+
new URL(response.url).hostname,
|
|
4269
|
+
"token_found",
|
|
4270
|
+
{
|
|
4271
|
+
type: token.type,
|
|
4272
|
+
source: token.source,
|
|
4273
|
+
pattern: token.pattern || token.selector
|
|
4274
|
+
}
|
|
4275
|
+
);
|
|
4276
|
+
}
|
|
4277
|
+
}
|
|
4278
|
+
return Array.from(uniqueTokens.values());
|
|
4279
|
+
}),
|
|
4280
|
+
extractCSRFFromResponse: (response) => Effect.gen(function* () {
|
|
4281
|
+
const tokens = yield* Effect.succeed([
|
|
4282
|
+
...extractFromHTML(response.body),
|
|
4283
|
+
...extractFromScripts(response.body)
|
|
4284
|
+
]);
|
|
4285
|
+
const csrfToken = tokens.find((t) => t.type === TokenType.CSRF);
|
|
4286
|
+
if (csrfToken) {
|
|
4287
|
+
yield* stateManager.storeToken(
|
|
4288
|
+
TokenType.CSRF,
|
|
4289
|
+
csrfToken.value,
|
|
4290
|
+
new Date(Date.now() + 36e5)
|
|
4291
|
+
);
|
|
4292
|
+
return csrfToken.value;
|
|
4293
|
+
}
|
|
4294
|
+
return null;
|
|
4295
|
+
}),
|
|
4296
|
+
extractAPIFromResponse: (response) => Effect.gen(function* () {
|
|
4297
|
+
const tokens = yield* Effect.succeed([
|
|
4298
|
+
...extractFromScripts(response.body),
|
|
4299
|
+
...extractFromHeaders(response.headers)
|
|
4300
|
+
]);
|
|
4301
|
+
const apiToken = tokens.find((t) => t.type === TokenType.API);
|
|
4302
|
+
if (apiToken) {
|
|
4303
|
+
yield* stateManager.storeToken(
|
|
4304
|
+
TokenType.API,
|
|
4305
|
+
apiToken.value,
|
|
4306
|
+
new Date(Date.now() + 36e5)
|
|
4307
|
+
);
|
|
4308
|
+
return apiToken.value;
|
|
4309
|
+
}
|
|
4310
|
+
return null;
|
|
4311
|
+
}),
|
|
4312
|
+
authenticatedRequest: (url, options = {}) => Effect.gen(function* () {
|
|
4313
|
+
const headers = { ...options.customHeaders };
|
|
4314
|
+
if (options.requireCSRF) {
|
|
4315
|
+
const isValid = yield* stateManager.isTokenValid(TokenType.CSRF);
|
|
4316
|
+
if (!isValid) {
|
|
4317
|
+
const baseUrl = new URL(url).origin;
|
|
4318
|
+
const baseResponse = yield* httpClient.get(baseUrl);
|
|
4319
|
+
yield* Effect.succeed(extractFromHTML(baseResponse.body)).pipe(
|
|
4320
|
+
Effect.flatMap((tokens) => {
|
|
4321
|
+
const csrfToken2 = tokens.find((t) => t.type === TokenType.CSRF);
|
|
4322
|
+
if (csrfToken2) {
|
|
4323
|
+
return stateManager.storeToken(
|
|
4324
|
+
TokenType.CSRF,
|
|
4325
|
+
csrfToken2.value,
|
|
4326
|
+
new Date(Date.now() + 36e5)
|
|
4327
|
+
);
|
|
4328
|
+
}
|
|
4329
|
+
return Effect.void;
|
|
4330
|
+
})
|
|
4331
|
+
);
|
|
4332
|
+
}
|
|
4333
|
+
const csrfToken = yield* stateManager.getToken(TokenType.CSRF).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
4334
|
+
if (csrfToken) {
|
|
4335
|
+
headers["X-CSRF-Token"] = csrfToken;
|
|
4336
|
+
headers["X-Requested-With"] = "XMLHttpRequest";
|
|
4337
|
+
}
|
|
4338
|
+
}
|
|
4339
|
+
if (options.requireAPI) {
|
|
4340
|
+
const isValid = yield* stateManager.isTokenValid(TokenType.API);
|
|
4341
|
+
if (!isValid) {
|
|
4342
|
+
return yield* Effect.fail(
|
|
4343
|
+
new Error("API token not available or expired")
|
|
4344
|
+
);
|
|
4345
|
+
}
|
|
4346
|
+
const apiToken = yield* stateManager.getToken(TokenType.API);
|
|
4347
|
+
headers["Authorization"] = `Bearer ${apiToken}`;
|
|
4348
|
+
headers["X-API-Key"] = apiToken;
|
|
4349
|
+
}
|
|
4350
|
+
const response = yield* httpClient.request(url, { headers });
|
|
4351
|
+
if (options.requireCSRF) {
|
|
4352
|
+
const currentCSRF = yield* stateManager.getToken(TokenType.CSRF).pipe(Effect.catchAll(() => Effect.succeed("")));
|
|
4353
|
+
if (currentCSRF) {
|
|
4354
|
+
yield* service.detectTokenRotation(
|
|
4355
|
+
currentCSRF,
|
|
4356
|
+
response,
|
|
4357
|
+
TokenType.CSRF
|
|
4358
|
+
);
|
|
4359
|
+
}
|
|
4360
|
+
}
|
|
4361
|
+
if (options.requireAPI) {
|
|
4362
|
+
const currentAPI = yield* stateManager.getToken(TokenType.API).pipe(Effect.catchAll(() => Effect.succeed("")));
|
|
4363
|
+
if (currentAPI) {
|
|
4364
|
+
yield* service.detectTokenRotation(
|
|
4365
|
+
currentAPI,
|
|
4366
|
+
response,
|
|
4367
|
+
TokenType.API
|
|
4368
|
+
);
|
|
4369
|
+
}
|
|
4370
|
+
}
|
|
4371
|
+
return response;
|
|
4372
|
+
}),
|
|
4373
|
+
detectTokenRotation: (oldToken, response, type) => Effect.gen(function* () {
|
|
4374
|
+
const tokens = yield* Effect.succeed([
|
|
4375
|
+
...extractFromHTML(response.body),
|
|
4376
|
+
...extractFromScripts(response.body),
|
|
4377
|
+
...extractFromHeaders(response.headers)
|
|
4378
|
+
]);
|
|
4379
|
+
const newToken = tokens.find(
|
|
4380
|
+
(t) => t.type === type && t.value !== oldToken
|
|
4381
|
+
);
|
|
4382
|
+
if (newToken) {
|
|
4383
|
+
yield* stateManager.storeToken(
|
|
4384
|
+
type,
|
|
4385
|
+
newToken.value,
|
|
4386
|
+
new Date(Date.now() + 36e5)
|
|
4387
|
+
);
|
|
4388
|
+
yield* logger.logEdgeCase(
|
|
4389
|
+
new URL(response.url).hostname,
|
|
4390
|
+
"token_rotated",
|
|
4391
|
+
{
|
|
4392
|
+
type,
|
|
4393
|
+
oldToken: oldToken.substring(0, 8) + "...",
|
|
4394
|
+
newToken: newToken.value.substring(0, 8) + "..."
|
|
4395
|
+
}
|
|
4396
|
+
);
|
|
4397
|
+
return true;
|
|
4398
|
+
}
|
|
4399
|
+
return false;
|
|
4400
|
+
}),
|
|
4401
|
+
refreshToken: (type, refreshUrl) => Effect.gen(function* () {
|
|
4402
|
+
if (!refreshUrl) {
|
|
4403
|
+
return yield* Effect.fail(new Error("No refresh URL provided"));
|
|
4404
|
+
}
|
|
4405
|
+
const response = yield* httpClient.get(refreshUrl);
|
|
4406
|
+
const tokens = yield* Effect.succeed([
|
|
4407
|
+
...extractFromHTML(response.body),
|
|
4408
|
+
...extractFromScripts(response.body),
|
|
4409
|
+
...extractFromHeaders(response.headers)
|
|
4410
|
+
]);
|
|
4411
|
+
const newToken = tokens.find((t) => t.type === type);
|
|
4412
|
+
if (!newToken) {
|
|
4413
|
+
return yield* Effect.fail(
|
|
4414
|
+
new Error(`Failed to refresh ${type} token`)
|
|
4415
|
+
);
|
|
4416
|
+
}
|
|
4417
|
+
yield* stateManager.storeToken(
|
|
4418
|
+
type,
|
|
4419
|
+
newToken.value,
|
|
4420
|
+
new Date(Date.now() + 36e5)
|
|
4421
|
+
);
|
|
4422
|
+
return newToken.value;
|
|
4423
|
+
})
|
|
4424
|
+
};
|
|
4425
|
+
return service;
|
|
4426
|
+
});
|
|
4427
|
+
const TokenExtractorLive = Layer.effect(
|
|
4428
|
+
TokenExtractor,
|
|
4429
|
+
makeTokenExtractor
|
|
4430
|
+
);
|
|
4431
|
+
class WebScrapingEngine extends Context.Tag("WebScrapingEngine")() {
|
|
4432
|
+
}
|
|
4433
|
+
const makeWebScrapingEngine = Effect.gen(function* () {
|
|
4434
|
+
const httpClient = yield* EnhancedHttpClient;
|
|
4435
|
+
const cookieManager = yield* CookieManager;
|
|
4436
|
+
const sessionStore = yield* SessionStore;
|
|
4437
|
+
const tokenExtractor = yield* TokenExtractor;
|
|
4438
|
+
const stateManager = yield* StateManager;
|
|
4439
|
+
const logger = yield* SpiderLogger;
|
|
4440
|
+
yield* ScraperService;
|
|
4441
|
+
const service = {
|
|
4442
|
+
login: (credentials) => Effect.gen(function* () {
|
|
4443
|
+
const domain = new URL(credentials.loginUrl).hostname;
|
|
4444
|
+
yield* logger.logEdgeCase(domain, "login_start", {
|
|
4445
|
+
url: credentials.loginUrl,
|
|
4446
|
+
username: credentials.username
|
|
4447
|
+
});
|
|
4448
|
+
const loginPageResponse = yield* httpClient.get(credentials.loginUrl);
|
|
4449
|
+
const csrfToken = yield* tokenExtractor.extractCSRFFromResponse(loginPageResponse);
|
|
4450
|
+
const formData = {
|
|
4451
|
+
[credentials.usernameField || "username"]: credentials.username,
|
|
4452
|
+
[credentials.passwordField || "password"]: credentials.password,
|
|
4453
|
+
...credentials.additionalFields
|
|
4454
|
+
};
|
|
4455
|
+
if (csrfToken) {
|
|
4456
|
+
const csrfFieldNames = [
|
|
4457
|
+
"csrf_token",
|
|
4458
|
+
"_csrf",
|
|
4459
|
+
"authenticity_token",
|
|
4460
|
+
"__RequestVerificationToken"
|
|
4461
|
+
];
|
|
4462
|
+
const csrfFieldName = csrfFieldNames.find(
|
|
4463
|
+
(name) => loginPageResponse.body.includes(`name="${name}"`)
|
|
4464
|
+
) || "csrf_token";
|
|
4465
|
+
formData[csrfFieldName] = csrfToken;
|
|
4466
|
+
yield* logger.logEdgeCase(domain, "csrf_token_added", {
|
|
4467
|
+
field: csrfFieldName
|
|
4468
|
+
});
|
|
4469
|
+
}
|
|
4470
|
+
const loginResponse = yield* httpClient.submitForm(
|
|
4471
|
+
credentials.loginUrl,
|
|
4472
|
+
formData
|
|
4473
|
+
);
|
|
4474
|
+
const isAuthenticated = loginResponse.status === 200 || loginResponse.status === 302 || loginResponse.headers["location"] !== void 0;
|
|
4475
|
+
if (!isAuthenticated) {
|
|
4476
|
+
return yield* Effect.fail(
|
|
4477
|
+
new Error(`Login failed with status ${loginResponse.status}`)
|
|
4478
|
+
);
|
|
4479
|
+
}
|
|
4480
|
+
yield* tokenExtractor.extractTokensFromResponse(loginResponse);
|
|
4481
|
+
const session = yield* sessionStore.createSession();
|
|
4482
|
+
yield* sessionStore.updateSessionData({
|
|
4483
|
+
authenticated: true,
|
|
4484
|
+
username: credentials.username,
|
|
4485
|
+
loginTime: /* @__PURE__ */ new Date()
|
|
4486
|
+
});
|
|
4487
|
+
const tokens = /* @__PURE__ */ new Map();
|
|
4488
|
+
for (const type of [TokenType.CSRF, TokenType.API, TokenType.AUTH]) {
|
|
4489
|
+
const token = yield* stateManager.getToken(type).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
4490
|
+
if (token) {
|
|
4491
|
+
tokens.set(type, token);
|
|
4492
|
+
}
|
|
4493
|
+
}
|
|
4494
|
+
yield* logger.logEdgeCase(domain, "login_success", {
|
|
4495
|
+
sessionId: session.id,
|
|
4496
|
+
tokensFound: Array.from(tokens.keys())
|
|
4497
|
+
});
|
|
4498
|
+
return {
|
|
4499
|
+
id: session.id,
|
|
4500
|
+
authenticated: true,
|
|
4501
|
+
tokens,
|
|
4502
|
+
startTime: /* @__PURE__ */ new Date()
|
|
4503
|
+
};
|
|
4504
|
+
}),
|
|
4505
|
+
fetchAuthenticated: (url) => Effect.gen(function* () {
|
|
4506
|
+
const isValid = yield* sessionStore.isSessionValid();
|
|
4507
|
+
if (!isValid) {
|
|
4508
|
+
return yield* Effect.fail(
|
|
4509
|
+
new Error("No valid session. Please login first.")
|
|
4510
|
+
);
|
|
4511
|
+
}
|
|
4512
|
+
return yield* httpClient.get(url);
|
|
4513
|
+
}),
|
|
4514
|
+
submitFormWithCSRF: (url, formData, csrfUrl) => Effect.gen(function* () {
|
|
4515
|
+
const domain = new URL(url).hostname;
|
|
4516
|
+
let csrfToken = null;
|
|
4517
|
+
const isValid = yield* stateManager.isTokenValid(TokenType.CSRF);
|
|
4518
|
+
if (!isValid && csrfUrl) {
|
|
4519
|
+
const csrfResponse = yield* httpClient.get(csrfUrl);
|
|
4520
|
+
csrfToken = yield* tokenExtractor.extractCSRFFromResponse(csrfResponse);
|
|
4521
|
+
} else if (isValid) {
|
|
4522
|
+
csrfToken = yield* stateManager.getToken(TokenType.CSRF).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
4523
|
+
}
|
|
4524
|
+
if (!csrfToken && !csrfUrl) {
|
|
4525
|
+
const formPageResponse = yield* httpClient.get(url);
|
|
4526
|
+
csrfToken = yield* tokenExtractor.extractCSRFFromResponse(formPageResponse);
|
|
4527
|
+
}
|
|
4528
|
+
const enhancedFormData = { ...formData };
|
|
4529
|
+
if (csrfToken) {
|
|
4530
|
+
const csrfFieldNames = [
|
|
4531
|
+
"csrf_token",
|
|
4532
|
+
"_csrf",
|
|
4533
|
+
"authenticity_token",
|
|
4534
|
+
"__RequestVerificationToken"
|
|
4535
|
+
];
|
|
4536
|
+
const csrfFieldName = csrfFieldNames[0];
|
|
4537
|
+
enhancedFormData[csrfFieldName] = csrfToken;
|
|
4538
|
+
yield* logger.logEdgeCase(domain, "csrf_protected_form", {
|
|
4539
|
+
url,
|
|
4540
|
+
csrfField: csrfFieldName
|
|
4541
|
+
});
|
|
4542
|
+
}
|
|
4543
|
+
const response = yield* httpClient.submitForm(url, enhancedFormData);
|
|
4544
|
+
if (csrfToken) {
|
|
4545
|
+
yield* tokenExtractor.detectTokenRotation(
|
|
4546
|
+
csrfToken,
|
|
4547
|
+
response,
|
|
4548
|
+
TokenType.CSRF
|
|
4549
|
+
);
|
|
4550
|
+
}
|
|
4551
|
+
return response;
|
|
4552
|
+
}),
|
|
4553
|
+
makeAPIRequest: (url, method = "GET", data) => Effect.gen(function* () {
|
|
4554
|
+
const response = yield* tokenExtractor.authenticatedRequest(url, {
|
|
4555
|
+
requireAPI: true,
|
|
4556
|
+
customHeaders: {
|
|
4557
|
+
"Content-Type": "application/json",
|
|
4558
|
+
Accept: "application/json"
|
|
4559
|
+
}
|
|
4560
|
+
}).pipe(
|
|
4561
|
+
Effect.catchAll((error) => {
|
|
4562
|
+
if (method === "GET") {
|
|
4563
|
+
return httpClient.get(url);
|
|
4564
|
+
} else {
|
|
4565
|
+
return httpClient.post(url, data);
|
|
4566
|
+
}
|
|
4567
|
+
})
|
|
4568
|
+
);
|
|
4569
|
+
return response;
|
|
4570
|
+
}),
|
|
4571
|
+
createSession: (id) => Effect.gen(function* () {
|
|
4572
|
+
const session = yield* sessionStore.createSession(id);
|
|
4573
|
+
const tokens = /* @__PURE__ */ new Map();
|
|
4574
|
+
for (const type of [TokenType.CSRF, TokenType.API, TokenType.AUTH]) {
|
|
4575
|
+
const token = yield* stateManager.getToken(type).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
4576
|
+
if (token) {
|
|
4577
|
+
tokens.set(type, token);
|
|
4578
|
+
}
|
|
4579
|
+
}
|
|
4580
|
+
return {
|
|
4581
|
+
id: session.id,
|
|
4582
|
+
authenticated: false,
|
|
4583
|
+
tokens,
|
|
4584
|
+
startTime: session.createdAt
|
|
4585
|
+
};
|
|
4586
|
+
}),
|
|
4587
|
+
loadSession: (id) => Effect.gen(function* () {
|
|
4588
|
+
yield* sessionStore.loadSession(id);
|
|
4589
|
+
const session = yield* sessionStore.getCurrentSession();
|
|
4590
|
+
if (session._tag === "None") {
|
|
4591
|
+
return yield* Effect.fail(new Error("Failed to load session"));
|
|
4592
|
+
}
|
|
4593
|
+
const tokens = /* @__PURE__ */ new Map();
|
|
4594
|
+
for (const type of [TokenType.CSRF, TokenType.API, TokenType.AUTH]) {
|
|
4595
|
+
const token = yield* stateManager.getToken(type).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
4596
|
+
if (token) {
|
|
4597
|
+
tokens.set(type, token);
|
|
4598
|
+
}
|
|
4599
|
+
}
|
|
4600
|
+
return {
|
|
4601
|
+
id: session.value.id,
|
|
4602
|
+
authenticated: session.value.userData?.authenticated || false,
|
|
4603
|
+
tokens,
|
|
4604
|
+
startTime: session.value.createdAt
|
|
4605
|
+
};
|
|
4606
|
+
}),
|
|
4607
|
+
exportSession: () => sessionStore.exportSession(),
|
|
4608
|
+
importSession: (data) => sessionStore.importSession(data),
|
|
4609
|
+
clearAll: () => Effect.gen(function* () {
|
|
4610
|
+
yield* sessionStore.clearSession();
|
|
4611
|
+
yield* cookieManager.clearCookies();
|
|
4612
|
+
yield* stateManager.clearState();
|
|
4613
|
+
})
|
|
4614
|
+
};
|
|
4615
|
+
return service;
|
|
4616
|
+
});
|
|
4617
|
+
const WebScrapingEngineLive = Layer.effect(
|
|
4618
|
+
WebScrapingEngine,
|
|
4619
|
+
makeWebScrapingEngine
|
|
4620
|
+
);
|
|
4621
|
+
export {
|
|
4622
|
+
ConfigurationError,
|
|
4623
|
+
CookieManager,
|
|
4624
|
+
CookieManagerLive,
|
|
4625
|
+
DEFAULT_HYBRID_CONFIG,
|
|
4626
|
+
DeltaPersistence,
|
|
4627
|
+
EnhancedHttpClient,
|
|
4628
|
+
EnhancedHttpClientLive,
|
|
4629
|
+
FileStorageBackend,
|
|
4630
|
+
FileSystemError,
|
|
4631
|
+
FullStatePersistence,
|
|
4632
|
+
HybridPersistence,
|
|
4633
|
+
LinkExtractionError,
|
|
4634
|
+
LinkExtractorService,
|
|
4635
|
+
LinkExtractorServiceLayer,
|
|
4636
|
+
LoggingMiddleware,
|
|
4637
|
+
MiddlewareError,
|
|
4638
|
+
MiddlewareManager,
|
|
4639
|
+
NetworkError,
|
|
4640
|
+
PageDataSchema,
|
|
4641
|
+
PersistenceError$1 as PersistenceError,
|
|
4642
|
+
PriorityRequest,
|
|
4643
|
+
RateLimitMiddleware,
|
|
4644
|
+
ResponseError,
|
|
4645
|
+
ResumabilityConfigs,
|
|
4646
|
+
PersistenceError2 as ResumabilityError,
|
|
4647
|
+
ResumabilityService,
|
|
4648
|
+
RobotsService,
|
|
4649
|
+
RobotsTxtError,
|
|
4650
|
+
ScraperService,
|
|
4651
|
+
SessionStore,
|
|
4652
|
+
SessionStoreLive,
|
|
4653
|
+
SpiderConfig,
|
|
4654
|
+
SpiderLoggerLive,
|
|
4655
|
+
SpiderLogger as SpiderLoggerTag,
|
|
4656
|
+
SpiderSchedulerService,
|
|
4657
|
+
SpiderService,
|
|
4658
|
+
SpiderState,
|
|
4659
|
+
SpiderStateKey,
|
|
4660
|
+
StateDelta,
|
|
4661
|
+
StateManager,
|
|
4662
|
+
StateManagerLive,
|
|
4663
|
+
StatsMiddleware,
|
|
4664
|
+
TokenExtractor,
|
|
4665
|
+
TokenExtractorLive,
|
|
4666
|
+
TokenType,
|
|
4667
|
+
UrlDeduplicatorService,
|
|
4668
|
+
UserAgentMiddleware,
|
|
4669
|
+
WebScrapingEngine,
|
|
4670
|
+
WebScrapingEngineLive,
|
|
4671
|
+
createStateOperation,
|
|
4672
|
+
makeCookieManager,
|
|
4673
|
+
makeEnhancedHttpClient,
|
|
4674
|
+
makeSessionStore,
|
|
4675
|
+
makeSpiderConfig,
|
|
4676
|
+
makeSpiderLogger,
|
|
4677
|
+
makeStateManager,
|
|
4678
|
+
makeTokenExtractor,
|
|
4679
|
+
makeWebScrapingEngine
|
|
4680
|
+
};
|
|
4681
|
+
//# sourceMappingURL=index.js.map
|