@jambudipa/spider 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +310 -332
- package/dist/index.js.map +1 -1
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -1
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +1 -1
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -1
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -1
- package/dist/lib/Resumability/types.d.ts +3 -3
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +7 -7
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -1
- package/dist/lib/Scraper/Scraper.service.d.ts +1 -1
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -1
- package/dist/lib/Spider/Spider.defaults.d.ts +24 -0
- package/dist/lib/Spider/Spider.defaults.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +0 -10
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -1
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -1
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +1 -2
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -1
- package/dist/lib/api-facades.d.ts +1 -1
- package/dist/lib/api-facades.d.ts.map +1 -1
- package/dist/lib/errors/effect-errors.d.ts +167 -34
- package/dist/lib/errors/effect-errors.d.ts.map +1 -1
- package/dist/lib/utils/JsonUtils.d.ts.map +1 -1
- package/dist/lib/utils/UrlUtils.d.ts.map +1 -1
- package/dist/lib/utils/url-deduplication.d.ts.map +1 -1
- package/package.json +7 -10
- package/dist/browser/BrowserManager.d.ts +0 -63
- package/dist/browser/BrowserManager.d.ts.map +0 -1
- package/dist/browser/PlaywrightAdapter.d.ts +0 -166
- package/dist/browser/PlaywrightAdapter.d.ts.map +0 -1
- package/dist/examples/01-basic-crawl-working.d.ts +0 -13
- package/dist/examples/01-basic-crawl-working.d.ts.map +0 -1
- package/dist/examples/02-multiple-urls-working.d.ts +0 -13
- package/dist/examples/02-multiple-urls-working.d.ts.map +0 -1
- package/dist/examples/03-url-filtering.d.ts +0 -13
- package/dist/examples/03-url-filtering.d.ts.map +0 -1
- package/dist/examples/04-robots-compliance.d.ts +0 -14
- package/dist/examples/04-robots-compliance.d.ts.map +0 -1
- package/dist/examples/05-link-extraction-selectors.d.ts +0 -14
- package/dist/examples/05-link-extraction-selectors.d.ts.map +0 -1
- package/dist/examples/06-custom-middleware.d.ts +0 -18
- package/dist/examples/06-custom-middleware.d.ts.map +0 -1
- package/dist/examples/07-resumability-demo.d.ts +0 -14
- package/dist/examples/07-resumability-demo.d.ts.map +0 -1
- package/dist/examples/08-worker-monitoring.d.ts +0 -15
- package/dist/examples/08-worker-monitoring.d.ts.map +0 -1
- package/dist/examples/09-error-handling-recovery.d.ts +0 -15
- package/dist/examples/09-error-handling-recovery.d.ts.map +0 -1
- package/dist/lib/errors.d.ts +0 -172
- package/dist/lib/errors.d.ts.map +0 -1
- package/dist/lib/utils/url-deduplication.test.d.ts +0 -5
- package/dist/lib/utils/url-deduplication.test.d.ts.map +0 -1
- package/dist/test/infrastructure/EffectTestUtils.d.ts +0 -167
- package/dist/test/infrastructure/EffectTestUtils.d.ts.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Effect, Layer, Option, Chunk, MutableHashSet, Schema, Data, Context, DateTime, Console, Duration, MutableHashMap, Queue,
|
|
1
|
+
import { Effect, Layer, Option, Chunk, MutableHashSet, Schema, Data, pipe, Context, DateTime, Console, Duration, MutableHashMap, Queue, HashMap, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Struct, Ref } from "effect";
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
3
|
import * as fs from "fs";
|
|
4
4
|
import * as path from "path";
|
|
@@ -342,11 +342,12 @@ class UrlDeduplicatorService extends Effect.Service()(
|
|
|
342
342
|
if (normalizedPath === "") {
|
|
343
343
|
normalizedPath = "/";
|
|
344
344
|
}
|
|
345
|
-
|
|
346
|
-
|
|
345
|
+
const hash = "";
|
|
346
|
+
let port = parsed.port;
|
|
347
347
|
if (parsed.protocol === "http:" && parsed.port === "80" || parsed.protocol === "https:" && parsed.port === "443") {
|
|
348
|
-
|
|
348
|
+
port = "";
|
|
349
349
|
}
|
|
350
|
+
let search = parsed.search;
|
|
350
351
|
if (parsed.search) {
|
|
351
352
|
const params = new URLSearchParams(parsed.search);
|
|
352
353
|
const sortedParams = new URLSearchParams();
|
|
@@ -355,9 +356,12 @@ class UrlDeduplicatorService extends Effect.Service()(
|
|
|
355
356
|
sortedParams.append(key, value);
|
|
356
357
|
});
|
|
357
358
|
});
|
|
358
|
-
|
|
359
|
+
const sortedStr = sortedParams.toString();
|
|
360
|
+
search = sortedStr ? `?${sortedStr}` : "";
|
|
359
361
|
}
|
|
360
|
-
|
|
362
|
+
const auth = parsed.username ? `${parsed.username}${parsed.password ? ":" + parsed.password : ""}@` : "";
|
|
363
|
+
const portStr = port ? `:${port}` : "";
|
|
364
|
+
return `${parsed.protocol}//${auth}${parsed.hostname}${portStr}${normalizedPath}${search}${hash}`;
|
|
361
365
|
}),
|
|
362
366
|
// If URL parsing fails, return original
|
|
363
367
|
() => Effect.succeed(url)
|
|
@@ -425,15 +429,49 @@ const PageDataSchema = Schema.Struct({
|
|
|
425
429
|
Schema.Record({ key: Schema.String, value: Schema.Unknown })
|
|
426
430
|
)
|
|
427
431
|
});
|
|
428
|
-
|
|
429
|
-
|
|
432
|
+
class SpiderError extends Data.TaggedError("SpiderError") {
|
|
433
|
+
get message() {
|
|
434
|
+
const detailsStr = Option.fromNullable(this.details).pipe(
|
|
435
|
+
Option.map((d) => `: ${String(d)}`),
|
|
436
|
+
Option.getOrElse(() => "")
|
|
437
|
+
);
|
|
438
|
+
return `Spider operation '${this.operation}' failed${detailsStr}`;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
class NetworkError extends Data.TaggedError("NetworkError") {
|
|
442
|
+
get message() {
|
|
443
|
+
const parts = pipe(
|
|
444
|
+
Chunk.make(`Network request to ${this.url} failed`),
|
|
445
|
+
(chunk) => this.statusCode ? Chunk.append(chunk, `with status ${this.statusCode}`) : chunk,
|
|
446
|
+
(chunk) => this.cause ? Chunk.append(chunk, `${this.cause}`) : chunk
|
|
447
|
+
);
|
|
448
|
+
return Chunk.toArray(parts).join(" ");
|
|
449
|
+
}
|
|
450
|
+
static fromResponse(url, response) {
|
|
430
451
|
return new NetworkError({
|
|
452
|
+
url,
|
|
453
|
+
statusCode: response.status,
|
|
454
|
+
method: "GET"
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
static fromCause(url, cause) {
|
|
458
|
+
return new NetworkError({ url, cause });
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
class TimeoutError extends Data.TaggedError("TimeoutError") {
|
|
462
|
+
get message() {
|
|
463
|
+
return `Operation '${this.operation}' timed out after ${this.timeoutMs}ms for ${this.url}`;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
class RobotsTxtError extends Data.TaggedError("RobotsTxtError") {
|
|
467
|
+
static fromCause(url, cause) {
|
|
468
|
+
return new RobotsTxtError({
|
|
431
469
|
url,
|
|
432
470
|
cause,
|
|
433
|
-
message: `Failed to fetch
|
|
471
|
+
message: `Failed to fetch robots.txt: ${cause}`
|
|
434
472
|
});
|
|
435
473
|
}
|
|
436
|
-
}
|
|
474
|
+
}
|
|
437
475
|
class ResponseError extends Data.TaggedError("ResponseError") {
|
|
438
476
|
static fromCause(url, cause) {
|
|
439
477
|
return new ResponseError({
|
|
@@ -443,53 +481,89 @@ class ResponseError extends Data.TaggedError("ResponseError") {
|
|
|
443
481
|
});
|
|
444
482
|
}
|
|
445
483
|
}
|
|
446
|
-
class
|
|
447
|
-
|
|
448
|
-
return
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
484
|
+
class ParseError extends Data.TaggedError("ParseError") {
|
|
485
|
+
get message() {
|
|
486
|
+
return `Failed to parse ${this.expected}${this.input ? ` from input: ${this.input.substring(0, 100)}...` : ""}`;
|
|
487
|
+
}
|
|
488
|
+
static json(input, cause) {
|
|
489
|
+
return new ParseError({
|
|
490
|
+
input,
|
|
491
|
+
expected: "JSON",
|
|
492
|
+
cause
|
|
493
|
+
});
|
|
494
|
+
}
|
|
495
|
+
static html(input, cause) {
|
|
496
|
+
return new ParseError({
|
|
497
|
+
input,
|
|
498
|
+
expected: "HTML",
|
|
499
|
+
cause
|
|
500
|
+
});
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
class ValidationError extends Data.TaggedError("ValidationError") {
|
|
504
|
+
get message() {
|
|
505
|
+
return `Validation failed for field '${this.field}': ${this.constraint}`;
|
|
506
|
+
}
|
|
507
|
+
static url(url) {
|
|
508
|
+
return new ValidationError({
|
|
509
|
+
field: "url",
|
|
510
|
+
value: url,
|
|
511
|
+
constraint: "Invalid URL format"
|
|
452
512
|
});
|
|
453
513
|
}
|
|
454
514
|
}
|
|
455
515
|
class ConfigurationError extends Data.TaggedError("ConfigurationError") {
|
|
456
516
|
}
|
|
457
|
-
|
|
517
|
+
class ConfigError extends Data.TaggedError("ConfigError") {
|
|
518
|
+
get message() {
|
|
519
|
+
return `Configuration error for '${this.field}': ${this.reason}`;
|
|
520
|
+
}
|
|
521
|
+
static invalid(field, value, expected) {
|
|
522
|
+
return new ConfigError({
|
|
523
|
+
field,
|
|
524
|
+
value,
|
|
525
|
+
reason: `Expected ${expected}, got ${typeof value}`
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
class MiddlewareError extends Data.TaggedError("MiddlewareError") {
|
|
530
|
+
get message() {
|
|
531
|
+
return `Middleware '${this.middlewareName}' failed during ${this.phase} phase`;
|
|
532
|
+
}
|
|
458
533
|
static transform(middlewareName, cause) {
|
|
459
534
|
return new MiddlewareError({
|
|
460
535
|
phase: "transform",
|
|
461
536
|
middlewareName,
|
|
462
|
-
cause
|
|
463
|
-
message: `Middleware '${middlewareName}' failed during transform: ${cause}`
|
|
537
|
+
cause
|
|
464
538
|
});
|
|
465
539
|
}
|
|
466
540
|
static error(middlewareName, cause) {
|
|
467
541
|
return new MiddlewareError({
|
|
468
542
|
phase: "error",
|
|
469
543
|
middlewareName,
|
|
470
|
-
cause
|
|
471
|
-
message: `Middleware '${middlewareName}' failed during error handling: ${cause}`
|
|
544
|
+
cause
|
|
472
545
|
});
|
|
473
546
|
}
|
|
474
|
-
}
|
|
475
|
-
|
|
547
|
+
}
|
|
548
|
+
class FileSystemError extends Data.TaggedError("FileSystemError") {
|
|
549
|
+
get message() {
|
|
550
|
+
return `File system ${this.operation} operation failed for path: ${this.path}`;
|
|
551
|
+
}
|
|
476
552
|
static write(path2, cause) {
|
|
477
553
|
return new FileSystemError({
|
|
478
554
|
operation: "write",
|
|
479
555
|
path: path2,
|
|
480
|
-
cause
|
|
481
|
-
message: `Failed to write file ${path2}: ${cause}`
|
|
556
|
+
cause
|
|
482
557
|
});
|
|
483
558
|
}
|
|
484
559
|
static create(path2, cause) {
|
|
485
560
|
return new FileSystemError({
|
|
486
561
|
operation: "create",
|
|
487
562
|
path: path2,
|
|
488
|
-
cause
|
|
489
|
-
message: `Failed to create directory ${path2}: ${cause}`
|
|
563
|
+
cause
|
|
490
564
|
});
|
|
491
565
|
}
|
|
492
|
-
}
|
|
566
|
+
}
|
|
493
567
|
let PersistenceError$1 = class PersistenceError extends Data.TaggedError("PersistenceError") {
|
|
494
568
|
static save(cause, key) {
|
|
495
569
|
return new PersistenceError({
|
|
@@ -553,36 +627,32 @@ class AdapterNotInitialisedError extends Data.TaggedError("AdapterNotInitialised
|
|
|
553
627
|
});
|
|
554
628
|
}
|
|
555
629
|
}
|
|
556
|
-
|
|
630
|
+
class BrowserError extends Data.TaggedError("BrowserError") {
|
|
631
|
+
get message() {
|
|
632
|
+
return `Browser operation '${this.operation}' failed${this.browserId ? ` for browser ${this.browserId}` : ""}${this.cause ? `: ${this.cause}` : ""}`;
|
|
633
|
+
}
|
|
557
634
|
static launch(cause) {
|
|
558
|
-
return new BrowserError({
|
|
559
|
-
operation: "launch",
|
|
560
|
-
cause,
|
|
561
|
-
message: `Failed to launch browser: ${cause}`
|
|
562
|
-
});
|
|
635
|
+
return new BrowserError({ operation: "launch", cause });
|
|
563
636
|
}
|
|
564
637
|
static createContext(cause) {
|
|
565
|
-
return new BrowserError({
|
|
566
|
-
operation: "createContext",
|
|
567
|
-
cause,
|
|
568
|
-
message: `Failed to create browser context: ${cause}`
|
|
569
|
-
});
|
|
638
|
+
return new BrowserError({ operation: "createContext", cause });
|
|
570
639
|
}
|
|
571
640
|
static createPage(cause) {
|
|
572
|
-
return new BrowserError({
|
|
573
|
-
operation: "createPage",
|
|
574
|
-
cause,
|
|
575
|
-
message: `Failed to create page: ${cause}`
|
|
576
|
-
});
|
|
641
|
+
return new BrowserError({ operation: "createPage", cause });
|
|
577
642
|
}
|
|
578
643
|
static closeContext(cause) {
|
|
644
|
+
return new BrowserError({ operation: "closeContext", cause });
|
|
645
|
+
}
|
|
646
|
+
static notLaunched() {
|
|
579
647
|
return new BrowserError({
|
|
580
|
-
operation: "
|
|
581
|
-
cause
|
|
582
|
-
message: `Failed to close context: ${cause}`
|
|
648
|
+
operation: "access",
|
|
649
|
+
cause: "Browser not launched"
|
|
583
650
|
});
|
|
584
651
|
}
|
|
585
|
-
|
|
652
|
+
static launchFailed(cause) {
|
|
653
|
+
return new BrowserError({ operation: "launch", cause });
|
|
654
|
+
}
|
|
655
|
+
}
|
|
586
656
|
class BrowserCleanupError extends Data.TaggedError("BrowserCleanupError") {
|
|
587
657
|
static context(id, cause) {
|
|
588
658
|
return new BrowserCleanupError({
|
|
@@ -601,6 +671,64 @@ class BrowserCleanupError extends Data.TaggedError("BrowserCleanupError") {
|
|
|
601
671
|
});
|
|
602
672
|
}
|
|
603
673
|
}
|
|
674
|
+
class PageError extends Data.TaggedError("PageError") {
|
|
675
|
+
get message() {
|
|
676
|
+
return `Page operation '${this.operation}' failed for ${this.url}${this.selector ? ` with selector '${this.selector}'` : ""}`;
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
class StateError extends Data.TaggedError("StateError") {
|
|
680
|
+
get message() {
|
|
681
|
+
return `State ${this.operation} operation failed${this.stateKey ? ` for key '${this.stateKey}'` : ""}`;
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
let SessionError$1 = class SessionError extends Data.TaggedError("SessionError") {
|
|
685
|
+
get message() {
|
|
686
|
+
return `Session operation '${this.operation}' failed${this.sessionId ? ` for session ${this.sessionId}` : ""}`;
|
|
687
|
+
}
|
|
688
|
+
static noActiveSession() {
|
|
689
|
+
return new SessionError({
|
|
690
|
+
operation: "access",
|
|
691
|
+
cause: "No active session"
|
|
692
|
+
});
|
|
693
|
+
}
|
|
694
|
+
};
|
|
695
|
+
class CrawlError extends Data.TaggedError("CrawlError") {
|
|
696
|
+
get message() {
|
|
697
|
+
return `Failed to crawl ${this.url} at depth ${this.depth}: ${this.reason}`;
|
|
698
|
+
}
|
|
699
|
+
static maxDepthReached(url, depth) {
|
|
700
|
+
return new CrawlError({
|
|
701
|
+
url,
|
|
702
|
+
depth,
|
|
703
|
+
reason: "Maximum depth reached"
|
|
704
|
+
});
|
|
705
|
+
}
|
|
706
|
+
static robotsBlocked(url) {
|
|
707
|
+
return new CrawlError({
|
|
708
|
+
url,
|
|
709
|
+
depth: 0,
|
|
710
|
+
reason: "Blocked by robots.txt"
|
|
711
|
+
});
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
class QueueError extends Data.TaggedError("QueueError") {
|
|
715
|
+
get message() {
|
|
716
|
+
const sizeStr = Option.fromNullable(this.queueSize).pipe(
|
|
717
|
+
Option.map((size) => ` (queue size: ${size})`),
|
|
718
|
+
Option.getOrElse(() => "")
|
|
719
|
+
);
|
|
720
|
+
return `Queue ${this.operation} operation failed${sizeStr}`;
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
const isSpiderError = (error) => {
|
|
724
|
+
return error instanceof SpiderError;
|
|
725
|
+
};
|
|
726
|
+
const isNetworkError = (error) => {
|
|
727
|
+
return error instanceof NetworkError || error instanceof TimeoutError;
|
|
728
|
+
};
|
|
729
|
+
const isBrowserError = (error) => {
|
|
730
|
+
return error instanceof BrowserError || error instanceof PageError;
|
|
731
|
+
};
|
|
604
732
|
class SpiderLogger extends Context.Tag("SpiderLogger")() {
|
|
605
733
|
}
|
|
606
734
|
const SummarySchema = Schema.Record({
|
|
@@ -651,9 +779,9 @@ const makeSpiderLogger = (logDir = "./spider-logs") => {
|
|
|
651
779
|
}
|
|
652
780
|
return "{}";
|
|
653
781
|
};
|
|
654
|
-
const writeLogEvent = (event) => Effect.
|
|
782
|
+
const writeLogEvent = (event) => Effect.gen(function* () {
|
|
655
783
|
const logLine = stringifyForLog(event) + "\n";
|
|
656
|
-
fs.appendFileSync(logFilePath, logLine);
|
|
784
|
+
yield* Effect.sync(() => fs.appendFileSync(logFilePath, logLine));
|
|
657
785
|
const importantTypes = [
|
|
658
786
|
"domain_start",
|
|
659
787
|
"domain_complete",
|
|
@@ -663,9 +791,7 @@ const makeSpiderLogger = (logDir = "./spider-logs") => {
|
|
|
663
791
|
if (importantTypes.includes(event.type)) {
|
|
664
792
|
const prefix = `[${event.type}]`;
|
|
665
793
|
const domainInfo = event.domain ? ` [${event.domain}]` : "";
|
|
666
|
-
Console.log(`${prefix}${domainInfo} ${event.message}`)
|
|
667
|
-
Effect.runSync
|
|
668
|
-
);
|
|
794
|
+
yield* Console.log(`${prefix}${domainInfo} ${event.message}`);
|
|
669
795
|
}
|
|
670
796
|
});
|
|
671
797
|
const updateSummary = (update) => Effect.sync(() => {
|
|
@@ -973,7 +1099,7 @@ class ScraperService extends Effect.Service()(
|
|
|
973
1099
|
if (error instanceof Error && error.name === "AbortError") {
|
|
974
1100
|
return RequestAbortError.timeout(url, timeoutMs);
|
|
975
1101
|
}
|
|
976
|
-
return NetworkError
|
|
1102
|
+
return NetworkError.fromCause(url, error);
|
|
977
1103
|
}
|
|
978
1104
|
});
|
|
979
1105
|
const fetchWithTimeout = fetchEffect.pipe(
|
|
@@ -1367,11 +1493,11 @@ class SpiderSchedulerService extends Effect.Service()(
|
|
|
1367
1493
|
if (normalizedPath === "") {
|
|
1368
1494
|
normalizedPath = "/";
|
|
1369
1495
|
}
|
|
1370
|
-
|
|
1371
|
-
parsed.hash = "";
|
|
1496
|
+
let port = parsed.port;
|
|
1372
1497
|
if (parsed.protocol === "http:" && parsed.port === "80" || parsed.protocol === "https:" && parsed.port === "443") {
|
|
1373
|
-
|
|
1498
|
+
port = "";
|
|
1374
1499
|
}
|
|
1500
|
+
let search = parsed.search;
|
|
1375
1501
|
if (parsed.search) {
|
|
1376
1502
|
const params = new URLSearchParams(parsed.search);
|
|
1377
1503
|
const sortedParams = new URLSearchParams();
|
|
@@ -1380,9 +1506,12 @@ class SpiderSchedulerService extends Effect.Service()(
|
|
|
1380
1506
|
sortedParams.append(key, value);
|
|
1381
1507
|
});
|
|
1382
1508
|
});
|
|
1383
|
-
|
|
1509
|
+
const sortedStr = sortedParams.toString();
|
|
1510
|
+
search = sortedStr ? `?${sortedStr}` : "";
|
|
1384
1511
|
}
|
|
1385
|
-
|
|
1512
|
+
const auth = parsed.username ? `${parsed.username}${parsed.password ? ":" + parsed.password : ""}@` : "";
|
|
1513
|
+
const portStr = port ? `:${port}` : "";
|
|
1514
|
+
return `${parsed.protocol}//${auth}${parsed.hostname}${portStr}${normalizedPath}${search}`;
|
|
1386
1515
|
}
|
|
1387
1516
|
}
|
|
1388
1517
|
);
|
|
@@ -1516,154 +1645,6 @@ class SpiderSchedulerService extends Effect.Service()(
|
|
|
1516
1645
|
}
|
|
1517
1646
|
) {
|
|
1518
1647
|
}
|
|
1519
|
-
class SpiderError extends Data.TaggedError("SpiderError") {
|
|
1520
|
-
get message() {
|
|
1521
|
-
const detailsStr = Option.fromNullable(this.details).pipe(
|
|
1522
|
-
Option.map((d) => `: ${String(d)}`),
|
|
1523
|
-
Option.getOrElse(() => "")
|
|
1524
|
-
);
|
|
1525
|
-
return `Spider operation '${this.operation}' failed${detailsStr}`;
|
|
1526
|
-
}
|
|
1527
|
-
}
|
|
1528
|
-
class NetworkError2 extends Data.TaggedError("NetworkError") {
|
|
1529
|
-
get message() {
|
|
1530
|
-
return `Network request to ${this.url} failed${this.statusCode ? ` with status ${this.statusCode}` : ""}`;
|
|
1531
|
-
}
|
|
1532
|
-
static fromResponse(url, response) {
|
|
1533
|
-
return new NetworkError2({
|
|
1534
|
-
url,
|
|
1535
|
-
statusCode: response.status,
|
|
1536
|
-
method: "GET"
|
|
1537
|
-
});
|
|
1538
|
-
}
|
|
1539
|
-
static fromCause(url, cause) {
|
|
1540
|
-
return new NetworkError2({ url, cause });
|
|
1541
|
-
}
|
|
1542
|
-
}
|
|
1543
|
-
class TimeoutError extends Data.TaggedError("TimeoutError") {
|
|
1544
|
-
get message() {
|
|
1545
|
-
return `Operation '${this.operation}' timed out after ${this.timeoutMs}ms for ${this.url}`;
|
|
1546
|
-
}
|
|
1547
|
-
}
|
|
1548
|
-
class ParseError extends Data.TaggedError("ParseError") {
|
|
1549
|
-
get message() {
|
|
1550
|
-
return `Failed to parse ${this.expected}${this.input ? ` from input: ${this.input.substring(0, 100)}...` : ""}`;
|
|
1551
|
-
}
|
|
1552
|
-
static json(input, cause) {
|
|
1553
|
-
return new ParseError({
|
|
1554
|
-
input,
|
|
1555
|
-
expected: "JSON",
|
|
1556
|
-
cause
|
|
1557
|
-
});
|
|
1558
|
-
}
|
|
1559
|
-
static html(input, cause) {
|
|
1560
|
-
return new ParseError({
|
|
1561
|
-
input,
|
|
1562
|
-
expected: "HTML",
|
|
1563
|
-
cause
|
|
1564
|
-
});
|
|
1565
|
-
}
|
|
1566
|
-
}
|
|
1567
|
-
class ValidationError extends Data.TaggedError("ValidationError") {
|
|
1568
|
-
get message() {
|
|
1569
|
-
return `Validation failed for field '${this.field}': ${this.constraint}`;
|
|
1570
|
-
}
|
|
1571
|
-
static url(url) {
|
|
1572
|
-
return new ValidationError({
|
|
1573
|
-
field: "url",
|
|
1574
|
-
value: url,
|
|
1575
|
-
constraint: "Invalid URL format"
|
|
1576
|
-
});
|
|
1577
|
-
}
|
|
1578
|
-
}
|
|
1579
|
-
class BrowserError2 extends Data.TaggedError("BrowserError") {
|
|
1580
|
-
get message() {
|
|
1581
|
-
return `Browser operation '${this.operation}' failed${this.browserId ? ` for browser ${this.browserId}` : ""}`;
|
|
1582
|
-
}
|
|
1583
|
-
static notLaunched() {
|
|
1584
|
-
return new BrowserError2({
|
|
1585
|
-
operation: "access",
|
|
1586
|
-
cause: "Browser not launched"
|
|
1587
|
-
});
|
|
1588
|
-
}
|
|
1589
|
-
static launchFailed(cause) {
|
|
1590
|
-
return new BrowserError2({
|
|
1591
|
-
operation: "launch",
|
|
1592
|
-
cause
|
|
1593
|
-
});
|
|
1594
|
-
}
|
|
1595
|
-
}
|
|
1596
|
-
class PageError extends Data.TaggedError("PageError") {
|
|
1597
|
-
get message() {
|
|
1598
|
-
return `Page operation '${this.operation}' failed for ${this.url}${this.selector ? ` with selector '${this.selector}'` : ""}`;
|
|
1599
|
-
}
|
|
1600
|
-
}
|
|
1601
|
-
class StateError extends Data.TaggedError("StateError") {
|
|
1602
|
-
get message() {
|
|
1603
|
-
return `State ${this.operation} operation failed${this.stateKey ? ` for key '${this.stateKey}'` : ""}`;
|
|
1604
|
-
}
|
|
1605
|
-
}
|
|
1606
|
-
let SessionError$1 = class SessionError extends Data.TaggedError("SessionError") {
|
|
1607
|
-
get message() {
|
|
1608
|
-
return `Session operation '${this.operation}' failed${this.sessionId ? ` for session ${this.sessionId}` : ""}`;
|
|
1609
|
-
}
|
|
1610
|
-
static noActiveSession() {
|
|
1611
|
-
return new SessionError({
|
|
1612
|
-
operation: "access",
|
|
1613
|
-
cause: "No active session"
|
|
1614
|
-
});
|
|
1615
|
-
}
|
|
1616
|
-
};
|
|
1617
|
-
class FileSystemError2 extends Data.TaggedError("FileSystemError") {
|
|
1618
|
-
get message() {
|
|
1619
|
-
return `File system ${this.operation} operation failed for path: ${this.path}`;
|
|
1620
|
-
}
|
|
1621
|
-
}
|
|
1622
|
-
class CrawlError extends Data.TaggedError("CrawlError") {
|
|
1623
|
-
get message() {
|
|
1624
|
-
return `Failed to crawl ${this.url} at depth ${this.depth}: ${this.reason}`;
|
|
1625
|
-
}
|
|
1626
|
-
static maxDepthReached(url, depth) {
|
|
1627
|
-
return new CrawlError({
|
|
1628
|
-
url,
|
|
1629
|
-
depth,
|
|
1630
|
-
reason: "Maximum depth reached"
|
|
1631
|
-
});
|
|
1632
|
-
}
|
|
1633
|
-
static robotsBlocked(url) {
|
|
1634
|
-
return new CrawlError({
|
|
1635
|
-
url,
|
|
1636
|
-
depth: 0,
|
|
1637
|
-
reason: "Blocked by robots.txt"
|
|
1638
|
-
});
|
|
1639
|
-
}
|
|
1640
|
-
}
|
|
1641
|
-
class QueueError extends Data.TaggedError("QueueError") {
|
|
1642
|
-
get message() {
|
|
1643
|
-
const sizeStr = Option.fromNullable(this.queueSize).pipe(
|
|
1644
|
-
Option.map((size) => ` (queue size: ${size})`),
|
|
1645
|
-
Option.getOrElse(() => "")
|
|
1646
|
-
);
|
|
1647
|
-
return `Queue ${this.operation} operation failed${sizeStr}`;
|
|
1648
|
-
}
|
|
1649
|
-
}
|
|
1650
|
-
class ConfigError extends Data.TaggedError("ConfigError") {
|
|
1651
|
-
get message() {
|
|
1652
|
-
return `Configuration error for '${this.field}': ${this.reason}`;
|
|
1653
|
-
}
|
|
1654
|
-
static invalid(field, value, expected) {
|
|
1655
|
-
return new ConfigError({
|
|
1656
|
-
field,
|
|
1657
|
-
value,
|
|
1658
|
-
reason: `Expected ${expected}, got ${typeof value}`
|
|
1659
|
-
});
|
|
1660
|
-
}
|
|
1661
|
-
}
|
|
1662
|
-
class MiddlewareError2 extends Data.TaggedError("MiddlewareError") {
|
|
1663
|
-
get message() {
|
|
1664
|
-
return `Middleware '${this.middlewareName}' failed during ${this.phase} phase`;
|
|
1665
|
-
}
|
|
1666
|
-
}
|
|
1667
1648
|
const DEFAULT_DEDUPLICATION_STRATEGY = {
|
|
1668
1649
|
wwwHandling: "ignore",
|
|
1669
1650
|
protocolHandling: "prefer-https",
|
|
@@ -1677,9 +1658,7 @@ const parseUrl = (url) => Effect.try({
|
|
|
1677
1658
|
});
|
|
1678
1659
|
const normalizeUrl = (url, strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Effect.gen(function* () {
|
|
1679
1660
|
const parsed = yield* parseUrl(url);
|
|
1680
|
-
|
|
1681
|
-
parsed.protocol = "https:";
|
|
1682
|
-
}
|
|
1661
|
+
const protocol = strategy.protocolHandling === "prefer-https" ? "https:" : parsed.protocol;
|
|
1683
1662
|
let domain = parsed.hostname.toLowerCase();
|
|
1684
1663
|
const hasWww = domain.startsWith("www.");
|
|
1685
1664
|
const domainWithoutWww = hasWww ? domain.substring(4) : domain;
|
|
@@ -1687,104 +1666,105 @@ const normalizeUrl = (url, strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Effect.
|
|
|
1687
1666
|
case "ignore":
|
|
1688
1667
|
case "prefer-non-www":
|
|
1689
1668
|
domain = domainWithoutWww;
|
|
1690
|
-
parsed.hostname = domain;
|
|
1691
1669
|
break;
|
|
1692
1670
|
case "prefer-www":
|
|
1693
1671
|
if (!hasWww) {
|
|
1694
1672
|
domain = `www.${domain}`;
|
|
1695
|
-
parsed.hostname = domain;
|
|
1696
1673
|
}
|
|
1697
1674
|
break;
|
|
1698
1675
|
}
|
|
1676
|
+
let pathname = parsed.pathname;
|
|
1699
1677
|
if (strategy.trailingSlashHandling === "ignore") {
|
|
1700
|
-
|
|
1678
|
+
pathname = pathname.replace(/\/$/, "") || "/";
|
|
1701
1679
|
}
|
|
1680
|
+
let search = "";
|
|
1702
1681
|
if (strategy.queryParamHandling === "ignore") {
|
|
1703
|
-
|
|
1682
|
+
search = "";
|
|
1704
1683
|
} else if (strategy.queryParamHandling === "sort") {
|
|
1705
1684
|
const params = new URLSearchParams(parsed.search);
|
|
1706
1685
|
const sorted = Array.from(params.entries()).sort(([a], [b]) => a.localeCompare(b));
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1686
|
+
const sortedSearch = new URLSearchParams(sorted).toString();
|
|
1687
|
+
search = sortedSearch ? `?${sortedSearch}` : "";
|
|
1688
|
+
} else {
|
|
1689
|
+
search = parsed.search;
|
|
1711
1690
|
}
|
|
1691
|
+
const hash = strategy.fragmentHandling === "ignore" ? "" : parsed.hash;
|
|
1692
|
+
const auth = parsed.username ? `${parsed.username}${parsed.password ? ":" + parsed.password : ""}@` : "";
|
|
1693
|
+
const port = parsed.port ? `:${parsed.port}` : "";
|
|
1694
|
+
const normalized = `${protocol}//${auth}${domain}${port}${pathname}${search}${hash}`;
|
|
1712
1695
|
return {
|
|
1713
1696
|
original: url,
|
|
1714
|
-
normalized
|
|
1697
|
+
normalized,
|
|
1715
1698
|
domain: domainWithoutWww
|
|
1716
1699
|
};
|
|
1717
1700
|
});
|
|
1718
1701
|
const deduplicateUrls = (urls, strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Effect.gen(function* () {
|
|
1719
|
-
|
|
1720
|
-
const skipped =
|
|
1702
|
+
let domainMap = HashMap.empty();
|
|
1703
|
+
const skipped = [];
|
|
1721
1704
|
let invalidCount = 0;
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
}
|
|
1757
|
-
}
|
|
1758
|
-
})
|
|
1759
|
-
),
|
|
1760
|
-
Effect.catchAll(
|
|
1761
|
-
(error) => Effect.gen(function* () {
|
|
1762
|
-
invalidCount++;
|
|
1763
|
-
yield* Ref.update(skipped, (arr) => [
|
|
1764
|
-
...arr,
|
|
1765
|
-
{ url: urlObj.url, reason: `Invalid URL: ${error.message}` }
|
|
1766
|
-
]);
|
|
1767
|
-
yield* Effect.logWarning(`Invalid URL skipped: ${urlObj.url}`);
|
|
1768
|
-
})
|
|
1769
|
-
)
|
|
1770
|
-
)
|
|
1771
|
-
),
|
|
1772
|
-
{ concurrency: "unbounded" }
|
|
1773
|
-
);
|
|
1774
|
-
const finalMap = yield* Ref.get(domainMap);
|
|
1775
|
-
const finalSkipped = yield* Ref.get(skipped);
|
|
1776
|
-
const deduplicated = Array.from(HashMap.values(finalMap));
|
|
1705
|
+
for (const urlObj of urls) {
|
|
1706
|
+
const normalizeResult = yield* Effect.either(normalizeUrl(urlObj.url, strategy));
|
|
1707
|
+
if (normalizeResult._tag === "Left") {
|
|
1708
|
+
invalidCount++;
|
|
1709
|
+
skipped.push({ url: urlObj.url, reason: `Invalid URL: ${normalizeResult.left.message}` });
|
|
1710
|
+
yield* Effect.logWarning(`Invalid URL skipped: ${urlObj.url}`);
|
|
1711
|
+
continue;
|
|
1712
|
+
}
|
|
1713
|
+
const normalized = normalizeResult.right;
|
|
1714
|
+
const key = strategy.wwwHandling === "preserve" ? normalized.normalized : normalized.domain;
|
|
1715
|
+
const existingOption = HashMap.get(domainMap, key);
|
|
1716
|
+
if (Option.isNone(existingOption)) {
|
|
1717
|
+
domainMap = HashMap.set(domainMap, key, urlObj);
|
|
1718
|
+
} else {
|
|
1719
|
+
const existing = existingOption.value;
|
|
1720
|
+
let shouldReplace = false;
|
|
1721
|
+
if (strategy.wwwHandling === "prefer-www") {
|
|
1722
|
+
const existingHasWww = existing.url.includes("://www.");
|
|
1723
|
+
const newHasWww = urlObj.url.includes("://www.");
|
|
1724
|
+
shouldReplace = !existingHasWww && newHasWww;
|
|
1725
|
+
} else if (strategy.wwwHandling === "prefer-non-www") {
|
|
1726
|
+
const existingHasWww = existing.url.includes("://www.");
|
|
1727
|
+
const newHasWww = urlObj.url.includes("://www.");
|
|
1728
|
+
shouldReplace = existingHasWww && !newHasWww;
|
|
1729
|
+
}
|
|
1730
|
+
if (shouldReplace) {
|
|
1731
|
+
domainMap = HashMap.set(domainMap, key, urlObj);
|
|
1732
|
+
skipped.push({ url: existing.url, reason: `Replaced by preferred variant: ${urlObj.url}` });
|
|
1733
|
+
} else {
|
|
1734
|
+
skipped.push({ url: urlObj.url, reason: `Duplicate of: ${existing.url}` });
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
const deduplicated = Array.from(HashMap.values(domainMap));
|
|
1777
1739
|
return {
|
|
1778
1740
|
deduplicated,
|
|
1779
|
-
skipped
|
|
1741
|
+
skipped,
|
|
1780
1742
|
stats: {
|
|
1781
1743
|
total: urls.length,
|
|
1782
1744
|
unique: deduplicated.length,
|
|
1783
|
-
duplicates:
|
|
1745
|
+
duplicates: skipped.filter((s) => s.reason.startsWith("Duplicate")).length,
|
|
1784
1746
|
invalid: invalidCount
|
|
1785
1747
|
}
|
|
1786
1748
|
};
|
|
1787
1749
|
});
|
|
1750
|
+
const SPIDER_DEFAULTS = Object.freeze({
|
|
1751
|
+
/** Threshold in ms after which a worker is considered stale (60s) */
|
|
1752
|
+
STALE_WORKER_THRESHOLD_MS: 6e4,
|
|
1753
|
+
/** Interval for health check monitoring */
|
|
1754
|
+
HEALTH_CHECK_INTERVAL: "15 seconds",
|
|
1755
|
+
/** Memory usage threshold in bytes (1GB) before logging warnings */
|
|
1756
|
+
MEMORY_THRESHOLD_BYTES: 1024 * 1024 * 1024,
|
|
1757
|
+
/** Queue size threshold before logging warnings */
|
|
1758
|
+
QUEUE_SIZE_THRESHOLD: 1e4,
|
|
1759
|
+
/** Timeout for task acquisition from queue */
|
|
1760
|
+
TASK_ACQUISITION_TIMEOUT: "10 seconds",
|
|
1761
|
+
/** Timeout for page fetch operations */
|
|
1762
|
+
FETCH_TIMEOUT: "45 seconds",
|
|
1763
|
+
/** Number of retry attempts for fetch operations */
|
|
1764
|
+
FETCH_RETRY_COUNT: 2,
|
|
1765
|
+
/** Interval for domain failure detection checks */
|
|
1766
|
+
FAILURE_DETECTOR_INTERVAL: "30 seconds"
|
|
1767
|
+
});
|
|
1788
1768
|
class SpiderService extends Effect.Service()(
|
|
1789
1769
|
"@jambudipa/spider",
|
|
1790
1770
|
{
|
|
@@ -1954,7 +1934,7 @@ class SpiderService extends Effect.Service()(
|
|
|
1954
1934
|
const workerHealthMonitor = Effect.gen(function* () {
|
|
1955
1935
|
const healthMap = MutableRef.get(workerHealthChecks);
|
|
1956
1936
|
const now = yield* DateTime.now;
|
|
1957
|
-
const staleThreshold =
|
|
1937
|
+
const staleThreshold = SPIDER_DEFAULTS.STALE_WORKER_THRESHOLD_MS;
|
|
1958
1938
|
let staleWorkersChunk = Chunk.empty();
|
|
1959
1939
|
for (const [workerId, lastCheck] of healthMap) {
|
|
1960
1940
|
const elapsed = DateTime.toEpochMillis(now) - DateTime.toEpochMillis(lastCheck);
|
|
@@ -1976,8 +1956,7 @@ class SpiderService extends Effect.Service()(
|
|
|
1976
1956
|
MutableRef.set(workerHealthChecks, updatedMap);
|
|
1977
1957
|
}
|
|
1978
1958
|
}).pipe(
|
|
1979
|
-
Effect.repeat(Schedule.fixed(
|
|
1980
|
-
// Check every 15 seconds
|
|
1959
|
+
Effect.repeat(Schedule.fixed(SPIDER_DEFAULTS.HEALTH_CHECK_INTERVAL))
|
|
1981
1960
|
);
|
|
1982
1961
|
const queueManager = {
|
|
1983
1962
|
// Atomic take: either returns task and increments active count, or detects completion
|
|
@@ -2063,7 +2042,7 @@ class SpiderService extends Effect.Service()(
|
|
|
2063
2042
|
yield* reportWorkerHealth(workerId);
|
|
2064
2043
|
const queueSize = yield* queueManager.size();
|
|
2065
2044
|
const memUsage = process.memoryUsage();
|
|
2066
|
-
if (memUsage.heapUsed >
|
|
2045
|
+
if (memUsage.heapUsed > SPIDER_DEFAULTS.MEMORY_THRESHOLD_BYTES) {
|
|
2067
2046
|
yield* logger.logEdgeCase(domain, "high_memory_usage", {
|
|
2068
2047
|
workerId,
|
|
2069
2048
|
heapUsed: Math.round(memUsage.heapUsed / 1024 / 1024) + "MB",
|
|
@@ -2071,7 +2050,7 @@ class SpiderService extends Effect.Service()(
|
|
|
2071
2050
|
queueSize
|
|
2072
2051
|
});
|
|
2073
2052
|
}
|
|
2074
|
-
if (queueSize >
|
|
2053
|
+
if (queueSize > SPIDER_DEFAULTS.QUEUE_SIZE_THRESHOLD) {
|
|
2075
2054
|
yield* logger.logEdgeCase(domain, "excessive_queue_size", {
|
|
2076
2055
|
workerId,
|
|
2077
2056
|
queueSize,
|
|
@@ -2087,7 +2066,7 @@ class SpiderService extends Effect.Service()(
|
|
|
2087
2066
|
}
|
|
2088
2067
|
);
|
|
2089
2068
|
const result = yield* queueManager.takeTaskOrComplete.pipe(
|
|
2090
|
-
Effect.timeout(
|
|
2069
|
+
Effect.timeout(SPIDER_DEFAULTS.TASK_ACQUISITION_TIMEOUT),
|
|
2091
2070
|
Effect.tap(
|
|
2092
2071
|
() => logger.logEdgeCase(domain, "task_acquisition_success", {
|
|
2093
2072
|
workerId,
|
|
@@ -2186,10 +2165,11 @@ class SpiderService extends Effect.Service()(
|
|
|
2186
2165
|
url: task.url,
|
|
2187
2166
|
message: "About to check shouldFollowUrl"
|
|
2188
2167
|
});
|
|
2168
|
+
const restrictToStartingDomainOption = restrictToStartingDomain ? Option.some(urlString) : Option.none();
|
|
2189
2169
|
const shouldFollow = yield* config.shouldFollowUrl(
|
|
2190
2170
|
task.url,
|
|
2191
2171
|
task.fromUrl,
|
|
2192
|
-
|
|
2172
|
+
Option.getOrUndefined(restrictToStartingDomainOption)
|
|
2193
2173
|
);
|
|
2194
2174
|
yield* logger.logEdgeCase(domain, "after_shouldFollowUrl", {
|
|
2195
2175
|
workerId,
|
|
@@ -2276,10 +2256,9 @@ class SpiderService extends Effect.Service()(
|
|
|
2276
2256
|
});
|
|
2277
2257
|
const pageData = yield* scraper.fetchAndParse(task.url, task.depth).pipe(
|
|
2278
2258
|
// Add overall timeout to prevent workers from hanging
|
|
2279
|
-
Effect.timeout(
|
|
2259
|
+
Effect.timeout(SPIDER_DEFAULTS.FETCH_TIMEOUT),
|
|
2280
2260
|
Effect.retry({
|
|
2281
|
-
times:
|
|
2282
|
-
// Reduced retries to prevent long hangs
|
|
2261
|
+
times: SPIDER_DEFAULTS.FETCH_RETRY_COUNT,
|
|
2283
2262
|
schedule: Schedule.exponential("1 second")
|
|
2284
2263
|
}),
|
|
2285
2264
|
Effect.catchAll(
|
|
@@ -2448,10 +2427,11 @@ class SpiderService extends Effect.Service()(
|
|
|
2448
2427
|
);
|
|
2449
2428
|
linksToProcess = resolvedLinks.filter(Option.isSome).map((opt) => opt.value);
|
|
2450
2429
|
for (const link of linksToProcess) {
|
|
2430
|
+
const linkRestrictOption = restrictToStartingDomain ? Option.some(urlString) : Option.none();
|
|
2451
2431
|
const linkShouldFollow = yield* config.shouldFollowUrl(
|
|
2452
2432
|
link,
|
|
2453
2433
|
task.url,
|
|
2454
|
-
|
|
2434
|
+
Option.getOrUndefined(linkRestrictOption)
|
|
2455
2435
|
);
|
|
2456
2436
|
if (!linkShouldFollow.follow) {
|
|
2457
2437
|
continue;
|
|
@@ -2619,7 +2599,7 @@ class SpiderService extends Effect.Service()(
|
|
|
2619
2599
|
let lastPageCount = 0;
|
|
2620
2600
|
let stuckIterations = 0;
|
|
2621
2601
|
while (!MutableRef.get(domainCompleted)) {
|
|
2622
|
-
yield* Effect.sleep(
|
|
2602
|
+
yield* Effect.sleep(SPIDER_DEFAULTS.FAILURE_DETECTOR_INTERVAL);
|
|
2623
2603
|
const pageCount = yield* localDeduplicator.size();
|
|
2624
2604
|
const queueSize = yield* queueManager.size();
|
|
2625
2605
|
const activeCount = MutableRef.get(activeWorkers);
|
|
@@ -2844,17 +2824,7 @@ class SpiderService extends Effect.Service()(
|
|
|
2844
2824
|
sessionId: stateKey.id,
|
|
2845
2825
|
urlsProcessed: 0
|
|
2846
2826
|
};
|
|
2847
|
-
})
|
|
2848
|
-
/**
|
|
2849
|
-
* Returns the list of URLs that have been visited during crawling.
|
|
2850
|
-
*
|
|
2851
|
-
* @returns Effect containing array of visited URLs
|
|
2852
|
-
*
|
|
2853
|
-
* @remarks
|
|
2854
|
-
* This is currently a placeholder implementation. In a future version,
|
|
2855
|
-
* this will return the actual list of visited URLs from the current session.
|
|
2856
|
-
*/
|
|
2857
|
-
getVisitedUrls: () => Effect.sync(() => [])
|
|
2827
|
+
})
|
|
2858
2828
|
};
|
|
2859
2829
|
return self;
|
|
2860
2830
|
}),
|
|
@@ -4502,34 +4472,25 @@ class PostgresStorageBackend {
|
|
|
4502
4472
|
return Effect.gen(function* () {
|
|
4503
4473
|
if (self.db.transaction) {
|
|
4504
4474
|
yield* Effect.tryPromise({
|
|
4475
|
+
// pg transaction callback requires a Promise, not Effect — .then() chains are intentional
|
|
4476
|
+
/* eslint-disable effect/no-promise-then-catch */
|
|
4505
4477
|
try: () => self.db.transaction(
|
|
4506
|
-
(tx) => (
|
|
4507
|
-
|
|
4508
|
-
|
|
4509
|
-
|
|
4510
|
-
|
|
4511
|
-
|
|
4512
|
-
|
|
4513
|
-
|
|
4514
|
-
|
|
4515
|
-
|
|
4516
|
-
|
|
4517
|
-
|
|
4518
|
-
() => tx.query(
|
|
4519
|
-
`DELETE FROM ${self.getTableName("deltas")} WHERE session_id = $1`,
|
|
4520
|
-
[key.id]
|
|
4521
|
-
)
|
|
4522
|
-
);
|
|
4523
|
-
yield* Effect.promise(
|
|
4524
|
-
() => tx.query(
|
|
4525
|
-
`DELETE FROM ${self.getTableName("sessions")} WHERE id = $1`,
|
|
4526
|
-
[key.id]
|
|
4527
|
-
)
|
|
4528
|
-
);
|
|
4529
|
-
})
|
|
4478
|
+
(tx) => tx.query(
|
|
4479
|
+
`DELETE FROM ${self.getTableName("snapshots")} WHERE session_id = $1`,
|
|
4480
|
+
[key.id]
|
|
4481
|
+
).then(
|
|
4482
|
+
() => tx.query(
|
|
4483
|
+
`DELETE FROM ${self.getTableName("deltas")} WHERE session_id = $1`,
|
|
4484
|
+
[key.id]
|
|
4485
|
+
)
|
|
4486
|
+
).then(
|
|
4487
|
+
() => tx.query(
|
|
4488
|
+
`DELETE FROM ${self.getTableName("sessions")} WHERE id = $1`,
|
|
4489
|
+
[key.id]
|
|
4530
4490
|
)
|
|
4531
4491
|
)
|
|
4532
4492
|
),
|
|
4493
|
+
/* eslint-enable effect/no-promise-then-catch */
|
|
4533
4494
|
catch: (error) => new PersistenceError2({
|
|
4534
4495
|
message: `Failed to delete state from PostgreSQL: ${error}`,
|
|
4535
4496
|
cause: error,
|
|
@@ -5136,7 +5097,7 @@ class JsonSchemaValidationError extends Data.TaggedError("JsonSchemaValidationEr
|
|
|
5136
5097
|
return `JSON validation failed for schema "${this.schemaName}": ${this.cause}`;
|
|
5137
5098
|
}
|
|
5138
5099
|
}
|
|
5139
|
-
const isNonNullObject = (value) => typeof value === "object" && !Array.isArray(value) && Option.fromNullable(value)
|
|
5100
|
+
const isNonNullObject = (value) => typeof value === "object" && !Array.isArray(value) && Option.isSome(Option.fromNullable(value));
|
|
5140
5101
|
const applyReplacer = (value, replacer) => {
|
|
5141
5102
|
const transform = (key, val) => {
|
|
5142
5103
|
const replaced = replacer(key, val);
|
|
@@ -5559,7 +5520,7 @@ const makeEnhancedHttpClient = Effect.gen(function* () {
|
|
|
5559
5520
|
redirect: options.followRedirects === false ? "manual" : "follow",
|
|
5560
5521
|
credentials: options.credentials ?? "same-origin"
|
|
5561
5522
|
}),
|
|
5562
|
-
catch: (error) => new
|
|
5523
|
+
catch: (error) => new NetworkError({
|
|
5563
5524
|
url,
|
|
5564
5525
|
method: options.method ?? "GET",
|
|
5565
5526
|
cause: error
|
|
@@ -5593,7 +5554,7 @@ const makeEnhancedHttpClient = Effect.gen(function* () {
|
|
|
5593
5554
|
);
|
|
5594
5555
|
const response = yield* fetchWithTimeout;
|
|
5595
5556
|
if (!response.ok) {
|
|
5596
|
-
return yield* Effect.fail(new
|
|
5557
|
+
return yield* Effect.fail(new NetworkError({
|
|
5597
5558
|
url: response.url,
|
|
5598
5559
|
statusCode: response.status,
|
|
5599
5560
|
method: options.method ?? "GET",
|
|
@@ -5656,7 +5617,7 @@ const makeEnhancedHttpClient = Effect.gen(function* () {
|
|
|
5656
5617
|
Effect.retry({
|
|
5657
5618
|
schedule: retrySchedule,
|
|
5658
5619
|
while: (error) => {
|
|
5659
|
-
if (error instanceof
|
|
5620
|
+
if (error instanceof NetworkError) {
|
|
5660
5621
|
if (error.statusCode && error.statusCode >= 400 && error.statusCode < 500) {
|
|
5661
5622
|
return false;
|
|
5662
5623
|
}
|
|
@@ -6775,28 +6736,38 @@ const WebScrapingEngineLive = Layer.effect(
|
|
|
6775
6736
|
makeWebScrapingEngine
|
|
6776
6737
|
);
|
|
6777
6738
|
export {
|
|
6739
|
+
AdapterNotInitialisedError,
|
|
6740
|
+
BrowserCleanupError,
|
|
6741
|
+
BrowserError,
|
|
6742
|
+
ConfigError,
|
|
6778
6743
|
ConfigurationError,
|
|
6744
|
+
ContentTypeError,
|
|
6779
6745
|
CookieManager,
|
|
6780
6746
|
CookieManagerLive,
|
|
6747
|
+
CrawlError,
|
|
6781
6748
|
DEFAULT_HYBRID_CONFIG,
|
|
6782
6749
|
DeltaPersistence,
|
|
6783
6750
|
EnhancedHttpClient,
|
|
6784
6751
|
EnhancedHttpClientLive,
|
|
6785
6752
|
FileStorageBackend,
|
|
6786
|
-
FileSystemError
|
|
6753
|
+
FileSystemError,
|
|
6787
6754
|
FullStatePersistence,
|
|
6788
6755
|
HybridPersistence,
|
|
6789
6756
|
LinkExtractionError,
|
|
6790
6757
|
LinkExtractorService,
|
|
6791
6758
|
LinkExtractorServiceLayer,
|
|
6792
6759
|
LoggingMiddleware,
|
|
6793
|
-
MiddlewareError
|
|
6760
|
+
MiddlewareError,
|
|
6794
6761
|
MiddlewareManager,
|
|
6795
|
-
NetworkError
|
|
6762
|
+
NetworkError,
|
|
6796
6763
|
PageDataSchema,
|
|
6764
|
+
PageError,
|
|
6765
|
+
ParseError,
|
|
6797
6766
|
PersistenceError$1 as PersistenceError,
|
|
6798
6767
|
PriorityRequest,
|
|
6768
|
+
QueueError,
|
|
6799
6769
|
RateLimitMiddleware,
|
|
6770
|
+
RequestAbortError,
|
|
6800
6771
|
ResponseError,
|
|
6801
6772
|
ResumabilityConfigs,
|
|
6802
6773
|
PersistenceError2 as ResumabilityError,
|
|
@@ -6804,6 +6775,7 @@ export {
|
|
|
6804
6775
|
RobotsService,
|
|
6805
6776
|
RobotsTxtError,
|
|
6806
6777
|
ScraperService,
|
|
6778
|
+
SessionError$1 as SessionError,
|
|
6807
6779
|
SessionStore,
|
|
6808
6780
|
SessionStoreLive,
|
|
6809
6781
|
SpiderConfig,
|
|
@@ -6814,17 +6786,23 @@ export {
|
|
|
6814
6786
|
SpiderState,
|
|
6815
6787
|
SpiderStateKey,
|
|
6816
6788
|
StateDelta,
|
|
6789
|
+
StateError,
|
|
6817
6790
|
StateManager,
|
|
6818
6791
|
StateManagerLive,
|
|
6819
6792
|
StatsMiddleware,
|
|
6793
|
+
TimeoutError,
|
|
6820
6794
|
TokenExtractor,
|
|
6821
6795
|
TokenExtractorLive,
|
|
6822
6796
|
TokenType,
|
|
6823
6797
|
UrlDeduplicatorService,
|
|
6824
6798
|
UserAgentMiddleware,
|
|
6799
|
+
ValidationError,
|
|
6825
6800
|
WebScrapingEngine,
|
|
6826
6801
|
WebScrapingEngineLive,
|
|
6827
6802
|
createStateOperation,
|
|
6803
|
+
isBrowserError,
|
|
6804
|
+
isNetworkError,
|
|
6805
|
+
isSpiderError,
|
|
6828
6806
|
makeCookieManager,
|
|
6829
6807
|
makeEnhancedHttpClient,
|
|
6830
6808
|
makeSessionStore,
|