@jambudipa/spider 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +14 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +571 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Effect, Layer, Option, Chunk, MutableHashSet, Schema, Data, pipe, Context, DateTime, Console, Duration, MutableHashMap, Queue, HashMap, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Struct
|
|
1
|
+
import { Effect, Layer, Option, Chunk, MutableHashSet, Schema, Data, pipe, Context, DateTime, Console, Duration, MutableHashMap, Queue, HashMap, Ref, HashSet, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Struct } from "effect";
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
3
|
import * as fs from "fs";
|
|
4
4
|
import * as path from "path";
|
|
@@ -1747,6 +1747,62 @@ const deduplicateUrls = (urls, strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Eff
|
|
|
1747
1747
|
}
|
|
1748
1748
|
};
|
|
1749
1749
|
});
|
|
1750
|
+
const createUrlDeduplicator = (strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Effect.gen(function* () {
|
|
1751
|
+
const seenUrls = yield* Ref.make(HashSet.empty());
|
|
1752
|
+
const urlStats = yield* Ref.make({
|
|
1753
|
+
processed: 0,
|
|
1754
|
+
unique: 0,
|
|
1755
|
+
duplicates: 0
|
|
1756
|
+
});
|
|
1757
|
+
return {
|
|
1758
|
+
/**
|
|
1759
|
+
* Check if a URL has been seen (after normalization)
|
|
1760
|
+
*/
|
|
1761
|
+
hasSeenUrl: (url) => Effect.gen(function* () {
|
|
1762
|
+
const normalized = yield* normalizeUrl(url, strategy);
|
|
1763
|
+
const seen = yield* Ref.get(seenUrls);
|
|
1764
|
+
return HashSet.has(seen, normalized.normalized);
|
|
1765
|
+
}),
|
|
1766
|
+
/**
|
|
1767
|
+
* Add a URL to the seen set
|
|
1768
|
+
*/
|
|
1769
|
+
markUrlSeen: (url) => Effect.gen(function* () {
|
|
1770
|
+
const normalized = yield* normalizeUrl(url, strategy);
|
|
1771
|
+
const seen = yield* Ref.get(seenUrls);
|
|
1772
|
+
if (HashSet.has(seen, normalized.normalized)) {
|
|
1773
|
+
yield* Ref.update(urlStats, (stats) => ({
|
|
1774
|
+
...stats,
|
|
1775
|
+
processed: stats.processed + 1,
|
|
1776
|
+
duplicates: stats.duplicates + 1
|
|
1777
|
+
}));
|
|
1778
|
+
return false;
|
|
1779
|
+
} else {
|
|
1780
|
+
yield* Ref.set(seenUrls, HashSet.add(seen, normalized.normalized));
|
|
1781
|
+
yield* Ref.update(urlStats, (stats) => ({
|
|
1782
|
+
...stats,
|
|
1783
|
+
processed: stats.processed + 1,
|
|
1784
|
+
unique: stats.unique + 1
|
|
1785
|
+
}));
|
|
1786
|
+
return true;
|
|
1787
|
+
}
|
|
1788
|
+
}),
|
|
1789
|
+
/**
|
|
1790
|
+
* Get deduplication statistics
|
|
1791
|
+
*/
|
|
1792
|
+
getStats: () => Ref.get(urlStats),
|
|
1793
|
+
/**
|
|
1794
|
+
* Reset the deduplicator
|
|
1795
|
+
*/
|
|
1796
|
+
reset: () => Effect.gen(function* () {
|
|
1797
|
+
yield* Ref.set(seenUrls, HashSet.empty());
|
|
1798
|
+
yield* Ref.set(urlStats, {
|
|
1799
|
+
processed: 0,
|
|
1800
|
+
unique: 0,
|
|
1801
|
+
duplicates: 0
|
|
1802
|
+
});
|
|
1803
|
+
})
|
|
1804
|
+
};
|
|
1805
|
+
});
|
|
1750
1806
|
const SPIDER_DEFAULTS = Object.freeze({
|
|
1751
1807
|
/** Threshold in ms after which a worker is considered stale (60s) */
|
|
1752
1808
|
STALE_WORKER_THRESHOLD_MS: 6e4,
|
|
@@ -5080,6 +5136,120 @@ const ResumabilityConfigs = {
|
|
|
5080
5136
|
backend: new PostgresStorageBackend(dbClient, config)
|
|
5081
5137
|
})
|
|
5082
5138
|
};
|
|
5139
|
+
class FetchError extends Data.TaggedError("FetchError") {
|
|
5140
|
+
get message() {
|
|
5141
|
+
return `Fetch failed for ${this.url}: ${this.reason} after ${this.durationMs}ms`;
|
|
5142
|
+
}
|
|
5143
|
+
}
|
|
5144
|
+
const makeLoggingFetch = Effect.gen(function* () {
|
|
5145
|
+
const logger = yield* SpiderLogger;
|
|
5146
|
+
return (url, options) => Effect.gen(function* () {
|
|
5147
|
+
const startTime = yield* DateTime.now;
|
|
5148
|
+
const startMs = DateTime.toEpochMillis(startTime);
|
|
5149
|
+
const domain = new URL(url).hostname;
|
|
5150
|
+
const optionDetails = Option.fromNullable(options).pipe(
|
|
5151
|
+
Option.map((opts) => ({
|
|
5152
|
+
method: opts.method,
|
|
5153
|
+
headers: Object.keys(opts.headers ?? {})
|
|
5154
|
+
}))
|
|
5155
|
+
);
|
|
5156
|
+
yield* logger.logEvent({
|
|
5157
|
+
type: "edge_case",
|
|
5158
|
+
domain,
|
|
5159
|
+
url,
|
|
5160
|
+
message: "[FETCH_START] Starting fetch request",
|
|
5161
|
+
details: {
|
|
5162
|
+
case: "fetch_start",
|
|
5163
|
+
url,
|
|
5164
|
+
timestamp: DateTime.formatIso(startTime),
|
|
5165
|
+
options: Option.getOrUndefined(optionDetails)
|
|
5166
|
+
}
|
|
5167
|
+
});
|
|
5168
|
+
const fetchEffect = Effect.tryPromise({
|
|
5169
|
+
try: () => globalThis.fetch(url, options),
|
|
5170
|
+
catch: (error) => new FetchError({
|
|
5171
|
+
url,
|
|
5172
|
+
reason: "network",
|
|
5173
|
+
durationMs: 0,
|
|
5174
|
+
// Duration will be calculated in error handler
|
|
5175
|
+
cause: error
|
|
5176
|
+
})
|
|
5177
|
+
});
|
|
5178
|
+
const timeoutDuration = Duration.seconds(30);
|
|
5179
|
+
const fetchWithTimeout = fetchEffect.pipe(
|
|
5180
|
+
Effect.timeoutOption(timeoutDuration),
|
|
5181
|
+
Effect.flatMap(
|
|
5182
|
+
(maybeResponse) => Option.match(maybeResponse, {
|
|
5183
|
+
onNone: () => Effect.gen(function* () {
|
|
5184
|
+
const currentTime = yield* DateTime.now;
|
|
5185
|
+
const durationMs2 = DateTime.toEpochMillis(currentTime) - startMs;
|
|
5186
|
+
yield* logger.logEvent({
|
|
5187
|
+
type: "edge_case",
|
|
5188
|
+
domain,
|
|
5189
|
+
url,
|
|
5190
|
+
message: `[FETCH_ABORT] Aborting fetch after ${durationMs2}ms`,
|
|
5191
|
+
details: {
|
|
5192
|
+
case: "fetch_abort",
|
|
5193
|
+
url,
|
|
5194
|
+
durationMs: durationMs2,
|
|
5195
|
+
reason: "timeout"
|
|
5196
|
+
}
|
|
5197
|
+
});
|
|
5198
|
+
return yield* Effect.fail(
|
|
5199
|
+
new FetchError({
|
|
5200
|
+
url,
|
|
5201
|
+
reason: "timeout",
|
|
5202
|
+
durationMs: Number(durationMs2)
|
|
5203
|
+
})
|
|
5204
|
+
);
|
|
5205
|
+
}),
|
|
5206
|
+
onSome: (response2) => Effect.succeed(response2)
|
|
5207
|
+
})
|
|
5208
|
+
)
|
|
5209
|
+
);
|
|
5210
|
+
const response = yield* fetchWithTimeout.pipe(
|
|
5211
|
+
Effect.catchAll(
|
|
5212
|
+
(error) => Effect.gen(function* () {
|
|
5213
|
+
const currentTime = yield* DateTime.now;
|
|
5214
|
+
const durationMs2 = DateTime.toEpochMillis(currentTime) - startMs;
|
|
5215
|
+
yield* logger.logEvent({
|
|
5216
|
+
type: "edge_case",
|
|
5217
|
+
domain,
|
|
5218
|
+
url,
|
|
5219
|
+
message: `[FETCH_ERROR] Failed after ${durationMs2}ms`,
|
|
5220
|
+
details: {
|
|
5221
|
+
case: "fetch_failed",
|
|
5222
|
+
url,
|
|
5223
|
+
durationMs: durationMs2,
|
|
5224
|
+
error: error._tag,
|
|
5225
|
+
message: error.message,
|
|
5226
|
+
isAborted: error.reason === "timeout"
|
|
5227
|
+
}
|
|
5228
|
+
});
|
|
5229
|
+
return yield* Effect.fail(error);
|
|
5230
|
+
})
|
|
5231
|
+
)
|
|
5232
|
+
);
|
|
5233
|
+
const endTime = yield* DateTime.now;
|
|
5234
|
+
const durationMs = DateTime.toEpochMillis(endTime) - startMs;
|
|
5235
|
+
yield* logger.logEvent({
|
|
5236
|
+
type: "edge_case",
|
|
5237
|
+
domain,
|
|
5238
|
+
url,
|
|
5239
|
+
message: `[FETCH_SUCCESS] Got response in ${durationMs}ms`,
|
|
5240
|
+
details: {
|
|
5241
|
+
case: "fetch_success",
|
|
5242
|
+
url,
|
|
5243
|
+
durationMs,
|
|
5244
|
+
status: response.status,
|
|
5245
|
+
statusText: response.statusText,
|
|
5246
|
+
contentType: response.headers.get("content-type")
|
|
5247
|
+
}
|
|
5248
|
+
});
|
|
5249
|
+
return response;
|
|
5250
|
+
});
|
|
5251
|
+
});
|
|
5252
|
+
const LoggingFetch = Context.GenericTag("LoggingFetch");
|
|
5083
5253
|
class JsonParseError extends Data.TaggedError("JsonParseError") {
|
|
5084
5254
|
get message() {
|
|
5085
5255
|
const preview = this.input.length > 100 ? `${this.input.substring(0, 100)}...` : this.input;
|
|
@@ -6527,6 +6697,251 @@ const TokenExtractorLive = Layer.effect(
|
|
|
6527
6697
|
TokenExtractor,
|
|
6528
6698
|
makeTokenExtractor
|
|
6529
6699
|
);
|
|
6700
|
+
class BrowserEngineService extends Effect.Service()(
|
|
6701
|
+
"@jambudipa.io/BrowserEngine",
|
|
6702
|
+
{
|
|
6703
|
+
effect: Effect.gen(function* () {
|
|
6704
|
+
const browserRef = yield* Ref.make(Option.none());
|
|
6705
|
+
const contextRef = yield* Ref.make(Option.none());
|
|
6706
|
+
const pageRef = yield* Ref.make(Option.none());
|
|
6707
|
+
const configRef = yield* Ref.make({
|
|
6708
|
+
headless: true,
|
|
6709
|
+
timeout: 3e4,
|
|
6710
|
+
viewport: { width: 1920, height: 1080 },
|
|
6711
|
+
userAgent: "Mozilla/5.0 (compatible; Spider/1.0)",
|
|
6712
|
+
locale: "en-GB"
|
|
6713
|
+
});
|
|
6714
|
+
const ensureBrowser = () => Effect.gen(function* () {
|
|
6715
|
+
const browserOpt = yield* Ref.get(browserRef);
|
|
6716
|
+
if (Option.isSome(browserOpt)) {
|
|
6717
|
+
return browserOpt.value;
|
|
6718
|
+
}
|
|
6719
|
+
const { chromium } = yield* Effect.tryPromise({
|
|
6720
|
+
try: () => import("playwright"),
|
|
6721
|
+
catch: () => BrowserError.launchFailed("Playwright not installed")
|
|
6722
|
+
});
|
|
6723
|
+
const config = yield* Ref.get(configRef);
|
|
6724
|
+
const browser = yield* Effect.tryPromise({
|
|
6725
|
+
try: () => chromium.launch({
|
|
6726
|
+
headless: config.headless,
|
|
6727
|
+
timeout: config.timeout
|
|
6728
|
+
}),
|
|
6729
|
+
catch: (error) => BrowserError.launchFailed(error)
|
|
6730
|
+
});
|
|
6731
|
+
yield* Ref.set(browserRef, Option.some(browser));
|
|
6732
|
+
return browser;
|
|
6733
|
+
});
|
|
6734
|
+
const ensureContext = () => Effect.gen(function* () {
|
|
6735
|
+
const contextOpt = yield* Ref.get(contextRef);
|
|
6736
|
+
if (Option.isSome(contextOpt)) {
|
|
6737
|
+
return contextOpt.value;
|
|
6738
|
+
}
|
|
6739
|
+
const browser = yield* ensureBrowser();
|
|
6740
|
+
const config = yield* Ref.get(configRef);
|
|
6741
|
+
const context = yield* Effect.tryPromise({
|
|
6742
|
+
try: () => browser.newContext({
|
|
6743
|
+
viewport: config.viewport,
|
|
6744
|
+
userAgent: config.userAgent,
|
|
6745
|
+
locale: config.locale
|
|
6746
|
+
}),
|
|
6747
|
+
catch: (error) => new BrowserError({
|
|
6748
|
+
operation: "newContext",
|
|
6749
|
+
cause: error
|
|
6750
|
+
})
|
|
6751
|
+
});
|
|
6752
|
+
yield* Ref.set(contextRef, Option.some(context));
|
|
6753
|
+
return context;
|
|
6754
|
+
});
|
|
6755
|
+
const getCurrentPage = () => Effect.gen(function* () {
|
|
6756
|
+
const pageOpt = yield* Ref.get(pageRef);
|
|
6757
|
+
return yield* Option.match(pageOpt, {
|
|
6758
|
+
onNone: () => Effect.fail(new PageError({
|
|
6759
|
+
url: "unknown",
|
|
6760
|
+
operation: "getCurrentPage",
|
|
6761
|
+
cause: "No active page"
|
|
6762
|
+
})),
|
|
6763
|
+
onSome: (page) => Effect.succeed(page)
|
|
6764
|
+
});
|
|
6765
|
+
});
|
|
6766
|
+
return {
|
|
6767
|
+
launch: () => Effect.gen(function* () {
|
|
6768
|
+
yield* ensureBrowser();
|
|
6769
|
+
yield* Effect.log("Browser launched successfully");
|
|
6770
|
+
}),
|
|
6771
|
+
createPage: () => Effect.gen(function* () {
|
|
6772
|
+
const context = yield* ensureContext();
|
|
6773
|
+
const page = yield* Effect.tryPromise({
|
|
6774
|
+
try: () => context.newPage(),
|
|
6775
|
+
catch: (error) => new BrowserError({
|
|
6776
|
+
operation: "newPage",
|
|
6777
|
+
cause: error
|
|
6778
|
+
})
|
|
6779
|
+
});
|
|
6780
|
+
yield* Ref.set(pageRef, Option.some(page));
|
|
6781
|
+
yield* Effect.log("New page created");
|
|
6782
|
+
return page;
|
|
6783
|
+
}),
|
|
6784
|
+
navigateTo: (url) => Effect.gen(function* () {
|
|
6785
|
+
const page = yield* getCurrentPage();
|
|
6786
|
+
yield* Effect.tryPromise({
|
|
6787
|
+
try: () => page.goto(url, { waitUntil: "networkidle" }),
|
|
6788
|
+
catch: (error) => new PageError({
|
|
6789
|
+
url,
|
|
6790
|
+
operation: "navigate",
|
|
6791
|
+
cause: error
|
|
6792
|
+
})
|
|
6793
|
+
});
|
|
6794
|
+
yield* Effect.logDebug(`Navigated to ${url}`);
|
|
6795
|
+
}),
|
|
6796
|
+
waitForSelector: (selector, timeout) => Effect.gen(function* () {
|
|
6797
|
+
const page = yield* getCurrentPage();
|
|
6798
|
+
const config = yield* Ref.get(configRef);
|
|
6799
|
+
yield* Effect.tryPromise({
|
|
6800
|
+
try: () => page.waitForSelector(selector, {
|
|
6801
|
+
timeout: timeout ?? config.timeout
|
|
6802
|
+
}),
|
|
6803
|
+
catch: (error) => new PageError({
|
|
6804
|
+
url: page.url(),
|
|
6805
|
+
operation: "waitForSelector",
|
|
6806
|
+
selector,
|
|
6807
|
+
cause: error
|
|
6808
|
+
})
|
|
6809
|
+
});
|
|
6810
|
+
}),
|
|
6811
|
+
click: (selector) => Effect.gen(function* () {
|
|
6812
|
+
const page = yield* getCurrentPage();
|
|
6813
|
+
yield* Effect.tryPromise({
|
|
6814
|
+
try: () => page.click(selector),
|
|
6815
|
+
catch: (error) => new PageError({
|
|
6816
|
+
url: page.url(),
|
|
6817
|
+
operation: "click",
|
|
6818
|
+
selector,
|
|
6819
|
+
cause: error
|
|
6820
|
+
})
|
|
6821
|
+
});
|
|
6822
|
+
yield* Effect.logDebug(`Clicked element: ${selector}`);
|
|
6823
|
+
}),
|
|
6824
|
+
fill: (selector, value) => Effect.gen(function* () {
|
|
6825
|
+
const page = yield* getCurrentPage();
|
|
6826
|
+
yield* Effect.tryPromise({
|
|
6827
|
+
try: () => page.fill(selector, value),
|
|
6828
|
+
catch: (error) => new PageError({
|
|
6829
|
+
url: page.url(),
|
|
6830
|
+
operation: "fill",
|
|
6831
|
+
selector,
|
|
6832
|
+
cause: error
|
|
6833
|
+
})
|
|
6834
|
+
});
|
|
6835
|
+
yield* Effect.logDebug(`Filled ${selector} with value`);
|
|
6836
|
+
}),
|
|
6837
|
+
scroll: (distance) => Effect.gen(function* () {
|
|
6838
|
+
const page = yield* getCurrentPage();
|
|
6839
|
+
yield* Effect.ignore(
|
|
6840
|
+
Effect.tryPromise({
|
|
6841
|
+
try: () => page.evaluate((d) => {
|
|
6842
|
+
window.scrollBy(0, d);
|
|
6843
|
+
}, distance),
|
|
6844
|
+
catch: (error) => error
|
|
6845
|
+
})
|
|
6846
|
+
);
|
|
6847
|
+
yield* Effect.logDebug(`Scrolled ${distance}px`);
|
|
6848
|
+
}),
|
|
6849
|
+
evaluate: (script) => Effect.gen(function* () {
|
|
6850
|
+
const page = yield* getCurrentPage();
|
|
6851
|
+
return yield* Effect.tryPromise({
|
|
6852
|
+
try: () => page.evaluate(script),
|
|
6853
|
+
catch: (error) => new PageError({
|
|
6854
|
+
url: page.url(),
|
|
6855
|
+
operation: "evaluate",
|
|
6856
|
+
cause: error
|
|
6857
|
+
})
|
|
6858
|
+
});
|
|
6859
|
+
}),
|
|
6860
|
+
getHTML: () => Effect.gen(function* () {
|
|
6861
|
+
const page = yield* getCurrentPage();
|
|
6862
|
+
return yield* Effect.tryPromise({
|
|
6863
|
+
try: () => page.content(),
|
|
6864
|
+
catch: (error) => new PageError({
|
|
6865
|
+
url: page.url(),
|
|
6866
|
+
operation: "getHTML",
|
|
6867
|
+
cause: error
|
|
6868
|
+
})
|
|
6869
|
+
});
|
|
6870
|
+
}),
|
|
6871
|
+
screenshot: (path2) => Effect.gen(function* () {
|
|
6872
|
+
const page = yield* getCurrentPage();
|
|
6873
|
+
const buffer = yield* Effect.tryPromise({
|
|
6874
|
+
try: () => page.screenshot({ path: path2, fullPage: true }),
|
|
6875
|
+
catch: (error) => new PageError({
|
|
6876
|
+
url: page.url(),
|
|
6877
|
+
operation: "screenshot",
|
|
6878
|
+
cause: error
|
|
6879
|
+
})
|
|
6880
|
+
});
|
|
6881
|
+
yield* Effect.log(`Screenshot taken${path2 ? ` and saved to ${path2}` : ""}`);
|
|
6882
|
+
return buffer;
|
|
6883
|
+
}),
|
|
6884
|
+
closePage: () => Effect.gen(function* () {
|
|
6885
|
+
const pageOpt = yield* Ref.get(pageRef);
|
|
6886
|
+
if (Option.isSome(pageOpt)) {
|
|
6887
|
+
yield* Effect.ignore(
|
|
6888
|
+
Effect.tryPromise({
|
|
6889
|
+
try: () => pageOpt.value.close(),
|
|
6890
|
+
catch: (error) => error
|
|
6891
|
+
})
|
|
6892
|
+
);
|
|
6893
|
+
yield* Ref.set(pageRef, Option.none());
|
|
6894
|
+
yield* Effect.log("Page closed");
|
|
6895
|
+
}
|
|
6896
|
+
}),
|
|
6897
|
+
close: () => Effect.gen(function* () {
|
|
6898
|
+
const pageOpt = yield* Ref.get(pageRef);
|
|
6899
|
+
if (Option.isSome(pageOpt)) {
|
|
6900
|
+
yield* Effect.ignore(
|
|
6901
|
+
Effect.tryPromise({
|
|
6902
|
+
try: () => pageOpt.value.close(),
|
|
6903
|
+
catch: (error) => error
|
|
6904
|
+
})
|
|
6905
|
+
);
|
|
6906
|
+
}
|
|
6907
|
+
const contextOpt = yield* Ref.get(contextRef);
|
|
6908
|
+
if (Option.isSome(contextOpt)) {
|
|
6909
|
+
yield* Effect.ignore(
|
|
6910
|
+
Effect.tryPromise({
|
|
6911
|
+
try: () => contextOpt.value.close(),
|
|
6912
|
+
catch: (error) => error
|
|
6913
|
+
})
|
|
6914
|
+
);
|
|
6915
|
+
}
|
|
6916
|
+
const browserOpt = yield* Ref.get(browserRef);
|
|
6917
|
+
if (Option.isSome(browserOpt)) {
|
|
6918
|
+
yield* Effect.ignore(
|
|
6919
|
+
Effect.tryPromise({
|
|
6920
|
+
try: () => browserOpt.value.close(),
|
|
6921
|
+
catch: (error) => error
|
|
6922
|
+
})
|
|
6923
|
+
);
|
|
6924
|
+
}
|
|
6925
|
+
yield* Ref.set(pageRef, Option.none());
|
|
6926
|
+
yield* Ref.set(contextRef, Option.none());
|
|
6927
|
+
yield* Ref.set(browserRef, Option.none());
|
|
6928
|
+
yield* Effect.log("Browser engine closed");
|
|
6929
|
+
})
|
|
6930
|
+
};
|
|
6931
|
+
})
|
|
6932
|
+
}
|
|
6933
|
+
) {
|
|
6934
|
+
}
|
|
6935
|
+
const BrowserEngineLive = BrowserEngineService.Default;
|
|
6936
|
+
const BrowserEngineWithConfig = (_config) => BrowserEngineService.Default;
|
|
6937
|
+
const withBrowser = (operation) => Effect.gen(function* () {
|
|
6938
|
+
const engine = yield* BrowserEngineService;
|
|
6939
|
+
return yield* Effect.acquireUseRelease(
|
|
6940
|
+
Effect.succeed(engine),
|
|
6941
|
+
operation,
|
|
6942
|
+
(engine2) => engine2.close()
|
|
6943
|
+
);
|
|
6944
|
+
});
|
|
6530
6945
|
class LoginError extends Data.TaggedError("LoginError") {
|
|
6531
6946
|
}
|
|
6532
6947
|
class SessionNotValidError extends Data.TaggedError("SessionNotValidError") {
|
|
@@ -6735,20 +7150,153 @@ const WebScrapingEngineLive = Layer.effect(
|
|
|
6735
7150
|
WebScrapingEngine,
|
|
6736
7151
|
makeWebScrapingEngine
|
|
6737
7152
|
);
|
|
7153
|
+
class WorkerHealthMonitor extends Effect.Service()(
|
|
7154
|
+
"@jambudipa.io/WorkerHealthMonitor",
|
|
7155
|
+
{
|
|
7156
|
+
effect: Effect.gen(function* () {
|
|
7157
|
+
const logger = yield* SpiderLogger;
|
|
7158
|
+
const workers = yield* Ref.make(HashMap.empty());
|
|
7159
|
+
const stuckThresholdMs = 6e4;
|
|
7160
|
+
return {
|
|
7161
|
+
/**
|
|
7162
|
+
* Register a worker's activity
|
|
7163
|
+
*/
|
|
7164
|
+
recordActivity: (workerId, domain, activity) => Effect.gen(function* () {
|
|
7165
|
+
const now = DateTime.unsafeNow();
|
|
7166
|
+
yield* Ref.update(workers, (map) => {
|
|
7167
|
+
const current = HashMap.get(map, workerId).pipe(
|
|
7168
|
+
(opt) => opt._tag === "Some" ? opt.value : {
|
|
7169
|
+
workerId,
|
|
7170
|
+
domain,
|
|
7171
|
+
lastActivity: now
|
|
7172
|
+
}
|
|
7173
|
+
);
|
|
7174
|
+
const updated = {
|
|
7175
|
+
...current,
|
|
7176
|
+
domain,
|
|
7177
|
+
lastActivity: now,
|
|
7178
|
+
currentUrl: activity.url ?? current.currentUrl,
|
|
7179
|
+
fetchStartTime: activity.fetchStart ? now : current.fetchStartTime
|
|
7180
|
+
};
|
|
7181
|
+
return HashMap.set(map, workerId, updated);
|
|
7182
|
+
});
|
|
7183
|
+
}),
|
|
7184
|
+
/**
|
|
7185
|
+
* Remove a worker from monitoring
|
|
7186
|
+
*/
|
|
7187
|
+
removeWorker: (workerId) => Ref.update(workers, (map) => HashMap.remove(map, workerId)),
|
|
7188
|
+
/**
|
|
7189
|
+
* Get stuck workers
|
|
7190
|
+
*/
|
|
7191
|
+
getStuckWorkers: Effect.gen(function* () {
|
|
7192
|
+
const now = DateTime.unsafeNow();
|
|
7193
|
+
const workerMap = yield* Ref.get(workers);
|
|
7194
|
+
const stuck = [];
|
|
7195
|
+
for (const [, status] of workerMap) {
|
|
7196
|
+
const inactiveMs = DateTime.toEpochMillis(now) - DateTime.toEpochMillis(status.lastActivity);
|
|
7197
|
+
if (inactiveMs > stuckThresholdMs) {
|
|
7198
|
+
stuck.push(status);
|
|
7199
|
+
}
|
|
7200
|
+
}
|
|
7201
|
+
return stuck;
|
|
7202
|
+
}),
|
|
7203
|
+
/**
|
|
7204
|
+
* Monitor workers and log stuck ones
|
|
7205
|
+
*/
|
|
7206
|
+
startMonitoring: Effect.gen(function* () {
|
|
7207
|
+
const self = {
|
|
7208
|
+
getStuckWorkers: Effect.gen(function* () {
|
|
7209
|
+
const now = DateTime.unsafeNow();
|
|
7210
|
+
const workerMap = yield* Ref.get(workers);
|
|
7211
|
+
const stuck = [];
|
|
7212
|
+
for (const [, status] of workerMap) {
|
|
7213
|
+
const inactiveMs = DateTime.toEpochMillis(now) - DateTime.toEpochMillis(status.lastActivity);
|
|
7214
|
+
if (inactiveMs > stuckThresholdMs) {
|
|
7215
|
+
stuck.push(status);
|
|
7216
|
+
}
|
|
7217
|
+
}
|
|
7218
|
+
return stuck;
|
|
7219
|
+
})
|
|
7220
|
+
};
|
|
7221
|
+
yield* Effect.repeat(
|
|
7222
|
+
Effect.gen(function* () {
|
|
7223
|
+
const stuck = yield* self.getStuckWorkers;
|
|
7224
|
+
if (stuck.length > 0) {
|
|
7225
|
+
for (const worker of stuck) {
|
|
7226
|
+
const nowMillis = DateTime.toEpochMillis(DateTime.unsafeNow());
|
|
7227
|
+
const inactiveMs = nowMillis - DateTime.toEpochMillis(worker.lastActivity);
|
|
7228
|
+
yield* logger.logEdgeCase(
|
|
7229
|
+
worker.domain,
|
|
7230
|
+
"worker_stuck_detected",
|
|
7231
|
+
{
|
|
7232
|
+
workerId: worker.workerId,
|
|
7233
|
+
currentUrl: worker.currentUrl,
|
|
7234
|
+
lastActivity: DateTime.formatIso(worker.lastActivity),
|
|
7235
|
+
inactiveMs,
|
|
7236
|
+
fetchStartTime: Option.fromNullable(worker.fetchStartTime).pipe(
|
|
7237
|
+
Option.map(DateTime.formatIso),
|
|
7238
|
+
Option.getOrElse(() => "N/A")
|
|
7239
|
+
)
|
|
7240
|
+
}
|
|
7241
|
+
);
|
|
7242
|
+
}
|
|
7243
|
+
}
|
|
7244
|
+
}),
|
|
7245
|
+
Schedule.fixed(Duration.seconds(30))
|
|
7246
|
+
);
|
|
7247
|
+
})
|
|
7248
|
+
};
|
|
7249
|
+
})
|
|
7250
|
+
}
|
|
7251
|
+
) {
|
|
7252
|
+
}
|
|
7253
|
+
const safeJsonParse = (data, onError) => Schema.decodeUnknown(Schema.parseJson(Schema.Unknown))(data).pipe(
|
|
7254
|
+
Effect.mapError(onError)
|
|
7255
|
+
);
|
|
7256
|
+
const toOption = (value, logContext) => {
|
|
7257
|
+
const result = Option.fromNullable(value);
|
|
7258
|
+
if (logContext && Option.isNone(result)) {
|
|
7259
|
+
return Effect.logDebug(`[Migration] Null value encountered: ${logContext}`).pipe(
|
|
7260
|
+
Effect.map(() => result)
|
|
7261
|
+
);
|
|
7262
|
+
}
|
|
7263
|
+
return Effect.succeed(result);
|
|
7264
|
+
};
|
|
7265
|
+
const fromPromise = (promise, onError) => Effect.tryPromise({
|
|
7266
|
+
try: promise,
|
|
7267
|
+
catch: onError
|
|
7268
|
+
});
|
|
7269
|
+
const cleanupResources = (resources) => Effect.all(
|
|
7270
|
+
resources.map(
|
|
7271
|
+
({ id, cleanup, onError }) => Effect.tryPromise({
|
|
7272
|
+
try: cleanup,
|
|
7273
|
+
catch: (error) => onError(id, error)
|
|
7274
|
+
})
|
|
7275
|
+
),
|
|
7276
|
+
{ mode: "either" }
|
|
7277
|
+
);
|
|
7278
|
+
const matchOption = (option, onNone, onSome) => Option.match(option, { onNone, onSome });
|
|
6738
7279
|
export {
|
|
7280
|
+
APITokenNotFoundError,
|
|
6739
7281
|
AdapterNotInitialisedError,
|
|
6740
7282
|
BrowserCleanupError,
|
|
7283
|
+
BrowserEngineLive,
|
|
7284
|
+
BrowserEngineService,
|
|
7285
|
+
BrowserEngineWithConfig,
|
|
6741
7286
|
BrowserError,
|
|
7287
|
+
CSRFTokenNotFoundError,
|
|
6742
7288
|
ConfigError,
|
|
6743
7289
|
ConfigurationError,
|
|
6744
7290
|
ContentTypeError,
|
|
6745
7291
|
CookieManager,
|
|
6746
7292
|
CookieManagerLive,
|
|
6747
7293
|
CrawlError,
|
|
7294
|
+
DEFAULT_DEDUPLICATION_STRATEGY,
|
|
6748
7295
|
DEFAULT_HYBRID_CONFIG,
|
|
6749
7296
|
DeltaPersistence,
|
|
6750
7297
|
EnhancedHttpClient,
|
|
6751
7298
|
EnhancedHttpClientLive,
|
|
7299
|
+
FetchError,
|
|
6752
7300
|
FileStorageBackend,
|
|
6753
7301
|
FileSystemError,
|
|
6754
7302
|
FullStatePersistence,
|
|
@@ -6756,7 +7304,9 @@ export {
|
|
|
6756
7304
|
LinkExtractionError,
|
|
6757
7305
|
LinkExtractorService,
|
|
6758
7306
|
LinkExtractorServiceLayer,
|
|
7307
|
+
LoggingFetch,
|
|
6759
7308
|
LoggingMiddleware,
|
|
7309
|
+
LoginError,
|
|
6760
7310
|
MiddlewareError,
|
|
6761
7311
|
MiddlewareManager,
|
|
6762
7312
|
NetworkError,
|
|
@@ -6764,6 +7314,7 @@ export {
|
|
|
6764
7314
|
PageError,
|
|
6765
7315
|
ParseError,
|
|
6766
7316
|
PersistenceError$1 as PersistenceError,
|
|
7317
|
+
PostgresStorageBackend,
|
|
6767
7318
|
PriorityRequest,
|
|
6768
7319
|
QueueError,
|
|
6769
7320
|
RateLimitMiddleware,
|
|
@@ -6774,8 +7325,11 @@ export {
|
|
|
6774
7325
|
ResumabilityService,
|
|
6775
7326
|
RobotsService,
|
|
6776
7327
|
RobotsTxtError,
|
|
7328
|
+
SPIDER_DEFAULTS,
|
|
6777
7329
|
ScraperService,
|
|
6778
7330
|
SessionError$1 as SessionError,
|
|
7331
|
+
SessionLoadError,
|
|
7332
|
+
SessionNotValidError,
|
|
6779
7333
|
SessionStore,
|
|
6780
7334
|
SessionStoreLive,
|
|
6781
7335
|
SpiderConfig,
|
|
@@ -6790,26 +7344,41 @@ export {
|
|
|
6790
7344
|
StateManager,
|
|
6791
7345
|
StateManagerLive,
|
|
6792
7346
|
StatsMiddleware,
|
|
7347
|
+
StorageKeyNotFoundError,
|
|
6793
7348
|
TimeoutError,
|
|
7349
|
+
TokenExpiredError,
|
|
6794
7350
|
TokenExtractor,
|
|
6795
7351
|
TokenExtractorLive,
|
|
7352
|
+
TokenNotFoundError,
|
|
6796
7353
|
TokenType,
|
|
6797
7354
|
UrlDeduplicatorService,
|
|
6798
7355
|
UserAgentMiddleware,
|
|
6799
7356
|
ValidationError,
|
|
6800
7357
|
WebScrapingEngine,
|
|
6801
7358
|
WebScrapingEngineLive,
|
|
7359
|
+
WorkerHealthMonitor,
|
|
7360
|
+
cleanupResources,
|
|
6802
7361
|
createStateOperation,
|
|
7362
|
+
createUrlDeduplicator,
|
|
7363
|
+
deduplicateUrls,
|
|
7364
|
+
fromPromise,
|
|
6803
7365
|
isBrowserError,
|
|
6804
7366
|
isNetworkError,
|
|
6805
7367
|
isSpiderError,
|
|
6806
7368
|
makeCookieManager,
|
|
6807
7369
|
makeEnhancedHttpClient,
|
|
7370
|
+
makeLoggingFetch,
|
|
6808
7371
|
makeSessionStore,
|
|
6809
7372
|
makeSpiderConfig,
|
|
6810
7373
|
makeSpiderLogger,
|
|
6811
7374
|
makeStateManager,
|
|
6812
7375
|
makeTokenExtractor,
|
|
6813
|
-
makeWebScrapingEngine
|
|
7376
|
+
makeWebScrapingEngine,
|
|
7377
|
+
matchOption,
|
|
7378
|
+
normalizeUrl,
|
|
7379
|
+
parseUrl,
|
|
7380
|
+
safeJsonParse,
|
|
7381
|
+
toOption,
|
|
7382
|
+
withBrowser
|
|
6814
7383
|
};
|
|
6815
7384
|
//# sourceMappingURL=index.js.map
|