@nikx/dory-worker 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +37 -0
- package/.gitattributes +2 -0
- package/README.md +2 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +59 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +17 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +29 -0
- package/dist/config.js.map +1 -0
- package/dist/db.d.ts +14 -0
- package/dist/db.d.ts.map +1 -0
- package/dist/db.js +69 -0
- package/dist/db.js.map +1 -0
- package/dist/docker.d.ts +22 -0
- package/dist/docker.d.ts.map +1 -0
- package/dist/docker.js +78 -0
- package/dist/docker.js.map +1 -0
- package/dist/logger.d.ts +8 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +35 -0
- package/dist/logger.js.map +1 -0
- package/dist/processor.d.ts +20 -0
- package/dist/processor.d.ts.map +1 -0
- package/dist/processor.js +116 -0
- package/dist/processor.js.map +1 -0
- package/dist/storage.d.ts +7 -0
- package/dist/storage.d.ts.map +1 -0
- package/dist/storage.js +37 -0
- package/dist/storage.js.map +1 -0
- package/dist/types.d.ts +61 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/worker.d.ts +7 -0
- package/dist/worker.d.ts.map +1 -0
- package/dist/worker.js +56 -0
- package/dist/worker.js.map +1 -0
- package/package.json +36 -0
- package/src/cli.ts +66 -0
- package/src/config.ts +64 -0
- package/src/docker.ts +108 -0
- package/src/logger.ts +34 -0
- package/src/processor.ts +160 -0
- package/src/worker.ts +64 -0
- package/tsconfig.json +18 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"storage.js","sourceRoot":"","sources":["../src/storage.ts"],"names":[],"mappings":";;AAQA,oDAkCC;AA1CD,mDAAgD;AAEhD,qCAAkC;AAElC;;;GAGG;AACI,KAAK,UAAU,oBAAoB,CACxC,MAAoB,EACpB,KAAa;IAEb,MAAM,QAAQ,GAAG,GAAG,KAAK,8CAA8C,CAAC;IAExE,IAAI,CAAC;QACH,IAAI,MAAM,CAAC,mBAAmB,EAAE,CAAC;YAC/B,6CAA6C;YAC7C,MAAM,IAAI,GAAG,MAAM,CAAC,mBAAmB,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;YAC5D,MAAM,GAAG,GAAG,GAAG,IAAI,0BAA0B,MAAM,CAAC,SAAS,MAAM,kBAAkB,CAAC,QAAQ,CAAC,YAAY,CAAC;YAC5G,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;YAC7B,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;gBACZ,eAAM,CAAC,IAAI,CACT,8CAA8C,KAAK,UAAU,GAAG,CAAC,MAAM,uBAAuB,CAC/F,CAAC;gBACF,OAAO,EAAE,CAAC;YACZ,CAAC;YACD,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAA4B,CAAC;QACvD,CAAC;QAED,4DAA4D;QAC5D,MAAM,OAAO,GAAG,IAAI,iBAAO,CAAC,EAAE,SAAS,EAAE,MAAM,CAAC,YAAY,EAAE,CAAC,CAAC;QAChE,MAAM,CAAC,QAAQ,CAAC,GAAG,MAAM,OAAO;aAC7B,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC;aACxB,IAAI,CAAC,QAAQ,CAAC;aACd,QAAQ,EAAE,CAAC;QACd,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAA4B,CAAC;IACpE,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,eAAM,CAAC,KAAK,CACV,qCAAqC,KAAK,KAAM,GAAa,CAAC,OAAO,EAAE,CACxE,CAAC;QACF,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import type { ObjectId } from "mongodb";
|
|
2
|
+
export interface Handler {
|
|
3
|
+
label: string;
|
|
4
|
+
code: string;
|
|
5
|
+
description?: string;
|
|
6
|
+
}
|
|
7
|
+
export interface ActorInput {
|
|
8
|
+
startUrls?: Array<string | {
|
|
9
|
+
url: string;
|
|
10
|
+
}>;
|
|
11
|
+
crawlerType?: string;
|
|
12
|
+
handlers?: Handler[];
|
|
13
|
+
maxPagesPerCrawl?: number;
|
|
14
|
+
maxConcurrency?: number;
|
|
15
|
+
actorMemoryMbytes?: number;
|
|
16
|
+
actorTimeoutSecs?: number;
|
|
17
|
+
tieredProxyUrls?: string[][];
|
|
18
|
+
webhookUrl?: string;
|
|
19
|
+
webhookRequestMethod?: string;
|
|
20
|
+
webhookPayloadSchema?: string;
|
|
21
|
+
webhookHeaders?: Array<{
|
|
22
|
+
key: string;
|
|
23
|
+
value: string;
|
|
24
|
+
}>;
|
|
25
|
+
uniqueKeySelector?: string[];
|
|
26
|
+
[key: string]: unknown;
|
|
27
|
+
}
|
|
28
|
+
export interface ActorDocument {
|
|
29
|
+
_id: ObjectId;
|
|
30
|
+
name: string;
|
|
31
|
+
crawlerType: "playwright" | "cheerio";
|
|
32
|
+
handlers: Handler[];
|
|
33
|
+
defaultInput: ActorInput;
|
|
34
|
+
[key: string]: unknown;
|
|
35
|
+
}
|
|
36
|
+
export type RunStatus = "queued" | "pending" | "running" | "completed" | "failed" | "cancelled";
|
|
37
|
+
export interface RunDocument {
|
|
38
|
+
_id: ObjectId;
|
|
39
|
+
actorId: ObjectId;
|
|
40
|
+
jobName: string;
|
|
41
|
+
executionId: string;
|
|
42
|
+
status: RunStatus;
|
|
43
|
+
containerId?: string;
|
|
44
|
+
startedAt?: Date;
|
|
45
|
+
finishedAt?: Date;
|
|
46
|
+
exitCode?: number;
|
|
47
|
+
error?: string;
|
|
48
|
+
errorCount?: number;
|
|
49
|
+
[key: string]: unknown;
|
|
50
|
+
}
|
|
51
|
+
/** Shape of the ACTOR_CONFIG JSON passed to the scraping container */
|
|
52
|
+
export interface ActorConfig {
|
|
53
|
+
jobId: string;
|
|
54
|
+
runId: string;
|
|
55
|
+
actorInstanceId: string;
|
|
56
|
+
userInput: ActorInput & {
|
|
57
|
+
crawlerType: string;
|
|
58
|
+
handlers: Handler[];
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AAExC,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,UAAU;IACzB,SAAS,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG;QAAE,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC5C,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,OAAO,EAAE,CAAC;IACrB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,eAAe,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IAC7B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,cAAc,CAAC,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACvD,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC7B,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED,MAAM,WAAW,aAAa;IAC5B,GAAG,EAAE,QAAQ,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,YAAY,GAAG,SAAS,CAAC;IACtC,QAAQ,EAAE,OAAO,EAAE,CAAC;IACpB,YAAY,EAAE,UAAU,CAAC;IACzB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED,MAAM,MAAM,SAAS,GACjB,QAAQ,GACR,SAAS,GACT,SAAS,GACT,WAAW,GACX,QAAQ,GACR,WAAW,CAAC;AAEhB,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,QAAQ,CAAC;IACd,OAAO,EAAE,QAAQ,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,SAAS,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,IAAI,CAAC;IACjB,UAAU,CAAC,EAAE,IAAI,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED,sEAAsE;AACtE,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,eAAe,EAAE,MAAM,CAAC;IACxB,SAAS,EAAE,UAAU,GAAG;QACtB,WAAW,EAAE,MAAM,CAAC;QACpB,QAAQ,EAAE,OAAO,EAAE,CAAC;KACrB,CAAC;CACH"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
package/dist/worker.d.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import { Worker } from "bullmq";
|
|
2
|
+
import IORedis from "ioredis";
|
|
3
|
+
import type { WorkerConfig } from "./config";
|
|
4
|
+
export declare const RUN_EXECUTION_QUEUE = "run-execution";
|
|
5
|
+
export declare function createRedisConnection(config: WorkerConfig): IORedis;
|
|
6
|
+
export declare function createWorker(config: WorkerConfig, redis: IORedis): Worker;
|
|
7
|
+
//# sourceMappingURL=worker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"worker.d.ts","sourceRoot":"","sources":["../src/worker.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAI7C,eAAO,MAAM,mBAAmB,kBAAkB,CAAC;AAEnD,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,YAAY,GAAG,OAAO,CAenE;AAED,wBAAgB,YAAY,CAAC,MAAM,EAAE,YAAY,EAAE,KAAK,EAAE,OAAO,GAAG,MAAM,CAsCzE"}
|
package/dist/worker.js
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.RUN_EXECUTION_QUEUE = void 0;
|
|
7
|
+
exports.createRedisConnection = createRedisConnection;
|
|
8
|
+
exports.createWorker = createWorker;
|
|
9
|
+
const bullmq_1 = require("bullmq");
|
|
10
|
+
const ioredis_1 = __importDefault(require("ioredis"));
|
|
11
|
+
const processor_1 = require("./processor");
|
|
12
|
+
const logger_1 = require("./logger");
|
|
13
|
+
exports.RUN_EXECUTION_QUEUE = "run-execution";
|
|
14
|
+
function createRedisConnection(config) {
|
|
15
|
+
const conn = config.redisUrl
|
|
16
|
+
? new ioredis_1.default(config.redisUrl, { maxRetriesPerRequest: null })
|
|
17
|
+
: new ioredis_1.default({
|
|
18
|
+
host: config.redisHost,
|
|
19
|
+
port: config.redisPort,
|
|
20
|
+
password: config.redisPassword,
|
|
21
|
+
maxRetriesPerRequest: null,
|
|
22
|
+
});
|
|
23
|
+
conn.on("connect", () => logger_1.logger.info("Redis connected"));
|
|
24
|
+
conn.on("error", (err) => logger_1.logger.error(`Redis error: ${err.message}`));
|
|
25
|
+
conn.on("reconnecting", () => logger_1.logger.warn("Redis reconnecting..."));
|
|
26
|
+
return conn;
|
|
27
|
+
}
|
|
28
|
+
function createWorker(config, redis) {
|
|
29
|
+
const worker = new bullmq_1.Worker(exports.RUN_EXECUTION_QUEUE, (job) => (0, processor_1.processRunJob)(config, job), {
|
|
30
|
+
connection: redis,
|
|
31
|
+
concurrency: config.maxConcurrentRuns,
|
|
32
|
+
lockDuration: 300_000, // 5 min — containers can be long-running
|
|
33
|
+
stalledInterval: 30_000, // check for stalled jobs every 30 s
|
|
34
|
+
maxStalledCount: 2, // re-queue once, then fail
|
|
35
|
+
});
|
|
36
|
+
worker.on("completed", (job) => {
|
|
37
|
+
logger_1.logger.info(`Job ${job.id} completed`);
|
|
38
|
+
});
|
|
39
|
+
worker.on("failed", (job, err) => {
|
|
40
|
+
logger_1.logger.error(`Job ${job?.id ?? "unknown"} failed: ${err.message}`);
|
|
41
|
+
});
|
|
42
|
+
worker.on("stalled", (jobId) => {
|
|
43
|
+
logger_1.logger.warn(`Job ${jobId} stalled — BullMQ will re-queue or fail it`);
|
|
44
|
+
});
|
|
45
|
+
worker.on("error", (err) => {
|
|
46
|
+
// Suppress the well-known "Missing lock" race condition — BullMQ stalled-job
|
|
47
|
+
// detection will recover the job automatically.
|
|
48
|
+
if (err.message?.includes("Missing lock")) {
|
|
49
|
+
logger_1.logger.warn(`Lock expired before retry could be scheduled — job will be recovered. (${err.message})`);
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
logger_1.logger.error(`Worker error: ${err.message}`);
|
|
53
|
+
});
|
|
54
|
+
return worker;
|
|
55
|
+
}
|
|
56
|
+
//# sourceMappingURL=worker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"worker.js","sourceRoot":"","sources":["../src/worker.ts"],"names":[],"mappings":";;;;;;AAQA,sDAeC;AAED,oCAsCC;AA/DD,mCAAgC;AAChC,sDAA8B;AAE9B,2CAA4C;AAC5C,qCAAkC;AAErB,QAAA,mBAAmB,GAAG,eAAe,CAAC;AAEnD,SAAgB,qBAAqB,CAAC,MAAoB;IACxD,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ;QAC1B,CAAC,CAAC,IAAI,iBAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE,oBAAoB,EAAE,IAAI,EAAE,CAAC;QAC9D,CAAC,CAAC,IAAI,iBAAO,CAAC;YACV,IAAI,EAAE,MAAM,CAAC,SAAS;YACtB,IAAI,EAAE,MAAM,CAAC,SAAS;YACtB,QAAQ,EAAE,MAAM,CAAC,aAAa;YAC9B,oBAAoB,EAAE,IAAI;SAC3B,CAAC,CAAC;IAEP,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,GAAG,EAAE,CAAC,eAAM,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC,CAAC;IACzD,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAU,EAAE,EAAE,CAAC,eAAM,CAAC,KAAK,CAAC,gBAAgB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;IAC9E,IAAI,CAAC,EAAE,CAAC,cAAc,EAAE,GAAG,EAAE,CAAC,eAAM,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC,CAAC;IAEpE,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAgB,YAAY,CAAC,MAAoB,EAAE,KAAc;IAC/D,MAAM,MAAM,GAAG,IAAI,eAAM,CACvB,2BAAmB,EACnB,CAAC,GAAG,EAAE,EAAE,CAAC,IAAA,yBAAa,EAAC,MAAM,EAAE,GAAG,CAAC,EACnC;QACE,UAAU,EAAE,KAAK;QACjB,WAAW,EAAE,MAAM,CAAC,iBAAiB;QACrC,YAAY,EAAE,OAAO,EAAE,yCAAyC;QAChE,eAAe,EAAE,MAAM,EAAE,oCAAoC;QAC7D,eAAe,EAAE,CAAC,EAAE,2BAA2B;KAChD,CACF,CAAC;IAEF,MAAM,CAAC,EAAE,CAAC,WAAW,EAAE,CAAC,GAAG,EAAE,EAAE;QAC7B,eAAM,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,EAAE,YAAY,CAAC,CAAC;IACzC,CAAC,CAAC,CAAC;IAEH,MAAM,CAAC,EAAE,CAAC,QAAQ,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;QAC/B,eAAM,CAAC,KAAK,CAAC,OAAO,GAAG,EAAE,EAAE,IAAI,SAAS,YAAY,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IACrE,CAAC,CAAC,CAAC;IAEH,MAAM,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,KAAa,EAAE,EAAE;QACrC,eAAM,CAAC,IAAI,CAAC,OAAO,KAAK,4CAA4C,CAAC,CAAC;IACxE,CAAC,CAAC,CAAC;IAEH,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAU,EAAE,EAAE;QAChC,6EAA6E;QAC7E,gDAAgD;QAChD,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC,cAAc,CAAC,EAAE,CAAC;YAC1C,eAAM,CAAC,IAAI,CACT,0EAA0E,GAAG,CAAC,OAAO,GAAG,CACzF,CAAC;YACF,OAAO;QACT,CAAC;QACD,eAAM,CAAC,KAAK,CAAC,iBAAiB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@nikx/dory-worker",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Standalone BullMQ worker for Dory – runs on any machine with Docker (including Raspberry Pi)",
|
|
5
|
+
"main": "dist/cli.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"dory-worker": "dist/cli.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"build": "tsc",
|
|
11
|
+
"start": "node dist/cli.js",
|
|
12
|
+
"dev": "npx ts-node --project tsconfig.json src/cli.ts",
|
|
13
|
+
"prepare": "npm run build"
|
|
14
|
+
},
|
|
15
|
+
"keywords": [
|
|
16
|
+
"dory",
|
|
17
|
+
"bullmq",
|
|
18
|
+
"worker",
|
|
19
|
+
"crawlee",
|
|
20
|
+
"scraping"
|
|
21
|
+
],
|
|
22
|
+
"author": "",
|
|
23
|
+
"license": "ISC",
|
|
24
|
+
"engines": {
|
|
25
|
+
"node": ">=20.0.0"
|
|
26
|
+
},
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"bullmq": "^5.70.1",
|
|
29
|
+
"ioredis": "^5.9.3"
|
|
30
|
+
},
|
|
31
|
+
"devDependencies": {
|
|
32
|
+
"@types/node": "^20.19.27",
|
|
33
|
+
"ts-node": "^10.9.2",
|
|
34
|
+
"typescript": "^5.9.3"
|
|
35
|
+
}
|
|
36
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { loadConfig } from "./config";
|
|
3
|
+
import { createRedisConnection, createWorker } from "./worker";
|
|
4
|
+
import { logger, setLogLevel } from "./logger";
|
|
5
|
+
|
|
6
|
+
async function main(): Promise<void> {
|
|
7
|
+
// ── Load & validate config ────────────────────────────────────────────
|
|
8
|
+
let config;
|
|
9
|
+
try {
|
|
10
|
+
config = loadConfig();
|
|
11
|
+
} catch (err) {
|
|
12
|
+
console.error(
|
|
13
|
+
`[dory-worker] Configuration error: ${(err as Error).message}`,
|
|
14
|
+
);
|
|
15
|
+
console.error("Set the required environment variables and try again.");
|
|
16
|
+
process.exit(1);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
setLogLevel(config.logLevel);
|
|
20
|
+
|
|
21
|
+
logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
22
|
+
logger.info(" dory-worker starting");
|
|
23
|
+
logger.info(` Worker ID : ${config.workerId}`);
|
|
24
|
+
logger.info(` Concurrency : ${config.maxConcurrentRuns}`);
|
|
25
|
+
logger.info(` API URL : ${config.apiBaseUrl}`);
|
|
26
|
+
const redisAddr =
|
|
27
|
+
config.redisUrl ?? `${config.redisHost}:${config.redisPort}`;
|
|
28
|
+
logger.info(` Redis : ${redisAddr}`);
|
|
29
|
+
if (config.storageEmulatorHost) {
|
|
30
|
+
logger.info(` GCS emulator: ${config.storageEmulatorHost} (dev mode)`);
|
|
31
|
+
}
|
|
32
|
+
logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
33
|
+
|
|
34
|
+
// ── Start BullMQ worker ───────────────────────────────────────────────
|
|
35
|
+
const redis = createRedisConnection(config);
|
|
36
|
+
const worker = createWorker(config, redis);
|
|
37
|
+
|
|
38
|
+
logger.info('Listening on queue "run-execution" — ready to process jobs');
|
|
39
|
+
|
|
40
|
+
// ── Graceful shutdown ─────────────────────────────────────────────────
|
|
41
|
+
let shuttingDown = false;
|
|
42
|
+
|
|
43
|
+
async function shutdown(signal: string): Promise<void> {
|
|
44
|
+
if (shuttingDown) return;
|
|
45
|
+
shuttingDown = true;
|
|
46
|
+
|
|
47
|
+
logger.info(`${signal} received — shutting down gracefully...`);
|
|
48
|
+
try {
|
|
49
|
+
await worker.close();
|
|
50
|
+
await redis.quit();
|
|
51
|
+
logger.info("Worker shut down cleanly");
|
|
52
|
+
process.exit(0);
|
|
53
|
+
} catch (err) {
|
|
54
|
+
logger.error(`Error during shutdown: ${(err as Error).message}`);
|
|
55
|
+
process.exit(1);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
process.on("SIGTERM", () => shutdown("SIGTERM"));
|
|
60
|
+
process.on("SIGINT", () => shutdown("SIGINT"));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
main().catch((err: unknown) => {
|
|
64
|
+
console.error("[dory-worker] Fatal error:", err);
|
|
65
|
+
process.exit(1);
|
|
66
|
+
});
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
export interface WorkerConfig {
|
|
2
|
+
// Redis — use REDIS_URL (e.g. Railway) or host/port separately
|
|
3
|
+
redisUrl?: string;
|
|
4
|
+
redisHost: string;
|
|
5
|
+
redisPort: number;
|
|
6
|
+
redisPassword?: string;
|
|
7
|
+
|
|
8
|
+
// dory-api public URL — worker calls this for config + status updates,
|
|
9
|
+
// and containers use it for their own status callbacks
|
|
10
|
+
apiBaseUrl: string;
|
|
11
|
+
|
|
12
|
+
// GCS pass-through — only needed so the worker can inject these env vars
|
|
13
|
+
// into Docker containers (containers upload results directly to GCS).
|
|
14
|
+
// Not needed if your docker image bakes these in via CI.
|
|
15
|
+
gcsBucket?: string;
|
|
16
|
+
gcpProjectId?: string;
|
|
17
|
+
/** fake-gcs-server URL for local dev (e.g. http://localhost:9199) */
|
|
18
|
+
storageEmulatorHost?: string;
|
|
19
|
+
|
|
20
|
+
// Fallback docker image if the API doesn't return one in /config
|
|
21
|
+
dockerImage?: string;
|
|
22
|
+
|
|
23
|
+
// Worker behaviour
|
|
24
|
+
maxConcurrentRuns: number;
|
|
25
|
+
workerId: string;
|
|
26
|
+
logLevel: string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const REQUIRED_VARS = ["API_BASE_URL"] as const;
|
|
30
|
+
|
|
31
|
+
export function loadConfig(): WorkerConfig {
|
|
32
|
+
const missing = REQUIRED_VARS.filter((k) => !process.env[k]);
|
|
33
|
+
if (missing.length > 0) {
|
|
34
|
+
throw new Error(
|
|
35
|
+
`Missing required environment variables: ${missing.join(", ")}`,
|
|
36
|
+
);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// At least one Redis connection method must be provided
|
|
40
|
+
if (!process.env.REDIS_URL && !process.env.REDIS_HOST) {
|
|
41
|
+
throw new Error(
|
|
42
|
+
"Missing Redis config: set REDIS_URL or REDIS_HOST (+ REDIS_PORT)",
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
redisUrl: process.env.REDIS_URL,
|
|
48
|
+
redisHost: process.env.REDIS_HOST ?? "localhost",
|
|
49
|
+
redisPort: parseInt(process.env.REDIS_PORT ?? "6379", 10),
|
|
50
|
+
redisPassword: process.env.REDIS_PASSWORD,
|
|
51
|
+
|
|
52
|
+
apiBaseUrl: process.env.API_BASE_URL!,
|
|
53
|
+
|
|
54
|
+
gcsBucket: process.env.GCS_BUCKET,
|
|
55
|
+
gcpProjectId: process.env.GCP_PROJECT_ID,
|
|
56
|
+
storageEmulatorHost: process.env.STORAGE_EMULATOR_HOST,
|
|
57
|
+
|
|
58
|
+
dockerImage: process.env.DOCKER_IMAGE,
|
|
59
|
+
|
|
60
|
+
maxConcurrentRuns: parseInt(process.env.MAX_CONCURRENT_RUNS ?? "2", 10),
|
|
61
|
+
workerId: process.env.WORKER_ID ?? `dory-worker-${process.pid}`,
|
|
62
|
+
logLevel: process.env.LOG_LEVEL ?? "info",
|
|
63
|
+
};
|
|
64
|
+
}
|
package/src/docker.ts
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import { exec } from "child_process";
|
|
2
|
+
import { promisify } from "util";
|
|
3
|
+
import type { WorkerConfig } from "./config";
|
|
4
|
+
import { logger } from "./logger";
|
|
5
|
+
|
|
6
|
+
const execAsync = promisify(exec);
|
|
7
|
+
|
|
8
|
+
interface ActorConfigEnvelope {
|
|
9
|
+
jobId: string;
|
|
10
|
+
runId: string;
|
|
11
|
+
actorInstanceId: string;
|
|
12
|
+
userInput: Record<string, unknown>;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Rewrite localhost URLs to host.docker.internal so containers can reach
|
|
17
|
+
* the host machine. Public URLs (Railway, GCS, etc.) are left unchanged.
|
|
18
|
+
*/
|
|
19
|
+
function toDockerHost(url: string): string {
|
|
20
|
+
return url
|
|
21
|
+
.replace("http://localhost", "http://host.docker.internal")
|
|
22
|
+
.replace("https://localhost", "https://host.docker.internal");
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function buildDockerEnvArgs(
|
|
26
|
+
config: WorkerConfig,
|
|
27
|
+
actorConfig: ActorConfigEnvelope,
|
|
28
|
+
memoryLimit: number,
|
|
29
|
+
): string[] {
|
|
30
|
+
const configBase64 = Buffer.from(JSON.stringify(actorConfig)).toString(
|
|
31
|
+
"base64",
|
|
32
|
+
);
|
|
33
|
+
|
|
34
|
+
const args = [
|
|
35
|
+
`-e ACTOR_CONFIG=${configBase64}`,
|
|
36
|
+
`-e API_BASE_URL=${toDockerHost(config.apiBaseUrl)}`,
|
|
37
|
+
`-e CRAWLEE_MEMORY_MBYTES=${memoryLimit}`,
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
// Pass through GCS config if set (containers need it to upload results)
|
|
41
|
+
if (config.gcsBucket) args.push(`-e GCS_BUCKET=${config.gcsBucket}`);
|
|
42
|
+
if (config.gcpProjectId)
|
|
43
|
+
args.push(`-e GCP_PROJECT_ID=${config.gcpProjectId}`);
|
|
44
|
+
if (config.storageEmulatorHost) {
|
|
45
|
+
args.push(
|
|
46
|
+
`-e STORAGE_EMULATOR_HOST=${toDockerHost(config.storageEmulatorHost)}`,
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return args;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Start the dory-core scraping container in detached mode.
|
|
55
|
+
* Returns the container ID.
|
|
56
|
+
*
|
|
57
|
+
* @param dockerImage Image to run — sourced from GET /api/runs/:id/config
|
|
58
|
+
* so the API controls what version runs everywhere.
|
|
59
|
+
*/
|
|
60
|
+
export async function startContainer(
|
|
61
|
+
config: WorkerConfig,
|
|
62
|
+
actorConfig: ActorConfigEnvelope,
|
|
63
|
+
memoryLimit: number,
|
|
64
|
+
dockerImage: string,
|
|
65
|
+
): Promise<string> {
|
|
66
|
+
const envArgs = buildDockerEnvArgs(config, actorConfig, memoryLimit).join(
|
|
67
|
+
" ",
|
|
68
|
+
);
|
|
69
|
+
const memoryStr = `${memoryLimit}m`;
|
|
70
|
+
|
|
71
|
+
const cmd = [
|
|
72
|
+
"docker run -d --rm",
|
|
73
|
+
`--memory=${memoryStr}`,
|
|
74
|
+
"--add-host=host.docker.internal:host-gateway",
|
|
75
|
+
envArgs,
|
|
76
|
+
dockerImage,
|
|
77
|
+
].join(" ");
|
|
78
|
+
|
|
79
|
+
logger.debug(`docker run: ${cmd}`);
|
|
80
|
+
const { stdout } = await execAsync(cmd);
|
|
81
|
+
return stdout.trim();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Block until the container exits and return its exit code.
|
|
86
|
+
* If the container exceeds `timeoutSecs`, it is force-killed.
|
|
87
|
+
*/
|
|
88
|
+
export async function waitForContainer(
|
|
89
|
+
containerId: string,
|
|
90
|
+
timeoutSecs: number,
|
|
91
|
+
): Promise<number> {
|
|
92
|
+
try {
|
|
93
|
+
const { stdout } = await execAsync(`docker wait ${containerId}`, {
|
|
94
|
+
timeout: (timeoutSecs + 30) * 1_000,
|
|
95
|
+
});
|
|
96
|
+
return parseInt(stdout.trim(), 10);
|
|
97
|
+
} catch (err) {
|
|
98
|
+
logger.warn(
|
|
99
|
+
`docker wait timed out for ${containerId.slice(0, 12)}: ${(err as Error).message} — force-killing`,
|
|
100
|
+
);
|
|
101
|
+
try {
|
|
102
|
+
await execAsync(`docker kill ${containerId}`);
|
|
103
|
+
} catch {
|
|
104
|
+
// Container already gone — ignore
|
|
105
|
+
}
|
|
106
|
+
return 137; // SIGKILL exit code
|
|
107
|
+
}
|
|
108
|
+
}
|
package/src/logger.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
type LogLevel = "debug" | "info" | "warn" | "error";
|
|
2
|
+
|
|
3
|
+
const LEVEL_WEIGHT: Record<LogLevel, number> = {
|
|
4
|
+
debug: 0,
|
|
5
|
+
info: 1,
|
|
6
|
+
warn: 2,
|
|
7
|
+
error: 3,
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
let currentLevel: LogLevel = "info";
|
|
11
|
+
|
|
12
|
+
export function setLogLevel(level: string): void {
|
|
13
|
+
if (level in LEVEL_WEIGHT) {
|
|
14
|
+
currentLevel = level as LogLevel;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function emit(level: LogLevel, message: string): void {
|
|
19
|
+
if (LEVEL_WEIGHT[level] < LEVEL_WEIGHT[currentLevel]) return;
|
|
20
|
+
const ts = new Date().toISOString();
|
|
21
|
+
const line = `[${ts}] [${level.toUpperCase().padEnd(5)}] ${message}`;
|
|
22
|
+
if (level === "error") {
|
|
23
|
+
process.stderr.write(line + "\n");
|
|
24
|
+
} else {
|
|
25
|
+
process.stdout.write(line + "\n");
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export const logger = {
|
|
30
|
+
debug: (msg: string) => emit("debug", msg),
|
|
31
|
+
info: (msg: string) => emit("info", msg),
|
|
32
|
+
warn: (msg: string) => emit("warn", msg),
|
|
33
|
+
error: (msg: string) => emit("error", msg),
|
|
34
|
+
};
|
package/src/processor.ts
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import type { Job } from "bullmq";
|
|
2
|
+
import type { WorkerConfig } from "./config";
|
|
3
|
+
import { startContainer, waitForContainer } from "./docker";
|
|
4
|
+
import { logger } from "./logger";
|
|
5
|
+
|
|
6
|
+
interface RunConfigResponse {
|
|
7
|
+
runId: string;
|
|
8
|
+
actorConfig: {
|
|
9
|
+
jobId: string;
|
|
10
|
+
runId: string;
|
|
11
|
+
actorInstanceId: string;
|
|
12
|
+
userInput: Record<string, unknown>;
|
|
13
|
+
};
|
|
14
|
+
memoryLimitMb: number;
|
|
15
|
+
actorTimeoutSecs: number;
|
|
16
|
+
dockerImage: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
async function apiGet<T>(apiBaseUrl: string, path: string): Promise<T> {
|
|
20
|
+
const url = `${apiBaseUrl.replace(/\/+$/, "")}${path}`;
|
|
21
|
+
const res = await fetch(url);
|
|
22
|
+
if (!res.ok) {
|
|
23
|
+
const body = await res.text().catch(() => "");
|
|
24
|
+
throw new Error(`GET ${path} → HTTP ${res.status}: ${body}`);
|
|
25
|
+
}
|
|
26
|
+
return res.json() as Promise<T>;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
async function apiPatch(
|
|
30
|
+
apiBaseUrl: string,
|
|
31
|
+
path: string,
|
|
32
|
+
body: Record<string, unknown>,
|
|
33
|
+
): Promise<void> {
|
|
34
|
+
const url = `${apiBaseUrl.replace(/\/+$/, "")}${path}`;
|
|
35
|
+
const res = await fetch(url, {
|
|
36
|
+
method: "POST",
|
|
37
|
+
headers: { "Content-Type": "application/json" },
|
|
38
|
+
body: JSON.stringify(body),
|
|
39
|
+
});
|
|
40
|
+
if (!res.ok) {
|
|
41
|
+
const text = await res.text().catch(() => "");
|
|
42
|
+
logger.warn(`POST ${path} → HTTP ${res.status}: ${text}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Core job processor.
|
|
48
|
+
*
|
|
49
|
+
* Flow:
|
|
50
|
+
* 1. GET /api/runs/:id/config — fetch actorConfig from dory-api (no DB/GCS access needed)
|
|
51
|
+
* 2. POST /api/runs/:id/status — mark run as "running"
|
|
52
|
+
* 3. docker run — start the scraping container
|
|
53
|
+
* 4. Extend BullMQ lock every 2 min while waiting
|
|
54
|
+
* 5. docker wait — block until container exits
|
|
55
|
+
* 6. POST /api/runs/:id/status — exit-code fallback if HTTP callback never arrived
|
|
56
|
+
*/
|
|
57
|
+
export async function processRunJob(
|
|
58
|
+
config: WorkerConfig,
|
|
59
|
+
job: Job<{ runId: string }>,
|
|
60
|
+
): Promise<{ runId: string; status: string }> {
|
|
61
|
+
const { runId } = job.data;
|
|
62
|
+
|
|
63
|
+
if (!runId) throw new Error("Job data missing runId");
|
|
64
|
+
|
|
65
|
+
logger.info(
|
|
66
|
+
`[${runId}] Processing job ${job.id} (attempt ${job.attemptsMade + 1})`,
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
// ── Step 1: Fetch run config from API ──────────────────────────────────
|
|
70
|
+
let runConfig: RunConfigResponse;
|
|
71
|
+
try {
|
|
72
|
+
runConfig = await apiGet<RunConfigResponse>(
|
|
73
|
+
config.apiBaseUrl,
|
|
74
|
+
`/api/runs/${runId}/config`,
|
|
75
|
+
);
|
|
76
|
+
} catch (err) {
|
|
77
|
+
logger.error(
|
|
78
|
+
`[${runId}] Failed to fetch run config: ${(err as Error).message}`,
|
|
79
|
+
);
|
|
80
|
+
throw err; // BullMQ will retry
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ── Step 2: Mark run as running ────────────────────────────────────────
|
|
84
|
+
await apiPatch(config.apiBaseUrl, `/api/runs/${runId}/status`, {
|
|
85
|
+
runId,
|
|
86
|
+
status: "running",
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
// ── Step 3: Start container ────────────────────────────────────────────
|
|
90
|
+
const dockerImage = runConfig.dockerImage || config.dockerImage;
|
|
91
|
+
if (!dockerImage) {
|
|
92
|
+
throw new Error(
|
|
93
|
+
`No docker image: API did not return one and DOCKER_IMAGE env var is not set`,
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
let containerId: string;
|
|
97
|
+
try {
|
|
98
|
+
containerId = await startContainer(
|
|
99
|
+
config,
|
|
100
|
+
runConfig.actorConfig,
|
|
101
|
+
runConfig.memoryLimitMb,
|
|
102
|
+
dockerImage,
|
|
103
|
+
);
|
|
104
|
+
logger.info(`[${runId}] Container started: ${containerId.slice(0, 12)}`);
|
|
105
|
+
} catch (err) {
|
|
106
|
+
logger.error(
|
|
107
|
+
`[${runId}] Failed to start container: ${(err as Error).message}`,
|
|
108
|
+
);
|
|
109
|
+
await apiPatch(config.apiBaseUrl, `/api/runs/${runId}/status`, {
|
|
110
|
+
runId,
|
|
111
|
+
status: "failed",
|
|
112
|
+
error: `Failed to start container: ${(err as Error).message}`,
|
|
113
|
+
});
|
|
114
|
+
throw err;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// ── Step 4: Extend BullMQ lock while container runs ─────────────────────
|
|
118
|
+
let lockTimer: ReturnType<typeof setInterval> | undefined;
|
|
119
|
+
if (job.token) {
|
|
120
|
+
lockTimer = setInterval(async () => {
|
|
121
|
+
try {
|
|
122
|
+
await job.extendLock(job.token!, 300_000);
|
|
123
|
+
} catch (extErr) {
|
|
124
|
+
logger.warn(
|
|
125
|
+
`[${runId}] Failed to extend lock: ${(extErr as Error).message}`,
|
|
126
|
+
);
|
|
127
|
+
}
|
|
128
|
+
}, 120_000);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// ── Step 5: Wait for container ────────────────────────────────────────
|
|
132
|
+
let exitCode = -1;
|
|
133
|
+
try {
|
|
134
|
+
exitCode = await waitForContainer(containerId, runConfig.actorTimeoutSecs);
|
|
135
|
+
logger.info(
|
|
136
|
+
`[${runId}] Container ${containerId.slice(0, 12)} exited with code ${exitCode}`,
|
|
137
|
+
);
|
|
138
|
+
} finally {
|
|
139
|
+
if (lockTimer) clearInterval(lockTimer);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// ── Step 6: Exit-code fallback ────────────────────────────────────────
|
|
143
|
+
// The container calls POST /api/runs/:id/status itself when it finishes.
|
|
144
|
+
// We POST here only as a safety net if that callback never arrived.
|
|
145
|
+
// The API's handleStatusUpdate ignores updates that move a run backward
|
|
146
|
+
// (e.g. completed → running), so this is safe to always send.
|
|
147
|
+
const finalStatus = exitCode === 0 ? "completed" : "failed";
|
|
148
|
+
await apiPatch(config.apiBaseUrl, `/api/runs/${runId}/status`, {
|
|
149
|
+
runId,
|
|
150
|
+
status: finalStatus,
|
|
151
|
+
exitCode,
|
|
152
|
+
timestamp: new Date().toISOString(),
|
|
153
|
+
...(exitCode !== 0 && {
|
|
154
|
+
error: `Container exited with code ${exitCode} (status callback not received)`,
|
|
155
|
+
}),
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
logger.info(`[${runId}] Job ${job.id} done — status: ${finalStatus}`);
|
|
159
|
+
return { runId, status: finalStatus };
|
|
160
|
+
}
|