@nikx/dory-worker 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +37 -0
- package/.gitattributes +2 -0
- package/README.md +2 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +59 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +17 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +29 -0
- package/dist/config.js.map +1 -0
- package/dist/db.d.ts +14 -0
- package/dist/db.d.ts.map +1 -0
- package/dist/db.js +69 -0
- package/dist/db.js.map +1 -0
- package/dist/docker.d.ts +22 -0
- package/dist/docker.d.ts.map +1 -0
- package/dist/docker.js +78 -0
- package/dist/docker.js.map +1 -0
- package/dist/logger.d.ts +8 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +35 -0
- package/dist/logger.js.map +1 -0
- package/dist/processor.d.ts +20 -0
- package/dist/processor.d.ts.map +1 -0
- package/dist/processor.js +116 -0
- package/dist/processor.js.map +1 -0
- package/dist/storage.d.ts +7 -0
- package/dist/storage.d.ts.map +1 -0
- package/dist/storage.js +37 -0
- package/dist/storage.js.map +1 -0
- package/dist/types.d.ts +61 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/worker.d.ts +7 -0
- package/dist/worker.d.ts.map +1 -0
- package/dist/worker.js +56 -0
- package/dist/worker.js.map +1 -0
- package/package.json +36 -0
- package/src/cli.ts +66 -0
- package/src/config.ts +64 -0
- package/src/docker.ts +108 -0
- package/src/logger.ts +34 -0
- package/src/processor.ts +160 -0
- package/src/worker.ts +64 -0
- package/tsconfig.json +18 -0
package/.env.example
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# ─── Required ────────────────────────────────────────────────────────────────
|
|
2
|
+
|
|
3
|
+
# Public URL of dory-api.
|
|
4
|
+
# Containers POST their status callbacks here, so it must be reachable
|
|
5
|
+
# from inside Docker containers running on this machine.
|
|
6
|
+
# Production example: https://your-api.railway.app
|
|
7
|
+
API_BASE_URL=http://localhost:4500
|
|
8
|
+
|
|
9
|
+
# ─── Redis ───────────────────────────────────────────────────────────────────
|
|
10
|
+
|
|
11
|
+
# Option A: full Redis URL (recommended for Railway / cloud Redis)
|
|
12
|
+
# REDIS_URL=redis://default:password@host.railway.internal:6379
|
|
13
|
+
|
|
14
|
+
# Option B: host + port + optional password
|
|
15
|
+
REDIS_HOST=localhost
|
|
16
|
+
REDIS_PORT=6379
|
|
17
|
+
# REDIS_PASSWORD=
|
|
18
|
+
|
|
19
|
+
# ─── Worker behaviour ────────────────────────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
# Max scraping containers to run in parallel on this machine
|
|
22
|
+
# Keep at 1-2 for Raspberry Pi, higher for more powerful machines
|
|
23
|
+
MAX_CONCURRENT_RUNS=2
|
|
24
|
+
|
|
25
|
+
# Optional identifier shown in logs (defaults to "dory-worker-{pid}")
|
|
26
|
+
# WORKER_ID=pi-worker-01
|
|
27
|
+
|
|
28
|
+
# Log level: debug | info | warn | error
|
|
29
|
+
LOG_LEVEL=info
|
|
30
|
+
|
|
31
|
+
# ─── Development only ────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
# Set this to use the fake-gcs-server emulator instead of real GCS
|
|
34
|
+
# STORAGE_EMULATOR_HOST=http://localhost:9199
|
|
35
|
+
|
|
36
|
+
# GCP service account key file path (production; alternative to GOOGLE_APPLICATION_CREDENTIALS)
|
|
37
|
+
# GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account-key.json
|
package/.gitattributes
ADDED
package/README.md
ADDED
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
const config_1 = require("./config");
|
|
5
|
+
const worker_1 = require("./worker");
|
|
6
|
+
const logger_1 = require("./logger");
|
|
7
|
+
async function main() {
|
|
8
|
+
// ── Load & validate config ────────────────────────────────────────────
|
|
9
|
+
let config;
|
|
10
|
+
try {
|
|
11
|
+
config = (0, config_1.loadConfig)();
|
|
12
|
+
}
|
|
13
|
+
catch (err) {
|
|
14
|
+
console.error(`[dory-worker] Configuration error: ${err.message}`);
|
|
15
|
+
console.error("Set the required environment variables and try again.");
|
|
16
|
+
process.exit(1);
|
|
17
|
+
}
|
|
18
|
+
(0, logger_1.setLogLevel)(config.logLevel);
|
|
19
|
+
logger_1.logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
20
|
+
logger_1.logger.info(" dory-worker starting");
|
|
21
|
+
logger_1.logger.info(` Worker ID : ${config.workerId}`);
|
|
22
|
+
logger_1.logger.info(` Concurrency : ${config.maxConcurrentRuns}`);
|
|
23
|
+
logger_1.logger.info(` API URL : ${config.apiBaseUrl}`);
|
|
24
|
+
const redisAddr = config.redisUrl ?? `${config.redisHost}:${config.redisPort}`;
|
|
25
|
+
logger_1.logger.info(` Redis : ${redisAddr}`);
|
|
26
|
+
if (config.storageEmulatorHost) {
|
|
27
|
+
logger_1.logger.info(` GCS emulator: ${config.storageEmulatorHost} (dev mode)`);
|
|
28
|
+
}
|
|
29
|
+
logger_1.logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
30
|
+
// ── Start BullMQ worker ───────────────────────────────────────────────
|
|
31
|
+
const redis = (0, worker_1.createRedisConnection)(config);
|
|
32
|
+
const worker = (0, worker_1.createWorker)(config, redis);
|
|
33
|
+
logger_1.logger.info('Listening on queue "run-execution" — ready to process jobs');
|
|
34
|
+
// ── Graceful shutdown ─────────────────────────────────────────────────
|
|
35
|
+
let shuttingDown = false;
|
|
36
|
+
async function shutdown(signal) {
|
|
37
|
+
if (shuttingDown)
|
|
38
|
+
return;
|
|
39
|
+
shuttingDown = true;
|
|
40
|
+
logger_1.logger.info(`${signal} received — shutting down gracefully...`);
|
|
41
|
+
try {
|
|
42
|
+
await worker.close();
|
|
43
|
+
await redis.quit();
|
|
44
|
+
logger_1.logger.info("Worker shut down cleanly");
|
|
45
|
+
process.exit(0);
|
|
46
|
+
}
|
|
47
|
+
catch (err) {
|
|
48
|
+
logger_1.logger.error(`Error during shutdown: ${err.message}`);
|
|
49
|
+
process.exit(1);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
process.on("SIGTERM", () => shutdown("SIGTERM"));
|
|
53
|
+
process.on("SIGINT", () => shutdown("SIGINT"));
|
|
54
|
+
}
|
|
55
|
+
main().catch((err) => {
|
|
56
|
+
console.error("[dory-worker] Fatal error:", err);
|
|
57
|
+
process.exit(1);
|
|
58
|
+
});
|
|
59
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";;;AACA,qCAAsC;AACtC,qCAA+D;AAC/D,qCAA+C;AAE/C,KAAK,UAAU,IAAI;IACjB,yEAAyE;IACzE,IAAI,MAAM,CAAC;IACX,IAAI,CAAC;QACH,MAAM,GAAG,IAAA,mBAAU,GAAE,CAAC;IACxB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CACX,sCAAuC,GAAa,CAAC,OAAO,EAAE,CAC/D,CAAC;QACF,OAAO,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;QACvE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,IAAA,oBAAW,EAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IAE7B,eAAM,CAAC,IAAI,CAAC,6CAA6C,CAAC,CAAC;IAC3D,eAAM,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;IACtC,eAAM,CAAC,IAAI,CAAC,mBAAmB,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAClD,eAAM,CAAC,IAAI,CAAC,mBAAmB,MAAM,CAAC,iBAAiB,EAAE,CAAC,CAAC;IAC3D,eAAM,CAAC,IAAI,CAAC,mBAAmB,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;IACpD,MAAM,SAAS,GACb,MAAM,CAAC,QAAQ,IAAI,GAAG,MAAM,CAAC,SAAS,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;IAC/D,eAAM,CAAC,IAAI,CAAC,mBAAmB,SAAS,EAAE,CAAC,CAAC;IAC5C,IAAI,MAAM,CAAC,mBAAmB,EAAE,CAAC;QAC/B,eAAM,CAAC,IAAI,CAAC,mBAAmB,MAAM,CAAC,mBAAmB,aAAa,CAAC,CAAC;IAC1E,CAAC;IACD,eAAM,CAAC,IAAI,CAAC,6CAA6C,CAAC,CAAC;IAE3D,yEAAyE;IACzE,MAAM,KAAK,GAAG,IAAA,8BAAqB,EAAC,MAAM,CAAC,CAAC;IAC5C,MAAM,MAAM,GAAG,IAAA,qBAAY,EAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IAE3C,eAAM,CAAC,IAAI,CAAC,4DAA4D,CAAC,CAAC;IAE1E,yEAAyE;IACzE,IAAI,YAAY,GAAG,KAAK,CAAC;IAEzB,KAAK,UAAU,QAAQ,CAAC,MAAc;QACpC,IAAI,YAAY;YAAE,OAAO;QACzB,YAAY,GAAG,IAAI,CAAC;QAEpB,eAAM,CAAC,IAAI,CAAC,GAAG,MAAM,yCAAyC,CAAC,CAAC;QAChE,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,KAAK,EAAE,CAAC;YACrB,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,eAAM,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;YACxC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,eAAM,CAAC,KAAK,CAAC,0BAA2B,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC;YACjE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,OAAO,CAAC,EAAE,CAAC,SAAS,EAAE,GAAG,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC;IACjD,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;AACjD,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,GAAY,EAAE,EAAE;IAC5B,OAAO,CAAC,KAAK,CAAC,4BAA4B,EAAE,GAAG,CAAC,CAAC;IACjD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}
|
package/dist/config.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export interface WorkerConfig {
|
|
2
|
+
redisUrl?: string;
|
|
3
|
+
redisHost: string;
|
|
4
|
+
redisPort: number;
|
|
5
|
+
redisPassword?: string;
|
|
6
|
+
apiBaseUrl: string;
|
|
7
|
+
gcsBucket?: string;
|
|
8
|
+
gcpProjectId?: string;
|
|
9
|
+
/** fake-gcs-server URL for local dev (e.g. http://localhost:9199) */
|
|
10
|
+
storageEmulatorHost?: string;
|
|
11
|
+
dockerImage?: string;
|
|
12
|
+
maxConcurrentRuns: number;
|
|
13
|
+
workerId: string;
|
|
14
|
+
logLevel: string;
|
|
15
|
+
}
|
|
16
|
+
export declare function loadConfig(): WorkerConfig;
|
|
17
|
+
//# sourceMappingURL=config.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,YAAY;IAE3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IAIvB,UAAU,EAAE,MAAM,CAAC;IAKnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,qEAAqE;IACrE,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAG7B,WAAW,CAAC,EAAE,MAAM,CAAC;IAGrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAID,wBAAgB,UAAU,IAAI,YAAY,CAiCzC"}
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.loadConfig = loadConfig;
|
|
4
|
+
const REQUIRED_VARS = ["API_BASE_URL"];
|
|
5
|
+
function loadConfig() {
|
|
6
|
+
const missing = REQUIRED_VARS.filter((k) => !process.env[k]);
|
|
7
|
+
if (missing.length > 0) {
|
|
8
|
+
throw new Error(`Missing required environment variables: ${missing.join(", ")}`);
|
|
9
|
+
}
|
|
10
|
+
// At least one Redis connection method must be provided
|
|
11
|
+
if (!process.env.REDIS_URL && !process.env.REDIS_HOST) {
|
|
12
|
+
throw new Error("Missing Redis config: set REDIS_URL or REDIS_HOST (+ REDIS_PORT)");
|
|
13
|
+
}
|
|
14
|
+
return {
|
|
15
|
+
redisUrl: process.env.REDIS_URL,
|
|
16
|
+
redisHost: process.env.REDIS_HOST ?? "localhost",
|
|
17
|
+
redisPort: parseInt(process.env.REDIS_PORT ?? "6379", 10),
|
|
18
|
+
redisPassword: process.env.REDIS_PASSWORD,
|
|
19
|
+
apiBaseUrl: process.env.API_BASE_URL,
|
|
20
|
+
gcsBucket: process.env.GCS_BUCKET,
|
|
21
|
+
gcpProjectId: process.env.GCP_PROJECT_ID,
|
|
22
|
+
storageEmulatorHost: process.env.STORAGE_EMULATOR_HOST,
|
|
23
|
+
dockerImage: process.env.DOCKER_IMAGE,
|
|
24
|
+
maxConcurrentRuns: parseInt(process.env.MAX_CONCURRENT_RUNS ?? "2", 10),
|
|
25
|
+
workerId: process.env.WORKER_ID ?? `dory-worker-${process.pid}`,
|
|
26
|
+
logLevel: process.env.LOG_LEVEL ?? "info",
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
//# sourceMappingURL=config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AA8BA,gCAiCC;AAnCD,MAAM,aAAa,GAAG,CAAC,cAAc,CAAU,CAAC;AAEhD,SAAgB,UAAU;IACxB,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CACb,2CAA2C,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAChE,CAAC;IACJ,CAAC;IAED,wDAAwD;IACxD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,SAAS,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,EAAE,CAAC;QACtD,MAAM,IAAI,KAAK,CACb,kEAAkE,CACnE,CAAC;IACJ,CAAC;IAED,OAAO;QACL,QAAQ,EAAE,OAAO,CAAC,GAAG,CAAC,SAAS;QAC/B,SAAS,EAAE,OAAO,CAAC,GAAG,CAAC,UAAU,IAAI,WAAW;QAChD,SAAS,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,IAAI,MAAM,EAAE,EAAE,CAAC;QACzD,aAAa,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;QAEzC,UAAU,EAAE,OAAO,CAAC,GAAG,CAAC,YAAa;QAErC,SAAS,EAAE,OAAO,CAAC,GAAG,CAAC,UAAU;QACjC,YAAY,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;QACxC,mBAAmB,EAAE,OAAO,CAAC,GAAG,CAAC,qBAAqB;QAEtD,WAAW,EAAE,OAAO,CAAC,GAAG,CAAC,YAAY;QAErC,iBAAiB,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,mBAAmB,IAAI,GAAG,EAAE,EAAE,CAAC;QACvE,QAAQ,EAAE,OAAO,CAAC,GAAG,CAAC,SAAS,IAAI,eAAe,OAAO,CAAC,GAAG,EAAE;QAC/D,QAAQ,EAAE,OAAO,CAAC,GAAG,CAAC,SAAS,IAAI,MAAM;KAC1C,CAAC;AACJ,CAAC"}
|
package/dist/db.d.ts
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { WorkerConfig } from "./config";
|
|
2
|
+
import type { RunDocument, ActorDocument } from "./types";
|
|
3
|
+
export declare function connectDb(config: WorkerConfig): Promise<void>;
|
|
4
|
+
export declare function disconnectDb(): Promise<void>;
|
|
5
|
+
/**
|
|
6
|
+
* Atomically transition a run from "queued" → "running".
|
|
7
|
+
* Returns the run document as it was BEFORE the update (status="queued"),
|
|
8
|
+
* or null if another worker already claimed it.
|
|
9
|
+
*/
|
|
10
|
+
export declare function claimRun(runId: string): Promise<RunDocument | null>;
|
|
11
|
+
export declare function loadRun(runId: string): Promise<RunDocument | null>;
|
|
12
|
+
export declare function loadActor(actorId: string): Promise<ActorDocument | null>;
|
|
13
|
+
export declare function updateRun(runId: string, update: Partial<RunDocument>): Promise<void>;
|
|
14
|
+
//# sourceMappingURL=db.d.ts.map
|
package/dist/db.d.ts.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"db.d.ts","sourceRoot":"","sources":["../src/db.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAC7C,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAa,MAAM,SAAS,CAAC;AAOrE,wBAAsB,SAAS,CAAC,MAAM,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,CAOnE;AAED,wBAAsB,YAAY,IAAI,OAAO,CAAC,IAAI,CAAC,CAKlD;AAED;;;;GAIG;AACH,wBAAsB,QAAQ,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC,CAazE;AAED,wBAAsB,OAAO,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC,CAMxE;AAED,wBAAsB,SAAS,CAC7B,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,CAM/B;AAED,wBAAsB,SAAS,CAC7B,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,OAAO,CAAC,WAAW,CAAC,GAC3B,OAAO,CAAC,IAAI,CAAC,CAOf"}
|
package/dist/db.js
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.connectDb = connectDb;
|
|
4
|
+
exports.disconnectDb = disconnectDb;
|
|
5
|
+
exports.claimRun = claimRun;
|
|
6
|
+
exports.loadRun = loadRun;
|
|
7
|
+
exports.loadActor = loadActor;
|
|
8
|
+
exports.updateRun = updateRun;
|
|
9
|
+
const mongodb_1 = require("mongodb");
|
|
10
|
+
const logger_1 = require("./logger");
|
|
11
|
+
let client = null;
|
|
12
|
+
let runsCol = null;
|
|
13
|
+
let actorsCol = null;
|
|
14
|
+
async function connectDb(config) {
|
|
15
|
+
client = new mongodb_1.MongoClient(config.mongoUri);
|
|
16
|
+
await client.connect();
|
|
17
|
+
const db = client.db(config.mongoDbName);
|
|
18
|
+
runsCol = db.collection("runs");
|
|
19
|
+
actorsCol = db.collection("actors");
|
|
20
|
+
logger_1.logger.info(`MongoDB connected (db: ${config.mongoDbName})`);
|
|
21
|
+
}
|
|
22
|
+
async function disconnectDb() {
|
|
23
|
+
if (client) {
|
|
24
|
+
await client.close();
|
|
25
|
+
logger_1.logger.info("MongoDB disconnected");
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Atomically transition a run from "queued" → "running".
|
|
30
|
+
* Returns the run document as it was BEFORE the update (status="queued"),
|
|
31
|
+
* or null if another worker already claimed it.
|
|
32
|
+
*/
|
|
33
|
+
async function claimRun(runId) {
|
|
34
|
+
if (!runsCol)
|
|
35
|
+
throw new Error("DB not connected");
|
|
36
|
+
if (!mongodb_1.ObjectId.isValid(runId))
|
|
37
|
+
return null;
|
|
38
|
+
const result = await runsCol.findOneAndUpdate({
|
|
39
|
+
_id: new mongodb_1.ObjectId(runId),
|
|
40
|
+
status: "queued",
|
|
41
|
+
}, { $set: { status: "running" } }, { returnDocument: "before" });
|
|
42
|
+
return result ?? null;
|
|
43
|
+
}
|
|
44
|
+
async function loadRun(runId) {
|
|
45
|
+
if (!runsCol)
|
|
46
|
+
throw new Error("DB not connected");
|
|
47
|
+
if (!mongodb_1.ObjectId.isValid(runId))
|
|
48
|
+
return null;
|
|
49
|
+
return runsCol.findOne({
|
|
50
|
+
_id: new mongodb_1.ObjectId(runId),
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
async function loadActor(actorId) {
|
|
54
|
+
if (!actorsCol)
|
|
55
|
+
throw new Error("DB not connected");
|
|
56
|
+
if (!mongodb_1.ObjectId.isValid(actorId))
|
|
57
|
+
return null;
|
|
58
|
+
return actorsCol.findOne({
|
|
59
|
+
_id: new mongodb_1.ObjectId(actorId),
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
async function updateRun(runId, update) {
|
|
63
|
+
if (!runsCol)
|
|
64
|
+
throw new Error("DB not connected");
|
|
65
|
+
if (!mongodb_1.ObjectId.isValid(runId))
|
|
66
|
+
return;
|
|
67
|
+
await runsCol.updateOne({ _id: new mongodb_1.ObjectId(runId) }, { $set: update });
|
|
68
|
+
}
|
|
69
|
+
//# sourceMappingURL=db.js.map
|
package/dist/db.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"db.js","sourceRoot":"","sources":["../src/db.ts"],"names":[],"mappings":";;AAUA,8BAOC;AAED,oCAKC;AAOD,4BAaC;AAED,0BAMC;AAED,8BAQC;AAED,8BAUC;AA1ED,qCAAgD;AAIhD,qCAAkC;AAElC,IAAI,MAAM,GAAuB,IAAI,CAAC;AACtC,IAAI,OAAO,GAAmC,IAAI,CAAC;AACnD,IAAI,SAAS,GAAqC,IAAI,CAAC;AAEhD,KAAK,UAAU,SAAS,CAAC,MAAoB;IAClD,MAAM,GAAG,IAAI,qBAAW,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IAC1C,MAAM,MAAM,CAAC,OAAO,EAAE,CAAC;IACvB,MAAM,EAAE,GAAG,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;IACzC,OAAO,GAAG,EAAE,CAAC,UAAU,CAAc,MAAM,CAAC,CAAC;IAC7C,SAAS,GAAG,EAAE,CAAC,UAAU,CAAgB,QAAQ,CAAC,CAAC;IACnD,eAAM,CAAC,IAAI,CAAC,0BAA0B,MAAM,CAAC,WAAW,GAAG,CAAC,CAAC;AAC/D,CAAC;AAEM,KAAK,UAAU,YAAY;IAChC,IAAI,MAAM,EAAE,CAAC;QACX,MAAM,MAAM,CAAC,KAAK,EAAE,CAAC;QACrB,eAAM,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;IACtC,CAAC;AACH,CAAC;AAED;;;;GAIG;AACI,KAAK,UAAU,QAAQ,CAAC,KAAa;IAC1C,IAAI,CAAC,OAAO;QAAE,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;IAClD,IAAI,CAAC,kBAAQ,CAAC,OAAO,CAAC,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAE1C,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,gBAAgB,CAC3C;QACE,GAAG,EAAE,IAAI,kBAAQ,CAAC,KAAK,CAAkC;QACzD,MAAM,EAAE,QAAQ;KACjB,EACD,EAAE,IAAI,EAAE,EAAE,MAAM,EAAE,SAAsB,EAAE,EAAE,EAC5C,EAAE,cAAc,EAAE,QAAQ,EAAE,CAC7B,CAAC;IACF,OAAO,MAAM,IAAI,IAAI,CAAC;AACxB,CAAC;AAEM,KAAK,UAAU,OAAO,CAAC,KAAa;IACzC,IAAI,CAAC,OAAO;QAAE,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;IAClD,IAAI,CAAC,kBAAQ,CAAC,OAAO,CAAC,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAC1C,OAAO,OAAO,CAAC,OAAO,CAAC;QACrB,GAAG,EAAE,IAAI,kBAAQ,CAAC,KAAK,CAAkC;KAC1D,CAAC,CAAC;AACL,CAAC;AAEM,KAAK,UAAU,SAAS,CAC7B,OAAe;IAEf,IAAI,CAAC,SAAS;QAAE,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;IACpD,IAAI,CAAC,kBAAQ,CAAC,OAAO,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5C,OAAO,SAAS,CAAC,OAAO,CAAC;QACvB,GAAG,EAAE,IAAI,kBAAQ,CAAC,OAAO,CAAoC;KAC9D,CAAC,CAAC;AACL,CAAC;AAEM,KAAK,UAAU,SAAS,CAC7B,KAAa,EACb,MAA4B;IAE5B,IAAI,CAAC,OAAO;QAAE,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;IAClD,IAAI,CAAC,kBAAQ,CAAC,OAAO,CAAC,KAAK,CAAC;QAAE,OAAO;IACrC,MAAM,OAAO,CAAC,SAAS,CACrB,EAAE,GAAG,EAAE,IAAI,kBAAQ,CAAC,KAAK,CAAkC,EAAE,EAC7D,EAAE,IAAI,EAAE,MAAM,EAAE,CACjB,CAAC;AACJ,CAAC"}
|
package/dist/docker.d.ts
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { WorkerConfig } from "./config";
|
|
2
|
+
interface ActorConfigEnvelope {
|
|
3
|
+
jobId: string;
|
|
4
|
+
runId: string;
|
|
5
|
+
actorInstanceId: string;
|
|
6
|
+
userInput: Record<string, unknown>;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Start the dory-core scraping container in detached mode.
|
|
10
|
+
* Returns the container ID.
|
|
11
|
+
*
|
|
12
|
+
* @param dockerImage Image to run — sourced from GET /api/runs/:id/config
|
|
13
|
+
* so the API controls what version runs everywhere.
|
|
14
|
+
*/
|
|
15
|
+
export declare function startContainer(config: WorkerConfig, actorConfig: ActorConfigEnvelope, memoryLimit: number, dockerImage: string): Promise<string>;
|
|
16
|
+
/**
|
|
17
|
+
* Block until the container exits and return its exit code.
|
|
18
|
+
* If the container exceeds `timeoutSecs`, it is force-killed.
|
|
19
|
+
*/
|
|
20
|
+
export declare function waitForContainer(containerId: string, timeoutSecs: number): Promise<number>;
|
|
21
|
+
export {};
|
|
22
|
+
//# sourceMappingURL=docker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docker.d.ts","sourceRoot":"","sources":["../src/docker.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAK7C,UAAU,mBAAmB;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,eAAe,EAAE,MAAM,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAwCD;;;;;;GAMG;AACH,wBAAsB,cAAc,CAClC,MAAM,EAAE,YAAY,EACpB,WAAW,EAAE,mBAAmB,EAChC,WAAW,EAAE,MAAM,EACnB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,MAAM,CAAC,CAiBjB;AAED;;;GAGG;AACH,wBAAsB,gBAAgB,CACpC,WAAW,EAAE,MAAM,EACnB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,MAAM,CAAC,CAiBjB"}
|
package/dist/docker.js
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.startContainer = startContainer;
|
|
4
|
+
exports.waitForContainer = waitForContainer;
|
|
5
|
+
const child_process_1 = require("child_process");
|
|
6
|
+
const util_1 = require("util");
|
|
7
|
+
const logger_1 = require("./logger");
|
|
8
|
+
const execAsync = (0, util_1.promisify)(child_process_1.exec);
|
|
9
|
+
/**
|
|
10
|
+
* Rewrite localhost URLs to host.docker.internal so containers can reach
|
|
11
|
+
* the host machine. Public URLs (Railway, GCS, etc.) are left unchanged.
|
|
12
|
+
*/
|
|
13
|
+
function toDockerHost(url) {
|
|
14
|
+
return url
|
|
15
|
+
.replace("http://localhost", "http://host.docker.internal")
|
|
16
|
+
.replace("https://localhost", "https://host.docker.internal");
|
|
17
|
+
}
|
|
18
|
+
function buildDockerEnvArgs(config, actorConfig, memoryLimit) {
|
|
19
|
+
const configBase64 = Buffer.from(JSON.stringify(actorConfig)).toString("base64");
|
|
20
|
+
const args = [
|
|
21
|
+
`-e ACTOR_CONFIG=${configBase64}`,
|
|
22
|
+
`-e API_BASE_URL=${toDockerHost(config.apiBaseUrl)}`,
|
|
23
|
+
`-e CRAWLEE_MEMORY_MBYTES=${memoryLimit}`,
|
|
24
|
+
];
|
|
25
|
+
// Pass through GCS config if set (containers need it to upload results)
|
|
26
|
+
if (config.gcsBucket)
|
|
27
|
+
args.push(`-e GCS_BUCKET=${config.gcsBucket}`);
|
|
28
|
+
if (config.gcpProjectId)
|
|
29
|
+
args.push(`-e GCP_PROJECT_ID=${config.gcpProjectId}`);
|
|
30
|
+
if (config.storageEmulatorHost) {
|
|
31
|
+
args.push(`-e STORAGE_EMULATOR_HOST=${toDockerHost(config.storageEmulatorHost)}`);
|
|
32
|
+
}
|
|
33
|
+
return args;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Start the dory-core scraping container in detached mode.
|
|
37
|
+
* Returns the container ID.
|
|
38
|
+
*
|
|
39
|
+
* @param dockerImage Image to run — sourced from GET /api/runs/:id/config
|
|
40
|
+
* so the API controls what version runs everywhere.
|
|
41
|
+
*/
|
|
42
|
+
async function startContainer(config, actorConfig, memoryLimit, dockerImage) {
|
|
43
|
+
const envArgs = buildDockerEnvArgs(config, actorConfig, memoryLimit).join(" ");
|
|
44
|
+
const memoryStr = `${memoryLimit}m`;
|
|
45
|
+
const cmd = [
|
|
46
|
+
"docker run -d --rm",
|
|
47
|
+
`--memory=${memoryStr}`,
|
|
48
|
+
"--add-host=host.docker.internal:host-gateway",
|
|
49
|
+
envArgs,
|
|
50
|
+
dockerImage,
|
|
51
|
+
].join(" ");
|
|
52
|
+
logger_1.logger.debug(`docker run: ${cmd}`);
|
|
53
|
+
const { stdout } = await execAsync(cmd);
|
|
54
|
+
return stdout.trim();
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Block until the container exits and return its exit code.
|
|
58
|
+
* If the container exceeds `timeoutSecs`, it is force-killed.
|
|
59
|
+
*/
|
|
60
|
+
async function waitForContainer(containerId, timeoutSecs) {
|
|
61
|
+
try {
|
|
62
|
+
const { stdout } = await execAsync(`docker wait ${containerId}`, {
|
|
63
|
+
timeout: (timeoutSecs + 30) * 1_000,
|
|
64
|
+
});
|
|
65
|
+
return parseInt(stdout.trim(), 10);
|
|
66
|
+
}
|
|
67
|
+
catch (err) {
|
|
68
|
+
logger_1.logger.warn(`docker wait timed out for ${containerId.slice(0, 12)}: ${err.message} — force-killing`);
|
|
69
|
+
try {
|
|
70
|
+
await execAsync(`docker kill ${containerId}`);
|
|
71
|
+
}
|
|
72
|
+
catch {
|
|
73
|
+
// Container already gone — ignore
|
|
74
|
+
}
|
|
75
|
+
return 137; // SIGKILL exit code
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
//# sourceMappingURL=docker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docker.js","sourceRoot":"","sources":["../src/docker.ts"],"names":[],"mappings":";;AA2DA,wCAsBC;AAMD,4CAoBC;AA3GD,iDAAqC;AACrC,+BAAiC;AAEjC,qCAAkC;AAElC,MAAM,SAAS,GAAG,IAAA,gBAAS,EAAC,oBAAI,CAAC,CAAC;AASlC;;;GAGG;AACH,SAAS,YAAY,CAAC,GAAW;IAC/B,OAAO,GAAG;SACP,OAAO,CAAC,kBAAkB,EAAE,6BAA6B,CAAC;SAC1D,OAAO,CAAC,mBAAmB,EAAE,8BAA8B,CAAC,CAAC;AAClE,CAAC;AAED,SAAS,kBAAkB,CACzB,MAAoB,EACpB,WAAgC,EAChC,WAAmB;IAEnB,MAAM,YAAY,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CACpE,QAAQ,CACT,CAAC;IAEF,MAAM,IAAI,GAAG;QACX,mBAAmB,YAAY,EAAE;QACjC,mBAAmB,YAAY,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE;QACpD,4BAA4B,WAAW,EAAE;KAC1C,CAAC;IAEF,wEAAwE;IACxE,IAAI,MAAM,CAAC,SAAS;QAAE,IAAI,CAAC,IAAI,CAAC,iBAAiB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IACrE,IAAI,MAAM,CAAC,YAAY;QACrB,IAAI,CAAC,IAAI,CAAC,qBAAqB,MAAM,CAAC,YAAY,EAAE,CAAC,CAAC;IACxD,IAAI,MAAM,CAAC,mBAAmB,EAAE,CAAC;QAC/B,IAAI,CAAC,IAAI,CACP,4BAA4B,YAAY,CAAC,MAAM,CAAC,mBAAmB,CAAC,EAAE,CACvE,CAAC;IACJ,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;GAMG;AACI,KAAK,UAAU,cAAc,CAClC,MAAoB,EACpB,WAAgC,EAChC,WAAmB,EACnB,WAAmB;IAEnB,MAAM,OAAO,GAAG,kBAAkB,CAAC,MAAM,EAAE,WAAW,EAAE,WAAW,CAAC,CAAC,IAAI,CACvE,GAAG,CACJ,CAAC;IACF,MAAM,SAAS,GAAG,GAAG,WAAW,GAAG,CAAC;IAEpC,MAAM,GAAG,GAAG;QACV,oBAAoB;QACpB,YAAY,SAAS,EAAE;QACvB,8CAA8C;QAC9C,OAAO;QACP,WAAW;KACZ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAEZ,eAAM,CAAC,KAAK,CAAC,eAAe,GAAG,EAAE,CAAC,CAAC;IACnC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,SAAS,CAAC,GAAG,CAAC,CAAC;IACxC,OAAO,MAAM,CAAC,IAAI,EAAE,CAAC;AACvB,CAAC;AAED;;;GAGG;AACI,KAAK,UAAU,gBAAgB,CACpC,WAAmB,EACnB,WAAmB;IAEnB,IAAI,CAAC;QACH,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,SAAS,CAAC,eAAe,WAAW,EAAE,EAAE;YAC/D,OAAO,EAAE,CAAC,WAAW,GAAG,EAAE,CAAC,GAAG,KAAK;SACpC,CAAC,CAAC;QACH,OAAO,QAAQ,CAAC,MAAM,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IACrC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,eAAM,CAAC,IAAI,CACT,6BAA6B,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,KAAM,GAAa,CAAC,OAAO,kBAAkB,CACnG,CAAC;QACF,IAAI,CAAC;YACH,MAAM,SAAS,CAAC,eAAe,WAAW,EAAE,CAAC,CAAC;QAChD,CAAC;QAAC,MAAM,CAAC;YACP,kCAAkC;QACpC,CAAC;QACD,OAAO,GAAG,CAAC,CAAC,oBAAoB;IAClC,CAAC;AACH,CAAC"}
|
package/dist/logger.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"logger.d.ts","sourceRoot":"","sources":["../src/logger.ts"],"names":[],"mappings":"AAWA,wBAAgB,WAAW,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,CAI/C;AAaD,eAAO,MAAM,MAAM;iBACJ,MAAM;gBACP,MAAM;gBACN,MAAM;iBACL,MAAM;CACpB,CAAC"}
|
package/dist/logger.js
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.logger = void 0;
|
|
4
|
+
exports.setLogLevel = setLogLevel;
|
|
5
|
+
const LEVEL_WEIGHT = {
|
|
6
|
+
debug: 0,
|
|
7
|
+
info: 1,
|
|
8
|
+
warn: 2,
|
|
9
|
+
error: 3,
|
|
10
|
+
};
|
|
11
|
+
let currentLevel = "info";
|
|
12
|
+
function setLogLevel(level) {
|
|
13
|
+
if (level in LEVEL_WEIGHT) {
|
|
14
|
+
currentLevel = level;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
function emit(level, message) {
|
|
18
|
+
if (LEVEL_WEIGHT[level] < LEVEL_WEIGHT[currentLevel])
|
|
19
|
+
return;
|
|
20
|
+
const ts = new Date().toISOString();
|
|
21
|
+
const line = `[${ts}] [${level.toUpperCase().padEnd(5)}] ${message}`;
|
|
22
|
+
if (level === "error") {
|
|
23
|
+
process.stderr.write(line + "\n");
|
|
24
|
+
}
|
|
25
|
+
else {
|
|
26
|
+
process.stdout.write(line + "\n");
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
exports.logger = {
|
|
30
|
+
debug: (msg) => emit("debug", msg),
|
|
31
|
+
info: (msg) => emit("info", msg),
|
|
32
|
+
warn: (msg) => emit("warn", msg),
|
|
33
|
+
error: (msg) => emit("error", msg),
|
|
34
|
+
};
|
|
35
|
+
//# sourceMappingURL=logger.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"logger.js","sourceRoot":"","sources":["../src/logger.ts"],"names":[],"mappings":";;;AAWA,kCAIC;AAbD,MAAM,YAAY,GAA6B;IAC7C,KAAK,EAAE,CAAC;IACR,IAAI,EAAE,CAAC;IACP,IAAI,EAAE,CAAC;IACP,KAAK,EAAE,CAAC;CACT,CAAC;AAEF,IAAI,YAAY,GAAa,MAAM,CAAC;AAEpC,SAAgB,WAAW,CAAC,KAAa;IACvC,IAAI,KAAK,IAAI,YAAY,EAAE,CAAC;QAC1B,YAAY,GAAG,KAAiB,CAAC;IACnC,CAAC;AACH,CAAC;AAED,SAAS,IAAI,CAAC,KAAe,EAAE,OAAe;IAC5C,IAAI,YAAY,CAAC,KAAK,CAAC,GAAG,YAAY,CAAC,YAAY,CAAC;QAAE,OAAO;IAC7D,MAAM,EAAE,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,IAAI,GAAG,IAAI,EAAE,MAAM,KAAK,CAAC,WAAW,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,OAAO,EAAE,CAAC;IACrE,IAAI,KAAK,KAAK,OAAO,EAAE,CAAC;QACtB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC;IACpC,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC;IACpC,CAAC;AACH,CAAC;AAEY,QAAA,MAAM,GAAG;IACpB,KAAK,EAAE,CAAC,GAAW,EAAE,EAAE,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC;IAC1C,IAAI,EAAE,CAAC,GAAW,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC;IACxC,IAAI,EAAE,CAAC,GAAW,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC;IACxC,KAAK,EAAE,CAAC,GAAW,EAAE,EAAE,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC;CAC3C,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { Job } from "bullmq";
|
|
2
|
+
import type { WorkerConfig } from "./config";
|
|
3
|
+
/**
|
|
4
|
+
* Core job processor.
|
|
5
|
+
*
|
|
6
|
+
* Flow:
|
|
7
|
+
* 1. GET /api/runs/:id/config — fetch actorConfig from dory-api (no DB/GCS access needed)
|
|
8
|
+
* 2. POST /api/runs/:id/status — mark run as "running"
|
|
9
|
+
* 3. docker run — start the scraping container
|
|
10
|
+
* 4. Extend BullMQ lock every 2 min while waiting
|
|
11
|
+
* 5. docker wait — block until container exits
|
|
12
|
+
* 6. POST /api/runs/:id/status — exit-code fallback if HTTP callback never arrived
|
|
13
|
+
*/
|
|
14
|
+
export declare function processRunJob(config: WorkerConfig, job: Job<{
|
|
15
|
+
runId: string;
|
|
16
|
+
}>): Promise<{
|
|
17
|
+
runId: string;
|
|
18
|
+
status: string;
|
|
19
|
+
}>;
|
|
20
|
+
//# sourceMappingURL=processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processor.d.ts","sourceRoot":"","sources":["../src/processor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,QAAQ,CAAC;AAClC,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AA4C7C;;;;;;;;;;GAUG;AACH,wBAAsB,aAAa,CACjC,MAAM,EAAE,YAAY,EACpB,GAAG,EAAE,GAAG,CAAC;IAAE,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,GAC1B,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC,CAoG5C"}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.processRunJob = processRunJob;
|
|
4
|
+
const docker_1 = require("./docker");
|
|
5
|
+
const logger_1 = require("./logger");
|
|
6
|
+
async function apiGet(apiBaseUrl, path) {
|
|
7
|
+
const url = `${apiBaseUrl.replace(/\/+$/, "")}${path}`;
|
|
8
|
+
const res = await fetch(url);
|
|
9
|
+
if (!res.ok) {
|
|
10
|
+
const body = await res.text().catch(() => "");
|
|
11
|
+
throw new Error(`GET ${path} → HTTP ${res.status}: ${body}`);
|
|
12
|
+
}
|
|
13
|
+
return res.json();
|
|
14
|
+
}
|
|
15
|
+
async function apiPatch(apiBaseUrl, path, body) {
|
|
16
|
+
const url = `${apiBaseUrl.replace(/\/+$/, "")}${path}`;
|
|
17
|
+
const res = await fetch(url, {
|
|
18
|
+
method: "POST",
|
|
19
|
+
headers: { "Content-Type": "application/json" },
|
|
20
|
+
body: JSON.stringify(body),
|
|
21
|
+
});
|
|
22
|
+
if (!res.ok) {
|
|
23
|
+
const text = await res.text().catch(() => "");
|
|
24
|
+
logger_1.logger.warn(`POST ${path} → HTTP ${res.status}: ${text}`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Core job processor.
|
|
29
|
+
*
|
|
30
|
+
* Flow:
|
|
31
|
+
* 1. GET /api/runs/:id/config — fetch actorConfig from dory-api (no DB/GCS access needed)
|
|
32
|
+
* 2. POST /api/runs/:id/status — mark run as "running"
|
|
33
|
+
* 3. docker run — start the scraping container
|
|
34
|
+
* 4. Extend BullMQ lock every 2 min while waiting
|
|
35
|
+
* 5. docker wait — block until container exits
|
|
36
|
+
* 6. POST /api/runs/:id/status — exit-code fallback if HTTP callback never arrived
|
|
37
|
+
*/
|
|
38
|
+
async function processRunJob(config, job) {
|
|
39
|
+
const { runId } = job.data;
|
|
40
|
+
if (!runId)
|
|
41
|
+
throw new Error("Job data missing runId");
|
|
42
|
+
logger_1.logger.info(`[${runId}] Processing job ${job.id} (attempt ${job.attemptsMade + 1})`);
|
|
43
|
+
// ── Step 1: Fetch run config from API ──────────────────────────────────
|
|
44
|
+
let runConfig;
|
|
45
|
+
try {
|
|
46
|
+
runConfig = await apiGet(config.apiBaseUrl, `/api/runs/${runId}/config`);
|
|
47
|
+
}
|
|
48
|
+
catch (err) {
|
|
49
|
+
logger_1.logger.error(`[${runId}] Failed to fetch run config: ${err.message}`);
|
|
50
|
+
throw err; // BullMQ will retry
|
|
51
|
+
}
|
|
52
|
+
// ── Step 2: Mark run as running ────────────────────────────────────────
|
|
53
|
+
await apiPatch(config.apiBaseUrl, `/api/runs/${runId}/status`, {
|
|
54
|
+
runId,
|
|
55
|
+
status: "running",
|
|
56
|
+
});
|
|
57
|
+
// ── Step 3: Start container ────────────────────────────────────────────
|
|
58
|
+
const dockerImage = runConfig.dockerImage || config.dockerImage;
|
|
59
|
+
if (!dockerImage) {
|
|
60
|
+
throw new Error(`No docker image: API did not return one and DOCKER_IMAGE env var is not set`);
|
|
61
|
+
}
|
|
62
|
+
let containerId;
|
|
63
|
+
try {
|
|
64
|
+
containerId = await (0, docker_1.startContainer)(config, runConfig.actorConfig, runConfig.memoryLimitMb, dockerImage);
|
|
65
|
+
logger_1.logger.info(`[${runId}] Container started: ${containerId.slice(0, 12)}`);
|
|
66
|
+
}
|
|
67
|
+
catch (err) {
|
|
68
|
+
logger_1.logger.error(`[${runId}] Failed to start container: ${err.message}`);
|
|
69
|
+
await apiPatch(config.apiBaseUrl, `/api/runs/${runId}/status`, {
|
|
70
|
+
runId,
|
|
71
|
+
status: "failed",
|
|
72
|
+
error: `Failed to start container: ${err.message}`,
|
|
73
|
+
});
|
|
74
|
+
throw err;
|
|
75
|
+
}
|
|
76
|
+
// ── Step 4: Extend BullMQ lock while container runs ─────────────────────
|
|
77
|
+
let lockTimer;
|
|
78
|
+
if (job.token) {
|
|
79
|
+
lockTimer = setInterval(async () => {
|
|
80
|
+
try {
|
|
81
|
+
await job.extendLock(job.token, 300_000);
|
|
82
|
+
}
|
|
83
|
+
catch (extErr) {
|
|
84
|
+
logger_1.logger.warn(`[${runId}] Failed to extend lock: ${extErr.message}`);
|
|
85
|
+
}
|
|
86
|
+
}, 120_000);
|
|
87
|
+
}
|
|
88
|
+
// ── Step 5: Wait for container ────────────────────────────────────────
|
|
89
|
+
let exitCode = -1;
|
|
90
|
+
try {
|
|
91
|
+
exitCode = await (0, docker_1.waitForContainer)(containerId, runConfig.actorTimeoutSecs);
|
|
92
|
+
logger_1.logger.info(`[${runId}] Container ${containerId.slice(0, 12)} exited with code ${exitCode}`);
|
|
93
|
+
}
|
|
94
|
+
finally {
|
|
95
|
+
if (lockTimer)
|
|
96
|
+
clearInterval(lockTimer);
|
|
97
|
+
}
|
|
98
|
+
// ── Step 6: Exit-code fallback ────────────────────────────────────────
|
|
99
|
+
// The container calls POST /api/runs/:id/status itself when it finishes.
|
|
100
|
+
// We POST here only as a safety net if that callback never arrived.
|
|
101
|
+
// The API's handleStatusUpdate ignores updates that move a run backward
|
|
102
|
+
// (e.g. completed → running), so this is safe to always send.
|
|
103
|
+
const finalStatus = exitCode === 0 ? "completed" : "failed";
|
|
104
|
+
await apiPatch(config.apiBaseUrl, `/api/runs/${runId}/status`, {
|
|
105
|
+
runId,
|
|
106
|
+
status: finalStatus,
|
|
107
|
+
exitCode,
|
|
108
|
+
timestamp: new Date().toISOString(),
|
|
109
|
+
...(exitCode !== 0 && {
|
|
110
|
+
error: `Container exited with code ${exitCode} (status callback not received)`,
|
|
111
|
+
}),
|
|
112
|
+
});
|
|
113
|
+
logger_1.logger.info(`[${runId}] Job ${job.id} done — status: ${finalStatus}`);
|
|
114
|
+
return { runId, status: finalStatus };
|
|
115
|
+
}
|
|
116
|
+
//# sourceMappingURL=processor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processor.js","sourceRoot":"","sources":["../src/processor.ts"],"names":[],"mappings":";;AAwDA,sCAuGC;AA7JD,qCAA4D;AAC5D,qCAAkC;AAelC,KAAK,UAAU,MAAM,CAAI,UAAkB,EAAE,IAAY;IACvD,MAAM,GAAG,GAAG,GAAG,UAAU,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC;IACvD,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;IAC7B,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;QACZ,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;QAC9C,MAAM,IAAI,KAAK,CAAC,OAAO,IAAI,WAAW,GAAG,CAAC,MAAM,KAAK,IAAI,EAAE,CAAC,CAAC;IAC/D,CAAC;IACD,OAAO,GAAG,CAAC,IAAI,EAAgB,CAAC;AAClC,CAAC;AAED,KAAK,UAAU,QAAQ,CACrB,UAAkB,EAClB,IAAY,EACZ,IAA6B;IAE7B,MAAM,GAAG,GAAG,GAAG,UAAU,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC;IACvD,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;QAC3B,MAAM,EAAE,MAAM;QACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;QAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;KAC3B,CAAC,CAAC;IACH,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;QACZ,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;QAC9C,eAAM,CAAC,IAAI,CAAC,QAAQ,IAAI,WAAW,GAAG,CAAC,MAAM,KAAK,IAAI,EAAE,CAAC,CAAC;IAC5D,CAAC;AACH,CAAC;AAED;;;;;;;;;;GAUG;AACI,KAAK,UAAU,aAAa,CACjC,MAAoB,EACpB,GAA2B;IAE3B,MAAM,EAAE,KAAK,EAAE,GAAG,GAAG,CAAC,IAAI,CAAC;IAE3B,IAAI,CAAC,KAAK;QAAE,MAAM,IAAI,KAAK,CAAC,wBAAwB,CAAC,CAAC;IAEtD,eAAM,CAAC,IAAI,CACT,IAAI,KAAK,oBAAoB,GAAG,CAAC,EAAE,aAAa,GAAG,CAAC,YAAY,GAAG,CAAC,GAAG,CACxE,CAAC;IAEF,0EAA0E;IAC1E,IAAI,SAA4B,CAAC;IACjC,IAAI,CAAC;QACH,SAAS,GAAG,MAAM,MAAM,CACtB,MAAM,CAAC,UAAU,EACjB,aAAa,KAAK,SAAS,CAC5B,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,eAAM,CAAC,KAAK,CACV,IAAI,KAAK,iCAAkC,GAAa,CAAC,OAAO,EAAE,CACnE,CAAC;QACF,MAAM,GAAG,CAAC,CAAC,oBAAoB;IACjC,CAAC;IAED,0EAA0E;IAC1E,MAAM,QAAQ,CAAC,MAAM,CAAC,UAAU,EAAE,aAAa,KAAK,SAAS,EAAE;QAC7D,KAAK;QACL,MAAM,EAAE,SAAS;KAClB,CAAC,CAAC;IAEH,0EAA0E;IAC1E,MAAM,WAAW,GAAG,SAAS,CAAC,WAAW,IAAI,MAAM,CAAC,WAAW,CAAC;IAChE,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,MAAM,IAAI,KAAK,CACb,6EAA6E,CAC9E,CAAC;IACJ,CAAC;IACD,IAAI,WAAmB,CAAC;IACxB,IAAI,CAAC;QACH,WAAW,GAAG,MAAM,IAAA,uBAAc,EAChC,MAAM,EACN,SAAS,CAAC,WAAW,EACrB,SAAS,CAAC,aAAa,EACvB,WAAW,CACZ,CAAC;QACF,eAAM,CAAC,IAAI,CAAC,IAAI,KAAK,wBAAwB,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;IAC3E,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,eAAM,CAAC,KAAK,CACV,IAAI,KAAK,gCAAiC,GAAa,CAAC,OAAO,EAAE,CAClE,CAAC;QACF,MAAM,QAAQ,CAAC,MAAM,CAAC,UAAU,EAAE,aAAa,KAAK,SAAS,EAAE;YAC7D,KAAK;YACL,MAAM,EAAE,QAAQ;YAChB,KAAK,EAAE,8BAA+B,GAAa,CAAC,OAAO,EAAE;SAC9D,CAAC,CAAC;QACH,MAAM,GAAG,CAAC;IACZ,CAAC;IAED,2EAA2E;IAC3E,IAAI,SAAqD,CAAC;IAC1D,IAAI,GAAG,CAAC,KAAK,EAAE,CAAC;QACd,SAAS,GAAG,WAAW,CAAC,KAAK,IAAI,EAAE;YACjC,IAAI,CAAC;gBACH,MAAM,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,KAAM,EAAE,OAAO,CAAC,CAAC;YAC5C,CAAC;YAAC,OAAO,MAAM,EAAE,CAAC;gBAChB,eAAM,CAAC,IAAI,CACT,IAAI,KAAK,4BAA6B,MAAgB,CAAC,OAAO,EAAE,CACjE,CAAC;YACJ,CAAC;QACH,CAAC,EAAE,OAAO,CAAC,CAAC;IACd,CAAC;IAED,yEAAyE;IACzE,IAAI,QAAQ,GAAG,CAAC,CAAC,CAAC;IAClB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,IAAA,yBAAgB,EAAC,WAAW,EAAE,SAAS,CAAC,gBAAgB,CAAC,CAAC;QAC3E,eAAM,CAAC,IAAI,CACT,IAAI,KAAK,eAAe,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,qBAAqB,QAAQ,EAAE,CAChF,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,IAAI,SAAS;YAAE,aAAa,CAAC,SAAS,CAAC,CAAC;IAC1C,CAAC;IAED,yEAAyE;IACzE,yEAAyE;IACzE,oEAAoE;IACpE,wEAAwE;IACxE,8DAA8D;IAC9D,MAAM,WAAW,GAAG,QAAQ,KAAK,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC;IAC5D,MAAM,QAAQ,CAAC,MAAM,CAAC,UAAU,EAAE,aAAa,KAAK,SAAS,EAAE;QAC7D,KAAK;QACL,MAAM,EAAE,WAAW;QACnB,QAAQ;QACR,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,GAAG,CAAC,QAAQ,KAAK,CAAC,IAAI;YACpB,KAAK,EAAE,8BAA8B,QAAQ,iCAAiC;SAC/E,CAAC;KACH,CAAC,CAAC;IAEH,eAAM,CAAC,IAAI,CAAC,IAAI,KAAK,SAAS,GAAG,CAAC,EAAE,mBAAmB,WAAW,EAAE,CAAC,CAAC;IACtE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;AACxC,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { WorkerConfig } from "./config";
|
|
2
|
+
/**
|
|
3
|
+
* Load the INPUT.json that dory-api wrote to GCS when the run was created.
|
|
4
|
+
* Supports both real GCS (production) and the fake-gcs-server emulator (dev).
|
|
5
|
+
*/
|
|
6
|
+
export declare function loadInputFromStorage(config: WorkerConfig, runId: string): Promise<Record<string, unknown>>;
|
|
7
|
+
//# sourceMappingURL=storage.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"storage.d.ts","sourceRoot":"","sources":["../src/storage.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAG7C;;;GAGG;AACH,wBAAsB,oBAAoB,CACxC,MAAM,EAAE,YAAY,EACpB,KAAK,EAAE,MAAM,GACZ,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,CA+BlC"}
|
package/dist/storage.js
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.loadInputFromStorage = loadInputFromStorage;
|
|
4
|
+
const storage_1 = require("@google-cloud/storage");
|
|
5
|
+
const logger_1 = require("./logger");
|
|
6
|
+
/**
|
|
7
|
+
* Load the INPUT.json that dory-api wrote to GCS when the run was created.
|
|
8
|
+
* Supports both real GCS (production) and the fake-gcs-server emulator (dev).
|
|
9
|
+
*/
|
|
10
|
+
async function loadInputFromStorage(config, runId) {
|
|
11
|
+
const filePath = `${runId}/storage/key_value_stores/default/INPUT.json`;
|
|
12
|
+
try {
|
|
13
|
+
if (config.storageEmulatorHost) {
|
|
14
|
+
// Dev: download via fake-gcs-server HTTP API
|
|
15
|
+
const host = config.storageEmulatorHost.replace(/\/+$/, "");
|
|
16
|
+
const url = `${host}/download/storage/v1/b/${config.gcsBucket}/o/${encodeURIComponent(filePath)}?alt=media`;
|
|
17
|
+
const res = await fetch(url);
|
|
18
|
+
if (!res.ok) {
|
|
19
|
+
logger_1.logger.warn(`GCS emulator: INPUT.json not found for run ${runId} (HTTP ${res.status}) — using empty input`);
|
|
20
|
+
return {};
|
|
21
|
+
}
|
|
22
|
+
return (await res.json());
|
|
23
|
+
}
|
|
24
|
+
// Production: real GCS with application default credentials
|
|
25
|
+
const storage = new storage_1.Storage({ projectId: config.gcpProjectId });
|
|
26
|
+
const [contents] = await storage
|
|
27
|
+
.bucket(config.gcsBucket)
|
|
28
|
+
.file(filePath)
|
|
29
|
+
.download();
|
|
30
|
+
return JSON.parse(contents.toString());
|
|
31
|
+
}
|
|
32
|
+
catch (err) {
|
|
33
|
+
logger_1.logger.error(`Failed to load INPUT.json for run ${runId}: ${err.message}`);
|
|
34
|
+
return {};
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
//# sourceMappingURL=storage.js.map
|