@flink-app/flink 2.0.0-alpha.72 → 2.0.0-alpha.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/FlinkApp.ts CHANGED
@@ -17,7 +17,8 @@ import { FlinkAuthPlugin } from "./auth/FlinkAuthPlugin";
17
17
  import { FlinkContext } from "./FlinkContext";
18
18
  import { FlinkError, internalServerError, notFound, unauthorized } from "./FlinkErrors";
19
19
  import { FlinkRequest, Handler, HandlerFile, HttpMethod, QueryParamMetadata, RouteProps, ValidationMode } from "./FlinkHttpHandler";
20
- import { FlinkJobFile } from "./FlinkJob";
20
+ import { FlinkJobFile, FlinkJobProps } from "./FlinkJob";
21
+ import { LeaderElection, LeaderElectionOptions } from "./LeaderElection";
21
22
  import { log } from "./FlinkLog";
22
23
  import { FlinkLogFactory } from "./FlinkLogFactory";
23
24
  import { FlinkPlugin } from "./FlinkPlugin";
@@ -30,6 +31,7 @@ import { formatValidationErrors, getPathParams, isError } from "./utils";
30
31
 
31
32
  const initLog = FlinkLogFactory.createLogger("flink.init");
32
33
  const perfLog = FlinkLogFactory.createLogger("flink.perf");
34
+ const schedulerLog = FlinkLogFactory.createLogger("flink.scheduler");
33
35
 
34
36
  const ajv = new Ajv();
35
37
  addFormats(ajv);
@@ -179,23 +181,33 @@ export interface FlinkOptions {
179
181
  */
180
182
  enabled?: boolean;
181
183
 
182
- // TODO: Implement master auto assignment
183
- // /**
184
- // * If true, the master (the instance if flink app that will run jobs) will be
185
- // * automatically assigned to the first node that starts.
186
- // *
187
- // * Is persisted in database.
188
- // *
189
- // * Will throw and exception if true but no database is configured.
190
- // */
191
- // autoAssignMaster?: boolean;
192
-
193
- // /**
194
- // * Name of collection to be used for storing master assignment.
195
- // *
196
- // * Defaults to `flink-scheduling`
197
- // */
198
- // autoAssignCollection?: string;
184
+ /**
185
+ * Enable leader election for horizontally scaled deployments.
186
+ *
187
+ * When enabled, only one instance (the leader) will run scheduled jobs.
188
+ * If the leader goes down, another instance automatically takes over.
189
+ *
190
+ * Requires a database connection (`db` option) since leader election
191
+ * state is persisted in MongoDB. If no database is configured, a warning
192
+ * will be logged and jobs will run on all instances (no leader election).
193
+ *
194
+ * Set to `true` for default settings, or pass an options object to customize.
195
+ *
196
+ * @example
197
+ * ```ts
198
+ * // Use defaults (15s lease, 5s heartbeat)
199
+ * scheduling: { leaderElection: true }
200
+ *
201
+ * // Custom settings
202
+ * scheduling: {
203
+ * leaderElection: {
204
+ * leaseDurationMs: 30000,
205
+ * heartbeatIntervalMs: 10000,
206
+ * }
207
+ * }
208
+ * ```
209
+ */
210
+ leaderElection?: boolean | LeaderElectionOptions;
199
211
  };
200
212
 
201
213
  /**
@@ -337,6 +349,8 @@ export class FlinkApp<C extends FlinkContext> {
337
349
  private handlerRouteCache = new Map<string, string>();
338
350
 
339
351
  public scheduler?: ToadScheduler;
352
+ private allInstanceScheduler?: ToadScheduler;
353
+ private leaderElection?: LeaderElection;
340
354
 
341
355
  private accessLog: { enabled: boolean; format: string };
342
356
 
@@ -406,10 +420,10 @@ export class FlinkApp<C extends FlinkContext> {
406
420
  await this.initializeAgents();
407
421
  perfLog.debug(`Initialize agents took ${Date.now() - agentInitStartTime}ms`);
408
422
 
409
- if (this.isSchedulingEnabled) {
423
+ if (this.isSchedulingEnabled && !this.leaderElectionConfig) {
410
424
  this.scheduler = new ToadScheduler();
411
- } else {
412
- initLog.info("🚫 Scheduling is disabled");
425
+ } else if (!this.isSchedulingEnabled) {
426
+ schedulerLog.info("Scheduling is disabled");
413
427
  }
414
428
 
415
429
  if (!this.disableHttpServer) {
@@ -459,9 +473,13 @@ export class FlinkApp<C extends FlinkContext> {
459
473
  perfLog.debug(`Register handlers took ${Date.now() - handlersStartTime}ms`);
460
474
 
461
475
  if (this.isSchedulingEnabled) {
462
- const jobsStartTime = Date.now();
463
- await this.registerAutoRegisterableJobs();
464
- perfLog.debug(`Register jobs took ${Date.now() - jobsStartTime}ms`);
476
+ if (this.leaderElectionConfig) {
477
+ await this.startLeaderElection();
478
+ } else {
479
+ const jobsStartTime = Date.now();
480
+ await this.registerAutoRegisterableJobs();
481
+ perfLog.debug(`Register jobs took ${Date.now() - jobsStartTime}ms`);
482
+ }
465
483
  }
466
484
 
467
485
  // Register 404 with slight delay to allow all manually added routes to be added
@@ -495,10 +513,18 @@ export class FlinkApp<C extends FlinkContext> {
495
513
  async stop() {
496
514
  log.info("🛑 Stopping Flink app...");
497
515
 
516
+ if (this.leaderElection) {
517
+ await this.leaderElection.stop();
518
+ }
519
+
498
520
  if (this.scheduler) {
499
521
  await this.scheduler.stop();
500
522
  }
501
523
 
524
+ if (this.allInstanceScheduler) {
525
+ await this.allInstanceScheduler.stop();
526
+ }
527
+
502
528
  if (this.expressServer) {
503
529
  return new Promise<void>((resolve, reject) => {
504
530
  const int = setTimeout(() => {
@@ -1068,9 +1094,15 @@ export class FlinkApp<C extends FlinkContext> {
1068
1094
 
1069
1095
  log.debug(`Registering ${schemaCount} schemas with AJV (manifest version: ${schemaManifest.version || "1.0"})`);
1070
1096
 
1071
- for (const { handler, assumedHttpMethod, __file } of autoRegisteredHandlers.sort(
1072
- (a, b) => (a.handler.Route?.order || 0) - (b.handler.Route?.order || 0)
1073
- )) {
1097
+ for (const { handler, assumedHttpMethod, __file } of autoRegisteredHandlers.sort((a, b) => {
1098
+ const orderDiff = (a.handler.Route?.order || 0) - (b.handler.Route?.order || 0);
1099
+ if (orderDiff !== 0) return orderDiff;
1100
+ // Static segments must be registered before parameterized ones to avoid
1101
+ // Express matching e.g. GET /jobs/by-tags with the /jobs/:id route.
1102
+ const aHasParam = a.handler.Route?.path?.includes("/:") ? 1 : 0;
1103
+ const bHasParam = b.handler.Route?.path?.includes("/:") ? 1 : 0;
1104
+ return aHasParam - bHasParam;
1105
+ })) {
1074
1106
  if (!handler.Route) {
1075
1107
  log.error(`Missing Props in handler ${__file}`);
1076
1108
  continue;
@@ -1142,40 +1174,43 @@ export class FlinkApp<C extends FlinkContext> {
1142
1174
  }
1143
1175
  }
1144
1176
 
1145
- private async registerAutoRegisterableJobs() {
1177
+ private async registerAutoRegisterableJobs(filter?: (jobProps: FlinkJobProps) => boolean) {
1146
1178
  if (!this.scheduler) {
1147
1179
  throw new Error("Scheduler not initialized"); // should never happen
1148
1180
  }
1149
1181
 
1150
1182
  for (const { Job: jobProps, default: jobFn, __file } of autoRegisteredJobs) {
1183
+ if (filter && !filter(jobProps)) {
1184
+ continue;
1185
+ }
1151
1186
  if (jobProps.cron && jobProps.interval) {
1152
- log.error(`Cannot register job ${jobProps.id} - both cron and interval are set in ${__file}`);
1187
+ schedulerLog.error(`Cannot register job ${jobProps.id} - both cron and interval are set in ${__file}`);
1153
1188
  continue;
1154
1189
  }
1155
1190
 
1156
1191
  if (jobProps.cron && jobProps.afterDelay) {
1157
- log.error(`Cannot register job ${jobProps.id} - both cron and afterDelay are set in ${__file}`);
1192
+ schedulerLog.error(`Cannot register job ${jobProps.id} - both cron and afterDelay are set in ${__file}`);
1158
1193
  continue;
1159
1194
  }
1160
1195
 
1161
1196
  if (jobProps.interval && jobProps.afterDelay) {
1162
- log.error(`Cannot register job ${jobProps.id} - both interval and afterDelay are set in ${__file}`);
1197
+ schedulerLog.error(`Cannot register job ${jobProps.id} - both interval and afterDelay are set in ${__file}`);
1163
1198
  continue;
1164
1199
  }
1165
1200
 
1166
1201
  if (this.scheduler.existsById(jobProps.id)) {
1167
- log.error(`Job with id ${jobProps.id} is already registered, found duplicate in ${__file}`);
1202
+ schedulerLog.error(`Job with id ${jobProps.id} is already registered, found duplicate in ${__file}`);
1168
1203
  continue;
1169
1204
  }
1170
1205
 
1171
- log.debug(`Registering job ${jobProps.id}: ${JSON.stringify(jobProps)} from ${__file}`);
1206
+ schedulerLog.debug(`Registering job ${jobProps.id}: ${JSON.stringify(jobProps)} from ${__file}`);
1172
1207
 
1173
1208
  const task = new AsyncTask(
1174
1209
  jobProps.id,
1175
1210
  async () => {
1176
1211
  await jobFn({ ctx: this.ctx });
1177
1212
 
1178
- log.debug(`Job ${jobProps.id} completed`);
1213
+ schedulerLog.debug(`Job ${jobProps.id} completed`);
1179
1214
 
1180
1215
  if (jobProps.afterDelay) {
1181
1216
  // afterDelay runs only once, so we remove the job
@@ -1183,7 +1218,7 @@ export class FlinkApp<C extends FlinkContext> {
1183
1218
  }
1184
1219
  },
1185
1220
  (err) => {
1186
- log.error(`Job ${jobProps.id} threw unhandled exception ${err}`);
1221
+ schedulerLog.error(`Job ${jobProps.id} threw unhandled exception ${err}`);
1187
1222
  console.error(err);
1188
1223
  }
1189
1224
  );
@@ -1216,7 +1251,7 @@ export class FlinkApp<C extends FlinkContext> {
1216
1251
  try {
1217
1252
  await jobFn({ ctx: this.ctx });
1218
1253
  } catch (err) {
1219
- log.error(`Job ${jobProps.id} threw unhandled exception ${err}`);
1254
+ schedulerLog.error(`Job ${jobProps.id} threw unhandled exception ${err}`);
1220
1255
  console.error(err);
1221
1256
  }
1222
1257
  });
@@ -1235,7 +1270,7 @@ export class FlinkApp<C extends FlinkContext> {
1235
1270
  this.scheduler.addSimpleIntervalJob(job);
1236
1271
  }
1237
1272
  } else {
1238
- log.error(`Cannot register job ${jobProps.id} - no cron, interval or once set in ${__file}`);
1273
+ schedulerLog.error(`Cannot register job ${jobProps.id} - no cron, interval or once set in ${__file}`);
1239
1274
  continue;
1240
1275
  }
1241
1276
  }
@@ -1496,6 +1531,56 @@ export class FlinkApp<C extends FlinkContext> {
1496
1531
  return this.schedulingOptions?.enabled !== false;
1497
1532
  }
1498
1533
 
1534
+ private get leaderElectionConfig(): LeaderElectionOptions | undefined {
1535
+ const opt = this.schedulingOptions?.leaderElection;
1536
+ if (!opt) return undefined;
1537
+ return opt === true ? {} : opt;
1538
+ }
1539
+
1540
+ private async startLeaderElection() {
1541
+ if (!this.db) {
1542
+ schedulerLog.warn(
1543
+ "Leader election is enabled but no database is configured. " +
1544
+ "Leader election requires a MongoDB connection to coordinate between instances. " +
1545
+ "Either add a database connection via the `db` option, or remove `scheduling.leaderElection` from your config. " +
1546
+ "Jobs will run on ALL instances without leader election."
1547
+ );
1548
+ // Fall back to running jobs on all instances
1549
+ this.scheduler = new ToadScheduler();
1550
+ await this.registerAutoRegisterableJobs();
1551
+ return;
1552
+ }
1553
+
1554
+ // Register runOnAllInstances jobs immediately on a separate scheduler
1555
+ const hasAllInstanceJobs = autoRegisteredJobs.some((j) => j.Job.runOnAllInstances);
1556
+ if (hasAllInstanceJobs) {
1557
+ this.allInstanceScheduler = new ToadScheduler();
1558
+ this.scheduler = this.allInstanceScheduler;
1559
+ await this.registerAutoRegisterableJobs((job) => !!job.runOnAllInstances);
1560
+ this.scheduler = undefined;
1561
+ }
1562
+
1563
+ const opts = this.leaderElectionConfig;
1564
+ this.leaderElection = new LeaderElection(this.db, opts);
1565
+
1566
+ await this.leaderElection.start(
1567
+ // onBecameLeader
1568
+ async () => {
1569
+ schedulerLog.info("This instance is now the leader - starting scheduled jobs");
1570
+ this.scheduler = new ToadScheduler();
1571
+ await this.registerAutoRegisterableJobs((job) => !job.runOnAllInstances);
1572
+ },
1573
+ // onLostLeadership
1574
+ () => {
1575
+ schedulerLog.info("This instance lost leadership - stopping scheduled jobs");
1576
+ if (this.scheduler) {
1577
+ this.scheduler.stop();
1578
+ this.scheduler = undefined;
1579
+ }
1580
+ }
1581
+ );
1582
+ }
1583
+
1499
1584
  private getMongoConnectionOptions() {
1500
1585
  if (!this.dbOpts) {
1501
1586
  throw new Error("No db configured");
package/src/FlinkJob.ts CHANGED
@@ -37,6 +37,17 @@ export type FlinkJobProps = {
37
37
  * retried after the next interval.
38
38
  */
39
39
  singleton?: boolean;
40
+
41
+ /**
42
+ * If true, this job will run on all instances regardless of leader election.
43
+ *
44
+ * By default, when leader election is enabled, jobs only run on the leader instance.
45
+ * Set this to true for jobs that should run on every instance, such as
46
+ * local cache cleanup or instance-specific health checks.
47
+ *
48
+ * Has no effect when leader election is not enabled.
49
+ */
50
+ runOnAllInstances?: boolean;
40
51
  };
41
52
 
42
53
  /**
package/src/FlinkRepo.ts CHANGED
@@ -51,7 +51,7 @@ export abstract class FlinkRepo<C extends FlinkContext, Model extends Document>
51
51
  return { ...model, _id: result.insertedId.toString() };
52
52
  }
53
53
 
54
- async updateOne(id: string | ObjectId, model: PartialModel<Model>): Promise<Model | null> {
54
+ async updateById(id: string | ObjectId, model: PartialModel<Model>): Promise<Model | null> {
55
55
  const oid = this.buildId(id);
56
56
 
57
57
  const { _id, ...modelWithoutId } = model;
@@ -66,6 +66,13 @@ export abstract class FlinkRepo<C extends FlinkContext, Model extends Document>
66
66
  return null;
67
67
  }
68
68
 
69
+ /**
70
+ * @deprecated Use `updateById` instead. This will be removed in a future major version.
71
+ */
72
+ async updateOne(id: string | ObjectId, model: PartialModel<Model>): Promise<Model | null> {
73
+ return this.updateById(id, model);
74
+ }
75
+
69
76
  async updateMany<U = PartialModel<Model>>(query: any, model: U): Promise<number> {
70
77
  const { _id, ...modelWithoutId } = model as any;
71
78
 
@@ -0,0 +1,203 @@
1
+ import { Collection, Db } from "mongodb";
2
+ import { v4 } from "uuid";
3
+ import { FlinkLogFactory } from "./FlinkLogFactory";
4
+
5
+ const log = FlinkLogFactory.createLogger("flink.scheduler");
6
+
7
+ export interface LeaderElectionOptions {
8
+ /**
9
+ * Duration in milliseconds before a leader's lease expires.
10
+ * If the leader fails to heartbeat within this time, another instance can take over.
11
+ * @default 15000
12
+ */
13
+ leaseDurationMs?: number;
14
+
15
+ /**
16
+ * Interval in milliseconds between heartbeats sent by the leader.
17
+ * Should be significantly less than leaseDurationMs (typically 1/3).
18
+ * @default 5000
19
+ */
20
+ heartbeatIntervalMs?: number;
21
+
22
+ /**
23
+ * Name of the MongoDB collection used for leader election.
24
+ * @default "_flink_leader"
25
+ */
26
+ collectionName?: string;
27
+ }
28
+
29
+ interface LeaderRecord {
30
+ _id: string;
31
+ instanceId: string;
32
+ lastHeartbeat: Date;
33
+ claimedAt: Date;
34
+ }
35
+
36
+ const LOCK_NAME = "job-scheduler";
37
+
38
+ export class LeaderElection {
39
+ private instanceId = v4();
40
+ private _isLeader = false;
41
+ private timer: ReturnType<typeof setTimeout> | null = null;
42
+ private collection: Collection<LeaderRecord>;
43
+ private leaseDurationMs: number;
44
+ private heartbeatIntervalMs: number;
45
+ private onBecameLeader?: () => void | Promise<void>;
46
+ private onLostLeadership?: () => void | Promise<void>;
47
+ private stopped = false;
48
+ private transitioning = false;
49
+
50
+ constructor(db: Db, opts?: LeaderElectionOptions) {
51
+ const collectionName = opts?.collectionName || "_flink_leader";
52
+ this.leaseDurationMs = opts?.leaseDurationMs || 15000;
53
+ this.heartbeatIntervalMs = opts?.heartbeatIntervalMs || 5000;
54
+ this.collection = db.collection<LeaderRecord>(collectionName);
55
+ }
56
+
57
+ get isLeader() {
58
+ return this._isLeader;
59
+ }
60
+
61
+ /**
62
+ * Start the leader election process.
63
+ * @param onBecameLeader Called when this instance becomes the leader
64
+ * @param onLostLeadership Called when this instance loses leadership
65
+ */
66
+ async start(onBecameLeader: () => void | Promise<void>, onLostLeadership: () => void | Promise<void>) {
67
+ this.onBecameLeader = onBecameLeader;
68
+ this.onLostLeadership = onLostLeadership;
69
+ this.stopped = false;
70
+
71
+ // Ensure TTL index exists for cleanup
72
+ const ttlSeconds = Math.ceil((this.leaseDurationMs * 2) / 1000);
73
+ try {
74
+ await this.collection.createIndex({ lastHeartbeat: 1 }, { expireAfterSeconds: ttlSeconds });
75
+ } catch (err: any) {
76
+ if (err.codeName === "IndexOptionsConflict" || err.code === 85) {
77
+ log.debug("TTL index options changed, recreating index");
78
+ await this.collection.dropIndex("lastHeartbeat_1");
79
+ await this.collection.createIndex({ lastHeartbeat: 1 }, { expireAfterSeconds: ttlSeconds });
80
+ } else {
81
+ throw err;
82
+ }
83
+ }
84
+
85
+ log.info(`Leader election started (instance: ${this.instanceId.substring(0, 8)})`);
86
+
87
+ // Run first election attempt immediately
88
+ await this.tryClaimLeadership();
89
+ }
90
+
91
+ /**
92
+ * Stop the leader election and release leadership if held.
93
+ */
94
+ async stop() {
95
+ this.stopped = true;
96
+
97
+ if (this.timer) {
98
+ clearTimeout(this.timer);
99
+ this.timer = null;
100
+ }
101
+
102
+ if (this._isLeader) {
103
+ try {
104
+ await this.collection.deleteOne({
105
+ _id: LOCK_NAME as any,
106
+ instanceId: this.instanceId,
107
+ });
108
+ log.info("Leadership released on shutdown");
109
+ } catch (err) {
110
+ log.error(`Failed to release leadership on shutdown: ${err}`);
111
+ }
112
+ this._isLeader = false;
113
+ }
114
+ }
115
+
116
+ private async tryClaimLeadership() {
117
+ if (this.stopped || this.transitioning) return;
118
+
119
+ const now = new Date();
120
+ const leaseExpiry = new Date(now.getTime() - this.leaseDurationMs);
121
+
122
+ try {
123
+ const result = await this.collection.findOneAndUpdate(
124
+ {
125
+ _id: LOCK_NAME as any,
126
+ $or: [
127
+ { instanceId: this.instanceId },
128
+ { lastHeartbeat: { $lt: leaseExpiry } },
129
+ ],
130
+ },
131
+ {
132
+ $set: {
133
+ instanceId: this.instanceId,
134
+ lastHeartbeat: now,
135
+ },
136
+ $setOnInsert: {
137
+ claimedAt: now,
138
+ },
139
+ },
140
+ { upsert: true, returnDocument: "after" }
141
+ );
142
+
143
+ const gotLock = result && (result as any).instanceId === this.instanceId;
144
+
145
+ if (gotLock && !this._isLeader) {
146
+ log.info(`This instance became the leader (instance: ${this.instanceId.substring(0, 8)})`);
147
+ this._isLeader = true;
148
+ this.transitioning = true;
149
+ try {
150
+ await this.onBecameLeader?.();
151
+ } catch (err) {
152
+ log.error(`Error in onBecameLeader callback: ${err}`);
153
+ } finally {
154
+ this.transitioning = false;
155
+ }
156
+ } else if (!gotLock && this._isLeader) {
157
+ log.warn(`This instance lost leadership (instance: ${this.instanceId.substring(0, 8)})`);
158
+ this._isLeader = false;
159
+ this.transitioning = true;
160
+ try {
161
+ await this.onLostLeadership?.();
162
+ } catch (err) {
163
+ log.error(`Error in onLostLeadership callback: ${err}`);
164
+ } finally {
165
+ this.transitioning = false;
166
+ }
167
+ }
168
+ } catch (err: any) {
169
+ if (err.code === 11000) {
170
+ // Duplicate key - another instance claimed first
171
+ if (this._isLeader) {
172
+ log.warn(`This instance lost leadership (instance: ${this.instanceId.substring(0, 8)})`);
173
+ this._isLeader = false;
174
+ try {
175
+ await this.onLostLeadership?.();
176
+ } catch (cbErr) {
177
+ log.error(`Error in onLostLeadership callback: ${cbErr}`);
178
+ }
179
+ }
180
+ } else {
181
+ log.error(`Leader election error: ${err}`);
182
+ // On error, assume we lost leadership to be safe
183
+ if (this._isLeader) {
184
+ this._isLeader = false;
185
+ try {
186
+ await this.onLostLeadership?.();
187
+ } catch (cbErr) {
188
+ log.error(`Error in onLostLeadership callback: ${cbErr}`);
189
+ }
190
+ }
191
+ }
192
+ }
193
+
194
+ // Schedule next attempt
195
+ if (!this.stopped) {
196
+ const nextInterval = this._isLeader
197
+ ? this.heartbeatIntervalMs
198
+ : this.heartbeatIntervalMs * 2;
199
+
200
+ this.timer = setTimeout(() => this.tryClaimLeadership(), nextInterval);
201
+ }
202
+ }
203
+ }
package/src/index.ts CHANGED
@@ -10,6 +10,8 @@ export * from "./FlinkRequestContext";
10
10
  export * from "./FlinkErrors";
11
11
  export * from "./FlinkPlugin";
12
12
  export * from "./FlinkJob";
13
+ export { LeaderElection } from "./LeaderElection";
14
+ export type { LeaderElectionOptions } from "./LeaderElection";
13
15
  export * from "./auth/FlinkAuthUser";
14
16
  export * from "./auth/FlinkAuthPlugin";
15
17
  export * from "./ai/FlinkTool";