@mux-magic/tools 0.1.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,414 @@
1
+ import {
2
+ finalize,
3
+ ignoreElements,
4
+ mergeMap,
5
+ Observable,
6
+ type OperatorFunction,
7
+ Subject,
8
+ type Subscriber,
9
+ type Subscription,
10
+ tap,
11
+ } from "rxjs"
12
+
13
+ // The scheduler historically read the active job id from server-only
14
+ // AsyncLocalStorage. To keep @mux-magic/tools free of server imports,
15
+ // the provider is injected at init time. CLI passes nothing (constant
16
+ // null); server passes its real getActiveJobId from logCapture.
17
+ type GetActiveJobId = () => string | null | undefined
18
+
19
+ const nullJobIdProvider: GetActiveJobId = () => null
20
+
21
+ let getActiveJobId: GetActiveJobId = nullJobIdProvider
22
+
23
+ // ---------------------------------------------------------------------------
24
+ // Process-wide Task scheduler
25
+ //
26
+ // A "Task" is a unit of heavy work (per-file copy, ffmpeg invocation, etc.)
27
+ // that's part of a Job. Tasks compete for a fixed pool of `concurrency`
28
+ // slots under TWO coupled constraints:
29
+ // 1. inflight-global < MAX_THREADS (unchanged from original design)
30
+ // 2. inflight-for-job < job.claim (new — per-job quota)
31
+ //
32
+ // The per-job claim is registered before a job's tasks are enqueued via
33
+ // `registerJobClaim(jobId, claim)` and torn down via `unregisterJobClaim`
34
+ // once the job finishes. Tasks without a jobId (null) are gated only by
35
+ // the global cap.
36
+ //
37
+ // Wiring: a single inbox Subject carries `{ bridge$, jobId }` pairs through
38
+ // a custom scheduler operator. Each `runTask(work$)` pushes a gated inner
39
+ // Observable onto the inbox; when the operator grants a slot (both constraints
40
+ // satisfied), it subscribes to bridge$ which then starts work$ and forwards
41
+ // values to the caller. Slot is held for as long as the bridge$ subscription
42
+ // is alive, then released on complete/error or on caller unsubscribe.
43
+ //
44
+ // Fair scheduling: when the front of the queue can't be admitted (per-job
45
+ // cap full), the operator scans forward to find any task from a different
46
+ // job that can run — preventing one job's saturated claim from blocking
47
+ // other jobs' tasks.
48
+ //
49
+ // Composition rule: operators that already route through `runTask` /
50
+ // `runTasks` MUST NOT be nested inside another finite-concurrency
51
+ // `mergeMap(..., n)` operating over scheduled work. Use unbounded
52
+ // `mergeAll()` upstream and let the scheduler do the bounding.
53
+ // ---------------------------------------------------------------------------
54
+
55
+ type ScheduledTask = {
56
+ bridge$: Observable<never>
57
+ jobId: string | null
58
+ }
59
+
60
+ let concurrency: number | null = null
61
+ let inbox: Subject<ScheduledTask> | null = null
62
+
63
+ // Per-job thread claim registry — populated by registerJobClaim /
64
+ // unregisterJobClaim outside the scheduler operator so callers can
65
+ // register before enqueueing tasks.
66
+ const claimByJob = new Map<string, number>()
67
+
68
+ const ensureInbox = (): Subject<ScheduledTask> => {
69
+ if (inbox === null) {
70
+ throw new Error(
71
+ "Task scheduler not initialized. Call initTaskScheduler() at process startup.",
72
+ )
73
+ }
74
+
75
+ return inbox
76
+ }
77
+
78
+ // Custom scheduler operator: replaces the former `mergeAll(concurrency)`.
79
+ // Enforces the global cap AND the per-job claim on every admission decision.
80
+ const buildScheduler =
81
+ (
82
+ maxConcurrency: number,
83
+ ): ((
84
+ source: Observable<ScheduledTask>,
85
+ ) => Observable<never>) =>
86
+ (source$) =>
87
+ new Observable<never>((outerSub) => {
88
+ let inflight = 0
89
+ const inflightByJob = new Map<string, number>()
90
+ const queue: ScheduledTask[] = []
91
+
92
+ const canAdmit = ({
93
+ jobId,
94
+ }: ScheduledTask): boolean => {
95
+ if (inflight >= maxConcurrency) return false
96
+ if (jobId === null) return true
97
+ const claim =
98
+ claimByJob.get(jobId) ?? maxConcurrency
99
+ return (inflightByJob.get(jobId) ?? 0) < claim
100
+ }
101
+
102
+ const onComplete = (jobId: string | null): void => {
103
+ inflight -= 1
104
+ if (jobId !== null) {
105
+ const count = (inflightByJob.get(jobId) ?? 0) - 1
106
+ if (count <= 0) {
107
+ inflightByJob.delete(jobId)
108
+ } else {
109
+ inflightByJob.set(jobId, count)
110
+ }
111
+ }
112
+ admitFromQueue()
113
+ }
114
+
115
+ const admit = (index: number): void => {
116
+ const task = queue.splice(index, 1)[0]
117
+ if (!task) return
118
+ inflight += 1
119
+ if (task.jobId !== null) {
120
+ inflightByJob.set(
121
+ task.jobId,
122
+ (inflightByJob.get(task.jobId) ?? 0) + 1,
123
+ )
124
+ }
125
+ task.bridge$.subscribe({
126
+ complete: () => onComplete(task.jobId),
127
+ error: () => onComplete(task.jobId),
128
+ })
129
+ }
130
+
131
+ const admitFromQueue = (): void => {
132
+ // Loop: each admit() removes one task from the queue and may open
133
+ // room for another (e.g. a per-job cap was the constraint, not the
134
+ // global cap). Stop when no admissible task remains.
135
+ while (true) {
136
+ const index = queue.findIndex(canAdmit)
137
+ if (index < 0) break
138
+ admit(index)
139
+ }
140
+ }
141
+
142
+ const subscription = source$.subscribe({
143
+ next: (task) => {
144
+ queue.push(task)
145
+ admitFromQueue()
146
+ },
147
+ error: (error) => outerSub.error(error),
148
+ complete: () => outerSub.complete(),
149
+ })
150
+
151
+ return () => subscription.unsubscribe()
152
+ })
153
+
154
+ // Registers a per-job thread-count claim. Call this before enqueueing
155
+ // any tasks for the job; the scheduler reads the claim at admission time.
156
+ // If the jobId already has a claim, the new value overwrites it.
157
+ export const registerJobClaim = (
158
+ jobId: string,
159
+ claim: number,
160
+ ): void => {
161
+ claimByJob.set(jobId, claim)
162
+ }
163
+
164
+ // Removes the per-job claim after the job finishes. Safe to call even
165
+ // if the job had no registered claim (no-op).
166
+ export const unregisterJobClaim = (jobId: string): void => {
167
+ claimByJob.delete(jobId)
168
+ }
169
+
170
+ // Init once at process startup. CLI passes 1 (sequential, equivalent to
171
+ // the historical concatMap behavior). API passes Number(MAX_THREADS) ||
172
+ // cpus().length. Idempotent on repeat calls with the same value; throws
173
+ // on conflicting re-init so a stray import path doesn't silently
174
+ // downgrade concurrency.
175
+ export const initTaskScheduler = (
176
+ newConcurrency: number,
177
+ options?: {
178
+ getActiveJobId?: GetActiveJobId
179
+ },
180
+ ): void => {
181
+ if (
182
+ concurrency !== null &&
183
+ concurrency === newConcurrency
184
+ ) {
185
+ if (options?.getActiveJobId) {
186
+ getActiveJobId = options.getActiveJobId
187
+ }
188
+
189
+ return
190
+ }
191
+
192
+ if (concurrency !== null) {
193
+ throw new Error(
194
+ `Task scheduler already initialized at concurrency=${concurrency}; refusing to re-init at ${newConcurrency}`,
195
+ )
196
+ }
197
+
198
+ concurrency = newConcurrency
199
+
200
+ if (options?.getActiveJobId) {
201
+ getActiveJobId = options.getActiveJobId
202
+ }
203
+
204
+ const newInbox = new Subject<ScheduledTask>()
205
+
206
+ newInbox.pipe(buildScheduler(newConcurrency)).subscribe()
207
+
208
+ inbox = newInbox
209
+ }
210
+
211
+ // Wraps work$ as a Task. The returned Observable is cold — subscribing
212
+ // enqueues the work; unsubscribing releases the slot (whether queued or
213
+ // running). Values from work$ mirror through to the caller. If work$
214
+ // errors, the caller sees the error.
215
+ //
216
+ // explicitJobId — pass a string or null in tests to bypass the async
217
+ // context lookup. Omit in production; the scheduler reads the current
218
+ // job id from the AsyncLocalStorage set by withJobContext() at subscribe
219
+ // time (inside the Observable factory), so the context is always live.
220
+ export const runTask = <T>(
221
+ work$: Observable<T>,
222
+ explicitJobId?: string | null,
223
+ ): Observable<T> =>
224
+ new Observable<T>((subscriber) => {
225
+ const jobId =
226
+ explicitJobId !== undefined
227
+ ? explicitJobId
228
+ : (getActiveJobId() ?? null)
229
+ const queue = ensureInbox()
230
+
231
+ let isCancelled = false
232
+ let bridgeSubscriber: Subscriber<never> | null = null
233
+ let innerSubscription: Subscription | null = null
234
+
235
+ // Gated inner Observable. The scheduler subscribes to this when
236
+ // a slot opens; we then start work$ and forward values to the caller.
237
+ // Slot stays held for as long as this Observable is "alive" — it
238
+ // completes when work$ ends naturally OR when the caller unsubscribes.
239
+ const bridge$ = new Observable<never>((bridgeSub) => {
240
+ if (isCancelled) {
241
+ bridgeSub.complete()
242
+
243
+ return
244
+ }
245
+
246
+ bridgeSubscriber = bridgeSub
247
+
248
+ innerSubscription = work$.subscribe({
249
+ next: (value) => {
250
+ subscriber.next(value)
251
+ },
252
+ error: (error) => {
253
+ subscriber.error(error)
254
+
255
+ bridgeSub.complete()
256
+ },
257
+ complete: () => {
258
+ subscriber.complete()
259
+
260
+ bridgeSub.complete()
261
+ },
262
+ })
263
+
264
+ return () => {
265
+ innerSubscription?.unsubscribe()
266
+ }
267
+ })
268
+
269
+ queue.next({ bridge$, jobId })
270
+
271
+ return () => {
272
+ isCancelled = true
273
+
274
+ innerSubscription?.unsubscribe()
275
+
276
+ // If the bridge has already been picked up by the scheduler,
277
+ // explicitly complete it so the slot is freed. If still queued,
278
+ // bridgeSubscriber is null and the isCancelled flag short-circuits
279
+ // when its slot eventually opens.
280
+ bridgeSubscriber?.complete()
281
+ }
282
+ })
283
+
284
+ // Pipeable form. Each upstream emission becomes a Task. Equivalent to
285
+ // `mergeMap(value => runTask(project(value, index)))` with unbounded
286
+ // outer concurrency — the scheduler is the actual cap.
287
+ export const runTasks = <T, R>(
288
+ project: (value: T, index: number) => Observable<R>,
289
+ ): OperatorFunction<T, R> =>
290
+ mergeMap((value: T, index: number) =>
291
+ runTask(project(value, index)),
292
+ )
293
+
294
+ // Pipeable form preserving input order on output. Each upstream value
295
+ // is projected via mergeMap (parallel by default), but emissions are
296
+ // released downstream in input-index order — file 5 is held back until
297
+ // files 1-4 have emitted, even if file 5 finishes first.
298
+ //
299
+ // Does NOT route the projected work through `runTask`. Use this when
300
+ // the heavy work is already wrapped (e.g. the projector body uses
301
+ // `runTasks(...)` over a sub-stream), or when the iteration is plain
302
+ // orchestration that shouldn't compete for scheduler slots — e.g.
303
+ // iterating over a `groupBy`'s GroupedObservables when each group's
304
+ // inner per-file work is what actually does IO.
305
+ //
306
+ // Why "not via runTask": if both the outer iteration AND the inner
307
+ // per-element work occupy scheduler slots, MAX_THREADS outer slots
308
+ // can starve inner work (deadlock). Keep one layer scheduled.
309
+ //
310
+ // Memory: out-of-order results buffer in a Map keyed by index until
311
+ // the head-of-queue completes. For commands that emit thousands of
312
+ // large values per element, the buffer grows with the slowest-element
313
+ // lag — fine for the per-file summary use case (one or a few small
314
+ // values per element); revisit if a future caller streams large
315
+ // payloads.
316
+ export const mergeMapOrdered =
317
+ <T, R>(
318
+ project: (value: T, index: number) => Observable<R>,
319
+ ): OperatorFunction<T, R> =>
320
+ (source) =>
321
+ new Observable<R>((subscriber) => {
322
+ let nextEmitIndex = 0
323
+ const buffered = new Map<number, R[]>()
324
+ const completed = new Set<number>()
325
+ let isUpstreamComplete = false
326
+ let inflightCount = 0
327
+
328
+ // Releases buffered results downstream in input-index order. Walks
329
+ // forward from `nextEmitIndex` while the next slot is marked
330
+ // completed; stops at the first gap. Called on every inner
331
+ // completion AND on upstream complete.
332
+ const tryFlush = (): void => {
333
+ while (completed.has(nextEmitIndex)) {
334
+ const items = buffered.get(nextEmitIndex) ?? []
335
+ items.forEach((item) => {
336
+ subscriber.next(item)
337
+ })
338
+ buffered.delete(nextEmitIndex)
339
+ completed.delete(nextEmitIndex)
340
+ nextEmitIndex += 1
341
+ }
342
+
343
+ if (isUpstreamComplete && inflightCount === 0) {
344
+ subscriber.complete()
345
+ }
346
+ }
347
+
348
+ const upstreamSubscription = source
349
+ .pipe(
350
+ mergeMap((value: T, index: number) => {
351
+ inflightCount += 1
352
+
353
+ return project(value, index).pipe(
354
+ tap((result) => {
355
+ const arr = buffered.get(index) ?? []
356
+ arr.push(result)
357
+ buffered.set(index, arr)
358
+ }),
359
+ finalize(() => {
360
+ inflightCount -= 1
361
+ completed.add(index)
362
+ tryFlush()
363
+ }),
364
+ // Values were already captured by the tap above; suppress
365
+ // them here so the outer mergeMap doesn't re-emit them
366
+ // out of order.
367
+ ignoreElements(),
368
+ )
369
+ }),
370
+ )
371
+ .subscribe({
372
+ error: (error) => {
373
+ subscriber.error(error)
374
+ },
375
+ complete: () => {
376
+ isUpstreamComplete = true
377
+ tryFlush()
378
+ },
379
+ })
380
+
381
+ return () => {
382
+ upstreamSubscription.unsubscribe()
383
+ }
384
+ })
385
+
386
+ // Pipeable form: each upstream value runs as a Task in parallel
387
+ // (capped by the scheduler), with emissions released in input-index
388
+ // order. Thin wrapper over `mergeMapOrdered` that wraps the projector
389
+ // in `runTask` for callers whose per-element work is the unit of
390
+ // scheduled IO/CPU (e.g. one network call + processing per file).
391
+ //
392
+ // Do NOT use this as the OUTER operator over a stream whose inner
393
+ // work also goes through the scheduler — both layers would compete
394
+ // for the same MAX_THREADS pool and risk deadlock. Use plain
395
+ // `mergeMapOrdered` for such orchestration and reserve the runTask
396
+ // wrapping for the deepest per-IO layer.
397
+ export const runTasksOrdered = <T, R>(
398
+ project: (value: T, index: number) => Observable<R>,
399
+ ): OperatorFunction<T, R> =>
400
+ mergeMapOrdered((value: T, index: number) =>
401
+ runTask(project(value, index)),
402
+ )
403
+
404
+ // Test-only — reset singleton between vitest runs so tests can re-init at
405
+ // a different concurrency.
406
+ export const __resetTaskSchedulerForTests = (): void => {
407
+ concurrency = null
408
+ claimByJob.clear()
409
+ getActiveJobId = nullJobIdProvider
410
+
411
+ inbox?.complete()
412
+
413
+ inbox = null
414
+ }