@omnimedia/omnitool 1.1.0-90 → 1.1.0-93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +132 -3
  2. package/package.json +1 -1
  3. package/s/demo/routines/timeline-setup.ts +32 -4
  4. package/s/driver/parts/compositor.ts +3 -0
  5. package/s/features/speech/transcribe/transcriber.ts +1 -1
  6. package/s/features/speech/transcribe/types.ts +8 -4
  7. package/s/timeline/index.ts +4 -0
  8. package/s/timeline/parts/captions.ts +90 -0
  9. package/s/timeline/parts/item.ts +23 -4
  10. package/s/timeline/renderers/export/parts/cursor.ts +20 -3
  11. package/s/timeline/renderers/parts/handy.ts +41 -13
  12. package/s/timeline/renderers/parts/samplers/visual/parts/defaults.ts +2 -1
  13. package/s/timeline/renderers/parts/samplers/visual/parts/sample.ts +19 -0
  14. package/s/timeline/sugar/helpers.ts +10 -0
  15. package/s/timeline/sugar/o.ts +55 -0
  16. package/x/demo/demo.bundle.min.js +103 -103
  17. package/x/demo/demo.bundle.min.js.map +4 -4
  18. package/x/demo/routines/timeline-setup.js +28 -3
  19. package/x/demo/routines/timeline-setup.js.map +1 -1
  20. package/x/driver/parts/compositor.js +3 -0
  21. package/x/driver/parts/compositor.js.map +1 -1
  22. package/x/features/speech/transcribe/transcriber.d.ts +1 -1
  23. package/x/features/speech/transcribe/transcriber.js +1 -1
  24. package/x/features/speech/transcribe/transcriber.js.map +1 -1
  25. package/x/features/speech/transcribe/types.d.ts +6 -4
  26. package/x/index.html +2 -2
  27. package/x/tests.bundle.min.js +107 -107
  28. package/x/tests.bundle.min.js.map +4 -4
  29. package/x/tests.html +1 -1
  30. package/x/timeline/index.d.ts +4 -0
  31. package/x/timeline/index.js +4 -0
  32. package/x/timeline/index.js.map +1 -1
  33. package/x/timeline/parts/captions.d.ts +40 -0
  34. package/x/timeline/parts/captions.js +55 -0
  35. package/x/timeline/parts/captions.js.map +1 -0
  36. package/x/timeline/parts/item.d.ts +21 -5
  37. package/x/timeline/parts/item.js +1 -0
  38. package/x/timeline/parts/item.js.map +1 -1
  39. package/x/timeline/renderers/export/parts/cursor.d.ts +1 -1
  40. package/x/timeline/renderers/export/parts/cursor.js +18 -3
  41. package/x/timeline/renderers/export/parts/cursor.js.map +1 -1
  42. package/x/timeline/renderers/parts/handy.d.ts +1 -0
  43. package/x/timeline/renderers/parts/handy.js +27 -13
  44. package/x/timeline/renderers/parts/handy.js.map +1 -1
  45. package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js +2 -1
  46. package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js.map +1 -1
  47. package/x/timeline/renderers/parts/samplers/visual/parts/sample.js +16 -0
  48. package/x/timeline/renderers/parts/samplers/visual/parts/sample.js.map +1 -1
  49. package/x/timeline/sugar/helpers.d.ts +3 -0
  50. package/x/timeline/sugar/helpers.js +3 -0
  51. package/x/timeline/sugar/helpers.js.map +1 -1
  52. package/x/timeline/sugar/o.d.ts +2 -0
  53. package/x/timeline/sugar/o.js +38 -0
  54. package/x/timeline/sugar/o.js.map +1 -1
package/README.md CHANGED
@@ -79,6 +79,133 @@ const timeline = timeline(
79
79
  )
80
80
  ```
81
81
 
82
+ ## 💬 Captions
83
+
84
+ Captions render transcript data as timed, styled text.
85
+ The transcript can come from anywhere, as long as it follows the structure.
86
+
87
+ ```ts
88
+ const transcript = {
89
+ text: "Hello world. This is a caption.",
90
+ chunks: [
91
+ {text: "Hello", timestamp: [0, 0.4]},
92
+ {text: "world.", timestamp: [0.4, 1.2]},
93
+ {text: "This", timestamp: [1.3, 1.6]},
94
+ {text: "is", timestamp: [1.6, 1.8]},
95
+ {text: "a", timestamp: [1.8, 1.9]},
96
+ {text: "caption.", timestamp: [1.9, 2.6]},
97
+ ],
98
+ }
99
+
100
+ const timeline = omni.timeline(o => {
101
+ const video = o.video(clip, {duration: 3000})
102
+ return o.captions(video, transcript)
103
+ })
104
+ ```
105
+
106
+ Use a caption preset to pick a built-in caption style:
107
+
108
+ ```ts
109
+ const timeline = omni.timeline(o => {
110
+ const video = o.video(clip, {duration: 3000})
111
+ return o.captions.presets.default(video, transcript)
112
+ })
113
+ ```
114
+
115
+ or do your own styled captions:
116
+
117
+ ```ts
118
+ const timeline = omni.timeline(o => {
119
+ const video = o.video(clip, {duration: 3000})
120
+ return o.captions(video, transcript, {
121
+ styles: {
122
+ fontFamily: "Inter",
123
+ fontSize: 64,
124
+ fill: "#fff7d6",
125
+ stroke: {color: "#111111", width: 8},
126
+ align: "center",
127
+ },
128
+ })
129
+ })
130
+ ```
131
+
132
+ Use omnitool's built in speech-to-text with default model:
133
+
134
+ ```ts
135
+ import {makeTranscriber, defaultTranscriberSpec} from "@omnimedia/omnitool"
136
+
137
+ // uses onnx-community/whisper-tiny_timestamped
138
+ const transcriber = await makeTranscriber({
139
+ driver,
140
+ spec: defaultTranscriberSpec(),
141
+ workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
142
+ onLoading: loading => console.log("loading", loading),
143
+ })
144
+
145
+ const transcript = await transcriber.transcribe({
146
+ source: file,
147
+ language: "english",
148
+ onReport: report => console.log("report", report),
149
+ onTranscription: text => console.log("transcribing", text),
150
+ })
151
+
152
+ const timeline = omni.timeline(o => {
153
+ const video = o.video(clip)
154
+ return o.captions(video, transcript)
155
+ })
156
+ ```
157
+
158
+ Load a custom speech-to-text model:
159
+
160
+ ```ts
161
+ const transcriber = await makeTranscriber({
162
+ driver,
163
+ spec: {
164
+ model: "onnx-community/whisper-tiny_timestamped",
165
+ dtype: "q4",
166
+ device: "wasm",
167
+ chunkLength: 20,
168
+ strideLength: 3
169
+ },
170
+ workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
171
+ onLoading: loading => console.log("loading", loading),
172
+ })
173
+ ```
174
+
175
+ > [!IMPORTANT]
176
+ > Use a Transformers.js-compatible speech-to-text model, for example `onnx-community/*_timestamped`.
177
+ > The model must support word-level timestamps because captions use `return_timestamps: "word"`.
178
+ > `device` and `dtype` are passed to Transformers.js and depend on your runtime/model.
179
+ > Browser usage commonly uses `"wasm"` or `"webgpu"`. `"webgpu"` for speed, `"wasm"` for more device support
180
+ > `workerUrl` depends on where you host the worker bundle.
181
+
182
+ `o.captions(video, transcript, options)` creates captions for a video or audio.
183
+ `o.captions` uses `captionPresets.default` preset.
184
+ use `o.captions.presets` to choose from available pre-styled captions.
185
+ pass `styles` in options to override preset styles.
186
+ transcript chunk timestamps are in seconds.
187
+
188
+ Update caption options after creation:
189
+
190
+ ```ts
191
+ const caption = o.captions.make(transcript, {maxChars: 42})
192
+ const style = o.textStyle({fill: "#00ff00"})
193
+ o.set(caption.id, {
194
+ maxChars: 32,
195
+ styleId: style.id,
196
+ })
197
+ ```
198
+
199
+ Caption options:
200
+ `styles` - sets styles, it overrides the preset's styles.
201
+ `start` - transcript time where captions begin, in milliseconds.
202
+ `duration` - caption layer duration, in milliseconds.
203
+ `maxChars` - maximum characters in one generated caption line.
204
+ `maxDuration` - maximum duration of one generated caption line, in milliseconds.
205
+ `maxSilence` - maximum silence allowed inside one caption; longer pauses start a new caption, in milliseconds.
206
+
207
+ import `captionPresets` to list available caption looks.
208
+
82
209
  ## 🎛 Filters
83
210
 
84
211
  Filter application:
@@ -353,9 +480,11 @@ Timeline items:
353
480
  - 4 `Text`
354
481
  - 5 `Gap`
355
482
  - 6 `Spatial`
356
- - 7 `Transition`
357
- - 8 `TextStyle`
358
- - 9 `Filter`
483
+ - 7 `Animation`
484
+ - 8 `Transition`
485
+ - 9 `TextStyle`
486
+ - 10 `Filter`
487
+ - 11 `Caption`
359
488
 
360
489
  ## 🗺️ Roadmap
361
490
  - CLI commands:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@omnimedia/omnitool",
3
- "version": "1.1.0-90",
3
+ "version": "1.1.0-93",
4
4
  "description": "open source video processing tools",
5
5
  "license": "MIT",
6
6
  "author": "Przemysław Gałęzki",
@@ -1,10 +1,25 @@
1
1
 
2
2
  import {Driver} from "../../driver/driver.js"
3
- import {Datafile, Item, Omni} from "../../timeline/index.js"
3
+ import {Datafile, defaultTranscriberSpec, Item, makeTranscriber, Omni} from "../../timeline/index.js"
4
4
 
5
5
  export async function TimelineSchemaTest(driver: Driver, file: File) {
6
+ const transcriber = await makeTranscriber({
7
+ driver,
8
+ spec: defaultTranscriberSpec(),
9
+ workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
10
+ onLoading: loading => console.log("transcriber loading", loading),
11
+ })
12
+
6
13
  const omni = new Omni(driver)
7
14
  const {videoA} = await omni.load({videoA: Datafile.make(file)})
15
+
16
+ const transcript = await transcriber.transcribe({
17
+ source: file,
18
+ language: "english",
19
+ onReport: report => console.log("transcriber report", report),
20
+ onTranscription: text => console.log("transcribing", text),
21
+ })
22
+
8
23
  const timeline = omni.timeline(o => {
9
24
  const text = o.text("content", {duration: 3000})
10
25
  const fade = o.animate.opacity.make("easeIn", [
@@ -30,15 +45,28 @@ export async function TimelineSchemaTest(driver: Driver, file: File) {
30
45
  [3000, o.transform({position: [320, 0], scale: [1.15, 1.15], rotation: 0})],
31
46
  ])
32
47
 
33
- const video = o.video(videoA, {duration: 3000, start: 1000})
48
+ const video = o.video(videoA, {duration: 6000, start: 3000})
34
49
  o.set<Item.Text>(text.id, {styleId: style.id, spatialId: textSpatial.id, animationIds: [fade.id, textMotion.id]})
35
50
  o.set<Item.Video>(video.id, {spatialId: videoSpatial.id})
36
51
 
37
52
  return o.sequence(
38
53
  o.stack(
39
54
  text,
40
- video,
41
- o.audio(videoA, {duration: 1000})
55
+ o.captions(video, transcript, {
56
+ maxChars: 34,
57
+ maxDuration: 2800,
58
+ maxSilence: 450,
59
+ styles: {
60
+ fontFamily: "Arial",
61
+ fontSize: 64,
62
+ fill: "#ffffff",
63
+ align: "center",
64
+ wordWrap: true,
65
+ wordWrapWidth: 1280,
66
+ stroke: {color: "#000000", width: 6},
67
+ },
68
+ }),
69
+ o.audio(videoA, {start: 3000})
42
70
  ),
43
71
  o.gap(500),
44
72
  o.video(videoA, {duration: 7000, start: 5000})
@@ -110,6 +110,9 @@ export class Compositor {
110
110
  parent: Container,
111
111
  ) {
112
112
  const sprite = this.#findOrCreate<Text>(layer)!
113
+ sprite.text = layer.content
114
+ if (layer.style)
115
+ sprite.style = layer.style
113
116
  this.#applyTransform(sprite, layer.matrix)
114
117
  this.#applyAlpha(sprite, layer.alpha)
115
118
  this.#applyCrop(sprite, layer.crop)
@@ -40,7 +40,7 @@ export async function makeTranscriber({driver, spec, workerUrl, onLoading}: Tran
40
40
  detachCallbacks()
41
41
  return result
42
42
  }),
43
- dispose: thread.terminate()
43
+ dispose: () => thread.terminate()
44
44
  }
45
45
  }
46
46
 
@@ -36,12 +36,16 @@ export type TranscriberPipeOptions = {
36
36
 
37
37
  export type SpeechTime = [start: number, end: number]
38
38
 
39
+ export type TranscriptWord = {
40
+ text: string
41
+ timestamp: SpeechTime
42
+ }
43
+
44
+ export type TranscriptSegment = TranscriptWord
45
+
39
46
  export type Transcription = {
40
47
  text: string
41
- chunks: {
42
- text: string
43
- timestamp: SpeechTime
44
- }[]
48
+ chunks: TranscriptWord[]
45
49
  }
46
50
 
47
51
  export type TranscriberSpec = {
@@ -8,6 +8,10 @@ export * from "./parts/resource-pool.js"
8
8
  export * from "./parts/resource.js"
9
9
  export * from "./parts/filmstrip.js"
10
10
  export * from "./parts/animations/registry.js"
11
+ export {captionPresets} from "./parts/captions.js"
12
+ export * from "../features/speech/transcribe/default-spec.js"
13
+ export * from "../features/speech/transcribe/transcriber.js"
14
+ export * from "../features/speech/transcribe/types.js"
11
15
 
12
16
  export * from "./parts/waveform/waveform.js"
13
17
  export * from "./parts/waveform/parts/types.js"
@@ -0,0 +1,90 @@
1
+
2
+ import type {Item} from "./item.js"
3
+ import {TextStyleOptions} from "pixi.js"
4
+ import {TransformOptions, Vec2} from "../types.js"
5
+ import {Transcription, TranscriptSegment} from "../../features/speech/transcribe/types.js"
6
+
7
+ export type CaptionOptions = {
8
+ start?: number
9
+ duration?: number
10
+ styles?: TextStyleOptions
11
+ maxChars?: number
12
+ maxDuration?: number
13
+ maxSilence?: number
14
+ }
15
+
16
+ export const captionPresets = {
17
+ default: {
18
+ styles: {
19
+ fontFamily: "Arial",
20
+ fontSize: 56,
21
+ fill: "#ffffff",
22
+ align: "center",
23
+ wordWrap: true,
24
+ wordWrapWidth: 1440,
25
+ } satisfies TextStyleOptions,
26
+ transform: {
27
+ position: [240, 860] as Vec2
28
+ } satisfies TransformOptions,
29
+ }
30
+ }
31
+
32
+ export type CaptionPreset = (typeof captionPresets)[keyof typeof captionPresets]
33
+ export type CaptionSourceItem = Item.Video | Item.Audio
34
+ export type CaptionAction = {
35
+ (item: CaptionSourceItem, transcript: Transcription, options?: CaptionOptions): Item.Stack
36
+ make: (transcript: Transcription, options?: CaptionOptions) => Item.Caption
37
+ }
38
+
39
+ export type CaptionActions = CaptionAction & {
40
+ presets: {
41
+ [TName in keyof typeof captionPresets]: CaptionAction
42
+ }
43
+ }
44
+
45
+ const CAPTION_DEFAULTS = {
46
+ maxChars: 42,
47
+ maxDuration: 3500,
48
+ maxSilence: 750,
49
+ } satisfies CaptionOptions
50
+
51
+ export function segmentTranscript(transcript: Transcription, options?: CaptionOptions): TranscriptSegment[] {
52
+ const {maxChars, maxDuration, maxSilence} = {...CAPTION_DEFAULTS, ...options}
53
+ const segments: TranscriptSegment[] = []
54
+ let current: TranscriptSegment | null = null
55
+
56
+ for (const {timestamp: [t0, t1], text: rawText} of transcript.chunks) {
57
+ const [start, end] = [t0 * 1000, t1 * 1000]
58
+ const text = rawText.trim()
59
+
60
+ if (!Number.isFinite(start) || !Number.isFinite(end) || !text) continue
61
+
62
+ if (!current) {
63
+ current = {text, timestamp: [start, end]}
64
+ continue
65
+ }
66
+
67
+ const [currentStart, currentEnd]: [number, number] = current.timestamp
68
+ const nextText = `${current.text} ${text}`.trim()
69
+ const shouldBreak =
70
+ nextText.length > maxChars ||
71
+ end - currentStart > maxDuration ||
72
+ start - currentEnd > maxSilence
73
+
74
+ if (shouldBreak) {
75
+ segments.push(current)
76
+ current = {text, timestamp: [start, end]}
77
+ }
78
+ else {
79
+ current = {text: nextText, timestamp: [currentStart, end]}
80
+ }
81
+ }
82
+
83
+ if (current) segments.push(current)
84
+ return segments
85
+ }
86
+
87
+ export function captionDuration(transcript: Transcription, options?: CaptionOptions) {
88
+ const segments = segmentTranscript(transcript, options)
89
+ return Math.max(0, ...segments.map(segment => segment.timestamp[1]))
90
+ }
@@ -3,8 +3,9 @@ import {TextStyleOptions} from "pixi.js"
3
3
 
4
4
  import {Id, Hash} from "./basics.js"
5
5
  import {Ms} from "../../units/ms.js"
6
- import type {FilterParams, FilterType} from "./filters.js"
7
6
  import {Transform, VisualAnimations} from "../types.js"
7
+ import type {FilterParams, FilterType} from "./filters.js"
8
+ import type {Transcription} from "../../features/speech/transcribe/types.js"
8
9
 
9
10
  export type Crop = [top: number, right: number, bottom: number, left: number]
10
11
 
@@ -19,7 +20,8 @@ export enum Kind {
19
20
  Animation,
20
21
  Transition,
21
22
  TextStyle,
22
- Filter
23
+ Filter,
24
+ Caption
23
25
  }
24
26
 
25
27
  export enum Effect {
@@ -109,6 +111,22 @@ export namespace Item {
109
111
  filterIds?: Id[]
110
112
  }
111
113
 
114
+
115
+ export type Caption = {
116
+ id: Id
117
+ kind: Kind.Caption
118
+ transcript: Transcription
119
+ start: number
120
+ duration: number
121
+ maxChars?: number
122
+ maxDuration?: number
123
+ maxSilence?: number
124
+ spatialId?: Id
125
+ animationIds?: Id[]
126
+ styleId?: Id
127
+ filterIds?: Id[]
128
+ }
129
+
112
130
  export type Transition = {
113
131
  id: Id
114
132
  kind: Kind.Transition
@@ -122,6 +140,7 @@ export namespace Item {
122
140
  | Video
123
141
  | Audio
124
142
  | Text
143
+ | Caption
125
144
  | Gap
126
145
  | Transition
127
146
  | Spatial
@@ -133,8 +152,8 @@ export namespace Item {
133
152
 
134
153
  export type ContainerItem = Item.Sequence | Item.Stack
135
154
  export type NonContainerItem = Exclude<Item.Any, ContainerItem>
136
- export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text
137
- export type VisualAnimatableItem = Item.Video | Item.Text
155
+ export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text | Item.Caption
156
+ export type VisualAnimatableItem = Item.Video | Item.Text | Item.Caption
138
157
 
139
158
  export type PlayableItem = Item.Any & {
140
159
  start: Ms
@@ -25,7 +25,7 @@ abstract class BaseVisualSampler {
25
25
  protected timeline: TimelineFile
26
26
  ) {
27
27
  this.#sampler = createVisualSampler(this.resolveMedia, (item, time) => {
28
- const targetUs = toUs(time)
28
+ const targetUs = toUs(ms(item.start + time))
29
29
  let cursor = this.#videoCursors.get(item.id)
30
30
 
31
31
  if (!cursor) {
@@ -148,8 +148,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
148
148
  return this.sample(timecode)
149
149
  }
150
150
 
151
- protected createCursor(source: DecoderSource, _initialTargetUs: number, endUs: number): VideoFrameCursor {
152
- const startUs = 0
151
+ protected createCursor(source: DecoderSource, startUs: number, endUs: number): VideoFrameCursor {
153
152
  const windowUs = 1_000_000
154
153
  const prefetchThreshold = windowUs * 0.5
155
154
 
@@ -159,6 +158,9 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
159
158
  let input: Input | null = null
160
159
  let sink: VideoSampleSink | null = null
161
160
  let prefetchPromise: Promise<{frames: VideoFrame[], windowStart: number, windowEnd: number}> | null = null
161
+ let activeFetches = 0
162
+ let idle: Promise<void> = Promise.resolve()
163
+ let resolveIdle: (() => void) | null = null
162
164
  let canceled = false
163
165
 
164
166
  const clear = () => {
@@ -167,6 +169,18 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
167
169
  frames = []
168
170
  }
169
171
 
172
+ const startFetch = () => {
173
+ if (activeFetches++ === 0)
174
+ idle = new Promise<void>(resolve => resolveIdle = resolve)
175
+ }
176
+
177
+ const endFetch = () => {
178
+ if (--activeFetches === 0) {
179
+ resolveIdle?.()
180
+ resolveIdle = null
181
+ }
182
+ }
183
+
170
184
  const getSink = async () => {
171
185
  if (sink) return sink
172
186
 
@@ -184,6 +198,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
184
198
  }
185
199
 
186
200
  const fetchFrames = async (targetUs: number) => {
201
+ startFetch()
187
202
  const wEnd = Math.min(endUs, targetUs + 1)
188
203
  const wStart = Math.max(startUs, wEnd - windowUs)
189
204
  const newFrames: VideoFrame[] = []
@@ -196,6 +211,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
196
211
  }
197
212
  }
198
213
 
214
+ endFetch()
199
215
  return {frames: newFrames, windowStart: wStart, windowEnd: wEnd}
200
216
  }
201
217
 
@@ -262,6 +278,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
262
278
  if (prefetched)
263
279
  for (const f of prefetched.frames) f.close()
264
280
 
281
+ await idle
265
282
  clear()
266
283
  input?.dispose()
267
284
  input = null
@@ -16,6 +16,7 @@ type WalkAtCallbacks = {
16
16
  stack: (x: Item.Stack, localTime: Ms, ancestors: AncestorAt[]) => void
17
17
  video: (x: Item.Video, localTime: Ms, ancestors: AncestorAt[]) => void
18
18
  text: (x: Item.Text, localTime: Ms, ancestors: AncestorAt[]) => void
19
+ caption: (x: Item.Caption, localTime: Ms, ancestors: AncestorAt[]) => void
19
20
  audio: (x: Item.Audio, localTime: Ms, ancestors: AncestorAt[]) => void
20
21
  }
21
22
 
@@ -24,6 +25,7 @@ type WalkCallbacks = {
24
25
  stack?: (x: Item.Stack, matrix: Mat6, ancestors: AncestorAt[]) => void
25
26
  video?: (x: Item.Video, matrix: Mat6, ancestors: AncestorAt[]) => void
26
27
  text?: (x: Item.Text, matrix: Mat6, ancestors: AncestorAt[]) => void
28
+ caption?: (x: Item.Caption, matrix: Mat6, ancestors: AncestorAt[]) => void
27
29
  audio?: (x: Item.Audio) => void
28
30
  }
29
31
 
@@ -52,6 +54,7 @@ export function itemsAt(p: Props): At[] {
52
54
  stack: () => { },
53
55
  video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
54
56
  text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
57
+ caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
55
58
  audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
56
59
  })
57
60
 
@@ -72,6 +75,7 @@ export function itemsFrom(p: FromProps): At[] {
72
75
  stack: () => { },
73
76
  video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
74
77
  text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
78
+ caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
75
79
  audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
76
80
  })
77
81
 
@@ -176,6 +180,10 @@ export function walk(
176
180
  callbacks.text?.(item, currentMatrix, ancestors)
177
181
  break
178
182
 
183
+ case Kind.Caption:
184
+ callbacks.caption?.(item, currentMatrix, ancestors)
185
+ break
186
+
179
187
  case Kind.Audio:
180
188
  callbacks.audio?.(item)
181
189
  break
@@ -211,11 +219,12 @@ function walkAt(
211
219
 
212
220
  if (!child)
213
221
  continue
214
- if (!isPlayableItem(child)) {
222
+
223
+ const duration = computeItemDurationFromMap(child.id, items)
224
+ if (duration <= 0)
215
225
  continue
216
- }
217
226
 
218
- if (time >= offset && time < offset + child.duration) {
227
+ if (time >= offset && time < offset + duration) {
219
228
  const localTime = ms(time - offset)
220
229
  walkAt(
221
230
  childId,
@@ -227,7 +236,7 @@ function walkAt(
227
236
  break
228
237
  }
229
238
 
230
- offset = ms(offset + child.duration)
239
+ offset = ms(offset + duration)
231
240
  }
232
241
 
233
242
  break
@@ -241,6 +250,10 @@ function walkAt(
241
250
  callbacks.text(item, time, ancestors)
242
251
  break
243
252
 
253
+ case Kind.Caption:
254
+ callbacks.caption(item, time, ancestors)
255
+ break
256
+
244
257
  case Kind.Audio:
245
258
  callbacks.audio(item, time, ancestors)
246
259
  break
@@ -275,11 +288,12 @@ function walkFrom(
275
288
 
276
289
  if (!child)
277
290
  continue
278
- if (!isPlayableItem(child)) {
291
+
292
+ const duration = computeItemDurationFromMap(child.id, items)
293
+ if (duration <= 0)
279
294
  continue
280
- }
281
295
 
282
- const end = ms(offset + child.duration)
296
+ const end = ms(offset + duration)
283
297
  if (from >= end) {
284
298
  offset = end
285
299
  continue
@@ -308,6 +322,10 @@ function walkFrom(
308
322
  callbacks.text(item, from, ancestors)
309
323
  break
310
324
 
325
+ case Kind.Caption:
326
+ callbacks.caption(item, from, ancestors)
327
+ break
328
+
311
329
  case Kind.Audio:
312
330
  callbacks.audio(item, from, ancestors)
313
331
  break
@@ -318,14 +336,24 @@ export function computeItemDuration(
318
336
  id: number,
319
337
  timeline: TimelineFile
320
338
  ): Ms {
321
- const item = timeline.items.find(item => item.id === id)
339
+ return computeItemDurationFromMap(
340
+ id,
341
+ new Map(timeline.items.map(item => [item.id, item]))
342
+ )
343
+ }
344
+
345
+ function computeItemDurationFromMap(
346
+ id: number,
347
+ items: Map<Id, Item.Any>
348
+ ): Ms {
349
+ const item = items.get(id)
322
350
 
323
351
  if (!item) return ms(0)
324
352
 
325
353
  switch (item.kind) {
326
354
  case Kind.Sequence: {
327
355
  const children = item.childrenIds
328
- .map(childId => timeline.items.find(x => x.id === childId))
356
+ .map(childId => items.get(childId))
329
357
  .filter(Boolean) as Item.Any[]
330
358
 
331
359
  let total = ms(0)
@@ -338,8 +366,8 @@ export function computeItemDuration(
338
366
  const next = children[i + 1]
339
367
 
340
368
  if (prev && next && prev.kind !== Kind.Transition && next.kind !== Kind.Transition) {
341
- const prevDur = computeItemDuration(prev.id, timeline)
342
- const nextDur = computeItemDuration(next.id, timeline)
369
+ const prevDur = computeItemDurationFromMap(prev.id, items)
370
+ const nextDur = computeItemDurationFromMap(next.id, items)
343
371
  const overlap = Math.max(0, Math.min(child.duration, prevDur, nextDur))
344
372
 
345
373
  total = ms(total - overlap)
@@ -347,7 +375,7 @@ export function computeItemDuration(
347
375
  continue
348
376
  }
349
377
 
350
- total = ms(total + computeItemDuration(child.id, timeline))
378
+ total = ms(total + computeItemDurationFromMap(child.id, items))
351
379
  }
352
380
 
353
381
  return total
@@ -357,7 +385,7 @@ export function computeItemDuration(
357
385
  let longest = ms(0)
358
386
 
359
387
  for (const childId of item.childrenIds) {
360
- const duration = computeItemDuration(childId, timeline)
388
+ const duration = computeItemDurationFromMap(childId, items)
361
389
  if (duration > longest) {
362
390
  longest = duration
363
391
  }
@@ -8,7 +8,8 @@ export type VideoSampler = (item: Item.Video, time: Ms) => Promise<VideoFrame |
8
8
  export function createDefaultVideoSampler(sink: VideoSink): VideoSampler {
9
9
  return async (item, time) => {
10
10
  const s = await sink.getSink(item.mediaHash)
11
- const sample = await s?.getSample(time / 1000)
11
+ const mediaTime = item.start + time
12
+ const sample = await s?.getSample(mediaTime / 1000)
12
13
  const frame = sample?.toVideoFrame()
13
14
  sample?.close()
14
15
  return frame ?? undefined
@@ -3,6 +3,7 @@ import {SampleContext} from "./types.js"
3
3
  import {sampleSequence} from "./sequence.js"
4
4
  import {Ms} from "../../../../../../units/ms.js"
5
5
  import {Item, Kind} from "../../../../../parts/item.js"
6
+ import {segmentTranscript} from "../../../../../parts/captions.js"
6
7
  import {FilterSpec, Layer} from "../../../../../../driver/fns/schematic.js"
7
8
  import {AncestorAt, computeOpacity, computeWorldMatrix} from "../../../handy.js"
8
9
 
@@ -58,6 +59,24 @@ export async function sampleVisual(
58
59
  return [{id: item.id, kind: "text", content: item.content, style, matrix, alpha, crop, filters}]
59
60
  }
60
61
 
62
+ case Kind.Caption: {
63
+ if (time < 0 || time >= item.duration) return []
64
+
65
+ const transcriptTime = item.start + time
66
+ const segment = segmentTranscript(item.transcript, item).find(segment => {
67
+ const [start, end] = segment.timestamp
68
+ return transcriptTime >= start && transcriptTime < end
69
+ })
70
+ if (!segment)
71
+ return []
72
+
73
+ const style = item.styleId
74
+ ? (ctx.items.get(item.styleId) as Item.TextStyle)?.style
75
+ : undefined
76
+
77
+ return [{id: item.id, kind: "text", content: segment.text, style, matrix, alpha, crop, filters}]
78
+ }
79
+
61
80
  case Kind.Gap: {
62
81
  return [{id: item.id, kind: "gap"}]
63
82
  }