@omnimedia/omnitool 1.1.0-91 → 1.1.0-94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +133 -3
  2. package/package.json +1 -1
  3. package/s/demo/routines/timeline-setup.ts +32 -4
  4. package/s/driver/parts/compositor.ts +3 -0
  5. package/s/features/speech/transcribe/transcriber.ts +1 -1
  6. package/s/features/speech/transcribe/types.ts +8 -4
  7. package/s/timeline/index.ts +4 -0
  8. package/s/timeline/parts/captions.ts +90 -0
  9. package/s/timeline/parts/item.ts +25 -4
  10. package/s/timeline/renderers/export/parts/cursor.ts +2 -3
  11. package/s/timeline/renderers/parts/handy.ts +41 -13
  12. package/s/timeline/renderers/parts/samplers/visual/parts/defaults.ts +2 -1
  13. package/s/timeline/renderers/parts/samplers/visual/parts/sample.ts +19 -0
  14. package/s/timeline/sugar/helpers.ts +10 -0
  15. package/s/timeline/sugar/o.ts +56 -0
  16. package/x/demo/demo.bundle.min.js +102 -102
  17. package/x/demo/demo.bundle.min.js.map +4 -4
  18. package/x/demo/routines/timeline-setup.js +28 -3
  19. package/x/demo/routines/timeline-setup.js.map +1 -1
  20. package/x/driver/parts/compositor.js +3 -0
  21. package/x/driver/parts/compositor.js.map +1 -1
  22. package/x/features/speech/transcribe/transcriber.d.ts +1 -1
  23. package/x/features/speech/transcribe/transcriber.js +1 -1
  24. package/x/features/speech/transcribe/transcriber.js.map +1 -1
  25. package/x/features/speech/transcribe/types.d.ts +6 -4
  26. package/x/index.html +2 -2
  27. package/x/tests.bundle.min.js +105 -105
  28. package/x/tests.bundle.min.js.map +4 -4
  29. package/x/tests.html +1 -1
  30. package/x/timeline/index.d.ts +4 -0
  31. package/x/timeline/index.js +4 -0
  32. package/x/timeline/index.js.map +1 -1
  33. package/x/timeline/parts/captions.d.ts +40 -0
  34. package/x/timeline/parts/captions.js +55 -0
  35. package/x/timeline/parts/captions.js.map +1 -0
  36. package/x/timeline/parts/item.d.ts +23 -5
  37. package/x/timeline/parts/item.js +1 -0
  38. package/x/timeline/parts/item.js.map +1 -1
  39. package/x/timeline/renderers/export/parts/cursor.d.ts +1 -1
  40. package/x/timeline/renderers/export/parts/cursor.js +2 -3
  41. package/x/timeline/renderers/export/parts/cursor.js.map +1 -1
  42. package/x/timeline/renderers/parts/handy.d.ts +1 -0
  43. package/x/timeline/renderers/parts/handy.js +27 -13
  44. package/x/timeline/renderers/parts/handy.js.map +1 -1
  45. package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js +2 -1
  46. package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js.map +1 -1
  47. package/x/timeline/renderers/parts/samplers/visual/parts/sample.js +16 -0
  48. package/x/timeline/renderers/parts/samplers/visual/parts/sample.js.map +1 -1
  49. package/x/timeline/sugar/helpers.d.ts +3 -0
  50. package/x/timeline/sugar/helpers.js +3 -0
  51. package/x/timeline/sugar/helpers.js.map +1 -1
  52. package/x/timeline/sugar/o.d.ts +2 -0
  53. package/x/timeline/sugar/o.js +39 -0
  54. package/x/timeline/sugar/o.js.map +1 -1
package/README.md CHANGED
@@ -79,6 +79,134 @@ const timeline = timeline(
79
79
  )
80
80
  ```
81
81
 
82
+ ## 💬 Captions
83
+
84
+ Captions render transcript data as timed, styled text.
85
+ The transcript can come from anywhere, as long as it follows the structure.
86
+
87
+ ```ts
88
+ const transcript = {
89
+ text: "Hello world. This is a caption.",
90
+ chunks: [
91
+ {text: "Hello", timestamp: [0, 0.4]},
92
+ {text: "world.", timestamp: [0.4, 1.2]},
93
+ {text: "This", timestamp: [1.3, 1.6]},
94
+ {text: "is", timestamp: [1.6, 1.8]},
95
+ {text: "a", timestamp: [1.8, 1.9]},
96
+ {text: "caption.", timestamp: [1.9, 2.6]},
97
+ ],
98
+ }
99
+
100
+ const timeline = omni.timeline(o => {
101
+ const video = o.video(clip, {duration: 3000})
102
+ return o.captions(video, transcript)
103
+ })
104
+ ```
105
+
106
+ Use a caption preset to pick a built-in caption style:
107
+
108
+ ```ts
109
+ const timeline = omni.timeline(o => {
110
+ const video = o.video(clip, {duration: 3000})
111
+ return o.captions.presets.default(video, transcript)
112
+ })
113
+ ```
114
+
115
+ or do your own styled captions:
116
+
117
+ ```ts
118
+ const timeline = omni.timeline(o => {
119
+ const video = o.video(clip, {duration: 3000})
120
+ return o.captions(video, transcript, {
121
+ styles: {
122
+ fontFamily: "Inter",
123
+ fontSize: 64,
124
+ fill: "#fff7d6",
125
+ stroke: {color: "#111111", width: 8},
126
+ align: "center",
127
+ },
128
+ })
129
+ })
130
+ ```
131
+
132
+ Use omnitool's built in speech-to-text with default model:
133
+
134
+ ```ts
135
+ import {makeTranscriber, defaultTranscriberSpec} from "@omnimedia/omnitool"
136
+
137
+ // uses onnx-community/whisper-tiny_timestamped
138
+ const transcriber = await makeTranscriber({
139
+ driver,
140
+ spec: defaultTranscriberSpec(),
141
+ workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
142
+ onLoading: loading => console.log("loading", loading),
143
+ })
144
+
145
+ const transcript = await transcriber.transcribe({
146
+ source: file,
147
+ language: "english",
148
+ onReport: report => console.log("report", report),
149
+ onTranscription: text => console.log("transcribing", text),
150
+ })
151
+
152
+ const timeline = omni.timeline(o => {
153
+ const video = o.video(clip)
154
+ return o.captions(video, transcript)
155
+ })
156
+ ```
157
+
158
+ Load a custom speech-to-text model:
159
+
160
+ ```ts
161
+ const transcriber = await makeTranscriber({
162
+ driver,
163
+ spec: {
164
+ model: "onnx-community/whisper-tiny_timestamped",
165
+ dtype: "q4",
166
+ device: "wasm",
167
+ chunkLength: 20,
168
+ strideLength: 3
169
+ },
170
+ workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
171
+ onLoading: loading => console.log("loading", loading),
172
+ })
173
+ ```
174
+
175
+ > [!IMPORTANT]
176
+ > Use a Transformers.js-compatible speech-to-text model, for example `onnx-community/*_timestamped`.
177
+ > The model must support word-level timestamps because captions use `return_timestamps: "word"`.
178
+ > `device` and `dtype` are passed to Transformers.js and depend on your runtime/model.
179
+ > Browser usage commonly uses `"wasm"` or `"webgpu"`. `"webgpu"` for speed, `"wasm"` for more device support
180
+ > `workerUrl` depends on where you host the worker bundle.
181
+
182
+ `o.captions(video, transcript, options)` creates captions for a video or audio.
183
+ `o.captions` uses `captionPresets.default` preset.
184
+ The generated caption id is stored on the source item as `captionId`.
185
+ use `o.captions.presets` to choose from available pre-styled captions.
186
+ pass `styles` in options to override preset styles.
187
+ transcript chunk timestamps are in seconds.
188
+
189
+ Update caption options after creation:
190
+
191
+ ```ts
192
+ const caption = o.captions.make(transcript, {maxChars: 42})
193
+ const style = o.textStyle({fill: "#00ff00"})
194
+ o.set(caption.id, {
195
+ maxChars: 32,
196
+ styleId: style.id,
197
+ })
198
+ ```
199
+
200
+ Caption options:
201
+ `styles` - sets styles, it overrides the preset's styles.
202
+ `start` - transcript time where captions begin, in milliseconds.
203
+ `duration` - caption layer duration, in milliseconds.
204
+ `maxChars` - maximum characters in one generated caption line.
205
+ `maxDuration` - maximum duration of one generated caption line, in milliseconds.
206
+ `maxSilence` - maximum silence allowed inside one caption; longer pauses start a new caption, in milliseconds.
207
+
208
+ import `captionPresets` to list available caption looks.
209
+
82
210
  ## 🎛 Filters
83
211
 
84
212
  Filter application:
@@ -353,9 +481,11 @@ Timeline items:
353
481
  - 4 `Text`
354
482
  - 5 `Gap`
355
483
  - 6 `Spatial`
356
- - 7 `Transition`
357
- - 8 `TextStyle`
358
- - 9 `Filter`
484
+ - 7 `Animation`
485
+ - 8 `Transition`
486
+ - 9 `TextStyle`
487
+ - 10 `Filter`
488
+ - 11 `Caption`
359
489
 
360
490
  ## 🗺️ Roadmap
361
491
  - CLI commands:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@omnimedia/omnitool",
3
- "version": "1.1.0-91",
3
+ "version": "1.1.0-94",
4
4
  "description": "open source video processing tools",
5
5
  "license": "MIT",
6
6
  "author": "Przemysław Gałęzki",
@@ -1,10 +1,25 @@
1
1
 
2
2
  import {Driver} from "../../driver/driver.js"
3
- import {Datafile, Item, Omni} from "../../timeline/index.js"
3
+ import {Datafile, defaultTranscriberSpec, Item, makeTranscriber, Omni} from "../../timeline/index.js"
4
4
 
5
5
  export async function TimelineSchemaTest(driver: Driver, file: File) {
6
+ const transcriber = await makeTranscriber({
7
+ driver,
8
+ spec: defaultTranscriberSpec(),
9
+ workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
10
+ onLoading: loading => console.log("transcriber loading", loading),
11
+ })
12
+
6
13
  const omni = new Omni(driver)
7
14
  const {videoA} = await omni.load({videoA: Datafile.make(file)})
15
+
16
+ const transcript = await transcriber.transcribe({
17
+ source: file,
18
+ language: "english",
19
+ onReport: report => console.log("transcriber report", report),
20
+ onTranscription: text => console.log("transcribing", text),
21
+ })
22
+
8
23
  const timeline = omni.timeline(o => {
9
24
  const text = o.text("content", {duration: 3000})
10
25
  const fade = o.animate.opacity.make("easeIn", [
@@ -30,15 +45,28 @@ export async function TimelineSchemaTest(driver: Driver, file: File) {
30
45
  [3000, o.transform({position: [320, 0], scale: [1.15, 1.15], rotation: 0})],
31
46
  ])
32
47
 
33
- const video = o.video(videoA, {duration: 3000, start: 1000})
48
+ const video = o.video(videoA, {duration: 6000, start: 3000})
34
49
  o.set<Item.Text>(text.id, {styleId: style.id, spatialId: textSpatial.id, animationIds: [fade.id, textMotion.id]})
35
50
  o.set<Item.Video>(video.id, {spatialId: videoSpatial.id})
36
51
 
37
52
  return o.sequence(
38
53
  o.stack(
39
54
  text,
40
- video,
41
- o.audio(videoA, {duration: 1000})
55
+ o.captions(video, transcript, {
56
+ maxChars: 34,
57
+ maxDuration: 2800,
58
+ maxSilence: 450,
59
+ styles: {
60
+ fontFamily: "Arial",
61
+ fontSize: 64,
62
+ fill: "#ffffff",
63
+ align: "center",
64
+ wordWrap: true,
65
+ wordWrapWidth: 1280,
66
+ stroke: {color: "#000000", width: 6},
67
+ },
68
+ }),
69
+ o.audio(videoA, {start: 3000})
42
70
  ),
43
71
  o.gap(500),
44
72
  o.video(videoA, {duration: 7000, start: 5000})
@@ -110,6 +110,9 @@ export class Compositor {
110
110
  parent: Container,
111
111
  ) {
112
112
  const sprite = this.#findOrCreate<Text>(layer)!
113
+ sprite.text = layer.content
114
+ if (layer.style)
115
+ sprite.style = layer.style
113
116
  this.#applyTransform(sprite, layer.matrix)
114
117
  this.#applyAlpha(sprite, layer.alpha)
115
118
  this.#applyCrop(sprite, layer.crop)
@@ -40,7 +40,7 @@ export async function makeTranscriber({driver, spec, workerUrl, onLoading}: Tran
40
40
  detachCallbacks()
41
41
  return result
42
42
  }),
43
- dispose: thread.terminate()
43
+ dispose: () => thread.terminate()
44
44
  }
45
45
  }
46
46
 
@@ -36,12 +36,16 @@ export type TranscriberPipeOptions = {
36
36
 
37
37
  export type SpeechTime = [start: number, end: number]
38
38
 
39
+ export type TranscriptWord = {
40
+ text: string
41
+ timestamp: SpeechTime
42
+ }
43
+
44
+ export type TranscriptSegment = TranscriptWord
45
+
39
46
  export type Transcription = {
40
47
  text: string
41
- chunks: {
42
- text: string
43
- timestamp: SpeechTime
44
- }[]
48
+ chunks: TranscriptWord[]
45
49
  }
46
50
 
47
51
  export type TranscriberSpec = {
@@ -8,6 +8,10 @@ export * from "./parts/resource-pool.js"
8
8
  export * from "./parts/resource.js"
9
9
  export * from "./parts/filmstrip.js"
10
10
  export * from "./parts/animations/registry.js"
11
+ export {captionPresets} from "./parts/captions.js"
12
+ export * from "../features/speech/transcribe/default-spec.js"
13
+ export * from "../features/speech/transcribe/transcriber.js"
14
+ export * from "../features/speech/transcribe/types.js"
11
15
 
12
16
  export * from "./parts/waveform/waveform.js"
13
17
  export * from "./parts/waveform/parts/types.js"
@@ -0,0 +1,90 @@
1
+
2
+ import type {Item} from "./item.js"
3
+ import {TextStyleOptions} from "pixi.js"
4
+ import {TransformOptions, Vec2} from "../types.js"
5
+ import {Transcription, TranscriptSegment} from "../../features/speech/transcribe/types.js"
6
+
7
+ export type CaptionOptions = {
8
+ start?: number
9
+ duration?: number
10
+ styles?: TextStyleOptions
11
+ maxChars?: number
12
+ maxDuration?: number
13
+ maxSilence?: number
14
+ }
15
+
16
+ export const captionPresets = {
17
+ default: {
18
+ styles: {
19
+ fontFamily: "Arial",
20
+ fontSize: 56,
21
+ fill: "#ffffff",
22
+ align: "center",
23
+ wordWrap: true,
24
+ wordWrapWidth: 1440,
25
+ } satisfies TextStyleOptions,
26
+ transform: {
27
+ position: [240, 860] as Vec2
28
+ } satisfies TransformOptions,
29
+ }
30
+ }
31
+
32
+ export type CaptionPreset = (typeof captionPresets)[keyof typeof captionPresets]
33
+ export type CaptionSourceItem = Item.Video | Item.Audio
34
+ export type CaptionAction = {
35
+ (item: CaptionSourceItem, transcript: Transcription, options?: CaptionOptions): Item.Stack
36
+ make: (transcript: Transcription, options?: CaptionOptions) => Item.Caption
37
+ }
38
+
39
+ export type CaptionActions = CaptionAction & {
40
+ presets: {
41
+ [TName in keyof typeof captionPresets]: CaptionAction
42
+ }
43
+ }
44
+
45
+ const CAPTION_DEFAULTS = {
46
+ maxChars: 42,
47
+ maxDuration: 3500,
48
+ maxSilence: 750,
49
+ } satisfies CaptionOptions
50
+
51
+ export function segmentTranscript(transcript: Transcription, options?: CaptionOptions): TranscriptSegment[] {
52
+ const {maxChars, maxDuration, maxSilence} = {...CAPTION_DEFAULTS, ...options}
53
+ const segments: TranscriptSegment[] = []
54
+ let current: TranscriptSegment | null = null
55
+
56
+ for (const {timestamp: [t0, t1], text: rawText} of transcript.chunks) {
57
+ const [start, end] = [t0 * 1000, t1 * 1000]
58
+ const text = rawText.trim()
59
+
60
+ if (!Number.isFinite(start) || !Number.isFinite(end) || !text) continue
61
+
62
+ if (!current) {
63
+ current = {text, timestamp: [start, end]}
64
+ continue
65
+ }
66
+
67
+ const [currentStart, currentEnd]: [number, number] = current.timestamp
68
+ const nextText = `${current.text} ${text}`.trim()
69
+ const shouldBreak =
70
+ nextText.length > maxChars ||
71
+ end - currentStart > maxDuration ||
72
+ start - currentEnd > maxSilence
73
+
74
+ if (shouldBreak) {
75
+ segments.push(current)
76
+ current = {text, timestamp: [start, end]}
77
+ }
78
+ else {
79
+ current = {text: nextText, timestamp: [currentStart, end]}
80
+ }
81
+ }
82
+
83
+ if (current) segments.push(current)
84
+ return segments
85
+ }
86
+
87
+ export function captionDuration(transcript: Transcription, options?: CaptionOptions) {
88
+ const segments = segmentTranscript(transcript, options)
89
+ return Math.max(0, ...segments.map(segment => segment.timestamp[1]))
90
+ }
@@ -3,8 +3,9 @@ import {TextStyleOptions} from "pixi.js"
3
3
 
4
4
  import {Id, Hash} from "./basics.js"
5
5
  import {Ms} from "../../units/ms.js"
6
- import type {FilterParams, FilterType} from "./filters.js"
7
6
  import {Transform, VisualAnimations} from "../types.js"
7
+ import type {FilterParams, FilterType} from "./filters.js"
8
+ import type {Transcription} from "../../features/speech/transcribe/types.js"
8
9
 
9
10
  export type Crop = [top: number, right: number, bottom: number, left: number]
10
11
 
@@ -19,7 +20,8 @@ export enum Kind {
19
20
  Animation,
20
21
  Transition,
21
22
  TextStyle,
22
- Filter
23
+ Filter,
24
+ Caption
23
25
  }
24
26
 
25
27
  export enum Effect {
@@ -87,6 +89,7 @@ export namespace Item {
87
89
  spatialId?: Id
88
90
  animationIds?: Id[]
89
91
  filterIds?: Id[]
92
+ captionId?: Id
90
93
  }
91
94
 
92
95
  export type Audio = {
@@ -96,6 +99,7 @@ export namespace Item {
96
99
  start: number
97
100
  duration: number
98
101
  gain?: number
102
+ captionId?: Id
99
103
  }
100
104
 
101
105
  export type Text = {
@@ -109,6 +113,22 @@ export namespace Item {
109
113
  filterIds?: Id[]
110
114
  }
111
115
 
116
+
117
+ export type Caption = {
118
+ id: Id
119
+ kind: Kind.Caption
120
+ transcript: Transcription
121
+ start: number
122
+ duration: number
123
+ maxChars?: number
124
+ maxDuration?: number
125
+ maxSilence?: number
126
+ spatialId?: Id
127
+ animationIds?: Id[]
128
+ styleId?: Id
129
+ filterIds?: Id[]
130
+ }
131
+
112
132
  export type Transition = {
113
133
  id: Id
114
134
  kind: Kind.Transition
@@ -122,6 +142,7 @@ export namespace Item {
122
142
  | Video
123
143
  | Audio
124
144
  | Text
145
+ | Caption
125
146
  | Gap
126
147
  | Transition
127
148
  | Spatial
@@ -133,8 +154,8 @@ export namespace Item {
133
154
 
134
155
  export type ContainerItem = Item.Sequence | Item.Stack
135
156
  export type NonContainerItem = Exclude<Item.Any, ContainerItem>
136
- export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text
137
- export type VisualAnimatableItem = Item.Video | Item.Text
157
+ export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text | Item.Caption
158
+ export type VisualAnimatableItem = Item.Video | Item.Text | Item.Caption
138
159
 
139
160
  export type PlayableItem = Item.Any & {
140
161
  start: Ms
@@ -25,7 +25,7 @@ abstract class BaseVisualSampler {
25
25
  protected timeline: TimelineFile
26
26
  ) {
27
27
  this.#sampler = createVisualSampler(this.resolveMedia, (item, time) => {
28
- const targetUs = toUs(time)
28
+ const targetUs = toUs(ms(item.start + time))
29
29
  let cursor = this.#videoCursors.get(item.id)
30
30
 
31
31
  if (!cursor) {
@@ -148,8 +148,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
148
148
  return this.sample(timecode)
149
149
  }
150
150
 
151
- protected createCursor(source: DecoderSource, _initialTargetUs: number, endUs: number): VideoFrameCursor {
152
- const startUs = 0
151
+ protected createCursor(source: DecoderSource, startUs: number, endUs: number): VideoFrameCursor {
153
152
  const windowUs = 1_000_000
154
153
  const prefetchThreshold = windowUs * 0.5
155
154
 
@@ -16,6 +16,7 @@ type WalkAtCallbacks = {
16
16
  stack: (x: Item.Stack, localTime: Ms, ancestors: AncestorAt[]) => void
17
17
  video: (x: Item.Video, localTime: Ms, ancestors: AncestorAt[]) => void
18
18
  text: (x: Item.Text, localTime: Ms, ancestors: AncestorAt[]) => void
19
+ caption: (x: Item.Caption, localTime: Ms, ancestors: AncestorAt[]) => void
19
20
  audio: (x: Item.Audio, localTime: Ms, ancestors: AncestorAt[]) => void
20
21
  }
21
22
 
@@ -24,6 +25,7 @@ type WalkCallbacks = {
24
25
  stack?: (x: Item.Stack, matrix: Mat6, ancestors: AncestorAt[]) => void
25
26
  video?: (x: Item.Video, matrix: Mat6, ancestors: AncestorAt[]) => void
26
27
  text?: (x: Item.Text, matrix: Mat6, ancestors: AncestorAt[]) => void
28
+ caption?: (x: Item.Caption, matrix: Mat6, ancestors: AncestorAt[]) => void
27
29
  audio?: (x: Item.Audio) => void
28
30
  }
29
31
 
@@ -52,6 +54,7 @@ export function itemsAt(p: Props): At[] {
52
54
  stack: () => { },
53
55
  video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
54
56
  text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
57
+ caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
55
58
  audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
56
59
  })
57
60
 
@@ -72,6 +75,7 @@ export function itemsFrom(p: FromProps): At[] {
72
75
  stack: () => { },
73
76
  video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
74
77
  text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
78
+ caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
75
79
  audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
76
80
  })
77
81
 
@@ -176,6 +180,10 @@ export function walk(
176
180
  callbacks.text?.(item, currentMatrix, ancestors)
177
181
  break
178
182
 
183
+ case Kind.Caption:
184
+ callbacks.caption?.(item, currentMatrix, ancestors)
185
+ break
186
+
179
187
  case Kind.Audio:
180
188
  callbacks.audio?.(item)
181
189
  break
@@ -211,11 +219,12 @@ function walkAt(
211
219
 
212
220
  if (!child)
213
221
  continue
214
- if (!isPlayableItem(child)) {
222
+
223
+ const duration = computeItemDurationFromMap(child.id, items)
224
+ if (duration <= 0)
215
225
  continue
216
- }
217
226
 
218
- if (time >= offset && time < offset + child.duration) {
227
+ if (time >= offset && time < offset + duration) {
219
228
  const localTime = ms(time - offset)
220
229
  walkAt(
221
230
  childId,
@@ -227,7 +236,7 @@ function walkAt(
227
236
  break
228
237
  }
229
238
 
230
- offset = ms(offset + child.duration)
239
+ offset = ms(offset + duration)
231
240
  }
232
241
 
233
242
  break
@@ -241,6 +250,10 @@ function walkAt(
241
250
  callbacks.text(item, time, ancestors)
242
251
  break
243
252
 
253
+ case Kind.Caption:
254
+ callbacks.caption(item, time, ancestors)
255
+ break
256
+
244
257
  case Kind.Audio:
245
258
  callbacks.audio(item, time, ancestors)
246
259
  break
@@ -275,11 +288,12 @@ function walkFrom(
275
288
 
276
289
  if (!child)
277
290
  continue
278
- if (!isPlayableItem(child)) {
291
+
292
+ const duration = computeItemDurationFromMap(child.id, items)
293
+ if (duration <= 0)
279
294
  continue
280
- }
281
295
 
282
- const end = ms(offset + child.duration)
296
+ const end = ms(offset + duration)
283
297
  if (from >= end) {
284
298
  offset = end
285
299
  continue
@@ -308,6 +322,10 @@ function walkFrom(
308
322
  callbacks.text(item, from, ancestors)
309
323
  break
310
324
 
325
+ case Kind.Caption:
326
+ callbacks.caption(item, from, ancestors)
327
+ break
328
+
311
329
  case Kind.Audio:
312
330
  callbacks.audio(item, from, ancestors)
313
331
  break
@@ -318,14 +336,24 @@ export function computeItemDuration(
318
336
  id: number,
319
337
  timeline: TimelineFile
320
338
  ): Ms {
321
- const item = timeline.items.find(item => item.id === id)
339
+ return computeItemDurationFromMap(
340
+ id,
341
+ new Map(timeline.items.map(item => [item.id, item]))
342
+ )
343
+ }
344
+
345
+ function computeItemDurationFromMap(
346
+ id: number,
347
+ items: Map<Id, Item.Any>
348
+ ): Ms {
349
+ const item = items.get(id)
322
350
 
323
351
  if (!item) return ms(0)
324
352
 
325
353
  switch (item.kind) {
326
354
  case Kind.Sequence: {
327
355
  const children = item.childrenIds
328
- .map(childId => timeline.items.find(x => x.id === childId))
356
+ .map(childId => items.get(childId))
329
357
  .filter(Boolean) as Item.Any[]
330
358
 
331
359
  let total = ms(0)
@@ -338,8 +366,8 @@ export function computeItemDuration(
338
366
  const next = children[i + 1]
339
367
 
340
368
  if (prev && next && prev.kind !== Kind.Transition && next.kind !== Kind.Transition) {
341
- const prevDur = computeItemDuration(prev.id, timeline)
342
- const nextDur = computeItemDuration(next.id, timeline)
369
+ const prevDur = computeItemDurationFromMap(prev.id, items)
370
+ const nextDur = computeItemDurationFromMap(next.id, items)
343
371
  const overlap = Math.max(0, Math.min(child.duration, prevDur, nextDur))
344
372
 
345
373
  total = ms(total - overlap)
@@ -347,7 +375,7 @@ export function computeItemDuration(
347
375
  continue
348
376
  }
349
377
 
350
- total = ms(total + computeItemDuration(child.id, timeline))
378
+ total = ms(total + computeItemDurationFromMap(child.id, items))
351
379
  }
352
380
 
353
381
  return total
@@ -357,7 +385,7 @@ export function computeItemDuration(
357
385
  let longest = ms(0)
358
386
 
359
387
  for (const childId of item.childrenIds) {
360
- const duration = computeItemDuration(childId, timeline)
388
+ const duration = computeItemDurationFromMap(childId, items)
361
389
  if (duration > longest) {
362
390
  longest = duration
363
391
  }
@@ -8,7 +8,8 @@ export type VideoSampler = (item: Item.Video, time: Ms) => Promise<VideoFrame |
8
8
  export function createDefaultVideoSampler(sink: VideoSink): VideoSampler {
9
9
  return async (item, time) => {
10
10
  const s = await sink.getSink(item.mediaHash)
11
- const sample = await s?.getSample(time / 1000)
11
+ const mediaTime = item.start + time
12
+ const sample = await s?.getSample(mediaTime / 1000)
12
13
  const frame = sample?.toVideoFrame()
13
14
  sample?.close()
14
15
  return frame ?? undefined
@@ -3,6 +3,7 @@ import {SampleContext} from "./types.js"
3
3
  import {sampleSequence} from "./sequence.js"
4
4
  import {Ms} from "../../../../../../units/ms.js"
5
5
  import {Item, Kind} from "../../../../../parts/item.js"
6
+ import {segmentTranscript} from "../../../../../parts/captions.js"
6
7
  import {FilterSpec, Layer} from "../../../../../../driver/fns/schematic.js"
7
8
  import {AncestorAt, computeOpacity, computeWorldMatrix} from "../../../handy.js"
8
9
 
@@ -58,6 +59,24 @@ export async function sampleVisual(
58
59
  return [{id: item.id, kind: "text", content: item.content, style, matrix, alpha, crop, filters}]
59
60
  }
60
61
 
62
+ case Kind.Caption: {
63
+ if (time < 0 || time >= item.duration) return []
64
+
65
+ const transcriptTime = item.start + time
66
+ const segment = segmentTranscript(item.transcript, item).find(segment => {
67
+ const [start, end] = segment.timestamp
68
+ return transcriptTime >= start && transcriptTime < end
69
+ })
70
+ if (!segment)
71
+ return []
72
+
73
+ const style = item.styleId
74
+ ? (ctx.items.get(item.styleId) as Item.TextStyle)?.style
75
+ : undefined
76
+
77
+ return [{id: item.id, kind: "text", content: segment.text, style, matrix, alpha, crop, filters}]
78
+ }
79
+
61
80
  case Kind.Gap: {
62
81
  return [{id: item.id, kind: "gap"}]
63
82
  }
@@ -6,6 +6,8 @@ import {Media} from "../parts/media.js"
6
6
  import {TimelineFile} from "../parts/basics.js"
7
7
  import {FilterAction} from "../parts/filters.js"
8
8
  import {filters, FilterParams, FilterType} from "../parts/filters.js"
9
+ import {CaptionOptions, CaptionSourceItem} from "../parts/captions.js"
10
+ import {Transcription} from "../../features/speech/transcribe/types.js"
9
11
  import {AnimationPreset, PresetOptions} from "../parts/animations/types.js"
10
12
  import {Crop, FilterableItem, Item, VisualAnimatableItem} from "../parts/item.js"
11
13
  import {animationPresets, visualAnimations} from "../parts/animations/registry.js"
@@ -75,6 +77,14 @@ export function text(
75
77
  return o => o.text(content, options)
76
78
  }
77
79
 
80
+ export function captions(
81
+ item: Build<CaptionSourceItem>,
82
+ transcript: Transcription,
83
+ options?: CaptionOptions
84
+ ): Build<Item.Stack> {
85
+ return o => o.captions(item(o), transcript, options)
86
+ }
87
+
78
88
  export function gap(duration: number): Build<Item.Gap> {
79
89
  return o => o.gap(duration)
80
90
  }