npm - @omnimedia/omnitool - Versions diffs - 1.1.0-90 → 1.1.0-93 - Mend

@omnimedia/omnitool 1.1.0-90 → 1.1.0-93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/README.md +132 -3
package/package.json +1 -1
package/s/demo/routines/timeline-setup.ts +32 -4
package/s/driver/parts/compositor.ts +3 -0
package/s/features/speech/transcribe/transcriber.ts +1 -1
package/s/features/speech/transcribe/types.ts +8 -4
package/s/timeline/index.ts +4 -0
package/s/timeline/parts/captions.ts +90 -0
package/s/timeline/parts/item.ts +23 -4
package/s/timeline/renderers/export/parts/cursor.ts +20 -3
package/s/timeline/renderers/parts/handy.ts +41 -13
package/s/timeline/renderers/parts/samplers/visual/parts/defaults.ts +2 -1
package/s/timeline/renderers/parts/samplers/visual/parts/sample.ts +19 -0
package/s/timeline/sugar/helpers.ts +10 -0
package/s/timeline/sugar/o.ts +55 -0
package/x/demo/demo.bundle.min.js +103 -103
package/x/demo/demo.bundle.min.js.map +4 -4
package/x/demo/routines/timeline-setup.js +28 -3
package/x/demo/routines/timeline-setup.js.map +1 -1
package/x/driver/parts/compositor.js +3 -0
package/x/driver/parts/compositor.js.map +1 -1
package/x/features/speech/transcribe/transcriber.d.ts +1 -1
package/x/features/speech/transcribe/transcriber.js +1 -1
package/x/features/speech/transcribe/transcriber.js.map +1 -1
package/x/features/speech/transcribe/types.d.ts +6 -4
package/x/index.html +2 -2
package/x/tests.bundle.min.js +107 -107
package/x/tests.bundle.min.js.map +4 -4
package/x/tests.html +1 -1
package/x/timeline/index.d.ts +4 -0
package/x/timeline/index.js +4 -0
package/x/timeline/index.js.map +1 -1
package/x/timeline/parts/captions.d.ts +40 -0
package/x/timeline/parts/captions.js +55 -0
package/x/timeline/parts/captions.js.map +1 -0
package/x/timeline/parts/item.d.ts +21 -5
package/x/timeline/parts/item.js +1 -0
package/x/timeline/parts/item.js.map +1 -1
package/x/timeline/renderers/export/parts/cursor.d.ts +1 -1
package/x/timeline/renderers/export/parts/cursor.js +18 -3
package/x/timeline/renderers/export/parts/cursor.js.map +1 -1
package/x/timeline/renderers/parts/handy.d.ts +1 -0
package/x/timeline/renderers/parts/handy.js +27 -13
package/x/timeline/renderers/parts/handy.js.map +1 -1
package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js +2 -1
package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js.map +1 -1
package/x/timeline/renderers/parts/samplers/visual/parts/sample.js +16 -0
package/x/timeline/renderers/parts/samplers/visual/parts/sample.js.map +1 -1
package/x/timeline/sugar/helpers.d.ts +3 -0
package/x/timeline/sugar/helpers.js +3 -0
package/x/timeline/sugar/helpers.js.map +1 -1
package/x/timeline/sugar/o.d.ts +2 -0
package/x/timeline/sugar/o.js +38 -0
package/x/timeline/sugar/o.js.map +1 -1

package/README.md CHANGED Viewed

@@ -79,6 +79,133 @@ const timeline = timeline(
 )
 ```
+## 💬 Captions
+Captions render transcript data as timed, styled text.
+The transcript can come from anywhere, as long as it follows the structure.
+```ts
+const transcript = {
+	text: "Hello world. This is a caption.",
+	chunks: [
+		{text: "Hello", timestamp: [0, 0.4]},
+		{text: "world.", timestamp: [0.4, 1.2]},
+		{text: "This", timestamp: [1.3, 1.6]},
+		{text: "is", timestamp: [1.6, 1.8]},
+		{text: "a", timestamp: [1.8, 1.9]},
+		{text: "caption.", timestamp: [1.9, 2.6]},
+	],
+}
+const timeline = omni.timeline(o => {
+	const video = o.video(clip, {duration: 3000})
+	return o.captions(video, transcript)
+})
+```
+Use a caption preset to pick a built-in caption style:
+```ts
+const timeline = omni.timeline(o => {
+	const video = o.video(clip, {duration: 3000})
+	return o.captions.presets.default(video, transcript)
+})
+```
+or do your own styled captions:
+```ts
+const timeline = omni.timeline(o => {
+	const video = o.video(clip, {duration: 3000})
+	return o.captions(video, transcript, {
+		styles: {
+			fontFamily: "Inter",
+			fontSize: 64,
+			fill: "#fff7d6",
+			stroke: {color: "#111111", width: 8},
+			align: "center",
+		},
+	})
+})
+```
+Use omnitool's built in speech-to-text with default model:
+```ts
+import {makeTranscriber, defaultTranscriberSpec} from "@omnimedia/omnitool"
+// uses onnx-community/whisper-tiny_timestamped
+const transcriber = await makeTranscriber({
+	driver,
+	spec: defaultTranscriberSpec(),
+	workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
+	onLoading: loading => console.log("loading", loading),
+})
+const transcript = await transcriber.transcribe({
+	source: file,
+	language: "english",
+	onReport: report => console.log("report", report),
+	onTranscription: text => console.log("transcribing", text),
+})
+const timeline = omni.timeline(o => {
+	const video = o.video(clip)
+	return o.captions(video, transcript)
+})
+```
+Load a custom speech-to-text model:
+```ts
+const transcriber = await makeTranscriber({
+	driver,
+	spec: {
+	  model: "onnx-community/whisper-tiny_timestamped",
+	  dtype: "q4",
+	  device: "wasm",
+	  chunkLength: 20,
+	  strideLength: 3
+	},
+	workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
+	onLoading: loading => console.log("loading", loading),
+})
+```
+> [!IMPORTANT]
+> Use a Transformers.js-compatible speech-to-text model, for example `onnx-community/*_timestamped`.
+> The model must support word-level timestamps because captions use `return_timestamps: "word"`.
+> `device` and `dtype` are passed to Transformers.js and depend on your runtime/model.
+> Browser usage commonly uses `"wasm"` or `"webgpu"`. `"webgpu"` for speed, `"wasm"` for more device support
+> `workerUrl` depends on where you host the worker bundle.
+`o.captions(video, transcript, options)` creates captions for a video or audio.
+`o.captions` uses `captionPresets.default` preset.
+use `o.captions.presets` to choose from available pre-styled captions.
+pass `styles` in options to override preset styles.
+transcript chunk timestamps are in seconds.
+Update caption options after creation:
+```ts
+const caption = o.captions.make(transcript, {maxChars: 42})
+const style = o.textStyle({fill: "#00ff00"})
+o.set(caption.id, {
+	maxChars: 32,
+	styleId: style.id,
+})
+```
+Caption options:
+`styles` - sets styles, it overrides the preset's styles.
+`start` - transcript time where captions begin, in milliseconds.
+`duration` - caption layer duration, in milliseconds.
+`maxChars` - maximum characters in one generated caption line.
+`maxDuration` - maximum duration of one generated caption line, in milliseconds.
+`maxSilence` - maximum silence allowed inside one caption; longer pauses start a new caption, in milliseconds.
+import `captionPresets` to list available caption looks.
 ## 🎛 Filters
 Filter application:
@@ -353,9 +480,11 @@ Timeline items:
 - 4 `Text`
 - 5 `Gap`
 - 6 `Spatial`
-- 7 `Transition`
-- 8 `TextStyle`
-- 9 `Filter`
+- 7 `Animation`
+- 8 `Transition`
+- 9 `TextStyle`
+- 10 `Filter`
+- 11 `Caption`
 ## 🗺️ Roadmap
 - CLI commands:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@omnimedia/omnitool",
-	"version": "1.1.0-90",
+	"version": "1.1.0-93",
 	"description": "open source video processing tools",
 	"license": "MIT",
 	"author": "Przemysław Gałęzki",

package/s/demo/routines/timeline-setup.ts CHANGED Viewed

@@ -1,10 +1,25 @@
 import {Driver} from "../../driver/driver.js"
-import {Datafile, Item, Omni} from "../../timeline/index.js"
+import {Datafile, defaultTranscriberSpec, Item, makeTranscriber, Omni} from "../../timeline/index.js"
 export async function TimelineSchemaTest(driver: Driver, file: File) {
+	const transcriber = await makeTranscriber({
+		driver,
+		spec: defaultTranscriberSpec(),
+		workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
+		onLoading: loading => console.log("transcriber loading", loading),
+	})
 	const omni = new Omni(driver)
 	const {videoA} = await omni.load({videoA: Datafile.make(file)})
+	const transcript = await transcriber.transcribe({
+		source: file,
+		language: "english",
+		onReport: report => console.log("transcriber report", report),
+		onTranscription: text => console.log("transcribing", text),
+	})
 	const timeline = omni.timeline(o => {
 		const text = o.text("content", {duration: 3000})
 		const fade = o.animate.opacity.make("easeIn", [
@@ -30,15 +45,28 @@ export async function TimelineSchemaTest(driver: Driver, file: File) {
 				[3000, o.transform({position: [320, 0], scale: [1.15, 1.15], rotation: 0})],
 		])
-		const video = o.video(videoA, {duration: 3000, start: 1000})
+		const video = o.video(videoA, {duration: 6000, start: 3000})
 		o.set<Item.Text>(text.id, {styleId: style.id, spatialId: textSpatial.id, animationIds: [fade.id, textMotion.id]})
 		o.set<Item.Video>(video.id, {spatialId: videoSpatial.id})
 		return o.sequence(
 			o.stack(
 				text,
-				video,
-				o.audio(videoA, {duration: 1000})
+				o.captions(video, transcript, {
+					maxChars: 34,
+					maxDuration: 2800,
+					maxSilence: 450,
+					styles: {
+						fontFamily: "Arial",
+						fontSize: 64,
+						fill: "#ffffff",
+						align: "center",
+						wordWrap: true,
+						wordWrapWidth: 1280,
+						stroke: {color: "#000000", width: 6},
+					},
+				}),
+				o.audio(videoA, {start: 3000})
 			),
 			o.gap(500),
 			o.video(videoA, {duration: 7000, start: 5000})

package/s/driver/parts/compositor.ts CHANGED Viewed

@@ -110,6 +110,9 @@ export class Compositor {
 		parent: Container,
 	) {
 		const sprite = this.#findOrCreate<Text>(layer)!
+		sprite.text = layer.content
+		if (layer.style)
+			sprite.style = layer.style
 		this.#applyTransform(sprite, layer.matrix)
 		this.#applyAlpha(sprite, layer.alpha)
 		this.#applyCrop(sprite, layer.crop)

package/s/features/speech/transcribe/transcriber.ts CHANGED Viewed

@@ -40,7 +40,7 @@ export async function makeTranscriber({driver, spec, workerUrl, onLoading}: Tran
 			detachCallbacks()
 			return result
 		}),
-		dispose: thread.terminate()
+		dispose: () => thread.terminate()
 	}
 }

package/s/features/speech/transcribe/types.ts CHANGED Viewed

@@ -36,12 +36,16 @@ export type TranscriberPipeOptions = {
 export type SpeechTime = [start: number, end: number]
+export type TranscriptWord = {
+	text: string
+	timestamp: SpeechTime
+}
+export type TranscriptSegment = TranscriptWord
 export type Transcription = {
 	text: string
-	chunks: {
-		text: string
-		timestamp: SpeechTime
-	}[]
+	chunks: TranscriptWord[]
 }
 export type TranscriberSpec = {

package/s/timeline/index.ts CHANGED Viewed

@@ -8,6 +8,10 @@ export * from "./parts/resource-pool.js"
 export * from "./parts/resource.js"
 export * from "./parts/filmstrip.js"
 export * from "./parts/animations/registry.js"
+export {captionPresets} from "./parts/captions.js"
+export * from "../features/speech/transcribe/default-spec.js"
+export * from "../features/speech/transcribe/transcriber.js"
+export * from "../features/speech/transcribe/types.js"
 export * from "./parts/waveform/waveform.js"
 export * from "./parts/waveform/parts/types.js"

package/s/timeline/parts/captions.ts ADDED Viewed

@@ -0,0 +1,90 @@
+import type {Item} from "./item.js"
+import {TextStyleOptions} from "pixi.js"
+import {TransformOptions, Vec2} from "../types.js"
+import {Transcription, TranscriptSegment} from "../../features/speech/transcribe/types.js"
+export type CaptionOptions = {
+	start?: number
+	duration?: number
+	styles?: TextStyleOptions
+	maxChars?: number
+	maxDuration?: number
+	maxSilence?: number
+}
+export const captionPresets = {
+	default: {
+		styles: {
+			fontFamily: "Arial",
+			fontSize: 56,
+			fill: "#ffffff",
+			align: "center",
+			wordWrap: true,
+			wordWrapWidth: 1440,
+		} satisfies TextStyleOptions,
+		transform: {
+			position: [240, 860] as Vec2
+		} satisfies TransformOptions,
+	}
+}
+export type CaptionPreset = (typeof captionPresets)[keyof typeof captionPresets]
+export type CaptionSourceItem = Item.Video | Item.Audio
+export type CaptionAction = {
+	(item: CaptionSourceItem, transcript: Transcription, options?: CaptionOptions): Item.Stack
+	make: (transcript: Transcription, options?: CaptionOptions) => Item.Caption
+}
+export type CaptionActions = CaptionAction & {
+	presets: {
+		[TName in keyof typeof captionPresets]: CaptionAction
+	}
+}
+const CAPTION_DEFAULTS = {
+	maxChars: 42,
+	maxDuration: 3500,
+	maxSilence: 750,
+} satisfies CaptionOptions
+export function segmentTranscript(transcript: Transcription, options?: CaptionOptions): TranscriptSegment[] {
+	const {maxChars, maxDuration, maxSilence} = {...CAPTION_DEFAULTS, ...options}
+	const segments: TranscriptSegment[] = []
+	let current: TranscriptSegment | null = null
+	for (const {timestamp: [t0, t1], text: rawText} of transcript.chunks) {
+		const [start, end] = [t0 * 1000, t1 * 1000]
+		const text = rawText.trim()
+		if (!Number.isFinite(start) || !Number.isFinite(end) || !text) continue
+		if (!current) {
+			current = {text, timestamp: [start, end]}
+			continue
+		}
+		const [currentStart, currentEnd]: [number, number] = current.timestamp
+		const nextText = `${current.text} ${text}`.trim()
+		const shouldBreak =
+			nextText.length > maxChars ||
+			end - currentStart > maxDuration ||
+			start - currentEnd > maxSilence
+		if (shouldBreak) {
+			segments.push(current)
+			current = {text, timestamp: [start, end]}
+		}
+		else {
+			current = {text: nextText, timestamp: [currentStart, end]}
+		}
+	}
+	if (current) segments.push(current)
+	return segments
+}
+export function captionDuration(transcript: Transcription, options?: CaptionOptions) {
+	const segments = segmentTranscript(transcript, options)
+	return Math.max(0, ...segments.map(segment => segment.timestamp[1]))
+}

package/s/timeline/parts/item.ts CHANGED Viewed

@@ -3,8 +3,9 @@ import {TextStyleOptions} from "pixi.js"
 import {Id, Hash} from "./basics.js"
 import {Ms} from "../../units/ms.js"
-import type {FilterParams, FilterType} from "./filters.js"
 import {Transform, VisualAnimations} from "../types.js"
+import type {FilterParams, FilterType} from "./filters.js"
+import type {Transcription} from "../../features/speech/transcribe/types.js"
 export type Crop = [top: number, right: number, bottom: number, left: number]
@@ -19,7 +20,8 @@ export enum Kind {
 	Animation,
 	Transition,
 	TextStyle,
-	Filter
+	Filter,
+	Caption
 }
 export enum Effect {
@@ -109,6 +111,22 @@ export namespace Item {
 		filterIds?: Id[]
 	}
+	export type Caption = {
+		id: Id
+		kind: Kind.Caption
+		transcript: Transcription
+		start: number
+		duration: number
+		maxChars?: number
+		maxDuration?: number
+		maxSilence?: number
+		spatialId?: Id
+		animationIds?: Id[]
+		styleId?: Id
+		filterIds?: Id[]
+	}
 	export type Transition = {
 		id: Id
 		kind: Kind.Transition
@@ -122,6 +140,7 @@ export namespace Item {
 		| Video
 		| Audio
 		| Text
+		| Caption
 		| Gap
 		| Transition
 		| Spatial
@@ -133,8 +152,8 @@ export namespace Item {
 export type ContainerItem = Item.Sequence | Item.Stack
 export type NonContainerItem = Exclude<Item.Any, ContainerItem>
-export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text
-export type VisualAnimatableItem = Item.Video | Item.Text
+export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text | Item.Caption
+export type VisualAnimatableItem = Item.Video | Item.Text | Item.Caption
 export type PlayableItem = Item.Any & {
 	start: Ms

package/s/timeline/renderers/export/parts/cursor.ts CHANGED Viewed

@@ -25,7 +25,7 @@ abstract class BaseVisualSampler {
 		protected timeline: TimelineFile
 	) {
 		this.#sampler = createVisualSampler(this.resolveMedia, (item, time) => {
-			const targetUs = toUs(time)
+			const targetUs = toUs(ms(item.start + time))
 			let cursor = this.#videoCursors.get(item.id)
 			if (!cursor) {
@@ -148,8 +148,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
 		return this.sample(timecode)
 	}
-	protected createCursor(source: DecoderSource, _initialTargetUs: number, endUs: number): VideoFrameCursor {
-		const startUs = 0
+	protected createCursor(source: DecoderSource, startUs: number, endUs: number): VideoFrameCursor {
 		const windowUs = 1_000_000
 		const prefetchThreshold = windowUs * 0.5
@@ -159,6 +158,9 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
 		let input: Input | null = null
 		let sink: VideoSampleSink | null = null
 		let prefetchPromise: Promise<{frames: VideoFrame[], windowStart: number, windowEnd: number}> | null = null
+		let activeFetches = 0
+		let idle: Promise<void> = Promise.resolve()
+		let resolveIdle: (() => void) | null = null
 		let canceled = false
 		const clear = () => {
@@ -167,6 +169,18 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
 			frames = []
 		}
+		const startFetch = () => {
+			if (activeFetches++ === 0)
+				idle = new Promise<void>(resolve => resolveIdle = resolve)
+		}
+		const endFetch = () => {
+			if (--activeFetches === 0) {
+				resolveIdle?.()
+				resolveIdle = null
+			}
+		}
 		const getSink = async () => {
 			if (sink) return sink
@@ -184,6 +198,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
 		}
 		const fetchFrames = async (targetUs: number) => {
+			startFetch()
 			const wEnd = Math.min(endUs, targetUs + 1)
 			const wStart = Math.max(startUs, wEnd - windowUs)
 			const newFrames: VideoFrame[] = []
@@ -196,6 +211,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
 				}
 			}
+			endFetch()
 			return {frames: newFrames, windowStart: wStart, windowEnd: wEnd}
 		}
@@ -262,6 +278,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
 				if (prefetched)
 					for (const f of prefetched.frames) f.close()
+				await idle
 				clear()
 				input?.dispose()
 				input = null

package/s/timeline/renderers/parts/handy.ts CHANGED Viewed

@@ -16,6 +16,7 @@ type WalkAtCallbacks = {
 	stack: (x: Item.Stack, localTime: Ms, ancestors: AncestorAt[]) => void
 	video: (x: Item.Video, localTime: Ms, ancestors: AncestorAt[]) => void
 	text: (x: Item.Text, localTime: Ms, ancestors: AncestorAt[]) => void
+	caption: (x: Item.Caption, localTime: Ms, ancestors: AncestorAt[]) => void
 	audio: (x: Item.Audio, localTime: Ms, ancestors: AncestorAt[]) => void
 }
@@ -24,6 +25,7 @@ type WalkCallbacks = {
 	stack?: (x: Item.Stack, matrix: Mat6, ancestors: AncestorAt[]) => void
 	video?: (x: Item.Video, matrix: Mat6, ancestors: AncestorAt[]) => void
 	text?: (x: Item.Text, matrix: Mat6, ancestors: AncestorAt[]) => void
+	caption?: (x: Item.Caption, matrix: Mat6, ancestors: AncestorAt[]) => void
 	audio?: (x: Item.Audio) => void
 }
@@ -52,6 +54,7 @@ export function itemsAt(p: Props): At[] {
 		stack: () => { },
 		video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
 		text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
+		caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
 		audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
 	})
@@ -72,6 +75,7 @@ export function itemsFrom(p: FromProps): At[] {
 		stack: () => { },
 		video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
 		text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
+		caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
 		audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
 	})
@@ -176,6 +180,10 @@ export function walk(
 			callbacks.text?.(item, currentMatrix, ancestors)
 			break
+		case Kind.Caption:
+			callbacks.caption?.(item, currentMatrix, ancestors)
+			break
 		case Kind.Audio:
 			callbacks.audio?.(item)
 			break
@@ -211,11 +219,12 @@ function walkAt(
 				if (!child)
 					continue
-				if (!isPlayableItem(child)) {
+				const duration = computeItemDurationFromMap(child.id, items)
+				if (duration <= 0)
 					continue
-				}
-				if (time >= offset && time < offset + child.duration) {
+				if (time >= offset && time < offset + duration) {
 					const localTime = ms(time - offset)
 					walkAt(
 						childId,
@@ -227,7 +236,7 @@ function walkAt(
 					break
 				}
-				offset = ms(offset + child.duration)
+				offset = ms(offset + duration)
 			}
 			break
@@ -241,6 +250,10 @@ function walkAt(
 			callbacks.text(item, time, ancestors)
 			break
+		case Kind.Caption:
+			callbacks.caption(item, time, ancestors)
+			break
 		case Kind.Audio:
 			callbacks.audio(item, time, ancestors)
 			break
@@ -275,11 +288,12 @@ function walkFrom(
 				if (!child)
 					continue
-				if (!isPlayableItem(child)) {
+				const duration = computeItemDurationFromMap(child.id, items)
+				if (duration <= 0)
 					continue
-				}
-				const end = ms(offset + child.duration)
+				const end = ms(offset + duration)
 				if (from >= end) {
 					offset = end
 					continue
@@ -308,6 +322,10 @@ function walkFrom(
 			callbacks.text(item, from, ancestors)
 			break
+		case Kind.Caption:
+			callbacks.caption(item, from, ancestors)
+			break
 		case Kind.Audio:
 			callbacks.audio(item, from, ancestors)
 			break
@@ -318,14 +336,24 @@ export function computeItemDuration(
 	id: number,
 	timeline: TimelineFile
 ): Ms {
-	const item = timeline.items.find(item => item.id === id)
+	return computeItemDurationFromMap(
+		id,
+		new Map(timeline.items.map(item => [item.id, item]))
+	)
+}
+function computeItemDurationFromMap(
+	id: number,
+	items: Map<Id, Item.Any>
+): Ms {
+	const item = items.get(id)
 	if (!item) return ms(0)
 	switch (item.kind) {
 		case Kind.Sequence: {
 			const children = item.childrenIds
-				.map(childId => timeline.items.find(x => x.id === childId))
+				.map(childId => items.get(childId))
 				.filter(Boolean) as Item.Any[]
 			let total = ms(0)
@@ -338,8 +366,8 @@ export function computeItemDuration(
 					const next = children[i + 1]
 					if (prev && next && prev.kind !== Kind.Transition && next.kind !== Kind.Transition) {
-						const prevDur = computeItemDuration(prev.id, timeline)
-						const nextDur = computeItemDuration(next.id, timeline)
+						const prevDur = computeItemDurationFromMap(prev.id, items)
+						const nextDur = computeItemDurationFromMap(next.id, items)
 						const overlap = Math.max(0, Math.min(child.duration, prevDur, nextDur))
 						total = ms(total - overlap)
@@ -347,7 +375,7 @@ export function computeItemDuration(
 					continue
 				}
-				total = ms(total + computeItemDuration(child.id, timeline))
+				total = ms(total + computeItemDurationFromMap(child.id, items))
 			}
 			return total
@@ -357,7 +385,7 @@ export function computeItemDuration(
 			let longest = ms(0)
 			for (const childId of item.childrenIds) {
-				const duration = computeItemDuration(childId, timeline)
+				const duration = computeItemDurationFromMap(childId, items)
 				if (duration > longest) {
 					longest = duration
 				}

package/s/timeline/renderers/parts/samplers/visual/parts/defaults.ts CHANGED Viewed

@@ -8,7 +8,8 @@ export type VideoSampler = (item: Item.Video, time: Ms) => Promise<VideoFrame |
 export function createDefaultVideoSampler(sink: VideoSink): VideoSampler {
 	return async (item, time) => {
 		const s = await sink.getSink(item.mediaHash)
-		const sample = await s?.getSample(time / 1000)
+		const mediaTime = item.start + time
+		const sample = await s?.getSample(mediaTime / 1000)
 		const frame = sample?.toVideoFrame()
 		sample?.close()
 		return frame ?? undefined

package/s/timeline/renderers/parts/samplers/visual/parts/sample.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import {SampleContext} from "./types.js"
 import {sampleSequence} from "./sequence.js"
 import {Ms} from "../../../../../../units/ms.js"
 import {Item, Kind} from "../../../../../parts/item.js"
+import {segmentTranscript} from "../../../../../parts/captions.js"
 import {FilterSpec, Layer} from "../../../../../../driver/fns/schematic.js"
 import {AncestorAt, computeOpacity, computeWorldMatrix} from "../../../handy.js"
@@ -58,6 +59,24 @@ export async function sampleVisual(
 			return [{id: item.id, kind: "text", content: item.content, style, matrix, alpha, crop, filters}]
 		}
+		case Kind.Caption: {
+			if (time < 0 || time >= item.duration) return []
+			const transcriptTime = item.start + time
+			const segment = segmentTranscript(item.transcript, item).find(segment => {
+				const [start, end] = segment.timestamp
+				return transcriptTime >= start && transcriptTime < end
+			})
+			if (!segment)
+				return []
+			const style = item.styleId
+				? (ctx.items.get(item.styleId) as Item.TextStyle)?.style
+				: undefined
+			return [{id: item.id, kind: "text", content: segment.text, style, matrix, alpha, crop, filters}]
+		}
 		case Kind.Gap: {
 			return [{id: item.id, kind: "gap"}]
 		}