@omnimedia/omnitool 1.1.0-91 → 1.1.0-94
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +133 -3
- package/package.json +1 -1
- package/s/demo/routines/timeline-setup.ts +32 -4
- package/s/driver/parts/compositor.ts +3 -0
- package/s/features/speech/transcribe/transcriber.ts +1 -1
- package/s/features/speech/transcribe/types.ts +8 -4
- package/s/timeline/index.ts +4 -0
- package/s/timeline/parts/captions.ts +90 -0
- package/s/timeline/parts/item.ts +25 -4
- package/s/timeline/renderers/export/parts/cursor.ts +2 -3
- package/s/timeline/renderers/parts/handy.ts +41 -13
- package/s/timeline/renderers/parts/samplers/visual/parts/defaults.ts +2 -1
- package/s/timeline/renderers/parts/samplers/visual/parts/sample.ts +19 -0
- package/s/timeline/sugar/helpers.ts +10 -0
- package/s/timeline/sugar/o.ts +56 -0
- package/x/demo/demo.bundle.min.js +102 -102
- package/x/demo/demo.bundle.min.js.map +4 -4
- package/x/demo/routines/timeline-setup.js +28 -3
- package/x/demo/routines/timeline-setup.js.map +1 -1
- package/x/driver/parts/compositor.js +3 -0
- package/x/driver/parts/compositor.js.map +1 -1
- package/x/features/speech/transcribe/transcriber.d.ts +1 -1
- package/x/features/speech/transcribe/transcriber.js +1 -1
- package/x/features/speech/transcribe/transcriber.js.map +1 -1
- package/x/features/speech/transcribe/types.d.ts +6 -4
- package/x/index.html +2 -2
- package/x/tests.bundle.min.js +105 -105
- package/x/tests.bundle.min.js.map +4 -4
- package/x/tests.html +1 -1
- package/x/timeline/index.d.ts +4 -0
- package/x/timeline/index.js +4 -0
- package/x/timeline/index.js.map +1 -1
- package/x/timeline/parts/captions.d.ts +40 -0
- package/x/timeline/parts/captions.js +55 -0
- package/x/timeline/parts/captions.js.map +1 -0
- package/x/timeline/parts/item.d.ts +23 -5
- package/x/timeline/parts/item.js +1 -0
- package/x/timeline/parts/item.js.map +1 -1
- package/x/timeline/renderers/export/parts/cursor.d.ts +1 -1
- package/x/timeline/renderers/export/parts/cursor.js +2 -3
- package/x/timeline/renderers/export/parts/cursor.js.map +1 -1
- package/x/timeline/renderers/parts/handy.d.ts +1 -0
- package/x/timeline/renderers/parts/handy.js +27 -13
- package/x/timeline/renderers/parts/handy.js.map +1 -1
- package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js +2 -1
- package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js.map +1 -1
- package/x/timeline/renderers/parts/samplers/visual/parts/sample.js +16 -0
- package/x/timeline/renderers/parts/samplers/visual/parts/sample.js.map +1 -1
- package/x/timeline/sugar/helpers.d.ts +3 -0
- package/x/timeline/sugar/helpers.js +3 -0
- package/x/timeline/sugar/helpers.js.map +1 -1
- package/x/timeline/sugar/o.d.ts +2 -0
- package/x/timeline/sugar/o.js +39 -0
- package/x/timeline/sugar/o.js.map +1 -1
package/README.md
CHANGED
|
@@ -79,6 +79,134 @@ const timeline = timeline(
|
|
|
79
79
|
)
|
|
80
80
|
```
|
|
81
81
|
|
|
82
|
+
## 💬 Captions
|
|
83
|
+
|
|
84
|
+
Captions render transcript data as timed, styled text.
|
|
85
|
+
The transcript can come from anywhere, as long as it follows the structure.
|
|
86
|
+
|
|
87
|
+
```ts
|
|
88
|
+
const transcript = {
|
|
89
|
+
text: "Hello world. This is a caption.",
|
|
90
|
+
chunks: [
|
|
91
|
+
{text: "Hello", timestamp: [0, 0.4]},
|
|
92
|
+
{text: "world.", timestamp: [0.4, 1.2]},
|
|
93
|
+
{text: "This", timestamp: [1.3, 1.6]},
|
|
94
|
+
{text: "is", timestamp: [1.6, 1.8]},
|
|
95
|
+
{text: "a", timestamp: [1.8, 1.9]},
|
|
96
|
+
{text: "caption.", timestamp: [1.9, 2.6]},
|
|
97
|
+
],
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const timeline = omni.timeline(o => {
|
|
101
|
+
const video = o.video(clip, {duration: 3000})
|
|
102
|
+
return o.captions(video, transcript)
|
|
103
|
+
})
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Use a caption preset to pick a built-in caption style:
|
|
107
|
+
|
|
108
|
+
```ts
|
|
109
|
+
const timeline = omni.timeline(o => {
|
|
110
|
+
const video = o.video(clip, {duration: 3000})
|
|
111
|
+
return o.captions.presets.default(video, transcript)
|
|
112
|
+
})
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
or do your own styled captions:
|
|
116
|
+
|
|
117
|
+
```ts
|
|
118
|
+
const timeline = omni.timeline(o => {
|
|
119
|
+
const video = o.video(clip, {duration: 3000})
|
|
120
|
+
return o.captions(video, transcript, {
|
|
121
|
+
styles: {
|
|
122
|
+
fontFamily: "Inter",
|
|
123
|
+
fontSize: 64,
|
|
124
|
+
fill: "#fff7d6",
|
|
125
|
+
stroke: {color: "#111111", width: 8},
|
|
126
|
+
align: "center",
|
|
127
|
+
},
|
|
128
|
+
})
|
|
129
|
+
})
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Use omnitool's built in speech-to-text with default model:
|
|
133
|
+
|
|
134
|
+
```ts
|
|
135
|
+
import {makeTranscriber, defaultTranscriberSpec} from "@omnimedia/omnitool"
|
|
136
|
+
|
|
137
|
+
// uses onnx-community/whisper-tiny_timestamped
|
|
138
|
+
const transcriber = await makeTranscriber({
|
|
139
|
+
driver,
|
|
140
|
+
spec: defaultTranscriberSpec(),
|
|
141
|
+
workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
|
|
142
|
+
onLoading: loading => console.log("loading", loading),
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
const transcript = await transcriber.transcribe({
|
|
146
|
+
source: file,
|
|
147
|
+
language: "english",
|
|
148
|
+
onReport: report => console.log("report", report),
|
|
149
|
+
onTranscription: text => console.log("transcribing", text),
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
const timeline = omni.timeline(o => {
|
|
153
|
+
const video = o.video(clip)
|
|
154
|
+
return o.captions(video, transcript)
|
|
155
|
+
})
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Load a custom speech-to-text model:
|
|
159
|
+
|
|
160
|
+
```ts
|
|
161
|
+
const transcriber = await makeTranscriber({
|
|
162
|
+
driver,
|
|
163
|
+
spec: {
|
|
164
|
+
model: "onnx-community/whisper-tiny_timestamped",
|
|
165
|
+
dtype: "q4",
|
|
166
|
+
device: "wasm",
|
|
167
|
+
chunkLength: 20,
|
|
168
|
+
strideLength: 3
|
|
169
|
+
},
|
|
170
|
+
workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
|
|
171
|
+
onLoading: loading => console.log("loading", loading),
|
|
172
|
+
})
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
> [!IMPORTANT]
|
|
176
|
+
> Use a Transformers.js-compatible speech-to-text model, for example `onnx-community/*_timestamped`.
|
|
177
|
+
> The model must support word-level timestamps because captions use `return_timestamps: "word"`.
|
|
178
|
+
> `device` and `dtype` are passed to Transformers.js and depend on your runtime/model.
|
|
179
|
+
> Browser usage commonly uses `"wasm"` or `"webgpu"`. `"webgpu"` for speed, `"wasm"` for more device support
|
|
180
|
+
> `workerUrl` depends on where you host the worker bundle.
|
|
181
|
+
|
|
182
|
+
`o.captions(video, transcript, options)` creates captions for a video or audio.
|
|
183
|
+
`o.captions` uses `captionPresets.default` preset.
|
|
184
|
+
The generated caption id is stored on the source item as `captionId`.
|
|
185
|
+
use `o.captions.presets` to choose from available pre-styled captions.
|
|
186
|
+
pass `styles` in options to override preset styles.
|
|
187
|
+
transcript chunk timestamps are in seconds.
|
|
188
|
+
|
|
189
|
+
Update caption options after creation:
|
|
190
|
+
|
|
191
|
+
```ts
|
|
192
|
+
const caption = o.captions.make(transcript, {maxChars: 42})
|
|
193
|
+
const style = o.textStyle({fill: "#00ff00"})
|
|
194
|
+
o.set(caption.id, {
|
|
195
|
+
maxChars: 32,
|
|
196
|
+
styleId: style.id,
|
|
197
|
+
})
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Caption options:
|
|
201
|
+
`styles` - sets styles, it overrides the preset's styles.
|
|
202
|
+
`start` - transcript time where captions begin, in milliseconds.
|
|
203
|
+
`duration` - caption layer duration, in milliseconds.
|
|
204
|
+
`maxChars` - maximum characters in one generated caption line.
|
|
205
|
+
`maxDuration` - maximum duration of one generated caption line, in milliseconds.
|
|
206
|
+
`maxSilence` - maximum silence allowed inside one caption; longer pauses start a new caption, in milliseconds.
|
|
207
|
+
|
|
208
|
+
import `captionPresets` to list available caption looks.
|
|
209
|
+
|
|
82
210
|
## 🎛 Filters
|
|
83
211
|
|
|
84
212
|
Filter application:
|
|
@@ -353,9 +481,11 @@ Timeline items:
|
|
|
353
481
|
- 4 `Text`
|
|
354
482
|
- 5 `Gap`
|
|
355
483
|
- 6 `Spatial`
|
|
356
|
-
- 7 `
|
|
357
|
-
- 8 `
|
|
358
|
-
- 9 `
|
|
484
|
+
- 7 `Animation`
|
|
485
|
+
- 8 `Transition`
|
|
486
|
+
- 9 `TextStyle`
|
|
487
|
+
- 10 `Filter`
|
|
488
|
+
- 11 `Caption`
|
|
359
489
|
|
|
360
490
|
## 🗺️ Roadmap
|
|
361
491
|
- CLI commands:
|
package/package.json
CHANGED
|
@@ -1,10 +1,25 @@
|
|
|
1
1
|
|
|
2
2
|
import {Driver} from "../../driver/driver.js"
|
|
3
|
-
import {Datafile, Item, Omni} from "../../timeline/index.js"
|
|
3
|
+
import {Datafile, defaultTranscriberSpec, Item, makeTranscriber, Omni} from "../../timeline/index.js"
|
|
4
4
|
|
|
5
5
|
export async function TimelineSchemaTest(driver: Driver, file: File) {
|
|
6
|
+
const transcriber = await makeTranscriber({
|
|
7
|
+
driver,
|
|
8
|
+
spec: defaultTranscriberSpec(),
|
|
9
|
+
workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
|
|
10
|
+
onLoading: loading => console.log("transcriber loading", loading),
|
|
11
|
+
})
|
|
12
|
+
|
|
6
13
|
const omni = new Omni(driver)
|
|
7
14
|
const {videoA} = await omni.load({videoA: Datafile.make(file)})
|
|
15
|
+
|
|
16
|
+
const transcript = await transcriber.transcribe({
|
|
17
|
+
source: file,
|
|
18
|
+
language: "english",
|
|
19
|
+
onReport: report => console.log("transcriber report", report),
|
|
20
|
+
onTranscription: text => console.log("transcribing", text),
|
|
21
|
+
})
|
|
22
|
+
|
|
8
23
|
const timeline = omni.timeline(o => {
|
|
9
24
|
const text = o.text("content", {duration: 3000})
|
|
10
25
|
const fade = o.animate.opacity.make("easeIn", [
|
|
@@ -30,15 +45,28 @@ export async function TimelineSchemaTest(driver: Driver, file: File) {
|
|
|
30
45
|
[3000, o.transform({position: [320, 0], scale: [1.15, 1.15], rotation: 0})],
|
|
31
46
|
])
|
|
32
47
|
|
|
33
|
-
const video = o.video(videoA, {duration:
|
|
48
|
+
const video = o.video(videoA, {duration: 6000, start: 3000})
|
|
34
49
|
o.set<Item.Text>(text.id, {styleId: style.id, spatialId: textSpatial.id, animationIds: [fade.id, textMotion.id]})
|
|
35
50
|
o.set<Item.Video>(video.id, {spatialId: videoSpatial.id})
|
|
36
51
|
|
|
37
52
|
return o.sequence(
|
|
38
53
|
o.stack(
|
|
39
54
|
text,
|
|
40
|
-
video,
|
|
41
|
-
|
|
55
|
+
o.captions(video, transcript, {
|
|
56
|
+
maxChars: 34,
|
|
57
|
+
maxDuration: 2800,
|
|
58
|
+
maxSilence: 450,
|
|
59
|
+
styles: {
|
|
60
|
+
fontFamily: "Arial",
|
|
61
|
+
fontSize: 64,
|
|
62
|
+
fill: "#ffffff",
|
|
63
|
+
align: "center",
|
|
64
|
+
wordWrap: true,
|
|
65
|
+
wordWrapWidth: 1280,
|
|
66
|
+
stroke: {color: "#000000", width: 6},
|
|
67
|
+
},
|
|
68
|
+
}),
|
|
69
|
+
o.audio(videoA, {start: 3000})
|
|
42
70
|
),
|
|
43
71
|
o.gap(500),
|
|
44
72
|
o.video(videoA, {duration: 7000, start: 5000})
|
|
@@ -110,6 +110,9 @@ export class Compositor {
|
|
|
110
110
|
parent: Container,
|
|
111
111
|
) {
|
|
112
112
|
const sprite = this.#findOrCreate<Text>(layer)!
|
|
113
|
+
sprite.text = layer.content
|
|
114
|
+
if (layer.style)
|
|
115
|
+
sprite.style = layer.style
|
|
113
116
|
this.#applyTransform(sprite, layer.matrix)
|
|
114
117
|
this.#applyAlpha(sprite, layer.alpha)
|
|
115
118
|
this.#applyCrop(sprite, layer.crop)
|
|
@@ -36,12 +36,16 @@ export type TranscriberPipeOptions = {
|
|
|
36
36
|
|
|
37
37
|
export type SpeechTime = [start: number, end: number]
|
|
38
38
|
|
|
39
|
+
export type TranscriptWord = {
|
|
40
|
+
text: string
|
|
41
|
+
timestamp: SpeechTime
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export type TranscriptSegment = TranscriptWord
|
|
45
|
+
|
|
39
46
|
export type Transcription = {
|
|
40
47
|
text: string
|
|
41
|
-
chunks:
|
|
42
|
-
text: string
|
|
43
|
-
timestamp: SpeechTime
|
|
44
|
-
}[]
|
|
48
|
+
chunks: TranscriptWord[]
|
|
45
49
|
}
|
|
46
50
|
|
|
47
51
|
export type TranscriberSpec = {
|
package/s/timeline/index.ts
CHANGED
|
@@ -8,6 +8,10 @@ export * from "./parts/resource-pool.js"
|
|
|
8
8
|
export * from "./parts/resource.js"
|
|
9
9
|
export * from "./parts/filmstrip.js"
|
|
10
10
|
export * from "./parts/animations/registry.js"
|
|
11
|
+
export {captionPresets} from "./parts/captions.js"
|
|
12
|
+
export * from "../features/speech/transcribe/default-spec.js"
|
|
13
|
+
export * from "../features/speech/transcribe/transcriber.js"
|
|
14
|
+
export * from "../features/speech/transcribe/types.js"
|
|
11
15
|
|
|
12
16
|
export * from "./parts/waveform/waveform.js"
|
|
13
17
|
export * from "./parts/waveform/parts/types.js"
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
|
|
2
|
+
import type {Item} from "./item.js"
|
|
3
|
+
import {TextStyleOptions} from "pixi.js"
|
|
4
|
+
import {TransformOptions, Vec2} from "../types.js"
|
|
5
|
+
import {Transcription, TranscriptSegment} from "../../features/speech/transcribe/types.js"
|
|
6
|
+
|
|
7
|
+
export type CaptionOptions = {
|
|
8
|
+
start?: number
|
|
9
|
+
duration?: number
|
|
10
|
+
styles?: TextStyleOptions
|
|
11
|
+
maxChars?: number
|
|
12
|
+
maxDuration?: number
|
|
13
|
+
maxSilence?: number
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export const captionPresets = {
|
|
17
|
+
default: {
|
|
18
|
+
styles: {
|
|
19
|
+
fontFamily: "Arial",
|
|
20
|
+
fontSize: 56,
|
|
21
|
+
fill: "#ffffff",
|
|
22
|
+
align: "center",
|
|
23
|
+
wordWrap: true,
|
|
24
|
+
wordWrapWidth: 1440,
|
|
25
|
+
} satisfies TextStyleOptions,
|
|
26
|
+
transform: {
|
|
27
|
+
position: [240, 860] as Vec2
|
|
28
|
+
} satisfies TransformOptions,
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export type CaptionPreset = (typeof captionPresets)[keyof typeof captionPresets]
|
|
33
|
+
export type CaptionSourceItem = Item.Video | Item.Audio
|
|
34
|
+
export type CaptionAction = {
|
|
35
|
+
(item: CaptionSourceItem, transcript: Transcription, options?: CaptionOptions): Item.Stack
|
|
36
|
+
make: (transcript: Transcription, options?: CaptionOptions) => Item.Caption
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export type CaptionActions = CaptionAction & {
|
|
40
|
+
presets: {
|
|
41
|
+
[TName in keyof typeof captionPresets]: CaptionAction
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const CAPTION_DEFAULTS = {
|
|
46
|
+
maxChars: 42,
|
|
47
|
+
maxDuration: 3500,
|
|
48
|
+
maxSilence: 750,
|
|
49
|
+
} satisfies CaptionOptions
|
|
50
|
+
|
|
51
|
+
export function segmentTranscript(transcript: Transcription, options?: CaptionOptions): TranscriptSegment[] {
|
|
52
|
+
const {maxChars, maxDuration, maxSilence} = {...CAPTION_DEFAULTS, ...options}
|
|
53
|
+
const segments: TranscriptSegment[] = []
|
|
54
|
+
let current: TranscriptSegment | null = null
|
|
55
|
+
|
|
56
|
+
for (const {timestamp: [t0, t1], text: rawText} of transcript.chunks) {
|
|
57
|
+
const [start, end] = [t0 * 1000, t1 * 1000]
|
|
58
|
+
const text = rawText.trim()
|
|
59
|
+
|
|
60
|
+
if (!Number.isFinite(start) || !Number.isFinite(end) || !text) continue
|
|
61
|
+
|
|
62
|
+
if (!current) {
|
|
63
|
+
current = {text, timestamp: [start, end]}
|
|
64
|
+
continue
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const [currentStart, currentEnd]: [number, number] = current.timestamp
|
|
68
|
+
const nextText = `${current.text} ${text}`.trim()
|
|
69
|
+
const shouldBreak =
|
|
70
|
+
nextText.length > maxChars ||
|
|
71
|
+
end - currentStart > maxDuration ||
|
|
72
|
+
start - currentEnd > maxSilence
|
|
73
|
+
|
|
74
|
+
if (shouldBreak) {
|
|
75
|
+
segments.push(current)
|
|
76
|
+
current = {text, timestamp: [start, end]}
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
current = {text: nextText, timestamp: [currentStart, end]}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (current) segments.push(current)
|
|
84
|
+
return segments
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export function captionDuration(transcript: Transcription, options?: CaptionOptions) {
|
|
88
|
+
const segments = segmentTranscript(transcript, options)
|
|
89
|
+
return Math.max(0, ...segments.map(segment => segment.timestamp[1]))
|
|
90
|
+
}
|
package/s/timeline/parts/item.ts
CHANGED
|
@@ -3,8 +3,9 @@ import {TextStyleOptions} from "pixi.js"
|
|
|
3
3
|
|
|
4
4
|
import {Id, Hash} from "./basics.js"
|
|
5
5
|
import {Ms} from "../../units/ms.js"
|
|
6
|
-
import type {FilterParams, FilterType} from "./filters.js"
|
|
7
6
|
import {Transform, VisualAnimations} from "../types.js"
|
|
7
|
+
import type {FilterParams, FilterType} from "./filters.js"
|
|
8
|
+
import type {Transcription} from "../../features/speech/transcribe/types.js"
|
|
8
9
|
|
|
9
10
|
export type Crop = [top: number, right: number, bottom: number, left: number]
|
|
10
11
|
|
|
@@ -19,7 +20,8 @@ export enum Kind {
|
|
|
19
20
|
Animation,
|
|
20
21
|
Transition,
|
|
21
22
|
TextStyle,
|
|
22
|
-
Filter
|
|
23
|
+
Filter,
|
|
24
|
+
Caption
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
export enum Effect {
|
|
@@ -87,6 +89,7 @@ export namespace Item {
|
|
|
87
89
|
spatialId?: Id
|
|
88
90
|
animationIds?: Id[]
|
|
89
91
|
filterIds?: Id[]
|
|
92
|
+
captionId?: Id
|
|
90
93
|
}
|
|
91
94
|
|
|
92
95
|
export type Audio = {
|
|
@@ -96,6 +99,7 @@ export namespace Item {
|
|
|
96
99
|
start: number
|
|
97
100
|
duration: number
|
|
98
101
|
gain?: number
|
|
102
|
+
captionId?: Id
|
|
99
103
|
}
|
|
100
104
|
|
|
101
105
|
export type Text = {
|
|
@@ -109,6 +113,22 @@ export namespace Item {
|
|
|
109
113
|
filterIds?: Id[]
|
|
110
114
|
}
|
|
111
115
|
|
|
116
|
+
|
|
117
|
+
export type Caption = {
|
|
118
|
+
id: Id
|
|
119
|
+
kind: Kind.Caption
|
|
120
|
+
transcript: Transcription
|
|
121
|
+
start: number
|
|
122
|
+
duration: number
|
|
123
|
+
maxChars?: number
|
|
124
|
+
maxDuration?: number
|
|
125
|
+
maxSilence?: number
|
|
126
|
+
spatialId?: Id
|
|
127
|
+
animationIds?: Id[]
|
|
128
|
+
styleId?: Id
|
|
129
|
+
filterIds?: Id[]
|
|
130
|
+
}
|
|
131
|
+
|
|
112
132
|
export type Transition = {
|
|
113
133
|
id: Id
|
|
114
134
|
kind: Kind.Transition
|
|
@@ -122,6 +142,7 @@ export namespace Item {
|
|
|
122
142
|
| Video
|
|
123
143
|
| Audio
|
|
124
144
|
| Text
|
|
145
|
+
| Caption
|
|
125
146
|
| Gap
|
|
126
147
|
| Transition
|
|
127
148
|
| Spatial
|
|
@@ -133,8 +154,8 @@ export namespace Item {
|
|
|
133
154
|
|
|
134
155
|
export type ContainerItem = Item.Sequence | Item.Stack
|
|
135
156
|
export type NonContainerItem = Exclude<Item.Any, ContainerItem>
|
|
136
|
-
export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text
|
|
137
|
-
export type VisualAnimatableItem = Item.Video | Item.Text
|
|
157
|
+
export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text | Item.Caption
|
|
158
|
+
export type VisualAnimatableItem = Item.Video | Item.Text | Item.Caption
|
|
138
159
|
|
|
139
160
|
export type PlayableItem = Item.Any & {
|
|
140
161
|
start: Ms
|
|
@@ -25,7 +25,7 @@ abstract class BaseVisualSampler {
|
|
|
25
25
|
protected timeline: TimelineFile
|
|
26
26
|
) {
|
|
27
27
|
this.#sampler = createVisualSampler(this.resolveMedia, (item, time) => {
|
|
28
|
-
const targetUs = toUs(time)
|
|
28
|
+
const targetUs = toUs(ms(item.start + time))
|
|
29
29
|
let cursor = this.#videoCursors.get(item.id)
|
|
30
30
|
|
|
31
31
|
if (!cursor) {
|
|
@@ -148,8 +148,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
|
|
|
148
148
|
return this.sample(timecode)
|
|
149
149
|
}
|
|
150
150
|
|
|
151
|
-
protected createCursor(source: DecoderSource,
|
|
152
|
-
const startUs = 0
|
|
151
|
+
protected createCursor(source: DecoderSource, startUs: number, endUs: number): VideoFrameCursor {
|
|
153
152
|
const windowUs = 1_000_000
|
|
154
153
|
const prefetchThreshold = windowUs * 0.5
|
|
155
154
|
|
|
@@ -16,6 +16,7 @@ type WalkAtCallbacks = {
|
|
|
16
16
|
stack: (x: Item.Stack, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
17
17
|
video: (x: Item.Video, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
18
18
|
text: (x: Item.Text, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
19
|
+
caption: (x: Item.Caption, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
19
20
|
audio: (x: Item.Audio, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
20
21
|
}
|
|
21
22
|
|
|
@@ -24,6 +25,7 @@ type WalkCallbacks = {
|
|
|
24
25
|
stack?: (x: Item.Stack, matrix: Mat6, ancestors: AncestorAt[]) => void
|
|
25
26
|
video?: (x: Item.Video, matrix: Mat6, ancestors: AncestorAt[]) => void
|
|
26
27
|
text?: (x: Item.Text, matrix: Mat6, ancestors: AncestorAt[]) => void
|
|
28
|
+
caption?: (x: Item.Caption, matrix: Mat6, ancestors: AncestorAt[]) => void
|
|
27
29
|
audio?: (x: Item.Audio) => void
|
|
28
30
|
}
|
|
29
31
|
|
|
@@ -52,6 +54,7 @@ export function itemsAt(p: Props): At[] {
|
|
|
52
54
|
stack: () => { },
|
|
53
55
|
video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
54
56
|
text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
57
|
+
caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
55
58
|
audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
|
|
56
59
|
})
|
|
57
60
|
|
|
@@ -72,6 +75,7 @@ export function itemsFrom(p: FromProps): At[] {
|
|
|
72
75
|
stack: () => { },
|
|
73
76
|
video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
74
77
|
text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
78
|
+
caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
75
79
|
audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
|
|
76
80
|
})
|
|
77
81
|
|
|
@@ -176,6 +180,10 @@ export function walk(
|
|
|
176
180
|
callbacks.text?.(item, currentMatrix, ancestors)
|
|
177
181
|
break
|
|
178
182
|
|
|
183
|
+
case Kind.Caption:
|
|
184
|
+
callbacks.caption?.(item, currentMatrix, ancestors)
|
|
185
|
+
break
|
|
186
|
+
|
|
179
187
|
case Kind.Audio:
|
|
180
188
|
callbacks.audio?.(item)
|
|
181
189
|
break
|
|
@@ -211,11 +219,12 @@ function walkAt(
|
|
|
211
219
|
|
|
212
220
|
if (!child)
|
|
213
221
|
continue
|
|
214
|
-
|
|
222
|
+
|
|
223
|
+
const duration = computeItemDurationFromMap(child.id, items)
|
|
224
|
+
if (duration <= 0)
|
|
215
225
|
continue
|
|
216
|
-
}
|
|
217
226
|
|
|
218
|
-
if (time >= offset && time < offset +
|
|
227
|
+
if (time >= offset && time < offset + duration) {
|
|
219
228
|
const localTime = ms(time - offset)
|
|
220
229
|
walkAt(
|
|
221
230
|
childId,
|
|
@@ -227,7 +236,7 @@ function walkAt(
|
|
|
227
236
|
break
|
|
228
237
|
}
|
|
229
238
|
|
|
230
|
-
offset = ms(offset +
|
|
239
|
+
offset = ms(offset + duration)
|
|
231
240
|
}
|
|
232
241
|
|
|
233
242
|
break
|
|
@@ -241,6 +250,10 @@ function walkAt(
|
|
|
241
250
|
callbacks.text(item, time, ancestors)
|
|
242
251
|
break
|
|
243
252
|
|
|
253
|
+
case Kind.Caption:
|
|
254
|
+
callbacks.caption(item, time, ancestors)
|
|
255
|
+
break
|
|
256
|
+
|
|
244
257
|
case Kind.Audio:
|
|
245
258
|
callbacks.audio(item, time, ancestors)
|
|
246
259
|
break
|
|
@@ -275,11 +288,12 @@ function walkFrom(
|
|
|
275
288
|
|
|
276
289
|
if (!child)
|
|
277
290
|
continue
|
|
278
|
-
|
|
291
|
+
|
|
292
|
+
const duration = computeItemDurationFromMap(child.id, items)
|
|
293
|
+
if (duration <= 0)
|
|
279
294
|
continue
|
|
280
|
-
}
|
|
281
295
|
|
|
282
|
-
const end = ms(offset +
|
|
296
|
+
const end = ms(offset + duration)
|
|
283
297
|
if (from >= end) {
|
|
284
298
|
offset = end
|
|
285
299
|
continue
|
|
@@ -308,6 +322,10 @@ function walkFrom(
|
|
|
308
322
|
callbacks.text(item, from, ancestors)
|
|
309
323
|
break
|
|
310
324
|
|
|
325
|
+
case Kind.Caption:
|
|
326
|
+
callbacks.caption(item, from, ancestors)
|
|
327
|
+
break
|
|
328
|
+
|
|
311
329
|
case Kind.Audio:
|
|
312
330
|
callbacks.audio(item, from, ancestors)
|
|
313
331
|
break
|
|
@@ -318,14 +336,24 @@ export function computeItemDuration(
|
|
|
318
336
|
id: number,
|
|
319
337
|
timeline: TimelineFile
|
|
320
338
|
): Ms {
|
|
321
|
-
|
|
339
|
+
return computeItemDurationFromMap(
|
|
340
|
+
id,
|
|
341
|
+
new Map(timeline.items.map(item => [item.id, item]))
|
|
342
|
+
)
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
function computeItemDurationFromMap(
|
|
346
|
+
id: number,
|
|
347
|
+
items: Map<Id, Item.Any>
|
|
348
|
+
): Ms {
|
|
349
|
+
const item = items.get(id)
|
|
322
350
|
|
|
323
351
|
if (!item) return ms(0)
|
|
324
352
|
|
|
325
353
|
switch (item.kind) {
|
|
326
354
|
case Kind.Sequence: {
|
|
327
355
|
const children = item.childrenIds
|
|
328
|
-
.map(childId =>
|
|
356
|
+
.map(childId => items.get(childId))
|
|
329
357
|
.filter(Boolean) as Item.Any[]
|
|
330
358
|
|
|
331
359
|
let total = ms(0)
|
|
@@ -338,8 +366,8 @@ export function computeItemDuration(
|
|
|
338
366
|
const next = children[i + 1]
|
|
339
367
|
|
|
340
368
|
if (prev && next && prev.kind !== Kind.Transition && next.kind !== Kind.Transition) {
|
|
341
|
-
const prevDur =
|
|
342
|
-
const nextDur =
|
|
369
|
+
const prevDur = computeItemDurationFromMap(prev.id, items)
|
|
370
|
+
const nextDur = computeItemDurationFromMap(next.id, items)
|
|
343
371
|
const overlap = Math.max(0, Math.min(child.duration, prevDur, nextDur))
|
|
344
372
|
|
|
345
373
|
total = ms(total - overlap)
|
|
@@ -347,7 +375,7 @@ export function computeItemDuration(
|
|
|
347
375
|
continue
|
|
348
376
|
}
|
|
349
377
|
|
|
350
|
-
total = ms(total +
|
|
378
|
+
total = ms(total + computeItemDurationFromMap(child.id, items))
|
|
351
379
|
}
|
|
352
380
|
|
|
353
381
|
return total
|
|
@@ -357,7 +385,7 @@ export function computeItemDuration(
|
|
|
357
385
|
let longest = ms(0)
|
|
358
386
|
|
|
359
387
|
for (const childId of item.childrenIds) {
|
|
360
|
-
const duration =
|
|
388
|
+
const duration = computeItemDurationFromMap(childId, items)
|
|
361
389
|
if (duration > longest) {
|
|
362
390
|
longest = duration
|
|
363
391
|
}
|
|
@@ -8,7 +8,8 @@ export type VideoSampler = (item: Item.Video, time: Ms) => Promise<VideoFrame |
|
|
|
8
8
|
export function createDefaultVideoSampler(sink: VideoSink): VideoSampler {
|
|
9
9
|
return async (item, time) => {
|
|
10
10
|
const s = await sink.getSink(item.mediaHash)
|
|
11
|
-
const
|
|
11
|
+
const mediaTime = item.start + time
|
|
12
|
+
const sample = await s?.getSample(mediaTime / 1000)
|
|
12
13
|
const frame = sample?.toVideoFrame()
|
|
13
14
|
sample?.close()
|
|
14
15
|
return frame ?? undefined
|
|
@@ -3,6 +3,7 @@ import {SampleContext} from "./types.js"
|
|
|
3
3
|
import {sampleSequence} from "./sequence.js"
|
|
4
4
|
import {Ms} from "../../../../../../units/ms.js"
|
|
5
5
|
import {Item, Kind} from "../../../../../parts/item.js"
|
|
6
|
+
import {segmentTranscript} from "../../../../../parts/captions.js"
|
|
6
7
|
import {FilterSpec, Layer} from "../../../../../../driver/fns/schematic.js"
|
|
7
8
|
import {AncestorAt, computeOpacity, computeWorldMatrix} from "../../../handy.js"
|
|
8
9
|
|
|
@@ -58,6 +59,24 @@ export async function sampleVisual(
|
|
|
58
59
|
return [{id: item.id, kind: "text", content: item.content, style, matrix, alpha, crop, filters}]
|
|
59
60
|
}
|
|
60
61
|
|
|
62
|
+
case Kind.Caption: {
|
|
63
|
+
if (time < 0 || time >= item.duration) return []
|
|
64
|
+
|
|
65
|
+
const transcriptTime = item.start + time
|
|
66
|
+
const segment = segmentTranscript(item.transcript, item).find(segment => {
|
|
67
|
+
const [start, end] = segment.timestamp
|
|
68
|
+
return transcriptTime >= start && transcriptTime < end
|
|
69
|
+
})
|
|
70
|
+
if (!segment)
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
const style = item.styleId
|
|
74
|
+
? (ctx.items.get(item.styleId) as Item.TextStyle)?.style
|
|
75
|
+
: undefined
|
|
76
|
+
|
|
77
|
+
return [{id: item.id, kind: "text", content: segment.text, style, matrix, alpha, crop, filters}]
|
|
78
|
+
}
|
|
79
|
+
|
|
61
80
|
case Kind.Gap: {
|
|
62
81
|
return [{id: item.id, kind: "gap"}]
|
|
63
82
|
}
|
|
@@ -6,6 +6,8 @@ import {Media} from "../parts/media.js"
|
|
|
6
6
|
import {TimelineFile} from "../parts/basics.js"
|
|
7
7
|
import {FilterAction} from "../parts/filters.js"
|
|
8
8
|
import {filters, FilterParams, FilterType} from "../parts/filters.js"
|
|
9
|
+
import {CaptionOptions, CaptionSourceItem} from "../parts/captions.js"
|
|
10
|
+
import {Transcription} from "../../features/speech/transcribe/types.js"
|
|
9
11
|
import {AnimationPreset, PresetOptions} from "../parts/animations/types.js"
|
|
10
12
|
import {Crop, FilterableItem, Item, VisualAnimatableItem} from "../parts/item.js"
|
|
11
13
|
import {animationPresets, visualAnimations} from "../parts/animations/registry.js"
|
|
@@ -75,6 +77,14 @@ export function text(
|
|
|
75
77
|
return o => o.text(content, options)
|
|
76
78
|
}
|
|
77
79
|
|
|
80
|
+
export function captions(
|
|
81
|
+
item: Build<CaptionSourceItem>,
|
|
82
|
+
transcript: Transcription,
|
|
83
|
+
options?: CaptionOptions
|
|
84
|
+
): Build<Item.Stack> {
|
|
85
|
+
return o => o.captions(item(o), transcript, options)
|
|
86
|
+
}
|
|
87
|
+
|
|
78
88
|
export function gap(duration: number): Build<Item.Gap> {
|
|
79
89
|
return o => o.gap(duration)
|
|
80
90
|
}
|