@omnimedia/omnitool 1.1.0-90 → 1.1.0-93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +132 -3
- package/package.json +1 -1
- package/s/demo/routines/timeline-setup.ts +32 -4
- package/s/driver/parts/compositor.ts +3 -0
- package/s/features/speech/transcribe/transcriber.ts +1 -1
- package/s/features/speech/transcribe/types.ts +8 -4
- package/s/timeline/index.ts +4 -0
- package/s/timeline/parts/captions.ts +90 -0
- package/s/timeline/parts/item.ts +23 -4
- package/s/timeline/renderers/export/parts/cursor.ts +20 -3
- package/s/timeline/renderers/parts/handy.ts +41 -13
- package/s/timeline/renderers/parts/samplers/visual/parts/defaults.ts +2 -1
- package/s/timeline/renderers/parts/samplers/visual/parts/sample.ts +19 -0
- package/s/timeline/sugar/helpers.ts +10 -0
- package/s/timeline/sugar/o.ts +55 -0
- package/x/demo/demo.bundle.min.js +103 -103
- package/x/demo/demo.bundle.min.js.map +4 -4
- package/x/demo/routines/timeline-setup.js +28 -3
- package/x/demo/routines/timeline-setup.js.map +1 -1
- package/x/driver/parts/compositor.js +3 -0
- package/x/driver/parts/compositor.js.map +1 -1
- package/x/features/speech/transcribe/transcriber.d.ts +1 -1
- package/x/features/speech/transcribe/transcriber.js +1 -1
- package/x/features/speech/transcribe/transcriber.js.map +1 -1
- package/x/features/speech/transcribe/types.d.ts +6 -4
- package/x/index.html +2 -2
- package/x/tests.bundle.min.js +107 -107
- package/x/tests.bundle.min.js.map +4 -4
- package/x/tests.html +1 -1
- package/x/timeline/index.d.ts +4 -0
- package/x/timeline/index.js +4 -0
- package/x/timeline/index.js.map +1 -1
- package/x/timeline/parts/captions.d.ts +40 -0
- package/x/timeline/parts/captions.js +55 -0
- package/x/timeline/parts/captions.js.map +1 -0
- package/x/timeline/parts/item.d.ts +21 -5
- package/x/timeline/parts/item.js +1 -0
- package/x/timeline/parts/item.js.map +1 -1
- package/x/timeline/renderers/export/parts/cursor.d.ts +1 -1
- package/x/timeline/renderers/export/parts/cursor.js +18 -3
- package/x/timeline/renderers/export/parts/cursor.js.map +1 -1
- package/x/timeline/renderers/parts/handy.d.ts +1 -0
- package/x/timeline/renderers/parts/handy.js +27 -13
- package/x/timeline/renderers/parts/handy.js.map +1 -1
- package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js +2 -1
- package/x/timeline/renderers/parts/samplers/visual/parts/defaults.js.map +1 -1
- package/x/timeline/renderers/parts/samplers/visual/parts/sample.js +16 -0
- package/x/timeline/renderers/parts/samplers/visual/parts/sample.js.map +1 -1
- package/x/timeline/sugar/helpers.d.ts +3 -0
- package/x/timeline/sugar/helpers.js +3 -0
- package/x/timeline/sugar/helpers.js.map +1 -1
- package/x/timeline/sugar/o.d.ts +2 -0
- package/x/timeline/sugar/o.js +38 -0
- package/x/timeline/sugar/o.js.map +1 -1
package/README.md
CHANGED
|
@@ -79,6 +79,133 @@ const timeline = timeline(
|
|
|
79
79
|
)
|
|
80
80
|
```
|
|
81
81
|
|
|
82
|
+
## 💬 Captions
|
|
83
|
+
|
|
84
|
+
Captions render transcript data as timed, styled text.
|
|
85
|
+
The transcript can come from anywhere, as long as it follows the structure.
|
|
86
|
+
|
|
87
|
+
```ts
|
|
88
|
+
const transcript = {
|
|
89
|
+
text: "Hello world. This is a caption.",
|
|
90
|
+
chunks: [
|
|
91
|
+
{text: "Hello", timestamp: [0, 0.4]},
|
|
92
|
+
{text: "world.", timestamp: [0.4, 1.2]},
|
|
93
|
+
{text: "This", timestamp: [1.3, 1.6]},
|
|
94
|
+
{text: "is", timestamp: [1.6, 1.8]},
|
|
95
|
+
{text: "a", timestamp: [1.8, 1.9]},
|
|
96
|
+
{text: "caption.", timestamp: [1.9, 2.6]},
|
|
97
|
+
],
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const timeline = omni.timeline(o => {
|
|
101
|
+
const video = o.video(clip, {duration: 3000})
|
|
102
|
+
return o.captions(video, transcript)
|
|
103
|
+
})
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Use a caption preset to pick a built-in caption style:
|
|
107
|
+
|
|
108
|
+
```ts
|
|
109
|
+
const timeline = omni.timeline(o => {
|
|
110
|
+
const video = o.video(clip, {duration: 3000})
|
|
111
|
+
return o.captions.presets.default(video, transcript)
|
|
112
|
+
})
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
or do your own styled captions:
|
|
116
|
+
|
|
117
|
+
```ts
|
|
118
|
+
const timeline = omni.timeline(o => {
|
|
119
|
+
const video = o.video(clip, {duration: 3000})
|
|
120
|
+
return o.captions(video, transcript, {
|
|
121
|
+
styles: {
|
|
122
|
+
fontFamily: "Inter",
|
|
123
|
+
fontSize: 64,
|
|
124
|
+
fill: "#fff7d6",
|
|
125
|
+
stroke: {color: "#111111", width: 8},
|
|
126
|
+
align: "center",
|
|
127
|
+
},
|
|
128
|
+
})
|
|
129
|
+
})
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Use omnitool's built in speech-to-text with default model:
|
|
133
|
+
|
|
134
|
+
```ts
|
|
135
|
+
import {makeTranscriber, defaultTranscriberSpec} from "@omnimedia/omnitool"
|
|
136
|
+
|
|
137
|
+
// uses onnx-community/whisper-tiny_timestamped
|
|
138
|
+
const transcriber = await makeTranscriber({
|
|
139
|
+
driver,
|
|
140
|
+
spec: defaultTranscriberSpec(),
|
|
141
|
+
workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
|
|
142
|
+
onLoading: loading => console.log("loading", loading),
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
const transcript = await transcriber.transcribe({
|
|
146
|
+
source: file,
|
|
147
|
+
language: "english",
|
|
148
|
+
onReport: report => console.log("report", report),
|
|
149
|
+
onTranscription: text => console.log("transcribing", text),
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
const timeline = omni.timeline(o => {
|
|
153
|
+
const video = o.video(clip)
|
|
154
|
+
return o.captions(video, transcript)
|
|
155
|
+
})
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Load a custom speech-to-text model:
|
|
159
|
+
|
|
160
|
+
```ts
|
|
161
|
+
const transcriber = await makeTranscriber({
|
|
162
|
+
driver,
|
|
163
|
+
spec: {
|
|
164
|
+
model: "onnx-community/whisper-tiny_timestamped",
|
|
165
|
+
dtype: "q4",
|
|
166
|
+
device: "wasm",
|
|
167
|
+
chunkLength: 20,
|
|
168
|
+
strideLength: 3
|
|
169
|
+
},
|
|
170
|
+
workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
|
|
171
|
+
onLoading: loading => console.log("loading", loading),
|
|
172
|
+
})
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
> [!IMPORTANT]
|
|
176
|
+
> Use a Transformers.js-compatible speech-to-text model, for example `onnx-community/*_timestamped`.
|
|
177
|
+
> The model must support word-level timestamps because captions use `return_timestamps: "word"`.
|
|
178
|
+
> `device` and `dtype` are passed to Transformers.js and depend on your runtime/model.
|
|
179
|
+
> Browser usage commonly uses `"wasm"` or `"webgpu"`. `"webgpu"` for speed, `"wasm"` for more device support
|
|
180
|
+
> `workerUrl` depends on where you host the worker bundle.
|
|
181
|
+
|
|
182
|
+
`o.captions(video, transcript, options)` creates captions for a video or audio.
|
|
183
|
+
`o.captions` uses `captionPresets.default` preset.
|
|
184
|
+
use `o.captions.presets` to choose from available pre-styled captions.
|
|
185
|
+
pass `styles` in options to override preset styles.
|
|
186
|
+
transcript chunk timestamps are in seconds.
|
|
187
|
+
|
|
188
|
+
Update caption options after creation:
|
|
189
|
+
|
|
190
|
+
```ts
|
|
191
|
+
const caption = o.captions.make(transcript, {maxChars: 42})
|
|
192
|
+
const style = o.textStyle({fill: "#00ff00"})
|
|
193
|
+
o.set(caption.id, {
|
|
194
|
+
maxChars: 32,
|
|
195
|
+
styleId: style.id,
|
|
196
|
+
})
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Caption options:
|
|
200
|
+
`styles` - sets styles, it overrides the preset's styles.
|
|
201
|
+
`start` - transcript time where captions begin, in milliseconds.
|
|
202
|
+
`duration` - caption layer duration, in milliseconds.
|
|
203
|
+
`maxChars` - maximum characters in one generated caption line.
|
|
204
|
+
`maxDuration` - maximum duration of one generated caption line, in milliseconds.
|
|
205
|
+
`maxSilence` - maximum silence allowed inside one caption; longer pauses start a new caption, in milliseconds.
|
|
206
|
+
|
|
207
|
+
import `captionPresets` to list available caption looks.
|
|
208
|
+
|
|
82
209
|
## 🎛 Filters
|
|
83
210
|
|
|
84
211
|
Filter application:
|
|
@@ -353,9 +480,11 @@ Timeline items:
|
|
|
353
480
|
- 4 `Text`
|
|
354
481
|
- 5 `Gap`
|
|
355
482
|
- 6 `Spatial`
|
|
356
|
-
- 7 `
|
|
357
|
-
- 8 `
|
|
358
|
-
- 9 `
|
|
483
|
+
- 7 `Animation`
|
|
484
|
+
- 8 `Transition`
|
|
485
|
+
- 9 `TextStyle`
|
|
486
|
+
- 10 `Filter`
|
|
487
|
+
- 11 `Caption`
|
|
359
488
|
|
|
360
489
|
## 🗺️ Roadmap
|
|
361
490
|
- CLI commands:
|
package/package.json
CHANGED
|
@@ -1,10 +1,25 @@
|
|
|
1
1
|
|
|
2
2
|
import {Driver} from "../../driver/driver.js"
|
|
3
|
-
import {Datafile, Item, Omni} from "../../timeline/index.js"
|
|
3
|
+
import {Datafile, defaultTranscriberSpec, Item, makeTranscriber, Omni} from "../../timeline/index.js"
|
|
4
4
|
|
|
5
5
|
export async function TimelineSchemaTest(driver: Driver, file: File) {
|
|
6
|
+
const transcriber = await makeTranscriber({
|
|
7
|
+
driver,
|
|
8
|
+
spec: defaultTranscriberSpec(),
|
|
9
|
+
workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
|
|
10
|
+
onLoading: loading => console.log("transcriber loading", loading),
|
|
11
|
+
})
|
|
12
|
+
|
|
6
13
|
const omni = new Omni(driver)
|
|
7
14
|
const {videoA} = await omni.load({videoA: Datafile.make(file)})
|
|
15
|
+
|
|
16
|
+
const transcript = await transcriber.transcribe({
|
|
17
|
+
source: file,
|
|
18
|
+
language: "english",
|
|
19
|
+
onReport: report => console.log("transcriber report", report),
|
|
20
|
+
onTranscription: text => console.log("transcribing", text),
|
|
21
|
+
})
|
|
22
|
+
|
|
8
23
|
const timeline = omni.timeline(o => {
|
|
9
24
|
const text = o.text("content", {duration: 3000})
|
|
10
25
|
const fade = o.animate.opacity.make("easeIn", [
|
|
@@ -30,15 +45,28 @@ export async function TimelineSchemaTest(driver: Driver, file: File) {
|
|
|
30
45
|
[3000, o.transform({position: [320, 0], scale: [1.15, 1.15], rotation: 0})],
|
|
31
46
|
])
|
|
32
47
|
|
|
33
|
-
const video = o.video(videoA, {duration:
|
|
48
|
+
const video = o.video(videoA, {duration: 6000, start: 3000})
|
|
34
49
|
o.set<Item.Text>(text.id, {styleId: style.id, spatialId: textSpatial.id, animationIds: [fade.id, textMotion.id]})
|
|
35
50
|
o.set<Item.Video>(video.id, {spatialId: videoSpatial.id})
|
|
36
51
|
|
|
37
52
|
return o.sequence(
|
|
38
53
|
o.stack(
|
|
39
54
|
text,
|
|
40
|
-
video,
|
|
41
|
-
|
|
55
|
+
o.captions(video, transcript, {
|
|
56
|
+
maxChars: 34,
|
|
57
|
+
maxDuration: 2800,
|
|
58
|
+
maxSilence: 450,
|
|
59
|
+
styles: {
|
|
60
|
+
fontFamily: "Arial",
|
|
61
|
+
fontSize: 64,
|
|
62
|
+
fill: "#ffffff",
|
|
63
|
+
align: "center",
|
|
64
|
+
wordWrap: true,
|
|
65
|
+
wordWrapWidth: 1280,
|
|
66
|
+
stroke: {color: "#000000", width: 6},
|
|
67
|
+
},
|
|
68
|
+
}),
|
|
69
|
+
o.audio(videoA, {start: 3000})
|
|
42
70
|
),
|
|
43
71
|
o.gap(500),
|
|
44
72
|
o.video(videoA, {duration: 7000, start: 5000})
|
|
@@ -110,6 +110,9 @@ export class Compositor {
|
|
|
110
110
|
parent: Container,
|
|
111
111
|
) {
|
|
112
112
|
const sprite = this.#findOrCreate<Text>(layer)!
|
|
113
|
+
sprite.text = layer.content
|
|
114
|
+
if (layer.style)
|
|
115
|
+
sprite.style = layer.style
|
|
113
116
|
this.#applyTransform(sprite, layer.matrix)
|
|
114
117
|
this.#applyAlpha(sprite, layer.alpha)
|
|
115
118
|
this.#applyCrop(sprite, layer.crop)
|
|
@@ -36,12 +36,16 @@ export type TranscriberPipeOptions = {
|
|
|
36
36
|
|
|
37
37
|
export type SpeechTime = [start: number, end: number]
|
|
38
38
|
|
|
39
|
+
export type TranscriptWord = {
|
|
40
|
+
text: string
|
|
41
|
+
timestamp: SpeechTime
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export type TranscriptSegment = TranscriptWord
|
|
45
|
+
|
|
39
46
|
export type Transcription = {
|
|
40
47
|
text: string
|
|
41
|
-
chunks:
|
|
42
|
-
text: string
|
|
43
|
-
timestamp: SpeechTime
|
|
44
|
-
}[]
|
|
48
|
+
chunks: TranscriptWord[]
|
|
45
49
|
}
|
|
46
50
|
|
|
47
51
|
export type TranscriberSpec = {
|
package/s/timeline/index.ts
CHANGED
|
@@ -8,6 +8,10 @@ export * from "./parts/resource-pool.js"
|
|
|
8
8
|
export * from "./parts/resource.js"
|
|
9
9
|
export * from "./parts/filmstrip.js"
|
|
10
10
|
export * from "./parts/animations/registry.js"
|
|
11
|
+
export {captionPresets} from "./parts/captions.js"
|
|
12
|
+
export * from "../features/speech/transcribe/default-spec.js"
|
|
13
|
+
export * from "../features/speech/transcribe/transcriber.js"
|
|
14
|
+
export * from "../features/speech/transcribe/types.js"
|
|
11
15
|
|
|
12
16
|
export * from "./parts/waveform/waveform.js"
|
|
13
17
|
export * from "./parts/waveform/parts/types.js"
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
|
|
2
|
+
import type {Item} from "./item.js"
|
|
3
|
+
import {TextStyleOptions} from "pixi.js"
|
|
4
|
+
import {TransformOptions, Vec2} from "../types.js"
|
|
5
|
+
import {Transcription, TranscriptSegment} from "../../features/speech/transcribe/types.js"
|
|
6
|
+
|
|
7
|
+
export type CaptionOptions = {
|
|
8
|
+
start?: number
|
|
9
|
+
duration?: number
|
|
10
|
+
styles?: TextStyleOptions
|
|
11
|
+
maxChars?: number
|
|
12
|
+
maxDuration?: number
|
|
13
|
+
maxSilence?: number
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export const captionPresets = {
|
|
17
|
+
default: {
|
|
18
|
+
styles: {
|
|
19
|
+
fontFamily: "Arial",
|
|
20
|
+
fontSize: 56,
|
|
21
|
+
fill: "#ffffff",
|
|
22
|
+
align: "center",
|
|
23
|
+
wordWrap: true,
|
|
24
|
+
wordWrapWidth: 1440,
|
|
25
|
+
} satisfies TextStyleOptions,
|
|
26
|
+
transform: {
|
|
27
|
+
position: [240, 860] as Vec2
|
|
28
|
+
} satisfies TransformOptions,
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export type CaptionPreset = (typeof captionPresets)[keyof typeof captionPresets]
|
|
33
|
+
export type CaptionSourceItem = Item.Video | Item.Audio
|
|
34
|
+
export type CaptionAction = {
|
|
35
|
+
(item: CaptionSourceItem, transcript: Transcription, options?: CaptionOptions): Item.Stack
|
|
36
|
+
make: (transcript: Transcription, options?: CaptionOptions) => Item.Caption
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export type CaptionActions = CaptionAction & {
|
|
40
|
+
presets: {
|
|
41
|
+
[TName in keyof typeof captionPresets]: CaptionAction
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const CAPTION_DEFAULTS = {
|
|
46
|
+
maxChars: 42,
|
|
47
|
+
maxDuration: 3500,
|
|
48
|
+
maxSilence: 750,
|
|
49
|
+
} satisfies CaptionOptions
|
|
50
|
+
|
|
51
|
+
export function segmentTranscript(transcript: Transcription, options?: CaptionOptions): TranscriptSegment[] {
|
|
52
|
+
const {maxChars, maxDuration, maxSilence} = {...CAPTION_DEFAULTS, ...options}
|
|
53
|
+
const segments: TranscriptSegment[] = []
|
|
54
|
+
let current: TranscriptSegment | null = null
|
|
55
|
+
|
|
56
|
+
for (const {timestamp: [t0, t1], text: rawText} of transcript.chunks) {
|
|
57
|
+
const [start, end] = [t0 * 1000, t1 * 1000]
|
|
58
|
+
const text = rawText.trim()
|
|
59
|
+
|
|
60
|
+
if (!Number.isFinite(start) || !Number.isFinite(end) || !text) continue
|
|
61
|
+
|
|
62
|
+
if (!current) {
|
|
63
|
+
current = {text, timestamp: [start, end]}
|
|
64
|
+
continue
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const [currentStart, currentEnd]: [number, number] = current.timestamp
|
|
68
|
+
const nextText = `${current.text} ${text}`.trim()
|
|
69
|
+
const shouldBreak =
|
|
70
|
+
nextText.length > maxChars ||
|
|
71
|
+
end - currentStart > maxDuration ||
|
|
72
|
+
start - currentEnd > maxSilence
|
|
73
|
+
|
|
74
|
+
if (shouldBreak) {
|
|
75
|
+
segments.push(current)
|
|
76
|
+
current = {text, timestamp: [start, end]}
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
current = {text: nextText, timestamp: [currentStart, end]}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (current) segments.push(current)
|
|
84
|
+
return segments
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export function captionDuration(transcript: Transcription, options?: CaptionOptions) {
|
|
88
|
+
const segments = segmentTranscript(transcript, options)
|
|
89
|
+
return Math.max(0, ...segments.map(segment => segment.timestamp[1]))
|
|
90
|
+
}
|
package/s/timeline/parts/item.ts
CHANGED
|
@@ -3,8 +3,9 @@ import {TextStyleOptions} from "pixi.js"
|
|
|
3
3
|
|
|
4
4
|
import {Id, Hash} from "./basics.js"
|
|
5
5
|
import {Ms} from "../../units/ms.js"
|
|
6
|
-
import type {FilterParams, FilterType} from "./filters.js"
|
|
7
6
|
import {Transform, VisualAnimations} from "../types.js"
|
|
7
|
+
import type {FilterParams, FilterType} from "./filters.js"
|
|
8
|
+
import type {Transcription} from "../../features/speech/transcribe/types.js"
|
|
8
9
|
|
|
9
10
|
export type Crop = [top: number, right: number, bottom: number, left: number]
|
|
10
11
|
|
|
@@ -19,7 +20,8 @@ export enum Kind {
|
|
|
19
20
|
Animation,
|
|
20
21
|
Transition,
|
|
21
22
|
TextStyle,
|
|
22
|
-
Filter
|
|
23
|
+
Filter,
|
|
24
|
+
Caption
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
export enum Effect {
|
|
@@ -109,6 +111,22 @@ export namespace Item {
|
|
|
109
111
|
filterIds?: Id[]
|
|
110
112
|
}
|
|
111
113
|
|
|
114
|
+
|
|
115
|
+
export type Caption = {
|
|
116
|
+
id: Id
|
|
117
|
+
kind: Kind.Caption
|
|
118
|
+
transcript: Transcription
|
|
119
|
+
start: number
|
|
120
|
+
duration: number
|
|
121
|
+
maxChars?: number
|
|
122
|
+
maxDuration?: number
|
|
123
|
+
maxSilence?: number
|
|
124
|
+
spatialId?: Id
|
|
125
|
+
animationIds?: Id[]
|
|
126
|
+
styleId?: Id
|
|
127
|
+
filterIds?: Id[]
|
|
128
|
+
}
|
|
129
|
+
|
|
112
130
|
export type Transition = {
|
|
113
131
|
id: Id
|
|
114
132
|
kind: Kind.Transition
|
|
@@ -122,6 +140,7 @@ export namespace Item {
|
|
|
122
140
|
| Video
|
|
123
141
|
| Audio
|
|
124
142
|
| Text
|
|
143
|
+
| Caption
|
|
125
144
|
| Gap
|
|
126
145
|
| Transition
|
|
127
146
|
| Spatial
|
|
@@ -133,8 +152,8 @@ export namespace Item {
|
|
|
133
152
|
|
|
134
153
|
export type ContainerItem = Item.Sequence | Item.Stack
|
|
135
154
|
export type NonContainerItem = Exclude<Item.Any, ContainerItem>
|
|
136
|
-
export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text
|
|
137
|
-
export type VisualAnimatableItem = Item.Video | Item.Text
|
|
155
|
+
export type FilterableItem = Item.Sequence | Item.Stack | Item.Video | Item.Text | Item.Caption
|
|
156
|
+
export type VisualAnimatableItem = Item.Video | Item.Text | Item.Caption
|
|
138
157
|
|
|
139
158
|
export type PlayableItem = Item.Any & {
|
|
140
159
|
start: Ms
|
|
@@ -25,7 +25,7 @@ abstract class BaseVisualSampler {
|
|
|
25
25
|
protected timeline: TimelineFile
|
|
26
26
|
) {
|
|
27
27
|
this.#sampler = createVisualSampler(this.resolveMedia, (item, time) => {
|
|
28
|
-
const targetUs = toUs(time)
|
|
28
|
+
const targetUs = toUs(ms(item.start + time))
|
|
29
29
|
let cursor = this.#videoCursors.get(item.id)
|
|
30
30
|
|
|
31
31
|
if (!cursor) {
|
|
@@ -148,8 +148,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
|
|
|
148
148
|
return this.sample(timecode)
|
|
149
149
|
}
|
|
150
150
|
|
|
151
|
-
protected createCursor(source: DecoderSource,
|
|
152
|
-
const startUs = 0
|
|
151
|
+
protected createCursor(source: DecoderSource, startUs: number, endUs: number): VideoFrameCursor {
|
|
153
152
|
const windowUs = 1_000_000
|
|
154
153
|
const prefetchThreshold = windowUs * 0.5
|
|
155
154
|
|
|
@@ -159,6 +158,9 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
|
|
|
159
158
|
let input: Input | null = null
|
|
160
159
|
let sink: VideoSampleSink | null = null
|
|
161
160
|
let prefetchPromise: Promise<{frames: VideoFrame[], windowStart: number, windowEnd: number}> | null = null
|
|
161
|
+
let activeFetches = 0
|
|
162
|
+
let idle: Promise<void> = Promise.resolve()
|
|
163
|
+
let resolveIdle: (() => void) | null = null
|
|
162
164
|
let canceled = false
|
|
163
165
|
|
|
164
166
|
const clear = () => {
|
|
@@ -167,6 +169,18 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
|
|
|
167
169
|
frames = []
|
|
168
170
|
}
|
|
169
171
|
|
|
172
|
+
const startFetch = () => {
|
|
173
|
+
if (activeFetches++ === 0)
|
|
174
|
+
idle = new Promise<void>(resolve => resolveIdle = resolve)
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const endFetch = () => {
|
|
178
|
+
if (--activeFetches === 0) {
|
|
179
|
+
resolveIdle?.()
|
|
180
|
+
resolveIdle = null
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
170
184
|
const getSink = async () => {
|
|
171
185
|
if (sink) return sink
|
|
172
186
|
|
|
@@ -184,6 +198,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
|
|
|
184
198
|
}
|
|
185
199
|
|
|
186
200
|
const fetchFrames = async (targetUs: number) => {
|
|
201
|
+
startFetch()
|
|
187
202
|
const wEnd = Math.min(endUs, targetUs + 1)
|
|
188
203
|
const wStart = Math.max(startUs, wEnd - windowUs)
|
|
189
204
|
const newFrames: VideoFrame[] = []
|
|
@@ -196,6 +211,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
|
|
|
196
211
|
}
|
|
197
212
|
}
|
|
198
213
|
|
|
214
|
+
endFetch()
|
|
199
215
|
return {frames: newFrames, windowStart: wStart, windowEnd: wEnd}
|
|
200
216
|
}
|
|
201
217
|
|
|
@@ -262,6 +278,7 @@ export class ReverseCursorVisualSampler extends BaseVisualSampler {
|
|
|
262
278
|
if (prefetched)
|
|
263
279
|
for (const f of prefetched.frames) f.close()
|
|
264
280
|
|
|
281
|
+
await idle
|
|
265
282
|
clear()
|
|
266
283
|
input?.dispose()
|
|
267
284
|
input = null
|
|
@@ -16,6 +16,7 @@ type WalkAtCallbacks = {
|
|
|
16
16
|
stack: (x: Item.Stack, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
17
17
|
video: (x: Item.Video, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
18
18
|
text: (x: Item.Text, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
19
|
+
caption: (x: Item.Caption, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
19
20
|
audio: (x: Item.Audio, localTime: Ms, ancestors: AncestorAt[]) => void
|
|
20
21
|
}
|
|
21
22
|
|
|
@@ -24,6 +25,7 @@ type WalkCallbacks = {
|
|
|
24
25
|
stack?: (x: Item.Stack, matrix: Mat6, ancestors: AncestorAt[]) => void
|
|
25
26
|
video?: (x: Item.Video, matrix: Mat6, ancestors: AncestorAt[]) => void
|
|
26
27
|
text?: (x: Item.Text, matrix: Mat6, ancestors: AncestorAt[]) => void
|
|
28
|
+
caption?: (x: Item.Caption, matrix: Mat6, ancestors: AncestorAt[]) => void
|
|
27
29
|
audio?: (x: Item.Audio) => void
|
|
28
30
|
}
|
|
29
31
|
|
|
@@ -52,6 +54,7 @@ export function itemsAt(p: Props): At[] {
|
|
|
52
54
|
stack: () => { },
|
|
53
55
|
video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
54
56
|
text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
57
|
+
caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
55
58
|
audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
|
|
56
59
|
})
|
|
57
60
|
|
|
@@ -72,6 +75,7 @@ export function itemsFrom(p: FromProps): At[] {
|
|
|
72
75
|
stack: () => { },
|
|
73
76
|
video: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
74
77
|
text: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
78
|
+
caption: (item, localTime, ancestors) => results.push({ item, localTime, ancestors }),
|
|
75
79
|
audio: (item, localTime, ancestors) => results.push({ item, localTime, ancestors })
|
|
76
80
|
})
|
|
77
81
|
|
|
@@ -176,6 +180,10 @@ export function walk(
|
|
|
176
180
|
callbacks.text?.(item, currentMatrix, ancestors)
|
|
177
181
|
break
|
|
178
182
|
|
|
183
|
+
case Kind.Caption:
|
|
184
|
+
callbacks.caption?.(item, currentMatrix, ancestors)
|
|
185
|
+
break
|
|
186
|
+
|
|
179
187
|
case Kind.Audio:
|
|
180
188
|
callbacks.audio?.(item)
|
|
181
189
|
break
|
|
@@ -211,11 +219,12 @@ function walkAt(
|
|
|
211
219
|
|
|
212
220
|
if (!child)
|
|
213
221
|
continue
|
|
214
|
-
|
|
222
|
+
|
|
223
|
+
const duration = computeItemDurationFromMap(child.id, items)
|
|
224
|
+
if (duration <= 0)
|
|
215
225
|
continue
|
|
216
|
-
}
|
|
217
226
|
|
|
218
|
-
if (time >= offset && time < offset +
|
|
227
|
+
if (time >= offset && time < offset + duration) {
|
|
219
228
|
const localTime = ms(time - offset)
|
|
220
229
|
walkAt(
|
|
221
230
|
childId,
|
|
@@ -227,7 +236,7 @@ function walkAt(
|
|
|
227
236
|
break
|
|
228
237
|
}
|
|
229
238
|
|
|
230
|
-
offset = ms(offset +
|
|
239
|
+
offset = ms(offset + duration)
|
|
231
240
|
}
|
|
232
241
|
|
|
233
242
|
break
|
|
@@ -241,6 +250,10 @@ function walkAt(
|
|
|
241
250
|
callbacks.text(item, time, ancestors)
|
|
242
251
|
break
|
|
243
252
|
|
|
253
|
+
case Kind.Caption:
|
|
254
|
+
callbacks.caption(item, time, ancestors)
|
|
255
|
+
break
|
|
256
|
+
|
|
244
257
|
case Kind.Audio:
|
|
245
258
|
callbacks.audio(item, time, ancestors)
|
|
246
259
|
break
|
|
@@ -275,11 +288,12 @@ function walkFrom(
|
|
|
275
288
|
|
|
276
289
|
if (!child)
|
|
277
290
|
continue
|
|
278
|
-
|
|
291
|
+
|
|
292
|
+
const duration = computeItemDurationFromMap(child.id, items)
|
|
293
|
+
if (duration <= 0)
|
|
279
294
|
continue
|
|
280
|
-
}
|
|
281
295
|
|
|
282
|
-
const end = ms(offset +
|
|
296
|
+
const end = ms(offset + duration)
|
|
283
297
|
if (from >= end) {
|
|
284
298
|
offset = end
|
|
285
299
|
continue
|
|
@@ -308,6 +322,10 @@ function walkFrom(
|
|
|
308
322
|
callbacks.text(item, from, ancestors)
|
|
309
323
|
break
|
|
310
324
|
|
|
325
|
+
case Kind.Caption:
|
|
326
|
+
callbacks.caption(item, from, ancestors)
|
|
327
|
+
break
|
|
328
|
+
|
|
311
329
|
case Kind.Audio:
|
|
312
330
|
callbacks.audio(item, from, ancestors)
|
|
313
331
|
break
|
|
@@ -318,14 +336,24 @@ export function computeItemDuration(
|
|
|
318
336
|
id: number,
|
|
319
337
|
timeline: TimelineFile
|
|
320
338
|
): Ms {
|
|
321
|
-
|
|
339
|
+
return computeItemDurationFromMap(
|
|
340
|
+
id,
|
|
341
|
+
new Map(timeline.items.map(item => [item.id, item]))
|
|
342
|
+
)
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
function computeItemDurationFromMap(
|
|
346
|
+
id: number,
|
|
347
|
+
items: Map<Id, Item.Any>
|
|
348
|
+
): Ms {
|
|
349
|
+
const item = items.get(id)
|
|
322
350
|
|
|
323
351
|
if (!item) return ms(0)
|
|
324
352
|
|
|
325
353
|
switch (item.kind) {
|
|
326
354
|
case Kind.Sequence: {
|
|
327
355
|
const children = item.childrenIds
|
|
328
|
-
.map(childId =>
|
|
356
|
+
.map(childId => items.get(childId))
|
|
329
357
|
.filter(Boolean) as Item.Any[]
|
|
330
358
|
|
|
331
359
|
let total = ms(0)
|
|
@@ -338,8 +366,8 @@ export function computeItemDuration(
|
|
|
338
366
|
const next = children[i + 1]
|
|
339
367
|
|
|
340
368
|
if (prev && next && prev.kind !== Kind.Transition && next.kind !== Kind.Transition) {
|
|
341
|
-
const prevDur =
|
|
342
|
-
const nextDur =
|
|
369
|
+
const prevDur = computeItemDurationFromMap(prev.id, items)
|
|
370
|
+
const nextDur = computeItemDurationFromMap(next.id, items)
|
|
343
371
|
const overlap = Math.max(0, Math.min(child.duration, prevDur, nextDur))
|
|
344
372
|
|
|
345
373
|
total = ms(total - overlap)
|
|
@@ -347,7 +375,7 @@ export function computeItemDuration(
|
|
|
347
375
|
continue
|
|
348
376
|
}
|
|
349
377
|
|
|
350
|
-
total = ms(total +
|
|
378
|
+
total = ms(total + computeItemDurationFromMap(child.id, items))
|
|
351
379
|
}
|
|
352
380
|
|
|
353
381
|
return total
|
|
@@ -357,7 +385,7 @@ export function computeItemDuration(
|
|
|
357
385
|
let longest = ms(0)
|
|
358
386
|
|
|
359
387
|
for (const childId of item.childrenIds) {
|
|
360
|
-
const duration =
|
|
388
|
+
const duration = computeItemDurationFromMap(childId, items)
|
|
361
389
|
if (duration > longest) {
|
|
362
390
|
longest = duration
|
|
363
391
|
}
|
|
@@ -8,7 +8,8 @@ export type VideoSampler = (item: Item.Video, time: Ms) => Promise<VideoFrame |
|
|
|
8
8
|
export function createDefaultVideoSampler(sink: VideoSink): VideoSampler {
|
|
9
9
|
return async (item, time) => {
|
|
10
10
|
const s = await sink.getSink(item.mediaHash)
|
|
11
|
-
const
|
|
11
|
+
const mediaTime = item.start + time
|
|
12
|
+
const sample = await s?.getSample(mediaTime / 1000)
|
|
12
13
|
const frame = sample?.toVideoFrame()
|
|
13
14
|
sample?.close()
|
|
14
15
|
return frame ?? undefined
|
|
@@ -3,6 +3,7 @@ import {SampleContext} from "./types.js"
|
|
|
3
3
|
import {sampleSequence} from "./sequence.js"
|
|
4
4
|
import {Ms} from "../../../../../../units/ms.js"
|
|
5
5
|
import {Item, Kind} from "../../../../../parts/item.js"
|
|
6
|
+
import {segmentTranscript} from "../../../../../parts/captions.js"
|
|
6
7
|
import {FilterSpec, Layer} from "../../../../../../driver/fns/schematic.js"
|
|
7
8
|
import {AncestorAt, computeOpacity, computeWorldMatrix} from "../../../handy.js"
|
|
8
9
|
|
|
@@ -58,6 +59,24 @@ export async function sampleVisual(
|
|
|
58
59
|
return [{id: item.id, kind: "text", content: item.content, style, matrix, alpha, crop, filters}]
|
|
59
60
|
}
|
|
60
61
|
|
|
62
|
+
case Kind.Caption: {
|
|
63
|
+
if (time < 0 || time >= item.duration) return []
|
|
64
|
+
|
|
65
|
+
const transcriptTime = item.start + time
|
|
66
|
+
const segment = segmentTranscript(item.transcript, item).find(segment => {
|
|
67
|
+
const [start, end] = segment.timestamp
|
|
68
|
+
return transcriptTime >= start && transcriptTime < end
|
|
69
|
+
})
|
|
70
|
+
if (!segment)
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
const style = item.styleId
|
|
74
|
+
? (ctx.items.get(item.styleId) as Item.TextStyle)?.style
|
|
75
|
+
: undefined
|
|
76
|
+
|
|
77
|
+
return [{id: item.id, kind: "text", content: segment.text, style, matrix, alpha, crop, filters}]
|
|
78
|
+
}
|
|
79
|
+
|
|
61
80
|
case Kind.Gap: {
|
|
62
81
|
return [{id: item.id, kind: "gap"}]
|
|
63
82
|
}
|