@omnimedia/omnitool 1.1.0-3 → 1.1.0-5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +11 -9
- package/s/context.ts +1 -1
- package/s/demo/demo.bundle.ts +6 -2
- package/s/demo/routines/filmstrip-test.ts +2 -2
- package/s/demo/routines/transcriber-test.ts +34 -0
- package/s/demo/routines/transitions-test.ts +43 -0
- package/s/driver/fns/host.ts +7 -6
- package/s/driver/fns/schematic.ts +1 -1
- package/s/driver/fns/work.ts +116 -119
- package/s/driver/utils/load-decoder-source.ts +3 -4
- package/s/features/speech/transcribe/default-spec.ts +11 -0
- package/s/features/speech/transcribe/parts/load-pipe.ts +19 -0
- package/s/features/speech/transcribe/parts/prep-audio.ts +23 -0
- package/s/features/speech/transcribe/parts/transcribe.ts +70 -0
- package/s/features/speech/transcribe/transcriber.ts +46 -0
- package/s/features/speech/transcribe/types.ts +82 -0
- package/s/features/speech/transcribe/worker.bundle.ts +40 -0
- package/s/features/transition/parts/fragment.ts +24 -0
- package/s/features/transition/parts/types.ts +94 -0
- package/s/features/transition/parts/uniforms.ts +29 -0
- package/s/features/transition/parts/vertex.ts +31 -0
- package/s/features/transition/transition.ts +60 -0
- package/s/timeline/utils/checksum.ts +2 -1
- package/s/tools/common/loader.ts +26 -0
- package/s/tools/common/transformer-pipeline.ts +26 -0
- package/s/tools/speech-recognition/common/model.ts +26 -0
- package/s/tools/speech-recognition/whisper/fns/host.ts +25 -0
- package/s/tools/speech-recognition/whisper/fns/schematic.ts +23 -0
- package/s/tools/speech-recognition/whisper/fns/work.ts +91 -0
- package/s/tools/speech-recognition/whisper/parts/types.ts +38 -0
- package/s/tools/speech-recognition/whisper/parts/worker.bundle.ts +7 -0
- package/s/tools/speech-recognition/whisper/tool.ts +70 -0
- package/x/context.js +1 -1
- package/x/context.js.map +1 -1
- package/x/demo/demo.bundle.js +6 -2
- package/x/demo/demo.bundle.js.map +1 -1
- package/x/demo/demo.bundle.min.js +6 -6
- package/x/demo/demo.bundle.min.js.map +4 -4
- package/x/demo/routines/filmstrip-test.d.ts +1 -1
- package/x/demo/routines/filmstrip-test.js +2 -2
- package/x/demo/routines/filmstrip-test.js.map +1 -1
- package/x/demo/routines/transcriber-test.d.ts +4 -0
- package/x/demo/routines/transcriber-test.js +33 -0
- package/x/demo/routines/transcriber-test.js.map +1 -0
- package/x/demo/routines/transitions-test.d.ts +5 -0
- package/x/demo/routines/transitions-test.js +35 -0
- package/x/demo/routines/transitions-test.js.map +1 -0
- package/x/driver/driver.worker.bundle.min.js +80 -80
- package/x/driver/driver.worker.bundle.min.js.map +4 -4
- package/x/driver/fns/host.js +3 -3
- package/x/driver/fns/host.js.map +1 -1
- package/x/driver/fns/schematic.d.ts +1 -1
- package/x/driver/fns/work.js +8 -8
- package/x/driver/fns/work.js.map +1 -1
- package/x/driver/utils/load-decoder-source.d.ts +2 -1
- package/x/driver/utils/load-decoder-source.js +2 -3
- package/x/driver/utils/load-decoder-source.js.map +1 -1
- package/x/features/speech/transcribe/default-spec.d.ts +2 -0
- package/x/features/speech/transcribe/default-spec.js +8 -0
- package/x/features/speech/transcribe/default-spec.js.map +1 -0
- package/x/features/speech/transcribe/parts/load-pipe.d.ts +2 -0
- package/x/features/speech/transcribe/parts/load-pipe.js +13 -0
- package/x/features/speech/transcribe/parts/load-pipe.js.map +1 -0
- package/x/features/speech/transcribe/parts/prep-audio.d.ts +5 -0
- package/x/features/speech/transcribe/parts/prep-audio.js +21 -0
- package/x/features/speech/transcribe/parts/prep-audio.js.map +1 -0
- package/x/features/speech/transcribe/parts/transcribe.d.ts +5 -0
- package/x/features/speech/transcribe/parts/transcribe.js +56 -0
- package/x/features/speech/transcribe/parts/transcribe.js.map +1 -0
- package/x/features/speech/transcribe/transcriber.d.ts +5 -0
- package/x/features/speech/transcribe/transcriber.js +33 -0
- package/x/features/speech/transcribe/transcriber.js.map +1 -0
- package/x/features/speech/transcribe/types.d.ts +66 -0
- package/x/features/speech/transcribe/types.js +2 -0
- package/x/features/speech/transcribe/types.js.map +1 -0
- package/x/features/speech/transcribe/worker.bundle.d.ts +1 -0
- package/x/features/speech/transcribe/worker.bundle.js +33 -0
- package/x/features/speech/transcribe/worker.bundle.js.map +1 -0
- package/x/features/speech/transcribe/worker.bundle.min.js +2916 -0
- package/x/features/speech/transcribe/worker.bundle.min.js.map +7 -0
- package/x/features/transition/parts/fragment.d.ts +1 -0
- package/x/features/transition/parts/fragment.js +25 -0
- package/x/features/transition/parts/fragment.js.map +1 -0
- package/x/features/transition/parts/types.d.ts +23 -0
- package/x/features/transition/parts/types.js +2 -0
- package/x/features/transition/parts/types.js.map +1 -0
- package/x/features/transition/parts/uniforms.d.ts +31 -0
- package/x/features/transition/parts/uniforms.js +27 -0
- package/x/features/transition/parts/uniforms.js.map +1 -0
- package/x/features/transition/parts/vertex.d.ts +1 -0
- package/x/features/transition/parts/vertex.js +32 -0
- package/x/features/transition/parts/vertex.js.map +1 -0
- package/x/features/transition/transition.d.ts +5 -0
- package/x/features/transition/transition.js +50 -0
- package/x/features/transition/transition.js.map +1 -0
- package/x/index.html +2 -2
- package/x/timeline/utils/checksum.js +2 -1
- package/x/timeline/utils/checksum.js.map +1 -1
- package/x/tools/common/loader.d.ts +19 -0
- package/x/tools/common/loader.js +18 -0
- package/x/tools/common/loader.js.map +1 -0
- package/x/tools/common/transformer-pipeline.d.ts +8 -0
- package/x/tools/common/transformer-pipeline.js +24 -0
- package/x/tools/common/transformer-pipeline.js.map +1 -0
- package/x/tools/speech-recognition/common/model.d.ts +14 -0
- package/x/tools/speech-recognition/common/model.js +16 -0
- package/x/tools/speech-recognition/common/model.js.map +1 -0
- package/x/tools/speech-recognition/whisper/fns/host.d.ts +13 -0
- package/x/tools/speech-recognition/whisper/fns/host.js +19 -0
- package/x/tools/speech-recognition/whisper/fns/host.js.map +1 -0
- package/x/tools/speech-recognition/whisper/fns/schematic.d.ts +19 -0
- package/x/tools/speech-recognition/whisper/fns/schematic.js +2 -0
- package/x/tools/speech-recognition/whisper/fns/schematic.js.map +1 -0
- package/x/tools/speech-recognition/whisper/fns/work.d.ts +12 -0
- package/x/tools/speech-recognition/whisper/fns/work.js +74 -0
- package/x/tools/speech-recognition/whisper/fns/work.js.map +1 -0
- package/x/tools/speech-recognition/whisper/parts/types.d.ts +31 -0
- package/x/tools/speech-recognition/whisper/parts/types.js +2 -0
- package/x/tools/speech-recognition/whisper/parts/types.js.map +1 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.d.ts +1 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.js +4 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.js.map +1 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.min.js +8 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.min.js.map +7 -0
- package/x/tools/speech-recognition/whisper/tool.d.ts +12 -0
- package/x/tools/speech-recognition/whisper/tool.js +63 -0
- package/x/tools/speech-recognition/whisper/tool.js.map +1 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@omnimedia/omnitool",
|
|
3
|
-
"version": "1.1.0-
|
|
3
|
+
"version": "1.1.0-5",
|
|
4
4
|
"description": "open source video processing tools",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Przemysław Gałęzki",
|
|
@@ -23,20 +23,22 @@
|
|
|
23
23
|
"test-debug": "node inspect x/tests.test.js"
|
|
24
24
|
},
|
|
25
25
|
"devDependencies": {
|
|
26
|
-
"@e280/science": "^0.0.
|
|
27
|
-
"@e280/scute": "^0.0.0-
|
|
28
|
-
"@types/node": "^24.0
|
|
26
|
+
"@e280/science": "^0.0.6",
|
|
27
|
+
"@e280/scute": "^0.0.0-7",
|
|
28
|
+
"@types/node": "^24.2.0",
|
|
29
29
|
"http-server": "^14.1.1",
|
|
30
30
|
"npm-run-all": "^4.1.5",
|
|
31
|
-
"typescript": "^5.
|
|
31
|
+
"typescript": "^5.9.2"
|
|
32
32
|
},
|
|
33
33
|
"dependencies": {
|
|
34
34
|
"@benev/slate": "^0.3.9",
|
|
35
|
-
"@e280/comrade": "^0.0.0-
|
|
36
|
-
"@e280/renraku": "^0.5.0-
|
|
37
|
-
"@e280/stz": "^0.0.0-
|
|
35
|
+
"@e280/comrade": "^0.0.0-23",
|
|
36
|
+
"@e280/renraku": "^0.5.0-29",
|
|
37
|
+
"@e280/stz": "^0.0.0-34",
|
|
38
|
+
"@huggingface/transformers": "^3.7.1",
|
|
38
39
|
"comrade": "^0.0.3",
|
|
39
|
-
"
|
|
40
|
+
"gl-transitions": "^1.43.0",
|
|
41
|
+
"mediabunny": "^1.4.4",
|
|
40
42
|
"mp4-muxer": "^5.2.1",
|
|
41
43
|
"pixi.js": "^8.10.1",
|
|
42
44
|
"wavesurfer.js": "^7.10.0",
|
package/s/context.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {Driver} from "./driver/driver.js"
|
|
2
2
|
|
|
3
|
-
const workerUrl = new URL("../driver/driver.worker.bundle.js", import.meta.url)
|
|
3
|
+
const workerUrl = new URL("../driver/driver.worker.bundle.min.js", import.meta.url)
|
|
4
4
|
|
|
5
5
|
export const context = {
|
|
6
6
|
driver: Driver.setup({workerUrl})
|
package/s/demo/demo.bundle.ts
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import {context} from "../context.js"
|
|
3
3
|
import {waveformTest} from "./routines/waveform-test.js"
|
|
4
4
|
import {filmstripTest} from "./routines/filmstrip-test.js"
|
|
5
|
+
import {transcriberTest} from "./routines/transcriber-test.js"
|
|
5
6
|
import {setupTranscodeTest} from "./routines/transcode-test.js"
|
|
6
7
|
|
|
7
8
|
const driver = await context.driver
|
|
@@ -14,6 +15,7 @@ fetchButton?.addEventListener("click", startDemoFetch)
|
|
|
14
15
|
importButton?.addEventListener("click", startDemoImport)
|
|
15
16
|
|
|
16
17
|
waveformTest()
|
|
18
|
+
const transcriber = await transcriberTest(driver)
|
|
17
19
|
|
|
18
20
|
// hello world test
|
|
19
21
|
{
|
|
@@ -26,9 +28,11 @@ waveformTest()
|
|
|
26
28
|
async function startDemoImport()
|
|
27
29
|
{
|
|
28
30
|
const [fileHandle] = await window.showOpenFilePicker()
|
|
29
|
-
const
|
|
30
|
-
|
|
31
|
+
const file = await fileHandle.getFile()
|
|
32
|
+
const transcode = setupTranscodeTest(driver, file)
|
|
33
|
+
await filmstripTest(file)
|
|
31
34
|
run(transcode, fileHandle.name)
|
|
35
|
+
await transcriber.transcribe(file)
|
|
32
36
|
}
|
|
33
37
|
|
|
34
38
|
async function startDemoFetch()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {Filmstrip} from "../../timeline/parts/filmstrip.js"
|
|
2
2
|
|
|
3
|
-
export async function filmstripTest(
|
|
3
|
+
export async function filmstripTest(file: File) {
|
|
4
4
|
const rangeSlider = document.querySelector(".range") as HTMLInputElement
|
|
5
5
|
const rangeView = document.querySelector(".range-view")!
|
|
6
6
|
const rangeSizeSlider = document.querySelector(".range-size")! as HTMLInputElement
|
|
@@ -10,7 +10,7 @@ export async function filmstripTest(fileHandle: FileSystemFileHandle) {
|
|
|
10
10
|
const FPS_10 = 1000/10 / 1000
|
|
11
11
|
let rangeSize = 0.5
|
|
12
12
|
const filmstrip = await Filmstrip.init(
|
|
13
|
-
|
|
13
|
+
file,
|
|
14
14
|
{
|
|
15
15
|
onChange(tiles) {
|
|
16
16
|
// Sort by time (optional, for clean ordering)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import {Driver} from "../../driver/driver.js"
|
|
2
|
+
import {makeTranscriber} from "../../features/speech/transcribe/transcriber.js"
|
|
3
|
+
|
|
4
|
+
export async function transcriberTest(driver: Driver) {
|
|
5
|
+
const transcriber = await makeTranscriber({
|
|
6
|
+
driver,
|
|
7
|
+
spec: {
|
|
8
|
+
model: "onnx-community/whisper-tiny_timestamped",
|
|
9
|
+
device: "webgpu",
|
|
10
|
+
strideLength: 5,
|
|
11
|
+
chunkLength: 30,
|
|
12
|
+
dtype: "fp32"
|
|
13
|
+
},
|
|
14
|
+
workerUrl: new URL("/features/speech/transcribe/worker.bundle.min.js", import.meta.url),
|
|
15
|
+
onLoading({progress, total}) {
|
|
16
|
+
console.log(progress, total, "total")
|
|
17
|
+
},
|
|
18
|
+
})
|
|
19
|
+
return {
|
|
20
|
+
transcribe: async (file: File) => {
|
|
21
|
+
const result = await transcriber.transcribe({
|
|
22
|
+
source: file,
|
|
23
|
+
language: "english",
|
|
24
|
+
onReport(report) {
|
|
25
|
+
console.log("report", report)
|
|
26
|
+
},
|
|
27
|
+
onTranscription(transcription) {
|
|
28
|
+
console.log("transcript", transcription)
|
|
29
|
+
}
|
|
30
|
+
})
|
|
31
|
+
console.log(result, "transcript result")
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import {Application, Sprite} from "pixi.js"
|
|
2
|
+
|
|
3
|
+
import {Driver} from "../../driver/driver.js"
|
|
4
|
+
import {DecoderSource} from "../../driver/fns/schematic.js"
|
|
5
|
+
import {makeTransition} from "../../features/transition/transition.js"
|
|
6
|
+
|
|
7
|
+
export async function setupTransitionsTest(driver: Driver, source: DecoderSource) {
|
|
8
|
+
const app = new Application()
|
|
9
|
+
await app.init({width: 300, height: 300, preference: "webgl"})
|
|
10
|
+
const sprite = new Sprite({width: 300, height: 300})
|
|
11
|
+
|
|
12
|
+
app.stage.addChild(sprite)
|
|
13
|
+
|
|
14
|
+
document.body.appendChild(app.canvas)
|
|
15
|
+
const transition = makeTransition({name: "circle", renderer: app.renderer})
|
|
16
|
+
|
|
17
|
+
async function run() {
|
|
18
|
+
const readables = driver.decode({
|
|
19
|
+
source,
|
|
20
|
+
async onFrame(frame) {
|
|
21
|
+
const texture = transition.render({
|
|
22
|
+
from: frame,
|
|
23
|
+
to: frame,
|
|
24
|
+
progress: 0.7,
|
|
25
|
+
width: app.canvas.width,
|
|
26
|
+
height: app.canvas.height
|
|
27
|
+
})
|
|
28
|
+
sprite.texture = texture
|
|
29
|
+
return frame
|
|
30
|
+
}
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
await driver.encode({
|
|
34
|
+
readables,
|
|
35
|
+
config: {
|
|
36
|
+
audio: {codec: "opus", bitrate: 128000},
|
|
37
|
+
video: {codec: "vp9", bitrate: 1000000}
|
|
38
|
+
}
|
|
39
|
+
})
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return {run}
|
|
43
|
+
}
|
package/s/driver/fns/host.ts
CHANGED
|
@@ -3,10 +3,11 @@ import {Comrade} from "@e280/comrade"
|
|
|
3
3
|
import {Machina} from "../parts/machina.js"
|
|
4
4
|
import {DriverSchematic} from "./schematic.js"
|
|
5
5
|
|
|
6
|
-
export const setupDriverHost = (machina: Machina) =>
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
}))
|
|
6
|
+
export const setupDriverHost = (machina: Machina) => (
|
|
7
|
+
Comrade.host<DriverSchematic>(_shell => ({
|
|
8
|
+
async world() {
|
|
9
|
+
machina.count++
|
|
10
|
+
},
|
|
11
|
+
}))
|
|
12
|
+
)
|
|
12
13
|
|
package/s/driver/fns/work.ts
CHANGED
|
@@ -1,142 +1,139 @@
|
|
|
1
1
|
import {Comrade} from "@e280/comrade"
|
|
2
|
-
import {
|
|
3
|
-
Input, ALL_FORMATS, VideoSampleSink, Output, Mp4OutputFormat, VideoSampleSource, VideoSample,
|
|
4
|
-
AudioSampleSink, AudioSampleSource, AudioSample, StreamTarget, BlobSource, UrlSource
|
|
5
|
-
} from "mediabunny"
|
|
6
2
|
import {autoDetectRenderer, Container, Renderer, Sprite, Text, Texture, DOMAdapter, WebWorkerAdapter} from "pixi.js"
|
|
3
|
+
import {Input, ALL_FORMATS, VideoSampleSink, Output, Mp4OutputFormat, VideoSampleSource, VideoSample, AudioSampleSink, AudioSampleSource, AudioSample, StreamTarget, BlobSource, UrlSource} from "mediabunny"
|
|
7
4
|
|
|
8
5
|
import {Composition, DriverSchematic, Layer, Transform} from "./schematic.js"
|
|
9
6
|
|
|
10
7
|
DOMAdapter.set(WebWorkerAdapter)
|
|
11
8
|
|
|
12
|
-
export const setupDriverWork =
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
9
|
+
export const setupDriverWork = (
|
|
10
|
+
Comrade.work<DriverSchematic>(shell => ({
|
|
11
|
+
async hello() {
|
|
12
|
+
await shell.host.world()
|
|
13
|
+
},
|
|
14
|
+
|
|
15
|
+
async decode({source, video, audio}) {
|
|
16
|
+
const loadSource = async () => {
|
|
17
|
+
if(source instanceof Blob) {
|
|
18
|
+
return new BlobSource(source)
|
|
19
|
+
} else {
|
|
20
|
+
return new UrlSource(source)
|
|
21
|
+
}
|
|
25
22
|
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
23
|
+
const input = new Input({
|
|
24
|
+
source: await loadSource(),
|
|
25
|
+
formats: ALL_FORMATS
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
const [videoTrack, audioTrack] = await Promise.all([
|
|
29
|
+
input.getPrimaryVideoTrack(),
|
|
30
|
+
input.getPrimaryAudioTrack()
|
|
31
|
+
])
|
|
32
|
+
|
|
33
|
+
const videoDecodable = await videoTrack?.canDecode()
|
|
34
|
+
const audioDecodable = await audioTrack?.canDecode()
|
|
35
|
+
|
|
36
|
+
const videoWriter = video.getWriter()
|
|
37
|
+
const audioWriter = audio.getWriter()
|
|
38
|
+
|
|
39
|
+
await Promise.all([
|
|
40
|
+
(async () => {
|
|
41
|
+
if (videoDecodable && videoTrack) {
|
|
42
|
+
const sink = new VideoSampleSink(videoTrack)
|
|
43
|
+
for await (const sample of sink.samples()) {
|
|
44
|
+
const frame = sample.toVideoFrame()
|
|
45
|
+
await videoWriter.write(frame)
|
|
46
|
+
sample.close()
|
|
47
|
+
frame.close()
|
|
48
|
+
}
|
|
49
|
+
await videoWriter.close()
|
|
50
|
+
}
|
|
51
|
+
})(),
|
|
52
|
+
(async () => {
|
|
53
|
+
if (audioDecodable && audioTrack) {
|
|
54
|
+
const sink = new AudioSampleSink(audioTrack)
|
|
55
|
+
for await (const sample of sink.samples()) {
|
|
56
|
+
const frame = sample.toAudioData()
|
|
57
|
+
await audioWriter.write(frame)
|
|
58
|
+
sample.close()
|
|
59
|
+
frame.close()
|
|
60
|
+
}
|
|
61
|
+
await audioWriter.close()
|
|
62
|
+
}
|
|
63
|
+
})()
|
|
64
|
+
])
|
|
65
|
+
},
|
|
66
|
+
|
|
67
|
+
async encode({readables, config, bridge}) {
|
|
68
|
+
const output = new Output({
|
|
69
|
+
format: new Mp4OutputFormat(),
|
|
70
|
+
target: new StreamTarget(bridge, {chunked: true})
|
|
71
|
+
})
|
|
72
|
+
const videoSource = new VideoSampleSource(config.video)
|
|
73
|
+
output.addVideoTrack(videoSource)
|
|
74
|
+
// since AudioSample is not transferable it fails to transfer encoder bitrate config
|
|
75
|
+
// so it needs to be hardcoded not set through constants eg QUALITY_LOW
|
|
76
|
+
const audioSource = new AudioSampleSource(config.audio)
|
|
77
|
+
output.addAudioTrack(audioSource)
|
|
78
|
+
|
|
79
|
+
await output.start()
|
|
80
|
+
|
|
81
|
+
const videoReader = readables.video.getReader()
|
|
82
|
+
const audioReader = readables.audio.getReader()
|
|
83
|
+
|
|
84
|
+
await Promise.all([
|
|
85
|
+
(async () => {
|
|
86
|
+
while (true) {
|
|
87
|
+
const {done, value} = await videoReader.read()
|
|
88
|
+
if (done) break
|
|
89
|
+
const sample = new VideoSample(value)
|
|
90
|
+
await videoSource.add(sample)
|
|
50
91
|
sample.close()
|
|
51
|
-
frame.close()
|
|
52
92
|
}
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
const frame = sample.toAudioData()
|
|
61
|
-
await audioWriter.write(frame)
|
|
93
|
+
})(),
|
|
94
|
+
(async () => {
|
|
95
|
+
while (true) {
|
|
96
|
+
const {done, value} = await audioReader.read()
|
|
97
|
+
if (done) break
|
|
98
|
+
const sample = new AudioSample(value)
|
|
99
|
+
await audioSource.add(sample)
|
|
62
100
|
sample.close()
|
|
63
|
-
|
|
101
|
+
value.close()
|
|
64
102
|
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
})()
|
|
68
|
-
])
|
|
69
|
-
},
|
|
70
|
-
|
|
71
|
-
async encode({readables, config, bridge}) {
|
|
72
|
-
const output = new Output({
|
|
73
|
-
format: new Mp4OutputFormat(),
|
|
74
|
-
target: new StreamTarget(bridge, {chunked: true})
|
|
75
|
-
})
|
|
76
|
-
const videoSource = new VideoSampleSource(config.video)
|
|
77
|
-
output.addVideoTrack(videoSource)
|
|
78
|
-
// since AudioSample is not transferable it fails to transfer encoder bitrate config
|
|
79
|
-
// so it needs to be hardcoded not set through constants eg QUALITY_LOW
|
|
80
|
-
const audioSource = new AudioSampleSource(config.audio)
|
|
81
|
-
output.addAudioTrack(audioSource)
|
|
82
|
-
|
|
83
|
-
await output.start()
|
|
84
|
-
|
|
85
|
-
const videoReader = readables.video.getReader()
|
|
86
|
-
const audioReader = readables.audio.getReader()
|
|
87
|
-
|
|
88
|
-
await Promise.all([
|
|
89
|
-
(async () => {
|
|
90
|
-
while (true) {
|
|
91
|
-
const {done, value} = await videoReader.read()
|
|
92
|
-
if (done) break
|
|
93
|
-
const sample = new VideoSample(value)
|
|
94
|
-
await videoSource.add(sample)
|
|
95
|
-
sample.close()
|
|
96
|
-
}
|
|
97
|
-
})(),
|
|
98
|
-
(async () => {
|
|
99
|
-
while (true) {
|
|
100
|
-
const {done, value} = await audioReader.read()
|
|
101
|
-
if (done) break
|
|
102
|
-
const sample = new AudioSample(value)
|
|
103
|
-
await audioSource.add(sample)
|
|
104
|
-
sample.close()
|
|
105
|
-
value.close()
|
|
106
|
-
}
|
|
107
|
-
})()
|
|
108
|
-
])
|
|
103
|
+
})()
|
|
104
|
+
])
|
|
109
105
|
|
|
110
|
-
|
|
111
|
-
|
|
106
|
+
await output.finalize()
|
|
107
|
+
},
|
|
112
108
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
109
|
+
async composite(composition) {
|
|
110
|
+
const {stage, renderer} = await renderPIXI(1920, 1080)
|
|
111
|
+
stage.removeChildren()
|
|
116
112
|
|
|
117
|
-
|
|
118
|
-
|
|
113
|
+
const {baseFrame, disposables} = await renderLayer(composition, stage)
|
|
114
|
+
renderer.render(stage)
|
|
119
115
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
116
|
+
// make sure browser support webgl/webgpu otherwise it might take much longer to construct frame
|
|
117
|
+
// if its very slow on eg edge try chrome
|
|
118
|
+
const frame = new VideoFrame(renderer.canvas, {
|
|
119
|
+
timestamp: baseFrame?.timestamp,
|
|
120
|
+
duration: baseFrame?.duration ?? undefined,
|
|
121
|
+
})
|
|
126
122
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
for (const disposable of disposables) {
|
|
131
|
-
disposable.destroy(true)
|
|
132
|
-
}
|
|
123
|
+
baseFrame?.close()
|
|
124
|
+
renderer.clear()
|
|
133
125
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
}))
|
|
126
|
+
for (const disposable of disposables) {
|
|
127
|
+
disposable.destroy(true)
|
|
128
|
+
}
|
|
138
129
|
|
|
130
|
+
shell.transfer = [frame]
|
|
131
|
+
return frame
|
|
132
|
+
}
|
|
133
|
+
}))
|
|
134
|
+
)
|
|
139
135
|
|
|
136
|
+
// TODO suspicious global, probably bad
|
|
140
137
|
let pixi: {
|
|
141
138
|
renderer: Renderer
|
|
142
139
|
stage: Container
|
|
@@ -2,10 +2,9 @@ import {BlobSource, UrlSource} from "mediabunny"
|
|
|
2
2
|
import {DecoderSource} from "../fns/schematic.js"
|
|
3
3
|
|
|
4
4
|
// only streamable sources
|
|
5
|
-
export async function loadDecoderSource(source: DecoderSource) {
|
|
6
|
-
if(source instanceof
|
|
7
|
-
|
|
8
|
-
return new BlobSource(file)
|
|
5
|
+
export async function loadDecoderSource(source: DecoderSource): Promise<UrlSource | BlobSource> {
|
|
6
|
+
if(source instanceof Blob) {
|
|
7
|
+
return new BlobSource(source)
|
|
9
8
|
} else {
|
|
10
9
|
return new UrlSource(source)
|
|
11
10
|
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
|
|
2
|
+
import {pipeline} from "@huggingface/transformers"
|
|
3
|
+
|
|
4
|
+
import {TranscriberPipeOptions} from "../types.js"
|
|
5
|
+
|
|
6
|
+
export async function loadPipe(options: TranscriberPipeOptions) {
|
|
7
|
+
const {spec, onLoading} = options
|
|
8
|
+
|
|
9
|
+
const pipe = await pipeline("automatic-speech-recognition", spec.model, {
|
|
10
|
+
device: spec.device,
|
|
11
|
+
dtype: spec.dtype,
|
|
12
|
+
progress_callback: (data: any) => {
|
|
13
|
+
onLoading({total: data.total, progress: data.progress})
|
|
14
|
+
},
|
|
15
|
+
})
|
|
16
|
+
|
|
17
|
+
return pipe
|
|
18
|
+
}
|
|
19
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
|
|
2
|
+
import {Driver} from "../../../../driver/driver.js"
|
|
3
|
+
|
|
4
|
+
export async function prepAudio(driver: Driver, source: Blob) {
|
|
5
|
+
const arrayBuffer = await source.arrayBuffer()
|
|
6
|
+
const audioCTX = new AudioContext({sampleRate: 16000})
|
|
7
|
+
const audioData = await audioCTX.decodeAudioData(arrayBuffer)
|
|
8
|
+
let audio: Float32Array
|
|
9
|
+
if (audioData.numberOfChannels === 2) {
|
|
10
|
+
const SCALING_FACTOR = Math.sqrt(2)
|
|
11
|
+
const left = audioData.getChannelData(0)
|
|
12
|
+
const right = audioData.getChannelData(1)
|
|
13
|
+
audio = new Float32Array(left.length)
|
|
14
|
+
for (let i = 0; i < audioData.length; ++i) {
|
|
15
|
+
audio[i] = (SCALING_FACTOR * (left[i] + right[i])) / 2
|
|
16
|
+
}
|
|
17
|
+
} else {
|
|
18
|
+
audio = audioData.getChannelData(0)
|
|
19
|
+
}
|
|
20
|
+
const duration = await driver.getAudioDuration(source)
|
|
21
|
+
return {audio, duration}
|
|
22
|
+
}
|
|
23
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
|
|
2
|
+
import {WhisperTextStreamer} from "@huggingface/transformers"
|
|
3
|
+
import {TranscribeOptions} from "../types.js"
|
|
4
|
+
|
|
5
|
+
export async function transcribe(options: TranscribeOptions) {
|
|
6
|
+
const {pipe, spec, request, callbacks} = options
|
|
7
|
+
|
|
8
|
+
if (!pipe.processor.feature_extractor)
|
|
9
|
+
throw new Error("no feature_extractor")
|
|
10
|
+
|
|
11
|
+
const timePrecision = (
|
|
12
|
+
pipe.processor.feature_extractor?.config.chunk_length /
|
|
13
|
+
// @ts-ignore
|
|
14
|
+
pipe.model.config.max_source_positions
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
let chunkCount = 0
|
|
18
|
+
let startTime: number | null = null
|
|
19
|
+
let tokenCount = 0
|
|
20
|
+
let tokensPerSecond = 0
|
|
21
|
+
|
|
22
|
+
const chunkDuration = spec.chunkLength - spec.strideLength
|
|
23
|
+
|
|
24
|
+
const calculateProgress = () => {
|
|
25
|
+
const audioProgressSeconds = chunkCount * chunkDuration
|
|
26
|
+
return Math.min(audioProgressSeconds / request.duration, 1)
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// TODO type error on pipe.tokenizer
|
|
30
|
+
const tokenizer = pipe.tokenizer as any
|
|
31
|
+
|
|
32
|
+
const streamer = new WhisperTextStreamer(tokenizer, {
|
|
33
|
+
time_precision: timePrecision,
|
|
34
|
+
token_callback_function: () => {
|
|
35
|
+
startTime ??= performance.now()
|
|
36
|
+
if (++tokenCount > 1) {
|
|
37
|
+
tokensPerSecond = (tokenCount / (performance.now() - startTime)) * 1000
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
callback_function: (textChunk: any) => {
|
|
41
|
+
// TODO
|
|
42
|
+
callbacks.onTranscription(textChunk)
|
|
43
|
+
callbacks.onReport({tokensPerSecond, progress: calculateProgress()})
|
|
44
|
+
},
|
|
45
|
+
on_finalize: () => {
|
|
46
|
+
startTime = null
|
|
47
|
+
tokenCount = 0
|
|
48
|
+
chunkCount++
|
|
49
|
+
callbacks.onReport({tokensPerSecond, progress: calculateProgress()})
|
|
50
|
+
},
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
const result = await pipe(new Float32Array(request.audio), {
|
|
54
|
+
top_k: 0,
|
|
55
|
+
do_sample: false,
|
|
56
|
+
chunk_length_s: spec.chunkLength,
|
|
57
|
+
stride_length_s: spec.strideLength,
|
|
58
|
+
language: request.language,
|
|
59
|
+
task: "transcribe",
|
|
60
|
+
return_timestamps: "word", // if using "word" the on_chunk_start & end is not called thus we cant retrieve timestamps, only after whole thing finishes
|
|
61
|
+
force_full_sequences: false,
|
|
62
|
+
streamer,
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
text: result.text,
|
|
67
|
+
chunks: result.chunks
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
|
|
2
|
+
import {Comrade} from "@e280/comrade"
|
|
3
|
+
import {coalesce, queue, sub} from "@e280/stz"
|
|
4
|
+
|
|
5
|
+
import {prepAudio} from "./parts/prep-audio.js"
|
|
6
|
+
import {TranscriberOptions, TranscriberSchematic, TranscriptionOptions, TranscriptionReport} from "./types.js"
|
|
7
|
+
|
|
8
|
+
export async function makeTranscriber({driver, spec, workerUrl, onLoading}: TranscriberOptions) {
|
|
9
|
+
const onReport = sub<[report: TranscriptionReport]>()
|
|
10
|
+
const onTranscription = sub<[transcription: string]>()
|
|
11
|
+
|
|
12
|
+
const thread = await Comrade.thread<TranscriberSchematic>({
|
|
13
|
+
label: "OmnitoolSpeechTranscriber",
|
|
14
|
+
workerUrl,
|
|
15
|
+
setupHost: () => ({
|
|
16
|
+
loading: async loading => onLoading(loading),
|
|
17
|
+
deliverReport: async report => onReport.pub(report),
|
|
18
|
+
deliverTranscription: async transcription => onTranscription.pub(transcription),
|
|
19
|
+
}),
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
await thread.work.prepare(spec)
|
|
23
|
+
|
|
24
|
+
return {
|
|
25
|
+
transcribe: queue(async(info: TranscriptionOptions) => {
|
|
26
|
+
const {source, language} = info
|
|
27
|
+
const {audio, duration} = await prepAudio(driver, source)
|
|
28
|
+
|
|
29
|
+
const detachCallbacks = coalesce(
|
|
30
|
+
onReport(info.onReport),
|
|
31
|
+
onTranscription(info.onTranscription),
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
const result = await thread.work.transcribe({
|
|
35
|
+
duration,
|
|
36
|
+
language,
|
|
37
|
+
audio: audio.buffer,
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
detachCallbacks()
|
|
41
|
+
return result
|
|
42
|
+
}),
|
|
43
|
+
dispose: thread.terminate()
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|