@omnimedia/omnitool 1.1.0-1 → 1.1.0-4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +10 -9
- package/s/context.ts +1 -1
- package/s/demo/demo.bundle.ts +6 -2
- package/s/demo/routines/filmstrip-test.ts +2 -2
- package/s/demo/routines/transcriber-test.ts +34 -0
- package/s/driver/fns/host.ts +7 -6
- package/s/driver/fns/schematic.ts +1 -1
- package/s/driver/fns/work.ts +116 -119
- package/s/driver/utils/load-decoder-source.ts +3 -4
- package/s/features/speech/transcribe/default-spec.ts +11 -0
- package/s/features/speech/transcribe/parts/load-pipe.ts +19 -0
- package/s/features/speech/transcribe/parts/prep-audio.ts +23 -0
- package/s/features/speech/transcribe/parts/transcribe.ts +70 -0
- package/s/features/speech/transcribe/transcriber.ts +46 -0
- package/s/features/speech/transcribe/types.ts +82 -0
- package/s/features/speech/transcribe/worker.bundle.ts +40 -0
- package/s/timeline/utils/checksum.ts +2 -1
- package/s/tools/common/loader.ts +26 -0
- package/s/tools/common/transformer-pipeline.ts +26 -0
- package/s/tools/speech-recognition/common/model.ts +26 -0
- package/s/tools/speech-recognition/whisper/fns/host.ts +25 -0
- package/s/tools/speech-recognition/whisper/fns/schematic.ts +23 -0
- package/s/tools/speech-recognition/whisper/fns/work.ts +91 -0
- package/s/tools/speech-recognition/whisper/parts/types.ts +38 -0
- package/s/tools/speech-recognition/whisper/parts/worker.bundle.ts +7 -0
- package/s/tools/speech-recognition/whisper/tool.ts +70 -0
- package/x/context.js +1 -1
- package/x/context.js.map +1 -1
- package/x/demo/demo.bundle.js +6 -2
- package/x/demo/demo.bundle.js.map +1 -1
- package/x/demo/demo.bundle.min.js +6 -6
- package/x/demo/demo.bundle.min.js.map +4 -4
- package/x/demo/routines/filmstrip-test.d.ts +1 -1
- package/x/demo/routines/filmstrip-test.js +2 -2
- package/x/demo/routines/filmstrip-test.js.map +1 -1
- package/x/demo/routines/transcriber-test.d.ts +4 -0
- package/x/demo/routines/transcriber-test.js +33 -0
- package/x/demo/routines/transcriber-test.js.map +1 -0
- package/x/driver/driver.worker.bundle.min.js +80 -80
- package/x/driver/driver.worker.bundle.min.js.map +4 -4
- package/x/driver/fns/host.js +3 -3
- package/x/driver/fns/host.js.map +1 -1
- package/x/driver/fns/schematic.d.ts +1 -1
- package/x/driver/fns/work.js +8 -8
- package/x/driver/fns/work.js.map +1 -1
- package/x/driver/utils/load-decoder-source.d.ts +2 -1
- package/x/driver/utils/load-decoder-source.js +2 -3
- package/x/driver/utils/load-decoder-source.js.map +1 -1
- package/x/features/speech/transcribe/default-spec.d.ts +2 -0
- package/x/features/speech/transcribe/default-spec.js +8 -0
- package/x/features/speech/transcribe/default-spec.js.map +1 -0
- package/x/features/speech/transcribe/parts/load-pipe.d.ts +2 -0
- package/x/features/speech/transcribe/parts/load-pipe.js +13 -0
- package/x/features/speech/transcribe/parts/load-pipe.js.map +1 -0
- package/x/features/speech/transcribe/parts/prep-audio.d.ts +5 -0
- package/x/features/speech/transcribe/parts/prep-audio.js +21 -0
- package/x/features/speech/transcribe/parts/prep-audio.js.map +1 -0
- package/x/features/speech/transcribe/parts/transcribe.d.ts +5 -0
- package/x/features/speech/transcribe/parts/transcribe.js +56 -0
- package/x/features/speech/transcribe/parts/transcribe.js.map +1 -0
- package/x/features/speech/transcribe/transcriber.d.ts +5 -0
- package/x/features/speech/transcribe/transcriber.js +33 -0
- package/x/features/speech/transcribe/transcriber.js.map +1 -0
- package/x/features/speech/transcribe/types.d.ts +66 -0
- package/x/features/speech/transcribe/types.js +2 -0
- package/x/features/speech/transcribe/types.js.map +1 -0
- package/x/features/speech/transcribe/worker.bundle.d.ts +1 -0
- package/x/features/speech/transcribe/worker.bundle.js +33 -0
- package/x/features/speech/transcribe/worker.bundle.js.map +1 -0
- package/x/features/speech/transcribe/worker.bundle.min.js +2916 -0
- package/x/features/speech/transcribe/worker.bundle.min.js.map +7 -0
- package/x/index.html +2 -2
- package/x/timeline/utils/checksum.js +2 -1
- package/x/timeline/utils/checksum.js.map +1 -1
- package/x/tools/common/loader.d.ts +19 -0
- package/x/tools/common/loader.js +18 -0
- package/x/tools/common/loader.js.map +1 -0
- package/x/tools/common/transformer-pipeline.d.ts +8 -0
- package/x/tools/common/transformer-pipeline.js +24 -0
- package/x/tools/common/transformer-pipeline.js.map +1 -0
- package/x/tools/speech-recognition/common/model.d.ts +14 -0
- package/x/tools/speech-recognition/common/model.js +16 -0
- package/x/tools/speech-recognition/common/model.js.map +1 -0
- package/x/tools/speech-recognition/whisper/fns/host.d.ts +13 -0
- package/x/tools/speech-recognition/whisper/fns/host.js +19 -0
- package/x/tools/speech-recognition/whisper/fns/host.js.map +1 -0
- package/x/tools/speech-recognition/whisper/fns/schematic.d.ts +19 -0
- package/x/tools/speech-recognition/whisper/fns/schematic.js +2 -0
- package/x/tools/speech-recognition/whisper/fns/schematic.js.map +1 -0
- package/x/tools/speech-recognition/whisper/fns/work.d.ts +12 -0
- package/x/tools/speech-recognition/whisper/fns/work.js +74 -0
- package/x/tools/speech-recognition/whisper/fns/work.js.map +1 -0
- package/x/tools/speech-recognition/whisper/parts/types.d.ts +31 -0
- package/x/tools/speech-recognition/whisper/parts/types.js +2 -0
- package/x/tools/speech-recognition/whisper/parts/types.js.map +1 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.d.ts +1 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.js +4 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.js.map +1 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.min.js +8 -0
- package/x/tools/speech-recognition/whisper/parts/worker.bundle.min.js.map +7 -0
- package/x/tools/speech-recognition/whisper/tool.d.ts +12 -0
- package/x/tools/speech-recognition/whisper/tool.js +63 -0
- package/x/tools/speech-recognition/whisper/tool.js.map +1 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
|
|
2
|
+
import {AsSchematic} from "@e280/comrade"
|
|
3
|
+
import {DataType, DeviceType, Pipeline} from "@huggingface/transformers"
|
|
4
|
+
|
|
5
|
+
import {Driver} from "../../../driver/driver.js"
|
|
6
|
+
|
|
7
|
+
export type TranscriberSchematic = AsSchematic<{
|
|
8
|
+
work: {
|
|
9
|
+
prepare(spec: TranscriberSpec): Promise<void>
|
|
10
|
+
transcribe(request: TranscriptionRequest): Promise<Transcription>
|
|
11
|
+
},
|
|
12
|
+
|
|
13
|
+
host: {
|
|
14
|
+
loading(load: Loading): Promise<void>
|
|
15
|
+
deliverReport(report: TranscriptionReport): Promise<void>
|
|
16
|
+
deliverTranscription(transcription: string): Promise<void>
|
|
17
|
+
}
|
|
18
|
+
}>
|
|
19
|
+
|
|
20
|
+
export type Loading = {
|
|
21
|
+
total: number
|
|
22
|
+
progress: number
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export type TranscribeOptions = {
|
|
26
|
+
pipe: Pipeline
|
|
27
|
+
spec: TranscriberSpec
|
|
28
|
+
request: TranscriptionRequest
|
|
29
|
+
callbacks: TranscriptionCallbacks
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export type TranscriberPipeOptions = {
|
|
33
|
+
spec: TranscriberSpec
|
|
34
|
+
onLoading: (loading: Loading) => void
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export type SpeechTime = [start: number, end: number]
|
|
38
|
+
|
|
39
|
+
export type Transcription = {
|
|
40
|
+
text: string
|
|
41
|
+
chunks: {
|
|
42
|
+
text: string
|
|
43
|
+
timestamp: SpeechTime
|
|
44
|
+
}[]
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export type TranscriberSpec = {
|
|
48
|
+
model: string
|
|
49
|
+
dtype: DataType
|
|
50
|
+
device: DeviceType
|
|
51
|
+
chunkLength: number
|
|
52
|
+
strideLength: number
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export type TranscriptionOptions = {
|
|
56
|
+
source: Blob
|
|
57
|
+
language: string | null
|
|
58
|
+
} & TranscriptionCallbacks
|
|
59
|
+
|
|
60
|
+
export type TranscriptionRequest = {
|
|
61
|
+
audio: ArrayBufferLike
|
|
62
|
+
language: string | null
|
|
63
|
+
duration: number
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export type TranscriptionReport = {
|
|
67
|
+
progress: number
|
|
68
|
+
tokensPerSecond: number
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export type TranscriptionCallbacks = {
|
|
72
|
+
onReport: (report: TranscriptionReport) => void
|
|
73
|
+
onTranscription: (transcription: string) => void
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export type TranscriberOptions = {
|
|
77
|
+
driver: Driver
|
|
78
|
+
spec: TranscriberSpec
|
|
79
|
+
workerUrl: URL | string
|
|
80
|
+
onLoading: (loading: Loading) => void
|
|
81
|
+
}
|
|
82
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
|
|
2
|
+
import {defer, once} from "@e280/stz"
|
|
3
|
+
import {Comrade, Host} from "@e280/comrade"
|
|
4
|
+
import {Pipeline} from "@huggingface/transformers"
|
|
5
|
+
|
|
6
|
+
import {loadPipe} from "./parts/load-pipe.js"
|
|
7
|
+
import {transcribe} from "./parts/transcribe.js"
|
|
8
|
+
import {TranscriberSchematic, TranscriberSpec} from "./types.js"
|
|
9
|
+
|
|
10
|
+
const deferred = defer<{pipe: Pipeline, spec: TranscriberSpec}>()
|
|
11
|
+
|
|
12
|
+
const makePrepare = (host: Host<TranscriberSchematic>) => once(async(spec: TranscriberSpec) => {
|
|
13
|
+
deferred.resolve({
|
|
14
|
+
spec,
|
|
15
|
+
pipe: await loadPipe({
|
|
16
|
+
spec,
|
|
17
|
+
onLoading: loading => host.loading(loading),
|
|
18
|
+
}),
|
|
19
|
+
})
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
await Comrade.worker<TranscriberSchematic>(shell => {
|
|
23
|
+
const prepare = makePrepare(shell.host)
|
|
24
|
+
return {
|
|
25
|
+
prepare,
|
|
26
|
+
async transcribe(request) {
|
|
27
|
+
const {pipe, spec} = await deferred.promise
|
|
28
|
+
return transcribe({
|
|
29
|
+
pipe,
|
|
30
|
+
spec,
|
|
31
|
+
request,
|
|
32
|
+
callbacks: {
|
|
33
|
+
onReport: report => shell.host.deliverReport(report),
|
|
34
|
+
onTranscription: transcription => shell.host.deliverTranscription(transcription),
|
|
35
|
+
},
|
|
36
|
+
})
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
})
|
|
40
|
+
|
|
@@ -10,7 +10,8 @@ export class Checksum {
|
|
|
10
10
|
) {}
|
|
11
11
|
|
|
12
12
|
static async make(data: Uint8Array) {
|
|
13
|
-
const
|
|
13
|
+
const data2 = new Uint8Array(data)
|
|
14
|
+
const bytes = new Uint8Array(await crypto.subtle.digest("SHA-256", data2))
|
|
14
15
|
const hash = Hex.fromBytes(bytes)
|
|
15
16
|
const nickname = Thumbprint.sigil.fromBytes(bytes)
|
|
16
17
|
return new this(data, bytes, hash, nickname)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import {pub, Pub} from "@e280/stz"
|
|
2
|
+
import {ProgressItem} from "../speech-recognition/whisper/parts/types.js"
|
|
3
|
+
|
|
4
|
+
export interface LoaderEvents {
|
|
5
|
+
onModelLoadProgress: Pub<ProgressItem[]>
|
|
6
|
+
onTpsUpdate: Pub<[number]>
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export abstract class Loader {
|
|
10
|
+
tps = 0
|
|
11
|
+
|
|
12
|
+
static loaderEvents = {
|
|
13
|
+
onModelLoadProgress: pub<ProgressItem[]>(),
|
|
14
|
+
onTpsUpdate: pub<[number]>()
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
constructor(public readonly name: string, public model: string) {}
|
|
18
|
+
|
|
19
|
+
abstract init(): Promise<void>
|
|
20
|
+
|
|
21
|
+
abstract setModel(model: string): void
|
|
22
|
+
|
|
23
|
+
setTps(value: number) {
|
|
24
|
+
this.tps = value
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
//@ts-ignore
|
|
2
|
+
import {pipeline} from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.0/dist/transformers.min.js"
|
|
3
|
+
|
|
4
|
+
import {ProgressCallback} from "../speech-recognition/whisper/parts/types.js"
|
|
5
|
+
|
|
6
|
+
export class PipelineFactory {
|
|
7
|
+
instance: any = null
|
|
8
|
+
model: string | null = null
|
|
9
|
+
|
|
10
|
+
constructor(public task: string) {}
|
|
11
|
+
|
|
12
|
+
async createInstance(model: string, progressCallback?: ProgressCallback) {
|
|
13
|
+
this.model = model
|
|
14
|
+
return this.instance = await pipeline(this.task, this.model, {
|
|
15
|
+
dtype: {
|
|
16
|
+
encoder_model:
|
|
17
|
+
this.model === "onnx-community/whisper-large-v3-turbo"
|
|
18
|
+
? "fp16"
|
|
19
|
+
: "fp32",
|
|
20
|
+
decoder_model_merged: "q4",
|
|
21
|
+
},
|
|
22
|
+
device: "webgpu",
|
|
23
|
+
progress_callback: progressCallback,
|
|
24
|
+
})
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import {pub} from "@e280/stz"
|
|
2
|
+
|
|
3
|
+
import {Loader} from "../../common/loader.js"
|
|
4
|
+
import {DecoderSource} from "../../../driver/fns/schematic.js"
|
|
5
|
+
import {SpeechRecognizerModels, Word, WordGroup} from "../whisper/parts/types.js"
|
|
6
|
+
|
|
7
|
+
export abstract class SpeechRecognizer extends Loader {
|
|
8
|
+
multilingual = true
|
|
9
|
+
|
|
10
|
+
static speechRecognizerEvents = {
|
|
11
|
+
onTranscriptionChunk: pub<Word[]>(),
|
|
12
|
+
onTranscribeProgress: pub<[number]>()
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
abstract transcribe(input: DecoderSource): Promise<WordGroup>
|
|
16
|
+
|
|
17
|
+
setMultilingual(value: boolean) {
|
|
18
|
+
this.multilingual = value
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
detectLanguage?(input: Blob | AudioBuffer): Promise<string>
|
|
22
|
+
|
|
23
|
+
setModel(value: SpeechRecognizerModels) {
|
|
24
|
+
this.model = value
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
|
|
2
|
+
import {Comrade} from "@e280/comrade"
|
|
3
|
+
import {ProgressItem} from "../parts/types.js"
|
|
4
|
+
import {SpeechRecognizerHostEvents, WhisperSchematic} from "./schematic.js"
|
|
5
|
+
|
|
6
|
+
export const setupWhisperHost = (events: SpeechRecognizerHostEvents) => (
|
|
7
|
+
Comrade.host<WhisperSchematic>(_shell => ({
|
|
8
|
+
async updateModelLoadProgress(item) {
|
|
9
|
+
events.onModelLoadProgress.pub(item)
|
|
10
|
+
},
|
|
11
|
+
async deliverTranscriptionChunk(chunk) {
|
|
12
|
+
events.onTranscriptionChunk.pub({
|
|
13
|
+
text: chunk.text,
|
|
14
|
+
timestamp: chunk.timestamp
|
|
15
|
+
})
|
|
16
|
+
},
|
|
17
|
+
async updateTps(value) {
|
|
18
|
+
events.onTpsUpdate.pub(value)
|
|
19
|
+
},
|
|
20
|
+
async updateTranscribeProgress(value) {
|
|
21
|
+
events.onTranscribeProgress(value)
|
|
22
|
+
}
|
|
23
|
+
}))
|
|
24
|
+
)
|
|
25
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import {Pub} from "@e280/stz"
|
|
2
|
+
import {AsSchematic} from "@e280/comrade"
|
|
3
|
+
|
|
4
|
+
import {LoaderEvents} from "../../../common/loader.js"
|
|
5
|
+
import {ProgressItem, TranscriptionChunk, TranscriptionMessage, TranscriptionResult, Word} from "../parts/types.js"
|
|
6
|
+
|
|
7
|
+
export type WhisperSchematic = AsSchematic<{
|
|
8
|
+
work: {
|
|
9
|
+
transcribe(input: TranscriptionMessage): Promise<TranscriptionResult | null>
|
|
10
|
+
},
|
|
11
|
+
|
|
12
|
+
host: {
|
|
13
|
+
updateModelLoadProgress(item: ProgressItem): Promise<void>
|
|
14
|
+
deliverTranscriptionChunk(chunk: TranscriptionChunk): Promise<void>
|
|
15
|
+
updateTps(value: number): Promise<void>
|
|
16
|
+
updateTranscribeProgress(value: number): Promise<void>
|
|
17
|
+
}
|
|
18
|
+
}>
|
|
19
|
+
|
|
20
|
+
export interface SpeechRecognizerHostEvents extends LoaderEvents {
|
|
21
|
+
onTranscriptionChunk: Pub<Word[]>
|
|
22
|
+
onTranscribeProgress: Pub<[number]>
|
|
23
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import {Comrade} from "@e280/comrade"
|
|
2
|
+
//@ts-ignore
|
|
3
|
+
import {pipeline, WhisperTextStreamer} from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.0/dist/transformers.min.js"
|
|
4
|
+
|
|
5
|
+
import {WhisperSchematic} from "./schematic.js"
|
|
6
|
+
import {TranscriptionChunk} from "../parts/types.js"
|
|
7
|
+
import {PipelineFactory} from "../../../common/transformer-pipeline.js"
|
|
8
|
+
|
|
9
|
+
// TODO suspicious globals, probably bad
|
|
10
|
+
const pipeline = new PipelineFactory("automatic-speech-recognition")
|
|
11
|
+
let transcriber: any
|
|
12
|
+
|
|
13
|
+
export const setupWhisperWork = Comrade.work<WhisperSchematic>(shell => ({
|
|
14
|
+
async transcribe({audio, model, language, duration}) {
|
|
15
|
+
const isDistil = model.startsWith("distil-whisper/")
|
|
16
|
+
|
|
17
|
+
if(!pipeline.model || pipeline.model !== model) {
|
|
18
|
+
pipeline.instance?.dispose()?.()
|
|
19
|
+
pipeline.instance = null
|
|
20
|
+
transcriber = await pipeline.createInstance(
|
|
21
|
+
model,
|
|
22
|
+
(data) => {
|
|
23
|
+
if(data.progress)
|
|
24
|
+
shell.host.updateModelLoadProgress({
|
|
25
|
+
id: data.file,
|
|
26
|
+
progress: data.progress
|
|
27
|
+
})
|
|
28
|
+
}
|
|
29
|
+
)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const timePrecision =
|
|
33
|
+
transcriber.processor.feature_extractor.config.chunk_length /
|
|
34
|
+
transcriber.model.config.max_source_positions
|
|
35
|
+
|
|
36
|
+
const chunkLength = isDistil ? 20 : 30
|
|
37
|
+
const strideLength = isDistil ? 3 : 5
|
|
38
|
+
|
|
39
|
+
let chunkCount = 0
|
|
40
|
+
let startTime: number | null = null
|
|
41
|
+
let tokenCount = 0
|
|
42
|
+
let tps = 0
|
|
43
|
+
|
|
44
|
+
const chunkDuration = chunkLength - strideLength
|
|
45
|
+
|
|
46
|
+
const estimateProgress = () => {
|
|
47
|
+
const audioProgressSeconds = chunkCount * chunkDuration
|
|
48
|
+
return Math.min(audioProgressSeconds / duration, 1)
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const streamer = new WhisperTextStreamer(transcriber.tokenizer, {
|
|
52
|
+
time_precision: timePrecision,
|
|
53
|
+
token_callback_function: () => {
|
|
54
|
+
startTime ??= performance.now()
|
|
55
|
+
if (++tokenCount > 1) {
|
|
56
|
+
tps = (tokenCount / (performance.now() - startTime)) * 1000
|
|
57
|
+
shell.host.updateTps(tps)
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
callback_function: (textChunk: any) => {
|
|
61
|
+
shell.host.deliverTranscriptionChunk(textChunk)
|
|
62
|
+
},
|
|
63
|
+
on_finalize: () => {
|
|
64
|
+
startTime = null
|
|
65
|
+
tokenCount = 0
|
|
66
|
+
chunkCount++
|
|
67
|
+
const progress = estimateProgress()
|
|
68
|
+
shell.host.updateTranscribeProgress(progress)
|
|
69
|
+
},
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
const output = await transcriber(audio, {
|
|
73
|
+
top_k: 0,
|
|
74
|
+
do_sample: false,
|
|
75
|
+
chunk_length_s: chunkLength,
|
|
76
|
+
stride_length_s: strideLength,
|
|
77
|
+
language,
|
|
78
|
+
task: "transcribe",
|
|
79
|
+
return_timestamps: "word", // if using "word" the on_chunk_start & end is not called thus we cant retrieve timestamps, only after whole thing finishes
|
|
80
|
+
force_full_sequences: false,
|
|
81
|
+
streamer,
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
if (!output) return null
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
tps,
|
|
88
|
+
...output,
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}))
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export interface ProgressItem {
|
|
2
|
+
id: string
|
|
3
|
+
progress: number
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
export type Word = {
|
|
7
|
+
text: string
|
|
8
|
+
timestamp: [start: number, end: number]
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export type WordGroup = Word[]
|
|
12
|
+
export type Transcript = WordGroup[]
|
|
13
|
+
|
|
14
|
+
export interface TranscriptionChunk {
|
|
15
|
+
text: string
|
|
16
|
+
offset: number
|
|
17
|
+
timestamp: [number, number]
|
|
18
|
+
finalised: boolean
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface TranscriptionMessage {
|
|
22
|
+
audio: Float32Array
|
|
23
|
+
model: string
|
|
24
|
+
subtask: string | null
|
|
25
|
+
language: string | null
|
|
26
|
+
duration: number
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface TranscriptionResult {
|
|
30
|
+
text: string
|
|
31
|
+
chunks: TranscriptionChunk[]
|
|
32
|
+
tps: number
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export type ProgressCallback = (data: any) => void
|
|
36
|
+
|
|
37
|
+
export type SpeechRecognizerModels = "onnx-community/whisper-tiny_timestamped"
|
|
38
|
+
export type SpeechRecognizerSubtasks = "transcribe"
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import {Comrade, Thread} from "@e280/comrade"
|
|
2
|
+
|
|
3
|
+
import {WordGroup} from "./parts/types.js"
|
|
4
|
+
import {context} from "../../../context.js"
|
|
5
|
+
import {setupWhisperHost} from "./fns/host.js"
|
|
6
|
+
import {SpeechRecognizer} from "../common/model.js"
|
|
7
|
+
import {WhisperSchematic} from "./fns/schematic.js"
|
|
8
|
+
|
|
9
|
+
export class Whisper extends SpeechRecognizer {
|
|
10
|
+
constructor(public thread: Thread<WhisperSchematic>) {
|
|
11
|
+
super('whisper', "onnx-community/whisper-tiny_timestamped")
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
static async setup() {
|
|
15
|
+
const thread = await Comrade.thread<WhisperSchematic>({
|
|
16
|
+
label: "OmnitoolDriver",
|
|
17
|
+
workerUrl: new URL("/tools/speech-recognition/whisper/parts/worker.bundle.min.js", import.meta.url),
|
|
18
|
+
setupHost: setupWhisperHost({
|
|
19
|
+
...this.loaderEvents,
|
|
20
|
+
...this.speechRecognizerEvents
|
|
21
|
+
})
|
|
22
|
+
})
|
|
23
|
+
return new this(thread)
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
async init() {
|
|
27
|
+
// there should be called loading of the model in worker instead when transcribe is called ..
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
async #transcribe(source: Blob, options?: {multilingual?: boolean, language?: string}) {
|
|
31
|
+
const arrayBuffer = await source.arrayBuffer()
|
|
32
|
+
const audioCTX = new AudioContext({sampleRate: 16000})
|
|
33
|
+
const audioData = await audioCTX.decodeAudioData(arrayBuffer)
|
|
34
|
+
let audio
|
|
35
|
+
if (audioData.numberOfChannels === 2) {
|
|
36
|
+
const SCALING_FACTOR = Math.sqrt(2)
|
|
37
|
+
const left = audioData.getChannelData(0)
|
|
38
|
+
const right = audioData.getChannelData(1)
|
|
39
|
+
audio = new Float32Array(left.length)
|
|
40
|
+
for (let i = 0; i < audioData.length; ++i) {
|
|
41
|
+
audio[i] = (SCALING_FACTOR * (left[i] + right[i])) / 2
|
|
42
|
+
}
|
|
43
|
+
} else {
|
|
44
|
+
audio = audioData.getChannelData(0)
|
|
45
|
+
}
|
|
46
|
+
const driver = await context.driver
|
|
47
|
+
const duration = await driver.getAudioDuration(source)
|
|
48
|
+
return await this.thread.work.transcribe({
|
|
49
|
+
audio,
|
|
50
|
+
duration,
|
|
51
|
+
model: this.model,
|
|
52
|
+
subtask: this.multilingual ? "transcribe" : null,
|
|
53
|
+
language:
|
|
54
|
+
this.multilingual && options?.language !== "auto"
|
|
55
|
+
? options?.language ?? "english"
|
|
56
|
+
: null
|
|
57
|
+
})
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async transcribe(source: Blob): Promise<WordGroup> {
|
|
61
|
+
const result = await this.#transcribe(source)
|
|
62
|
+
|
|
63
|
+
const words = result?.chunks.map((chunk: any) => ({
|
|
64
|
+
text: chunk.text.trim(),
|
|
65
|
+
timestamp: chunk.timestamp,
|
|
66
|
+
})) as WordGroup
|
|
67
|
+
|
|
68
|
+
return words
|
|
69
|
+
}
|
|
70
|
+
}
|
package/x/context.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Driver } from "./driver/driver.js";
|
|
2
|
-
const workerUrl = new URL("../driver/driver.worker.bundle.js", import.meta.url);
|
|
2
|
+
const workerUrl = new URL("../driver/driver.worker.bundle.min.js", import.meta.url);
|
|
3
3
|
export const context = {
|
|
4
4
|
driver: Driver.setup({ workerUrl })
|
|
5
5
|
};
|
package/x/context.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"context.js","sourceRoot":"","sources":["../s/context.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,MAAM,EAAC,MAAM,oBAAoB,CAAA;AAEzC,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,
|
|
1
|
+
{"version":3,"file":"context.js","sourceRoot":"","sources":["../s/context.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,MAAM,EAAC,MAAM,oBAAoB,CAAA;AAEzC,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,uCAAuC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;AAEnF,MAAM,CAAC,MAAM,OAAO,GAAG;IACtB,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,EAAC,SAAS,EAAC,CAAC;CACjC,CAAA"}
|
package/x/demo/demo.bundle.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { context } from "../context.js";
|
|
2
2
|
import { waveformTest } from "./routines/waveform-test.js";
|
|
3
3
|
import { filmstripTest } from "./routines/filmstrip-test.js";
|
|
4
|
+
import { transcriberTest } from "./routines/transcriber-test.js";
|
|
4
5
|
import { setupTranscodeTest } from "./routines/transcode-test.js";
|
|
5
6
|
const driver = await context.driver;
|
|
6
7
|
const results = document.querySelector(".results");
|
|
@@ -9,6 +10,7 @@ const importButton = document.querySelector(".import");
|
|
|
9
10
|
fetchButton?.addEventListener("click", startDemoFetch);
|
|
10
11
|
importButton?.addEventListener("click", startDemoImport);
|
|
11
12
|
waveformTest();
|
|
13
|
+
const transcriber = await transcriberTest(driver);
|
|
12
14
|
// hello world test
|
|
13
15
|
{
|
|
14
16
|
await driver.thread.work.hello();
|
|
@@ -20,9 +22,11 @@ waveformTest();
|
|
|
20
22
|
// transcoding tests
|
|
21
23
|
async function startDemoImport() {
|
|
22
24
|
const [fileHandle] = await window.showOpenFilePicker();
|
|
23
|
-
const
|
|
24
|
-
|
|
25
|
+
const file = await fileHandle.getFile();
|
|
26
|
+
const transcode = setupTranscodeTest(driver, file);
|
|
27
|
+
await filmstripTest(file);
|
|
25
28
|
run(transcode, fileHandle.name);
|
|
29
|
+
await transcriber.transcribe(file);
|
|
26
30
|
}
|
|
27
31
|
async function startDemoFetch() {
|
|
28
32
|
// which videos to run tests on
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"demo.bundle.js","sourceRoot":"","sources":["../../s/demo/demo.bundle.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,OAAO,EAAC,MAAM,eAAe,CAAA;AACrC,OAAO,EAAC,YAAY,EAAC,MAAM,6BAA6B,CAAA;AACxD,OAAO,EAAC,aAAa,EAAC,MAAM,8BAA8B,CAAA;AAC1D,OAAO,EAAC,kBAAkB,EAAC,MAAM,8BAA8B,CAAA;AAE/D,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,MAAM,CAAA;AACnC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAE,CAAA;AAEnD,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAA;AACpD,MAAM,YAAY,GAAG,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAsB,CAAA;AAE3E,WAAW,EAAE,gBAAgB,CAAC,OAAO,EAAE,cAAc,CAAC,CAAA;AACtD,YAAY,EAAE,gBAAgB,CAAC,OAAO,EAAE,eAAe,CAAC,CAAA;AAExD,YAAY,EAAE,CAAA;
|
|
1
|
+
{"version":3,"file":"demo.bundle.js","sourceRoot":"","sources":["../../s/demo/demo.bundle.ts"],"names":[],"mappings":"AACA,OAAO,EAAC,OAAO,EAAC,MAAM,eAAe,CAAA;AACrC,OAAO,EAAC,YAAY,EAAC,MAAM,6BAA6B,CAAA;AACxD,OAAO,EAAC,aAAa,EAAC,MAAM,8BAA8B,CAAA;AAC1D,OAAO,EAAC,eAAe,EAAC,MAAM,gCAAgC,CAAA;AAC9D,OAAO,EAAC,kBAAkB,EAAC,MAAM,8BAA8B,CAAA;AAE/D,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,MAAM,CAAA;AACnC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAE,CAAA;AAEnD,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAA;AACpD,MAAM,YAAY,GAAG,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAsB,CAAA;AAE3E,WAAW,EAAE,gBAAgB,CAAC,OAAO,EAAE,cAAc,CAAC,CAAA;AACtD,YAAY,EAAE,gBAAgB,CAAC,OAAO,EAAE,eAAe,CAAC,CAAA;AAExD,YAAY,EAAE,CAAA;AACd,MAAM,WAAW,GAAG,MAAM,eAAe,CAAC,MAAM,CAAC,CAAA;AAEjD,mBAAmB;AACnB,CAAC;IACA,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,CAAA;IAChC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,KAAK,CAAC;QAAE,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAA;;QACxD,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAA;AACrD,CAAC;AAED,oBAAoB;AACpB,KAAK,UAAU,eAAe;IAE7B,MAAM,CAAC,UAAU,CAAC,GAAG,MAAM,MAAM,CAAC,kBAAkB,EAAE,CAAA;IACtD,MAAM,IAAI,GAAG,MAAM,UAAU,CAAC,OAAO,EAAE,CAAA;IACvC,MAAM,SAAS,GAAG,kBAAkB,CAAC,MAAM,EAAE,IAAI,CAAC,CAAA;IAClD,MAAM,aAAa,CAAC,IAAI,CAAC,CAAA;IACzB,GAAG,CAAC,SAAS,EAAE,UAAU,CAAC,IAAI,CAAC,CAAA;IAC/B,MAAM,WAAW,CAAC,UAAU,CAAC,IAAI,CAAC,CAAA;AACnC,CAAC;AAED,KAAK,UAAU,cAAc;IAG5B,+BAA+B;IAC/B,MAAM,MAAM,GAAG;QACd,qBAAqB;KACrB,CAAA;IAED,gCAAgC;IAChC,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QAC1B,MAAM,SAAS,GAAG,kBAAkB,CAAC,MAAM,EAAE,qBAAqB,CAAC,CAAA;QACnE,GAAG,CAAC,SAAS,EAAE,GAAG,CAAC,CAAA;IACpB,CAAC;AACF,CAAC;AAED,KAAK,UAAU,GAAG,CAAC,SAAgD,EAAE,KAAa;IACjF,oBAAoB;IACpB,MAAM,GAAG,GAAG,QAAQ,CAAC,aAAa,CAAC,KAAK,CAAC,CAAA;IACzC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;IAEnB,kBAAkB;IAClB,MAAM,CAAC,GAAG,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAA;IACrC,CAAC,CAAC,WAAW,GAAG,KAAK,CAAA;IACrB,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA;IAEb,wBAAwB;IACxB,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,MAAM,CAAC,CAAA;IAE5B,eAAe;IACf,MAAM,SAAS,CAAC,GAAG,EAAE,CAAA;AACtB,CAAC"}
|