@fugood/bricks-project 2.24.0-beta.1 → 2.24.0-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/compile/action-name-map.ts +26 -14
- package/compile/index.ts +3 -1
- package/package.json +4 -3
- package/tools/mcp-server.ts +9 -4
- package/tools/postinstall.ts +47 -5
- package/types/generators/RealtimeTranscription.ts +9 -0
- package/types/generators/SpeechToTextGgml.ts +4 -95
- package/types/generators/VadOnnx.ts +201 -0
- package/types/generators/VadTraditional.ts +123 -0
- package/types/generators/index.ts +2 -0
- package/utils/event-props.ts +24 -0
|
@@ -601,6 +601,14 @@ export const templateActionNameMap = {
|
|
|
601
601
|
variables: 'GENERATOR_MCP_VARIABLES',
|
|
602
602
|
},
|
|
603
603
|
},
|
|
604
|
+
GENERATOR_TRADITIONAL_VAD: {
|
|
605
|
+
GENERATOR_TRADITIONAL_VAD_DETECT_FILE: {
|
|
606
|
+
fileUrl: 'GENERATOR_TRADITIONAL_VAD_FILE_URL',
|
|
607
|
+
},
|
|
608
|
+
GENERATOR_TRADITIONAL_VAD_DETECT_DATA: {
|
|
609
|
+
data: 'GENERATOR_TRADITIONAL_VAD_DATA',
|
|
610
|
+
},
|
|
611
|
+
},
|
|
604
612
|
GENERATOR_TTS: {
|
|
605
613
|
GENERATOR_TTS_GENERATE: {
|
|
606
614
|
text: 'GENERATOR_TTS_TEXT',
|
|
@@ -621,6 +629,24 @@ export const templateActionNameMap = {
|
|
|
621
629
|
audioUri: 'GENERATOR_ONNX_STT_AUDIO_URI',
|
|
622
630
|
},
|
|
623
631
|
},
|
|
632
|
+
GENERATOR_ONNX_VAD: {
|
|
633
|
+
GENERATOR_ONNX_VAD_DETECT_FILE: {
|
|
634
|
+
fileUrl: 'GENERATOR_ONNX_VAD_FILE_URL',
|
|
635
|
+
threshold: 'GENERATOR_ONNX_VAD_THRESHOLD',
|
|
636
|
+
minSpeechDurationMs: 'GENERATOR_ONNX_VAD_MIN_SPEECH_DURATION_MS',
|
|
637
|
+
minSilenceDurationMs: 'GENERATOR_ONNX_VAD_MIN_SILENCE_DURATION_MS',
|
|
638
|
+
maxSpeechDurationS: 'GENERATOR_ONNX_VAD_MAX_SPEECH_DURATION_S',
|
|
639
|
+
speechPadMs: 'GENERATOR_ONNX_VAD_SPEECH_PAD_MS',
|
|
640
|
+
},
|
|
641
|
+
GENERATOR_ONNX_VAD_DETECT_DATA: {
|
|
642
|
+
data: 'GENERATOR_ONNX_VAD_DATA',
|
|
643
|
+
threshold: 'GENERATOR_ONNX_VAD_THRESHOLD',
|
|
644
|
+
minSpeechDurationMs: 'GENERATOR_ONNX_VAD_MIN_SPEECH_DURATION_MS',
|
|
645
|
+
minSilenceDurationMs: 'GENERATOR_ONNX_VAD_MIN_SILENCE_DURATION_MS',
|
|
646
|
+
maxSpeechDurationS: 'GENERATOR_ONNX_VAD_MAX_SPEECH_DURATION_S',
|
|
647
|
+
speechPadMs: 'GENERATOR_ONNX_VAD_SPEECH_PAD_MS',
|
|
648
|
+
},
|
|
649
|
+
},
|
|
624
650
|
GENERATOR_SPEECH_INFERENCE: {
|
|
625
651
|
GENERATOR_SPEECH_INFERENCE_TRANSCRIBE_FILE: {
|
|
626
652
|
fileUrl: 'GENERATOR_SPEECH_INFERENCE_FILE_URL',
|
|
@@ -636,20 +662,6 @@ export const templateActionNameMap = {
|
|
|
636
662
|
language: 'GENERATOR_SPEECH_INFERENCE_LANGUAGE',
|
|
637
663
|
translate: 'GENERATOR_SPEECH_INFERENCE_TRANSLATE',
|
|
638
664
|
},
|
|
639
|
-
GENERATOR_SPEECH_INFERENCE_TRANSCRIBE_REALTIME: {
|
|
640
|
-
prompt: 'GENERATOR_SPEECH_INFERENCE_PROMPT',
|
|
641
|
-
beamSize: 'GENERATOR_SPEECH_INFERENCE_BEAM_SIZE',
|
|
642
|
-
language: 'GENERATOR_SPEECH_INFERENCE_LANGUAGE',
|
|
643
|
-
translate: 'GENERATOR_SPEECH_INFERENCE_TRANSLATE',
|
|
644
|
-
realtimeAudioSeconds: 'GENERATOR_SPEECH_INFERENCE_REALTIME_AUDIO_SECONDS',
|
|
645
|
-
realtimeAudioSliceSeconds: 'GENERATOR_SPEECH_INFERENCE_REALTIME_AUDIO_SLICE_SECONDS',
|
|
646
|
-
realtimeAudioMinSeconds: 'GENERATOR_SPEECH_INFERENCE_REALTIME_AUDIO_MIN_SECONDS',
|
|
647
|
-
realtimeSaveAudio: 'GENERATOR_SPEECH_INFERENCE_REALTIME_SAVE_AUDIO',
|
|
648
|
-
realtimeVadEnabled: 'GENERATOR_SPEECH_INFERENCE_REALTIME_VAD_ENABLED',
|
|
649
|
-
realtimeVadMs: 'GENERATOR_SPEECH_INFERENCE_REALTIME_VAD_MS',
|
|
650
|
-
realtimeVadThold: 'GENERATOR_SPEECH_INFERENCE_REALTIME_VAD_THOLD',
|
|
651
|
-
realtimeVadFreqThold: 'GENERATOR_SPEECH_INFERENCE_REALTIME_VAD_FREQ_THOLD',
|
|
652
|
-
},
|
|
653
665
|
},
|
|
654
666
|
GENERATOR_VAD_INFERENCE: {
|
|
655
667
|
GENERATOR_VAD_INFERENCE_DETECT_FILE: {
|
package/compile/index.ts
CHANGED
|
@@ -421,8 +421,9 @@ const compileAutomation = (automationMap: AutomationMap) =>
|
|
|
421
421
|
|
|
422
422
|
export const compile = async (app: Application) => {
|
|
423
423
|
await new Promise((resolve) => setImmediate(resolve, 0))
|
|
424
|
+
const timestamp = Date.now()
|
|
424
425
|
const config = {
|
|
425
|
-
title: app.name
|
|
426
|
+
title: `${app.name || 'Unknown'}(${timestamp})`,
|
|
426
427
|
subspace_map: app.subspaces.reduce((subspaceMap, subspace) => {
|
|
427
428
|
subspaceMap[subspace.id] = {
|
|
428
429
|
title: subspace.title,
|
|
@@ -969,6 +970,7 @@ export const compile = async (app: Application) => {
|
|
|
969
970
|
automation_map: app.automationMap
|
|
970
971
|
? compileAutomation(app.automationMap)
|
|
971
972
|
: app.metadata?.TEMP_automation_map || {},
|
|
973
|
+
update_timestamp: timestamp,
|
|
972
974
|
}
|
|
973
975
|
return config
|
|
974
976
|
}
|
package/package.json
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/bricks-project",
|
|
3
|
-
"version": "2.24.0-beta.
|
|
3
|
+
"version": "2.24.0-beta.3",
|
|
4
4
|
"main": "index.ts",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"build": "bun scripts/build.js"
|
|
7
7
|
},
|
|
8
8
|
"dependencies": {
|
|
9
|
-
"@fugood/bricks-cli": "^2.
|
|
9
|
+
"@fugood/bricks-cli": "^2.24.0-beta.2",
|
|
10
10
|
"@huggingface/gguf": "^0.3.2",
|
|
11
|
+
"@iarna/toml": "^3.0.0",
|
|
11
12
|
"@modelcontextprotocol/sdk": "^1.15.0",
|
|
12
13
|
"@toon-format/toon": "^2.1.0",
|
|
13
14
|
"@types/escodegen": "^0.0.10",
|
|
@@ -18,5 +19,5 @@
|
|
|
18
19
|
"lodash": "^4.17.4",
|
|
19
20
|
"uuid": "^8.3.1"
|
|
20
21
|
},
|
|
21
|
-
"gitHead": "
|
|
22
|
+
"gitHead": "7cf2a6850a7765fd801e1bbdbb571871bae92f62"
|
|
22
23
|
}
|
package/tools/mcp-server.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'
|
|
2
2
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
|
|
3
3
|
import { z } from 'zod'
|
|
4
|
-
import {
|
|
4
|
+
import { $, JSON5 } from 'bun'
|
|
5
5
|
import * as TOON from '@toon-format/toon'
|
|
6
6
|
import Fuse from 'fuse.js'
|
|
7
7
|
import { gguf } from '@huggingface/gguf'
|
|
@@ -729,7 +729,7 @@ server.tool(
|
|
|
729
729
|
|
|
730
730
|
// Return JSON for huggingface_select to allow direct parsing by consumers
|
|
731
731
|
return {
|
|
732
|
-
content: [{ type: 'text', text:
|
|
732
|
+
content: [{ type: 'text', text: JSON5.stringify(result, null, 2) }],
|
|
733
733
|
}
|
|
734
734
|
}
|
|
735
735
|
|
|
@@ -842,7 +842,7 @@ server.tool(
|
|
|
842
842
|
|
|
843
843
|
// Return JSON for huggingface_select to allow direct parsing by consumers
|
|
844
844
|
return {
|
|
845
|
-
content: [{ type: 'text', text:
|
|
845
|
+
content: [{ type: 'text', text: JSON5.stringify(result, null, 2) }],
|
|
846
846
|
}
|
|
847
847
|
} else {
|
|
848
848
|
const result = {
|
|
@@ -875,7 +875,12 @@ server.tool(
|
|
|
875
875
|
|
|
876
876
|
// Return JSON for huggingface_select to allow direct parsing by consumers
|
|
877
877
|
return {
|
|
878
|
-
content: [
|
|
878
|
+
content: [
|
|
879
|
+
{
|
|
880
|
+
type: 'text',
|
|
881
|
+
text: `${JSON5.stringify(result, null, 2)} // Please use _hfRepoInfo undocumented field with @ts-ignore`,
|
|
882
|
+
},
|
|
883
|
+
],
|
|
879
884
|
}
|
|
880
885
|
}
|
|
881
886
|
} catch (err: any) {
|
package/tools/postinstall.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { $ } from 'bun'
|
|
2
2
|
import { stat, readFile, writeFile, readdir } from 'fs/promises'
|
|
3
|
+
import TOML from '@iarna/toml'
|
|
3
4
|
|
|
4
5
|
const cwd = process.cwd()
|
|
5
6
|
|
|
@@ -63,9 +64,7 @@ if (hasClaudeCode || hasAgentsMd) {
|
|
|
63
64
|
await handleMcpConfigOverride(mcpConfigPath)
|
|
64
65
|
}
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
// Install skills that don't already exist in the project
|
|
68
|
-
const skillsDir = `${cwd}/.claude/skills`
|
|
67
|
+
const setupSkills = async (skillsDir) => {
|
|
69
68
|
const packageSkillsDir = `${__dirname}/../skills`
|
|
70
69
|
|
|
71
70
|
if (await exists(packageSkillsDir)) {
|
|
@@ -81,10 +80,53 @@ if (hasClaudeCode) {
|
|
|
81
80
|
console.log(`Skill '${skill}' already exists, skipping`)
|
|
82
81
|
} else {
|
|
83
82
|
await $`cp -r ${packageSkillsDir}/${skill} ${targetSkillDir}`
|
|
84
|
-
console.log(`Installed skill '${skill}' to
|
|
83
|
+
console.log(`Installed skill '${skill}' to ${skillsDir}/`)
|
|
85
84
|
}
|
|
86
85
|
}),
|
|
87
86
|
)
|
|
88
87
|
}
|
|
89
|
-
// TODO: .codex/skills, .cursor/skills if needed
|
|
90
88
|
}
|
|
89
|
+
|
|
90
|
+
if (hasClaudeCode) {
|
|
91
|
+
// Install skills that don't already exist in the project
|
|
92
|
+
await setupSkills(`${cwd}/.claude/skills`)
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (hasAgentsMd) {
|
|
96
|
+
// Handle codex skills
|
|
97
|
+
// Currently no signal file for codex skills, so we just check if AGENTS.md exists
|
|
98
|
+
await setupSkills(`${cwd}/.codex/skills`)
|
|
99
|
+
|
|
100
|
+
const defaultCodexMcpConfig = {
|
|
101
|
+
mcp_servers: {
|
|
102
|
+
'bricks-project': projectMcpServer,
|
|
103
|
+
},
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const handleCodexMcpConfigOverride = async (mcpConfigPath: string) => {
|
|
107
|
+
let mcpConfig: { mcp_servers: Record<string, typeof projectMcpServer> } | null = null
|
|
108
|
+
if (await exists(mcpConfigPath)) {
|
|
109
|
+
const configStr = await readFile(mcpConfigPath, 'utf-8')
|
|
110
|
+
try {
|
|
111
|
+
mcpConfig = TOML.parse(configStr)
|
|
112
|
+
if (!mcpConfig?.mcp_servers) throw new Error('mcp_servers is not defined')
|
|
113
|
+
mcpConfig.mcp_servers['bricks-project'] = projectMcpServer
|
|
114
|
+
} catch (e) {
|
|
115
|
+
mcpConfig = defaultCodexMcpConfig
|
|
116
|
+
}
|
|
117
|
+
} else {
|
|
118
|
+
mcpConfig = defaultCodexMcpConfig
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
await writeFile(mcpConfigPath, `${TOML.stringify(mcpConfig, null, 2)}\n`)
|
|
122
|
+
|
|
123
|
+
console.log(`Updated ${mcpConfigPath}`)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Setup MCP config (.codex/config.toml)
|
|
127
|
+
const codexConfigPath = `${cwd}/.codex/config.toml`
|
|
128
|
+
await handleCodexMcpConfigOverride(codexConfigPath)
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// TODO: .cursor/skills if needed
|
|
132
|
+
// TODO: User setting in application.json to avoid unnecessary skills/config setup
|
|
@@ -41,6 +41,8 @@ Default property:
|
|
|
41
41
|
"audioSliceSec": 30,
|
|
42
42
|
"audioMinSec": 1,
|
|
43
43
|
"maxSlicesInMemory": 5,
|
|
44
|
+
"transcribeProcessingPauseMs": 500,
|
|
45
|
+
"initTranscribeAfterMs": 500,
|
|
44
46
|
"vadStrategy": "use-preset",
|
|
45
47
|
"vadPreset": "default",
|
|
46
48
|
"initialPrompt": "",
|
|
@@ -71,6 +73,10 @@ Default property:
|
|
|
71
73
|
audioMinSec?: number | DataLink
|
|
72
74
|
/* Maximum number of slices to keep in memory */
|
|
73
75
|
maxSlicesInMemory?: number | DataLink
|
|
76
|
+
/* Transcribe processing interval in milliseconds */
|
|
77
|
+
transcribeProcessingPauseMs?: number | DataLink
|
|
78
|
+
/* Transcribe processing init after pause in milliseconds */
|
|
79
|
+
initTranscribeAfterMs?: number | DataLink
|
|
74
80
|
/* VAD Strategy */
|
|
75
81
|
vadStrategy?: 'use-preset' | 'use-generator-options' | DataLink
|
|
76
82
|
/* VAD preset configuration */
|
|
@@ -136,6 +142,8 @@ Default property:
|
|
|
136
142
|
results?: () => Data
|
|
137
143
|
/* Current transcription result text */
|
|
138
144
|
resultText?: () => Data
|
|
145
|
+
/* Last stabilized transcription result segment */
|
|
146
|
+
lastStabilizedSegment?: () => Data
|
|
139
147
|
/* Current statistics */
|
|
140
148
|
statistics?: () => Data
|
|
141
149
|
/* Latest transcribe event */
|
|
@@ -168,6 +176,7 @@ export type GeneratorRealtimeTranscription = Generator &
|
|
|
168
176
|
| 'isTranscribing'
|
|
169
177
|
| 'results'
|
|
170
178
|
| 'resultText'
|
|
179
|
+
| 'lastStabilizedSegment'
|
|
171
180
|
| 'statistics'
|
|
172
181
|
| 'lastTranscribeEvent'
|
|
173
182
|
| 'lastVadEvent'
|
|
@@ -80,76 +80,9 @@ export type GeneratorSpeechInferenceActionTranscribeData = ActionWithParams & {
|
|
|
80
80
|
>
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
-
/*
|
|
84
|
-
export type
|
|
85
|
-
__actionName: '
|
|
86
|
-
params?: Array<
|
|
87
|
-
| {
|
|
88
|
-
input: 'prompt'
|
|
89
|
-
value?: string | DataLink | EventProperty
|
|
90
|
-
mapping?: string
|
|
91
|
-
}
|
|
92
|
-
| {
|
|
93
|
-
input: 'beamSize'
|
|
94
|
-
value?: number | DataLink | EventProperty
|
|
95
|
-
mapping?: string
|
|
96
|
-
}
|
|
97
|
-
| {
|
|
98
|
-
input: 'language'
|
|
99
|
-
value?: string | DataLink | EventProperty
|
|
100
|
-
mapping?: string
|
|
101
|
-
}
|
|
102
|
-
| {
|
|
103
|
-
input: 'translate'
|
|
104
|
-
value?: boolean | DataLink | EventProperty
|
|
105
|
-
mapping?: string
|
|
106
|
-
}
|
|
107
|
-
| {
|
|
108
|
-
input: 'realtimeAudioSeconds'
|
|
109
|
-
value?: number | DataLink | EventProperty
|
|
110
|
-
mapping?: string
|
|
111
|
-
}
|
|
112
|
-
| {
|
|
113
|
-
input: 'realtimeAudioSliceSeconds'
|
|
114
|
-
value?: number | DataLink | EventProperty
|
|
115
|
-
mapping?: string
|
|
116
|
-
}
|
|
117
|
-
| {
|
|
118
|
-
input: 'realtimeAudioMinSeconds'
|
|
119
|
-
value?: number | DataLink | EventProperty
|
|
120
|
-
mapping?: string
|
|
121
|
-
}
|
|
122
|
-
| {
|
|
123
|
-
input: 'realtimeSaveAudio'
|
|
124
|
-
value?: boolean | DataLink | EventProperty
|
|
125
|
-
mapping?: string
|
|
126
|
-
}
|
|
127
|
-
| {
|
|
128
|
-
input: 'realtimeVadEnabled'
|
|
129
|
-
value?: boolean | DataLink | EventProperty
|
|
130
|
-
mapping?: string
|
|
131
|
-
}
|
|
132
|
-
| {
|
|
133
|
-
input: 'realtimeVadMs'
|
|
134
|
-
value?: number | DataLink | EventProperty
|
|
135
|
-
mapping?: string
|
|
136
|
-
}
|
|
137
|
-
| {
|
|
138
|
-
input: 'realtimeVadThold'
|
|
139
|
-
value?: number | DataLink | EventProperty
|
|
140
|
-
mapping?: string
|
|
141
|
-
}
|
|
142
|
-
| {
|
|
143
|
-
input: 'realtimeVadFreqThold'
|
|
144
|
-
value?: number | DataLink | EventProperty
|
|
145
|
-
mapping?: string
|
|
146
|
-
}
|
|
147
|
-
>
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
/* [Deprecated] Stop transcribing microphone audio source */
|
|
151
|
-
export type GeneratorSpeechInferenceActionTranscribeRealtimeStop = Action & {
|
|
152
|
-
__actionName: 'GENERATOR_SPEECH_INFERENCE_TRANSCRIBE_REALTIME_STOP'
|
|
83
|
+
/* Stop current transcription */
|
|
84
|
+
export type GeneratorSpeechInferenceActionTranscribeStop = Action & {
|
|
85
|
+
__actionName: 'GENERATOR_SPEECH_INFERENCE_TRANSCRIBE_STOP'
|
|
153
86
|
}
|
|
154
87
|
|
|
155
88
|
/* Clear downloaded files (model, audio) & current jobs */
|
|
@@ -172,14 +105,7 @@ Default property:
|
|
|
172
105
|
"modelUseCoreML": false,
|
|
173
106
|
"modelUseGPU": true,
|
|
174
107
|
"modelUseFlashAttn": false,
|
|
175
|
-
"inferLanguage": "Auto"
|
|
176
|
-
"inferRealtimeAudioSeconds": 30,
|
|
177
|
-
"inferRealtimeAudioSliceSeconds": 30,
|
|
178
|
-
"inferRealtimeSaveAudio": false,
|
|
179
|
-
"inferRealtimeVadEnabled": false,
|
|
180
|
-
"inferRealtimeVadMs": 2000,
|
|
181
|
-
"inferRealtimeVadThold": 0.6,
|
|
182
|
-
"inferRealtimeVadFreqThold": 100
|
|
108
|
+
"inferLanguage": "Auto"
|
|
183
109
|
}
|
|
184
110
|
*/
|
|
185
111
|
property?: {
|
|
@@ -381,23 +307,6 @@ Default property:
|
|
|
381
307
|
inferFileUrl?: string | DataLink
|
|
382
308
|
/* MD5 of file to be inferred */
|
|
383
309
|
inferFileMd5?: string | DataLink
|
|
384
|
-
/* Record duration in seconds. For performance, the value less than 30 seconds is recommended. */
|
|
385
|
-
inferRealtimeAudioSeconds?: number | DataLink
|
|
386
|
-
/* Optimize audio transcription performance by slicing audio samples when `Realtime Audio Seconds` > 30. */
|
|
387
|
-
inferRealtimeAudioSliceSeconds?: number | DataLink
|
|
388
|
-
/* Min duration of audio to start transcribe. Min: 0.5 seconds, Max: `Realtime Audio Slice Seconds`, Default: 1 second */
|
|
389
|
-
inferRealtimeAudioMinSeconds?: number | DataLink
|
|
390
|
-
/* Save recorded audio to file, the file path will be output to outlet. */
|
|
391
|
-
inferRealtimeSaveAudio?: boolean | DataLink
|
|
392
|
-
/* Start transcribe on recording when the audio volume is greater than the threshold by using VAD (Voice Activity Detection).
|
|
393
|
-
The first VAD will be triggered after 2 second of recording. */
|
|
394
|
-
inferRealtimeVadEnabled?: boolean | DataLink
|
|
395
|
-
/* The length of the collected audio is used for VAD. (ms) */
|
|
396
|
-
inferRealtimeVadMs?: number | DataLink
|
|
397
|
-
/* VAD threshold */
|
|
398
|
-
inferRealtimeVadThold?: number | DataLink
|
|
399
|
-
/* Frequency to apply High-pass filter in VAD */
|
|
400
|
-
inferRealtimeVadFreqThold?: number | DataLink
|
|
401
310
|
/* Buttress connection settings for remote inference */
|
|
402
311
|
buttressConnectionSettings?:
|
|
403
312
|
| DataLink
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/* Auto generated by build script */
|
|
2
|
+
import type { SwitchCondInnerStateCurrentCanvas, SwitchCondData, SwitchDef } from '../switch'
|
|
3
|
+
import type { Data, DataLink } from '../data'
|
|
4
|
+
import type {
|
|
5
|
+
Brick,
|
|
6
|
+
Generator,
|
|
7
|
+
EventAction,
|
|
8
|
+
ActionWithDataParams,
|
|
9
|
+
ActionWithParams,
|
|
10
|
+
Action,
|
|
11
|
+
EventProperty,
|
|
12
|
+
} from '../common'
|
|
13
|
+
|
|
14
|
+
/* Load the model */
|
|
15
|
+
export type GeneratorVadInferenceOnnxActionLoadModel = Action & {
|
|
16
|
+
__actionName: 'GENERATOR_ONNX_VAD_LOAD_MODEL'
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/* Detect speech in audio file */
|
|
20
|
+
export type GeneratorVadInferenceOnnxActionDetectFile = ActionWithParams & {
|
|
21
|
+
__actionName: 'GENERATOR_ONNX_VAD_DETECT_FILE'
|
|
22
|
+
params?: Array<
|
|
23
|
+
| {
|
|
24
|
+
input: 'fileUrl'
|
|
25
|
+
value?: string | DataLink | EventProperty
|
|
26
|
+
mapping?: string
|
|
27
|
+
}
|
|
28
|
+
| {
|
|
29
|
+
input: 'threshold'
|
|
30
|
+
value?: number | DataLink | EventProperty
|
|
31
|
+
mapping?: string
|
|
32
|
+
}
|
|
33
|
+
| {
|
|
34
|
+
input: 'minSpeechDurationMs'
|
|
35
|
+
value?: number | DataLink | EventProperty
|
|
36
|
+
mapping?: string
|
|
37
|
+
}
|
|
38
|
+
| {
|
|
39
|
+
input: 'minSilenceDurationMs'
|
|
40
|
+
value?: number | DataLink | EventProperty
|
|
41
|
+
mapping?: string
|
|
42
|
+
}
|
|
43
|
+
| {
|
|
44
|
+
input: 'maxSpeechDurationS'
|
|
45
|
+
value?: number | DataLink | EventProperty
|
|
46
|
+
mapping?: string
|
|
47
|
+
}
|
|
48
|
+
| {
|
|
49
|
+
input: 'speechPadMs'
|
|
50
|
+
value?: number | DataLink | EventProperty
|
|
51
|
+
mapping?: string
|
|
52
|
+
}
|
|
53
|
+
>
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/* Detect speech in audio data */
|
|
57
|
+
export type GeneratorVadInferenceOnnxActionDetectData = ActionWithParams & {
|
|
58
|
+
__actionName: 'GENERATOR_ONNX_VAD_DETECT_DATA'
|
|
59
|
+
params?: Array<
|
|
60
|
+
| {
|
|
61
|
+
input: 'data'
|
|
62
|
+
value?: any | EventProperty
|
|
63
|
+
mapping?: string
|
|
64
|
+
}
|
|
65
|
+
| {
|
|
66
|
+
input: 'threshold'
|
|
67
|
+
value?: number | DataLink | EventProperty
|
|
68
|
+
mapping?: string
|
|
69
|
+
}
|
|
70
|
+
| {
|
|
71
|
+
input: 'minSpeechDurationMs'
|
|
72
|
+
value?: number | DataLink | EventProperty
|
|
73
|
+
mapping?: string
|
|
74
|
+
}
|
|
75
|
+
| {
|
|
76
|
+
input: 'minSilenceDurationMs'
|
|
77
|
+
value?: number | DataLink | EventProperty
|
|
78
|
+
mapping?: string
|
|
79
|
+
}
|
|
80
|
+
| {
|
|
81
|
+
input: 'maxSpeechDurationS'
|
|
82
|
+
value?: number | DataLink | EventProperty
|
|
83
|
+
mapping?: string
|
|
84
|
+
}
|
|
85
|
+
| {
|
|
86
|
+
input: 'speechPadMs'
|
|
87
|
+
value?: number | DataLink | EventProperty
|
|
88
|
+
mapping?: string
|
|
89
|
+
}
|
|
90
|
+
>
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/* Clean cache */
|
|
94
|
+
export type GeneratorVadInferenceOnnxActionCleanCache = Action & {
|
|
95
|
+
__actionName: 'GENERATOR_ONNX_VAD_CLEAN_CACHE'
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/* Release context */
|
|
99
|
+
export type GeneratorVadInferenceOnnxActionReleaseContext = Action & {
|
|
100
|
+
__actionName: 'GENERATOR_ONNX_VAD_RELEASE_CONTEXT'
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
interface GeneratorVadInferenceOnnxDef {
|
|
104
|
+
/*
|
|
105
|
+
Default property:
|
|
106
|
+
{
|
|
107
|
+
"modelType": "auto",
|
|
108
|
+
"detectThreshold": 0.5,
|
|
109
|
+
"detectMinSpeechDurationMs": 250,
|
|
110
|
+
"detectMinSilenceDurationMs": 100,
|
|
111
|
+
"detectMaxSpeechDurationS": 30,
|
|
112
|
+
"detectSpeechPadMs": 30,
|
|
113
|
+
"executionMode": "sequential"
|
|
114
|
+
}
|
|
115
|
+
*/
|
|
116
|
+
property?: {
|
|
117
|
+
/* Initialize the VAD context on generator initialization */
|
|
118
|
+
init?: boolean | DataLink
|
|
119
|
+
/* VAD model */
|
|
120
|
+
model?: string | DataLink
|
|
121
|
+
/* Model type (auto-detected from config.json) */
|
|
122
|
+
modelType?: string | DataLink
|
|
123
|
+
/* Quantize type */
|
|
124
|
+
quantizeType?:
|
|
125
|
+
| 'auto'
|
|
126
|
+
| 'none'
|
|
127
|
+
| 'fp16'
|
|
128
|
+
| 'q8'
|
|
129
|
+
| 'int8'
|
|
130
|
+
| 'uint8'
|
|
131
|
+
| 'q4'
|
|
132
|
+
| 'bnb4'
|
|
133
|
+
| 'q4f16'
|
|
134
|
+
| DataLink
|
|
135
|
+
/* Speech probability threshold (0.0-1.0) */
|
|
136
|
+
detectThreshold?: number | DataLink
|
|
137
|
+
/* Minimum speech duration in milliseconds */
|
|
138
|
+
detectMinSpeechDurationMs?: number | DataLink
|
|
139
|
+
/* Minimum silence duration in milliseconds */
|
|
140
|
+
detectMinSilenceDurationMs?: number | DataLink
|
|
141
|
+
/* Maximum speech duration in seconds */
|
|
142
|
+
detectMaxSpeechDurationS?: number | DataLink
|
|
143
|
+
/* Padding around speech segments in milliseconds */
|
|
144
|
+
detectSpeechPadMs?: number | DataLink
|
|
145
|
+
/* Executor candidates, descending order of priority
|
|
146
|
+
Default will be xnnpack, wasm, cpu */
|
|
147
|
+
executors?:
|
|
148
|
+
| Array<'qnn' | 'dml' | 'nnapi' | 'xnnpack' | 'coreml' | 'cpu' | 'wasm' | 'webgpu' | DataLink>
|
|
149
|
+
| DataLink
|
|
150
|
+
/* Execution mode
|
|
151
|
+
Usually when the model has many branches, setting this option to `parallel` will give you better performance. */
|
|
152
|
+
executionMode?: 'sequential' | 'parallel' | DataLink
|
|
153
|
+
/* QNN backend */
|
|
154
|
+
qnnBackend?: 'HTP' | 'HTA' | 'DSP' | 'GPU' | 'CPU' | DataLink
|
|
155
|
+
/* Enable FP16 for QNN HTP */
|
|
156
|
+
qnnHtpEnableFp16?: boolean | DataLink
|
|
157
|
+
/* Enable QNN debug */
|
|
158
|
+
qnnEnableDebug?: boolean | DataLink
|
|
159
|
+
}
|
|
160
|
+
events?: {
|
|
161
|
+
/* Event triggered when context state changes */
|
|
162
|
+
onContextStateChange?: Array<EventAction>
|
|
163
|
+
/* Event triggered when error occurs */
|
|
164
|
+
onError?: Array<EventAction>
|
|
165
|
+
/* Event triggered when got detection result */
|
|
166
|
+
onDetected?: Array<EventAction>
|
|
167
|
+
}
|
|
168
|
+
outlets?: {
|
|
169
|
+
/* Context state */
|
|
170
|
+
contextState?: () => Data
|
|
171
|
+
/* Is detecting */
|
|
172
|
+
isDetecting?: () => Data
|
|
173
|
+
/* Detection segments result */
|
|
174
|
+
detectionSegments?: () => Data
|
|
175
|
+
/* Detection details */
|
|
176
|
+
detectionDetails?: () => Data
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/* Local Voice Activity Detection (VAD) inference based on [transformers.js](https://huggingface.co/docs/transformers.js)
|
|
181
|
+
You can use any compatible VAD model from HuggingFace (Silero VAD, smart-turn, etc.) */
|
|
182
|
+
export type GeneratorVadInferenceOnnx = Generator &
|
|
183
|
+
GeneratorVadInferenceOnnxDef & {
|
|
184
|
+
templateKey: 'GENERATOR_ONNX_VAD'
|
|
185
|
+
switches: Array<
|
|
186
|
+
SwitchDef &
|
|
187
|
+
GeneratorVadInferenceOnnxDef & {
|
|
188
|
+
conds?: Array<{
|
|
189
|
+
method: '==' | '!=' | '>' | '<' | '>=' | '<='
|
|
190
|
+
cond:
|
|
191
|
+
| SwitchCondInnerStateCurrentCanvas
|
|
192
|
+
| SwitchCondData
|
|
193
|
+
| {
|
|
194
|
+
__typename: 'SwitchCondInnerStateOutlet'
|
|
195
|
+
outlet: 'contextState' | 'isDetecting' | 'detectionSegments' | 'detectionDetails'
|
|
196
|
+
value: any
|
|
197
|
+
}
|
|
198
|
+
}>
|
|
199
|
+
}
|
|
200
|
+
>
|
|
201
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/* Auto generated by build script */
|
|
2
|
+
import type { SwitchCondInnerStateCurrentCanvas, SwitchCondData, SwitchDef } from '../switch'
|
|
3
|
+
import type { Data, DataLink } from '../data'
|
|
4
|
+
import type {
|
|
5
|
+
Brick,
|
|
6
|
+
Generator,
|
|
7
|
+
EventAction,
|
|
8
|
+
ActionWithDataParams,
|
|
9
|
+
ActionWithParams,
|
|
10
|
+
Action,
|
|
11
|
+
EventProperty,
|
|
12
|
+
} from '../common'
|
|
13
|
+
|
|
14
|
+
/* Detect speech in audio file */
|
|
15
|
+
export type GeneratorVadInferenceTraditionalActionDetectFile = ActionWithParams & {
|
|
16
|
+
__actionName: 'GENERATOR_TRADITIONAL_VAD_DETECT_FILE'
|
|
17
|
+
params?: Array<{
|
|
18
|
+
input: 'fileUrl'
|
|
19
|
+
value?: string | DataLink | EventProperty
|
|
20
|
+
mapping?: string
|
|
21
|
+
}>
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/* Detect speech in audio data stream */
|
|
25
|
+
export type GeneratorVadInferenceTraditionalActionDetectData = ActionWithParams & {
|
|
26
|
+
__actionName: 'GENERATOR_TRADITIONAL_VAD_DETECT_DATA'
|
|
27
|
+
params?: Array<{
|
|
28
|
+
input: 'data'
|
|
29
|
+
value?: any | EventProperty
|
|
30
|
+
mapping?: string
|
|
31
|
+
}>
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
interface GeneratorVadInferenceTraditionalDef {
|
|
35
|
+
/*
|
|
36
|
+
Default property:
|
|
37
|
+
{
|
|
38
|
+
"detectVocalFreqMin": 75,
|
|
39
|
+
"detectVocalFreqMax": 900,
|
|
40
|
+
"detectThreshold": 0.5,
|
|
41
|
+
"detectMinSpeechDurationMs": 250,
|
|
42
|
+
"detectMinSilenceDurationMs": 100,
|
|
43
|
+
"detectMaxSpeechDurationS": 30,
|
|
44
|
+
"detectSpeechPadMs": 30
|
|
45
|
+
}
|
|
46
|
+
*/
|
|
47
|
+
property?: {
|
|
48
|
+
/* Minimum vocal frequency in Hz */
|
|
49
|
+
detectVocalFreqMin?: number | DataLink
|
|
50
|
+
/* Maximum vocal frequency in Hz */
|
|
51
|
+
detectVocalFreqMax?: number | DataLink
|
|
52
|
+
/* Volume threshold in dB */
|
|
53
|
+
detectVolumeThreshold?: number | DataLink
|
|
54
|
+
/* Speech probability threshold (0.0-1.0) - maps to frequency clarity */
|
|
55
|
+
detectThreshold?: number | DataLink
|
|
56
|
+
/* Minimum speech duration in milliseconds */
|
|
57
|
+
detectMinSpeechDurationMs?: number | DataLink
|
|
58
|
+
/* Minimum silence duration in milliseconds */
|
|
59
|
+
detectMinSilenceDurationMs?: number | DataLink
|
|
60
|
+
/* Maximum speech duration in seconds */
|
|
61
|
+
detectMaxSpeechDurationS?: number | DataLink
|
|
62
|
+
/* Padding around speech segments in milliseconds */
|
|
63
|
+
detectSpeechPadMs?: number | DataLink
|
|
64
|
+
/* The file URL or path to be analyzed */
|
|
65
|
+
detectFileUrl?: string | DataLink
|
|
66
|
+
/* MD5 of file to be analyzed */
|
|
67
|
+
detectFileMd5?: string | DataLink
|
|
68
|
+
}
|
|
69
|
+
events?: {
|
|
70
|
+
/* Event triggered when context state changes */
|
|
71
|
+
onContextStateChange?: Array<EventAction>
|
|
72
|
+
/* Event triggered when detection result is available */
|
|
73
|
+
onDetected?: Array<EventAction>
|
|
74
|
+
/* Event triggered when error occurs */
|
|
75
|
+
onError?: Array<EventAction>
|
|
76
|
+
}
|
|
77
|
+
outlets?: {
|
|
78
|
+
/* Context state */
|
|
79
|
+
contextState?: () => Data
|
|
80
|
+
/* Is detecting */
|
|
81
|
+
isDetecting?: () => Data
|
|
82
|
+
/* Is speaking (real-time) */
|
|
83
|
+
isSpeaking?: () => Data
|
|
84
|
+
/* Detection segments result */
|
|
85
|
+
detectionSegments?: () => Data
|
|
86
|
+
/* Current volume in dB */
|
|
87
|
+
currentVolume?: () => Data
|
|
88
|
+
/* Current frequency clarity (0-1) */
|
|
89
|
+
currentClarity?: () => Data
|
|
90
|
+
/* Current detected frequency in Hz */
|
|
91
|
+
currentFrequency?: () => Data
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/* Traditional Voice Activity Detection (VAD) using pitch detection and RMS volume analysis
|
|
96
|
+
No model download required - pure algorithmic approach */
|
|
97
|
+
export type GeneratorVadInferenceTraditional = Generator &
|
|
98
|
+
GeneratorVadInferenceTraditionalDef & {
|
|
99
|
+
templateKey: 'GENERATOR_TRADITIONAL_VAD'
|
|
100
|
+
switches: Array<
|
|
101
|
+
SwitchDef &
|
|
102
|
+
GeneratorVadInferenceTraditionalDef & {
|
|
103
|
+
conds?: Array<{
|
|
104
|
+
method: '==' | '!=' | '>' | '<' | '>=' | '<='
|
|
105
|
+
cond:
|
|
106
|
+
| SwitchCondInnerStateCurrentCanvas
|
|
107
|
+
| SwitchCondData
|
|
108
|
+
| {
|
|
109
|
+
__typename: 'SwitchCondInnerStateOutlet'
|
|
110
|
+
outlet:
|
|
111
|
+
| 'contextState'
|
|
112
|
+
| 'isDetecting'
|
|
113
|
+
| 'isSpeaking'
|
|
114
|
+
| 'detectionSegments'
|
|
115
|
+
| 'currentVolume'
|
|
116
|
+
| 'currentClarity'
|
|
117
|
+
| 'currentFrequency'
|
|
118
|
+
value: any
|
|
119
|
+
}
|
|
120
|
+
}>
|
|
121
|
+
}
|
|
122
|
+
>
|
|
123
|
+
}
|
|
@@ -33,9 +33,11 @@ export * from './ThermalPrinter'
|
|
|
33
33
|
export * from './SqLite'
|
|
34
34
|
export * from './McpServer'
|
|
35
35
|
export * from './Mcp'
|
|
36
|
+
export * from './VadTraditional'
|
|
36
37
|
export * from './TextToSpeechOnnx'
|
|
37
38
|
export * from './LlmOnnx'
|
|
38
39
|
export * from './SpeechToTextOnnx'
|
|
40
|
+
export * from './VadOnnx'
|
|
39
41
|
export * from './SpeechToTextGgml'
|
|
40
42
|
export * from './VadGgml'
|
|
41
43
|
export * from './RealtimeTranscription'
|
package/utils/event-props.ts
CHANGED
|
@@ -733,6 +733,18 @@ export const templateEventPropsMap = {
|
|
|
733
733
|
'GENERATOR_MCP_ERROR_MESSAGE', // type: string
|
|
734
734
|
],
|
|
735
735
|
},
|
|
736
|
+
GENERATOR_TRADITIONAL_VAD: {
|
|
737
|
+
onContextStateChange: [
|
|
738
|
+
'GENERATOR_TRADITIONAL_VAD_CONTEXT_STATE', // type: string
|
|
739
|
+
],
|
|
740
|
+
onDetected: [
|
|
741
|
+
'GENERATOR_TRADITIONAL_VAD_DETECTION_SEGMENTS', // type: array
|
|
742
|
+
'GENERATOR_TRADITIONAL_VAD_DETECTION_TIME', // type: number
|
|
743
|
+
],
|
|
744
|
+
onError: [
|
|
745
|
+
'GENERATOR_TRADITIONAL_VAD_ERROR', // type: string
|
|
746
|
+
],
|
|
747
|
+
},
|
|
736
748
|
GENERATOR_TTS: {
|
|
737
749
|
onContextStateChange: [
|
|
738
750
|
'GENERATOR_TTS_CONTEXT_STATE', // type: string
|
|
@@ -767,6 +779,18 @@ export const templateEventPropsMap = {
|
|
|
767
779
|
'GENERATOR_ONNX_STT_ERROR', // type: string
|
|
768
780
|
],
|
|
769
781
|
},
|
|
782
|
+
GENERATOR_ONNX_VAD: {
|
|
783
|
+
onContextStateChange: [
|
|
784
|
+
'GENERATOR_ONNX_VAD_CONTEXT_STATE', // type: string
|
|
785
|
+
],
|
|
786
|
+
onError: [
|
|
787
|
+
'GENERATOR_ONNX_VAD_ERROR', // type: string
|
|
788
|
+
],
|
|
789
|
+
onDetected: [
|
|
790
|
+
'GENERATOR_ONNX_VAD_DETECTION_SEGMENTS', // type: array
|
|
791
|
+
'GENERATOR_ONNX_VAD_DETECTION_TIME', // type: number
|
|
792
|
+
],
|
|
793
|
+
},
|
|
770
794
|
GENERATOR_SPEECH_INFERENCE: {
|
|
771
795
|
onContextStateChange: [
|
|
772
796
|
'GENERATOR_SPEECH_INFERENCE_CONTEXT_STATE', // type: string
|