@storyteller-platform/ghost-story 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/bin.cjs +47 -47
- package/dist/cli/bin.js +3 -3
- package/dist/cli/config.d.cts +59 -9
- package/dist/cli/config.d.ts +59 -9
- package/dist/index.cjs +7 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +8 -0
- package/dist/recognition/WhisperServerSTT.cjs +46 -4
- package/dist/recognition/WhisperServerSTT.js +47 -4
- package/dist/utilities/WhisperTimeline.cjs +22 -8
- package/dist/utilities/WhisperTimeline.js +22 -8
- package/package.json +2 -2
package/dist/cli/bin.cjs
CHANGED
|
@@ -25,14 +25,14 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
25
25
|
var import_zli = require("@robingenz/zli");
|
|
26
26
|
var import_cli_progress = require("cli-progress");
|
|
27
27
|
var import_fs_extra = require("fs-extra");
|
|
28
|
-
var
|
|
28
|
+
var import_v4 = require("zod/v4");
|
|
29
29
|
var import_constants = require("../constants.cjs");
|
|
30
30
|
var import_config = require("./config.cjs");
|
|
31
31
|
var import_install = require("./install.cjs");
|
|
32
32
|
var import_whisper_server = require("./whisper-server.cjs");
|
|
33
|
-
const recognitionEngineSchema =
|
|
34
|
-
const whisperModelSchema =
|
|
35
|
-
const buildVariantSchema =
|
|
33
|
+
const recognitionEngineSchema = import_v4.z.enum(import_constants.RECOGNITION_ENGINES);
|
|
34
|
+
const whisperModelSchema = import_v4.z.enum(import_constants.WHISPER_MODELS);
|
|
35
|
+
const buildVariantSchema = import_v4.z.enum(import_constants.BUILD_VARIANTS);
|
|
36
36
|
function isInstallTarget(value) {
|
|
37
37
|
return ["binary", "model", "vad", "all"].includes(value);
|
|
38
38
|
}
|
|
@@ -44,19 +44,19 @@ Usage:
|
|
|
44
44
|
ghost-story install model <model> - Install a whisper model
|
|
45
45
|
ghost-story install vad - Install Silero VAD model
|
|
46
46
|
ghost-story install all - Install binary, all models, and VAD`,
|
|
47
|
-
args:
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
args: import_v4.z.union([
|
|
48
|
+
import_v4.z.tuple([
|
|
49
|
+
import_v4.z.enum(["binary", "model", "vad", "all"]).describe("What to install: binary, model, vad, or all")
|
|
50
50
|
]),
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
import_v4.z.tuple([
|
|
52
|
+
import_v4.z.enum(["binary", "model", "vad", "all"]).describe("What to install: binary, model, vad, or all"),
|
|
53
|
+
import_v4.z.string().optional().describe("Variant (for binary) or model name (for model)")
|
|
54
54
|
])
|
|
55
55
|
]),
|
|
56
56
|
options: (0, import_zli.defineOptions)(
|
|
57
|
-
|
|
58
|
-
force:
|
|
59
|
-
list:
|
|
57
|
+
import_v4.z.object({
|
|
58
|
+
force: import_v4.z.boolean().default(false).describe("Force installation even if platform doesn't match"),
|
|
59
|
+
list: import_v4.z.boolean().default(false).describe("List available variants or models")
|
|
60
60
|
}),
|
|
61
61
|
{ f: "force", l: "list" }
|
|
62
62
|
),
|
|
@@ -156,8 +156,8 @@ const statusCommand = (0, import_zli.defineCommand)({
|
|
|
156
156
|
});
|
|
157
157
|
const transcribeCommand = (0, import_zli.defineCommand)({
|
|
158
158
|
description: "Transcribe a single file with whisper.cpp",
|
|
159
|
-
args:
|
|
160
|
-
|
|
159
|
+
args: import_v4.z.tuple([import_v4.z.string().describe("Input audio file path")], {
|
|
160
|
+
error: (issue) => {
|
|
161
161
|
if (issue.code === "too_small") {
|
|
162
162
|
return {
|
|
163
163
|
message: "Input audio file path is required"
|
|
@@ -169,23 +169,23 @@ const transcribeCommand = (0, import_zli.defineCommand)({
|
|
|
169
169
|
}
|
|
170
170
|
}),
|
|
171
171
|
options: (0, import_zli.defineOptions)(
|
|
172
|
-
|
|
173
|
-
output:
|
|
174
|
-
language:
|
|
172
|
+
import_v4.z.object({
|
|
173
|
+
output: import_v4.z.string().optional().describe("Output file path for transcription (JSON)"),
|
|
174
|
+
language: import_v4.z.string().default("en-US").describe(
|
|
175
175
|
"BCP 47 language tag representing the primary language of the audio (e.g. en-US)"
|
|
176
176
|
),
|
|
177
177
|
engine: recognitionEngineSchema.default("whisper.cpp").describe("Speech-to-text engine"),
|
|
178
|
-
model:
|
|
179
|
-
threads:
|
|
180
|
-
processors:
|
|
181
|
-
noProgress:
|
|
182
|
-
noAutoInstall:
|
|
183
|
-
deepgramApiKey:
|
|
184
|
-
googleApiKey:
|
|
185
|
-
amazonRegion:
|
|
186
|
-
amazonBucketName:
|
|
187
|
-
amazonAccessKeyId:
|
|
188
|
-
amazonSecretAccessKey:
|
|
178
|
+
model: import_v4.z.string().default("tiny.en").describe("Transcription model"),
|
|
179
|
+
threads: import_v4.z.coerce.number().default(4).describe("Number of threads"),
|
|
180
|
+
processors: import_v4.z.coerce.number().default(1).describe("Number of processors"),
|
|
181
|
+
noProgress: import_v4.z.boolean().default(false).describe("Disable the progress bar"),
|
|
182
|
+
noAutoInstall: import_v4.z.boolean().default(false).describe("Don't auto-install missing binary/model"),
|
|
183
|
+
deepgramApiKey: import_v4.z.string().optional().describe("Deepgram API key"),
|
|
184
|
+
googleApiKey: import_v4.z.string().optional().describe("Google Cloud API key"),
|
|
185
|
+
amazonRegion: import_v4.z.string().optional().describe("AWS region code"),
|
|
186
|
+
amazonBucketName: import_v4.z.string().optional().describe("The AWS s3 bucket to upload the provided audio to"),
|
|
187
|
+
amazonAccessKeyId: import_v4.z.string().optional().describe("AWS access key ID"),
|
|
188
|
+
amazonSecretAccessKey: import_v4.z.string().optional().describe("AWS secret access key")
|
|
189
189
|
}),
|
|
190
190
|
{ m: "model", p: "processors", t: "threads" }
|
|
191
191
|
),
|
|
@@ -248,18 +248,18 @@ Transcription written to ${outputPath}`);
|
|
|
248
248
|
const serverCommand = (0, import_zli.defineCommand)({
|
|
249
249
|
description: "Start a whisper.cpp transcription server",
|
|
250
250
|
options: (0, import_zli.defineOptions)(
|
|
251
|
-
|
|
251
|
+
import_v4.z.object({
|
|
252
252
|
model: whisperModelSchema.default("tiny.en").describe("Whisper model"),
|
|
253
|
-
port:
|
|
254
|
-
host:
|
|
255
|
-
threads:
|
|
256
|
-
processors:
|
|
257
|
-
noConvert:
|
|
258
|
-
noAutoInstall:
|
|
253
|
+
port: import_v4.z.coerce.number().default(8080).describe("Port to listen on"),
|
|
254
|
+
host: import_v4.z.string().default("0.0.0.0").describe("Host to bind to"),
|
|
255
|
+
threads: import_v4.z.coerce.number().default(4).describe("Number of threads"),
|
|
256
|
+
processors: import_v4.z.coerce.number().default(1).describe("Number of processors"),
|
|
257
|
+
noConvert: import_v4.z.boolean().default(false).describe("Disable automatic audio conversion"),
|
|
258
|
+
noAutoInstall: import_v4.z.boolean().default(false).describe("Don't auto-install missing binary/model"),
|
|
259
259
|
variant: buildVariantSchema.optional().describe("Use specific binary variant"),
|
|
260
|
-
force:
|
|
261
|
-
vadModel:
|
|
262
|
-
vadThreshold:
|
|
260
|
+
force: import_v4.z.boolean().default(false).describe("Force running even if platform doesn't match"),
|
|
261
|
+
vadModel: import_v4.z.string().optional().describe("Path to VAD model for voice activity detection"),
|
|
262
|
+
vadThreshold: import_v4.z.coerce.number().optional().describe("VAD threshold probability (0.0-1.0)")
|
|
263
263
|
}),
|
|
264
264
|
{ m: "model", p: "port", t: "threads", f: "force" }
|
|
265
265
|
),
|
|
@@ -293,8 +293,8 @@ const serverCommand = (0, import_zli.defineCommand)({
|
|
|
293
293
|
});
|
|
294
294
|
const vadCommand = (0, import_zli.defineCommand)({
|
|
295
295
|
description: "Run voice activity detection on an audio file",
|
|
296
|
-
args:
|
|
297
|
-
|
|
296
|
+
args: import_v4.z.tuple([import_v4.z.string().describe("Input audio file path")], {
|
|
297
|
+
error: (issue) => {
|
|
298
298
|
if (issue.code === "too_small") {
|
|
299
299
|
return {
|
|
300
300
|
message: "Input audio file path is required"
|
|
@@ -306,12 +306,12 @@ const vadCommand = (0, import_zli.defineCommand)({
|
|
|
306
306
|
}
|
|
307
307
|
}),
|
|
308
308
|
options: (0, import_zli.defineOptions)(
|
|
309
|
-
|
|
310
|
-
output:
|
|
311
|
-
threshold:
|
|
312
|
-
minSpeechDuration:
|
|
313
|
-
minSilenceDuration:
|
|
314
|
-
speechPad:
|
|
309
|
+
import_v4.z.object({
|
|
310
|
+
output: import_v4.z.string().optional().describe("Output file path for VAD segments (JSON)"),
|
|
311
|
+
threshold: import_v4.z.coerce.number().default(0.5).describe("Speech detection threshold (0.0-1.0)"),
|
|
312
|
+
minSpeechDuration: import_v4.z.coerce.number().default(250).describe("Minimum speech duration in ms"),
|
|
313
|
+
minSilenceDuration: import_v4.z.coerce.number().default(100).describe("Minimum silence duration in ms"),
|
|
314
|
+
speechPad: import_v4.z.coerce.number().default(30).describe("Speech padding in ms")
|
|
315
315
|
}),
|
|
316
316
|
{ o: "output" }
|
|
317
317
|
),
|
package/dist/cli/bin.js
CHANGED
|
@@ -7,7 +7,7 @@ import {
|
|
|
7
7
|
} from "@robingenz/zli";
|
|
8
8
|
import { Presets, SingleBar } from "cli-progress";
|
|
9
9
|
import { ensureDirSync } from "fs-extra";
|
|
10
|
-
import { z } from "zod";
|
|
10
|
+
import { z } from "zod/v4";
|
|
11
11
|
import {
|
|
12
12
|
BUILD_VARIANTS,
|
|
13
13
|
RECOGNITION_ENGINES,
|
|
@@ -157,7 +157,7 @@ const statusCommand = defineCommand({
|
|
|
157
157
|
const transcribeCommand = defineCommand({
|
|
158
158
|
description: "Transcribe a single file with whisper.cpp",
|
|
159
159
|
args: z.tuple([z.string().describe("Input audio file path")], {
|
|
160
|
-
|
|
160
|
+
error: (issue) => {
|
|
161
161
|
if (issue.code === "too_small") {
|
|
162
162
|
return {
|
|
163
163
|
message: "Input audio file path is required"
|
|
@@ -294,7 +294,7 @@ const serverCommand = defineCommand({
|
|
|
294
294
|
const vadCommand = defineCommand({
|
|
295
295
|
description: "Run voice activity detection on an audio file",
|
|
296
296
|
args: z.tuple([z.string().describe("Input audio file path")], {
|
|
297
|
-
|
|
297
|
+
error: (issue) => {
|
|
298
298
|
if (issue.code === "too_small") {
|
|
299
299
|
return {
|
|
300
300
|
message: "Input audio file path is required"
|
package/dist/cli/config.d.cts
CHANGED
|
@@ -48,15 +48,65 @@ declare function resolveVariant(requestedVariant?: BuildVariant): BuildVariant;
|
|
|
48
48
|
declare function isValidModel(model: string): model is WhisperModel;
|
|
49
49
|
declare function isValidVariant(variant: string): variant is BuildVariant;
|
|
50
50
|
declare const cliConfigSchema: z.ZodObject<{
|
|
51
|
-
lastUsedModel: z.ZodNullable<z.ZodEnum<
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
51
|
+
lastUsedModel: z.ZodNullable<z.ZodEnum<{
|
|
52
|
+
tiny: "tiny";
|
|
53
|
+
"tiny.en": "tiny.en";
|
|
54
|
+
"tiny-q5_1": "tiny-q5_1";
|
|
55
|
+
"tiny.en-q5_1": "tiny.en-q5_1";
|
|
56
|
+
"tiny-q8_0": "tiny-q8_0";
|
|
57
|
+
base: "base";
|
|
58
|
+
"base.en": "base.en";
|
|
59
|
+
"base-q5_1": "base-q5_1";
|
|
60
|
+
"base.en-q5_1": "base.en-q5_1";
|
|
61
|
+
"base-q8_0": "base-q8_0";
|
|
62
|
+
small: "small";
|
|
63
|
+
"small.en": "small.en";
|
|
64
|
+
"small-q5_1": "small-q5_1";
|
|
65
|
+
"small.en-q5_1": "small.en-q5_1";
|
|
66
|
+
"small-q8_0": "small-q8_0";
|
|
67
|
+
medium: "medium";
|
|
68
|
+
"medium.en": "medium.en";
|
|
69
|
+
"medium-q5_0": "medium-q5_0";
|
|
70
|
+
"medium.en-q5_0": "medium.en-q5_0";
|
|
71
|
+
"medium-q8_0": "medium-q8_0";
|
|
72
|
+
"large-v1": "large-v1";
|
|
73
|
+
"large-v2": "large-v2";
|
|
74
|
+
"large-v2-q5_0": "large-v2-q5_0";
|
|
75
|
+
"large-v2-q8_0": "large-v2-q8_0";
|
|
76
|
+
"large-v3": "large-v3";
|
|
77
|
+
"large-v3-q5_0": "large-v3-q5_0";
|
|
78
|
+
"large-v3-turbo": "large-v3-turbo";
|
|
79
|
+
"large-v3-turbo-q5_0": "large-v3-turbo-q5_0";
|
|
80
|
+
"large-v3-turbo-q8_0": "large-v3-turbo-q8_0";
|
|
81
|
+
}>>;
|
|
82
|
+
installedVariant: z.ZodNullable<z.ZodEnum<{
|
|
83
|
+
"darwin-arm64-coreml": "darwin-arm64-coreml";
|
|
84
|
+
"darwin-arm64-cpu": "darwin-arm64-cpu";
|
|
85
|
+
"darwin-x64-cpu": "darwin-x64-cpu";
|
|
86
|
+
"linux-x64-blas": "linux-x64-blas";
|
|
87
|
+
"linux-x64-cpu": "linux-x64-cpu";
|
|
88
|
+
"linux-x64-cuda-13.1.0": "linux-x64-cuda-13.1.0";
|
|
89
|
+
"linux-x64-cuda-12.9.0": "linux-x64-cuda-12.9.0";
|
|
90
|
+
"linux-x64-cuda-11.8.0": "linux-x64-cuda-11.8.0";
|
|
91
|
+
"linux-x64-sycl": "linux-x64-sycl";
|
|
92
|
+
"linux-x64-vulkan": "linux-x64-vulkan";
|
|
93
|
+
"linux-x64-rocm": "linux-x64-rocm";
|
|
94
|
+
"linux-x64-cuda-13.1.0-legacy": "linux-x64-cuda-13.1.0-legacy";
|
|
95
|
+
"linux-x64-cuda-12.9.0-legacy": "linux-x64-cuda-12.9.0-legacy";
|
|
96
|
+
"linux-x64-cuda-11.8.0-legacy": "linux-x64-cuda-11.8.0-legacy";
|
|
97
|
+
"linux-x64-sycl-legacy": "linux-x64-sycl-legacy";
|
|
98
|
+
"linux-x64-vulkan-legacy": "linux-x64-vulkan-legacy";
|
|
99
|
+
"linux-x64-rocm-legacy": "linux-x64-rocm-legacy";
|
|
100
|
+
"linux-x64-blas-legacy": "linux-x64-blas-legacy";
|
|
101
|
+
"linux-x64-cpu-legacy": "linux-x64-cpu-legacy";
|
|
102
|
+
"linux-arm64-cpu": "linux-arm64-cpu";
|
|
103
|
+
"windows-x64-cpu": "windows-x64-cpu";
|
|
104
|
+
"windows-x64-cuda-13.1.0": "windows-x64-cuda-13.1.0";
|
|
105
|
+
"windows-x64-cuda-12.9.0": "windows-x64-cuda-12.9.0";
|
|
106
|
+
"windows-x64-cuda-11.8.0": "windows-x64-cuda-11.8.0";
|
|
107
|
+
"windows-x64-vulkan": "windows-x64-vulkan";
|
|
108
|
+
}>>;
|
|
109
|
+
}, z.z.core.$strip>;
|
|
60
110
|
/**
|
|
61
111
|
* Only to be used by the CLI, not the API/programmatic use.
|
|
62
112
|
* Mostly to remember the last used model and variant.
|
package/dist/cli/config.d.ts
CHANGED
|
@@ -48,15 +48,65 @@ declare function resolveVariant(requestedVariant?: BuildVariant): BuildVariant;
|
|
|
48
48
|
declare function isValidModel(model: string): model is WhisperModel;
|
|
49
49
|
declare function isValidVariant(variant: string): variant is BuildVariant;
|
|
50
50
|
declare const cliConfigSchema: z.ZodObject<{
|
|
51
|
-
lastUsedModel: z.ZodNullable<z.ZodEnum<
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
51
|
+
lastUsedModel: z.ZodNullable<z.ZodEnum<{
|
|
52
|
+
tiny: "tiny";
|
|
53
|
+
"tiny.en": "tiny.en";
|
|
54
|
+
"tiny-q5_1": "tiny-q5_1";
|
|
55
|
+
"tiny.en-q5_1": "tiny.en-q5_1";
|
|
56
|
+
"tiny-q8_0": "tiny-q8_0";
|
|
57
|
+
base: "base";
|
|
58
|
+
"base.en": "base.en";
|
|
59
|
+
"base-q5_1": "base-q5_1";
|
|
60
|
+
"base.en-q5_1": "base.en-q5_1";
|
|
61
|
+
"base-q8_0": "base-q8_0";
|
|
62
|
+
small: "small";
|
|
63
|
+
"small.en": "small.en";
|
|
64
|
+
"small-q5_1": "small-q5_1";
|
|
65
|
+
"small.en-q5_1": "small.en-q5_1";
|
|
66
|
+
"small-q8_0": "small-q8_0";
|
|
67
|
+
medium: "medium";
|
|
68
|
+
"medium.en": "medium.en";
|
|
69
|
+
"medium-q5_0": "medium-q5_0";
|
|
70
|
+
"medium.en-q5_0": "medium.en-q5_0";
|
|
71
|
+
"medium-q8_0": "medium-q8_0";
|
|
72
|
+
"large-v1": "large-v1";
|
|
73
|
+
"large-v2": "large-v2";
|
|
74
|
+
"large-v2-q5_0": "large-v2-q5_0";
|
|
75
|
+
"large-v2-q8_0": "large-v2-q8_0";
|
|
76
|
+
"large-v3": "large-v3";
|
|
77
|
+
"large-v3-q5_0": "large-v3-q5_0";
|
|
78
|
+
"large-v3-turbo": "large-v3-turbo";
|
|
79
|
+
"large-v3-turbo-q5_0": "large-v3-turbo-q5_0";
|
|
80
|
+
"large-v3-turbo-q8_0": "large-v3-turbo-q8_0";
|
|
81
|
+
}>>;
|
|
82
|
+
installedVariant: z.ZodNullable<z.ZodEnum<{
|
|
83
|
+
"darwin-arm64-coreml": "darwin-arm64-coreml";
|
|
84
|
+
"darwin-arm64-cpu": "darwin-arm64-cpu";
|
|
85
|
+
"darwin-x64-cpu": "darwin-x64-cpu";
|
|
86
|
+
"linux-x64-blas": "linux-x64-blas";
|
|
87
|
+
"linux-x64-cpu": "linux-x64-cpu";
|
|
88
|
+
"linux-x64-cuda-13.1.0": "linux-x64-cuda-13.1.0";
|
|
89
|
+
"linux-x64-cuda-12.9.0": "linux-x64-cuda-12.9.0";
|
|
90
|
+
"linux-x64-cuda-11.8.0": "linux-x64-cuda-11.8.0";
|
|
91
|
+
"linux-x64-sycl": "linux-x64-sycl";
|
|
92
|
+
"linux-x64-vulkan": "linux-x64-vulkan";
|
|
93
|
+
"linux-x64-rocm": "linux-x64-rocm";
|
|
94
|
+
"linux-x64-cuda-13.1.0-legacy": "linux-x64-cuda-13.1.0-legacy";
|
|
95
|
+
"linux-x64-cuda-12.9.0-legacy": "linux-x64-cuda-12.9.0-legacy";
|
|
96
|
+
"linux-x64-cuda-11.8.0-legacy": "linux-x64-cuda-11.8.0-legacy";
|
|
97
|
+
"linux-x64-sycl-legacy": "linux-x64-sycl-legacy";
|
|
98
|
+
"linux-x64-vulkan-legacy": "linux-x64-vulkan-legacy";
|
|
99
|
+
"linux-x64-rocm-legacy": "linux-x64-rocm-legacy";
|
|
100
|
+
"linux-x64-blas-legacy": "linux-x64-blas-legacy";
|
|
101
|
+
"linux-x64-cpu-legacy": "linux-x64-cpu-legacy";
|
|
102
|
+
"linux-arm64-cpu": "linux-arm64-cpu";
|
|
103
|
+
"windows-x64-cpu": "windows-x64-cpu";
|
|
104
|
+
"windows-x64-cuda-13.1.0": "windows-x64-cuda-13.1.0";
|
|
105
|
+
"windows-x64-cuda-12.9.0": "windows-x64-cuda-12.9.0";
|
|
106
|
+
"windows-x64-cuda-11.8.0": "windows-x64-cuda-11.8.0";
|
|
107
|
+
"windows-x64-vulkan": "windows-x64-vulkan";
|
|
108
|
+
}>>;
|
|
109
|
+
}, z.z.core.$strip>;
|
|
60
110
|
/**
|
|
61
111
|
* Only to be used by the CLI, not the API/programmatic use.
|
|
62
112
|
* Mostly to remember the last used model and variant.
|
package/dist/index.cjs
CHANGED
|
@@ -72,6 +72,9 @@ __export(index_exports, {
|
|
|
72
72
|
serviceCapabilities: () => import_audio.serviceCapabilities,
|
|
73
73
|
setConversionMode: () => import_config2.setConversionMode,
|
|
74
74
|
setTimingEnabled: () => import_config2.setTimingEnabled,
|
|
75
|
+
spacelessScriptPattern: () => import_SpacelessScripts.spacelessScriptPattern,
|
|
76
|
+
spacelessScripts: () => import_SpacelessScripts.spacelessScripts,
|
|
77
|
+
startsWithSpacelessScript: () => import_SpacelessScripts.startsWithSpacelessScript,
|
|
75
78
|
toBuffer: () => import_audio.toBuffer,
|
|
76
79
|
toFilePath: () => import_audio.toFilePath,
|
|
77
80
|
toReadStream: () => import_audio.toReadStream,
|
|
@@ -87,6 +90,7 @@ var import_config2 = require("./config.cjs");
|
|
|
87
90
|
var import_OpenAICloudSTT = require("./recognition/OpenAICloudSTT.cjs");
|
|
88
91
|
var import_WhisperCppSTT = require("./recognition/WhisperCppSTT.cjs");
|
|
89
92
|
var import_WhisperServerSTT = require("./recognition/WhisperServerSTT.cjs");
|
|
93
|
+
var import_SpacelessScripts = require("./utilities/SpacelessScripts.cjs");
|
|
90
94
|
var import_Timing = require("./utilities/Timing.cjs");
|
|
91
95
|
var import_Silero = require("./vad/Silero.cjs");
|
|
92
96
|
// Annotate the CommonJS export names for ESM import in node:
|
|
@@ -145,6 +149,9 @@ var import_Silero = require("./vad/Silero.cjs");
|
|
|
145
149
|
serviceCapabilities,
|
|
146
150
|
setConversionMode,
|
|
147
151
|
setTimingEnabled,
|
|
152
|
+
spacelessScriptPattern,
|
|
153
|
+
spacelessScripts,
|
|
154
|
+
startsWithSpacelessScript,
|
|
148
155
|
toBuffer,
|
|
149
156
|
toFilePath,
|
|
150
157
|
toReadStream,
|
package/dist/index.d.cts
CHANGED
|
@@ -9,6 +9,7 @@ export { OpenAICloudSTTOptions, RecognitionResult as OpenAIResult, inputPreferen
|
|
|
9
9
|
export { WhisperCppModelId, WhisperCppOptions, RecognitionResult as WhisperCppResult, recognize as recognizeWhisperCpp, inputPreference as whisperCppInputPreference } from './recognition/WhisperCppSTT.cjs';
|
|
10
10
|
export { WhisperServerOptions, RecognitionResult as WhisperServerResult, recognize as recognizeWhisperServer, inputPreference as whisperServerInputPreference } from './recognition/WhisperServerSTT.cjs';
|
|
11
11
|
export { Timeline, TimelineEntry, TimelineEntryType } from './utilities/Timeline.cjs';
|
|
12
|
+
export { spacelessScriptPattern, spacelessScripts, startsWithSpacelessScript } from './utilities/SpacelessScripts.cjs';
|
|
12
13
|
export { AggregatedStats, PhaseTiming, Timing, TimingAggregator, TimingSpan, TimingSummary, createAggregator, createTiming, formatDuration, formatPercentage, formatSingleReport, printSingleReport } from './utilities/Timing.cjs';
|
|
13
14
|
export { SileroOptions, VadSegment, detectVoiceActivity, ensureVadInstalled, segmentsToTimeline } from './vad/Silero.cjs';
|
|
14
15
|
import 'node:fs';
|
package/dist/index.d.ts
CHANGED
|
@@ -9,6 +9,7 @@ export { OpenAICloudSTTOptions, RecognitionResult as OpenAIResult, inputPreferen
|
|
|
9
9
|
export { WhisperCppModelId, WhisperCppOptions, RecognitionResult as WhisperCppResult, recognize as recognizeWhisperCpp, inputPreference as whisperCppInputPreference } from './recognition/WhisperCppSTT.js';
|
|
10
10
|
export { WhisperServerOptions, RecognitionResult as WhisperServerResult, recognize as recognizeWhisperServer, inputPreference as whisperServerInputPreference } from './recognition/WhisperServerSTT.js';
|
|
11
11
|
export { Timeline, TimelineEntry, TimelineEntryType } from './utilities/Timeline.js';
|
|
12
|
+
export { spacelessScriptPattern, spacelessScripts, startsWithSpacelessScript } from './utilities/SpacelessScripts.js';
|
|
12
13
|
export { AggregatedStats, PhaseTiming, Timing, TimingAggregator, TimingSpan, TimingSummary, createAggregator, createTiming, formatDuration, formatPercentage, formatSingleReport, printSingleReport } from './utilities/Timing.js';
|
|
13
14
|
export { SileroOptions, VadSegment, detectVoiceActivity, ensureVadInstalled, segmentsToTimeline } from './vad/Silero.js';
|
|
14
15
|
import 'node:fs';
|
package/dist/index.js
CHANGED
|
@@ -62,6 +62,11 @@ import {
|
|
|
62
62
|
inputPreference as inputPreference3,
|
|
63
63
|
recognize as recognize4
|
|
64
64
|
} from "./recognition/WhisperServerSTT.js";
|
|
65
|
+
import {
|
|
66
|
+
spacelessScriptPattern,
|
|
67
|
+
spacelessScripts,
|
|
68
|
+
startsWithSpacelessScript
|
|
69
|
+
} from "./utilities/SpacelessScripts.js";
|
|
65
70
|
import {
|
|
66
71
|
Timing,
|
|
67
72
|
TimingAggregator,
|
|
@@ -132,6 +137,9 @@ export {
|
|
|
132
137
|
serviceCapabilities,
|
|
133
138
|
setConversionMode,
|
|
134
139
|
setTimingEnabled,
|
|
140
|
+
spacelessScriptPattern,
|
|
141
|
+
spacelessScripts,
|
|
142
|
+
startsWithSpacelessScript,
|
|
135
143
|
toBuffer,
|
|
136
144
|
toFilePath,
|
|
137
145
|
toReadStream,
|
|
@@ -59,11 +59,49 @@ async function recognize(input, languageCode, timing, options) {
|
|
|
59
59
|
if (languageCode) {
|
|
60
60
|
form.append("language", languageCode);
|
|
61
61
|
}
|
|
62
|
-
const
|
|
62
|
+
const baseUrl = opts.baseURL.replace(/\/+$/g, "");
|
|
63
|
+
const url = `${baseUrl}${opts.inferencePath}`;
|
|
63
64
|
const headers = {};
|
|
64
65
|
if (opts.apiKey) {
|
|
65
66
|
headers["Authorization"] = `Bearer ${opts.apiKey}`;
|
|
66
67
|
}
|
|
68
|
+
const configResponse = await fetch(`${baseUrl}/config`, {
|
|
69
|
+
headers,
|
|
70
|
+
dispatcher: (0, import_fetch.createTimeoutAgent)(opts.timeout)
|
|
71
|
+
});
|
|
72
|
+
let whisperConfig = null;
|
|
73
|
+
if (configResponse.ok) {
|
|
74
|
+
try {
|
|
75
|
+
const [config, audioLength] = await Promise.all([
|
|
76
|
+
configResponse.json(),
|
|
77
|
+
(0, import_audio.getAudioDuration)(filePath)
|
|
78
|
+
]);
|
|
79
|
+
whisperConfig = {
|
|
80
|
+
...config,
|
|
81
|
+
audioDuration: audioLength
|
|
82
|
+
};
|
|
83
|
+
const effectiveProcessors = (0, import_WhisperTimeline.calculateEffectiveProcessors)(
|
|
84
|
+
audioLength,
|
|
85
|
+
whisperConfig.maxProcessors
|
|
86
|
+
);
|
|
87
|
+
if (effectiveProcessors !== whisperConfig.processors) {
|
|
88
|
+
const configForm = new FormData();
|
|
89
|
+
configForm.append("processors", String(effectiveProcessors));
|
|
90
|
+
configForm.append("threads", String(whisperConfig.threads));
|
|
91
|
+
await fetch(`${baseUrl}/config`, {
|
|
92
|
+
method: "POST",
|
|
93
|
+
headers,
|
|
94
|
+
body: configForm
|
|
95
|
+
});
|
|
96
|
+
whisperConfig.processors = effectiveProcessors;
|
|
97
|
+
}
|
|
98
|
+
} catch (e) {
|
|
99
|
+
console.warn(
|
|
100
|
+
`Failed to get config from Whisper server, continuing with default config. If you aren't using ghost-story server, this is expected`,
|
|
101
|
+
e
|
|
102
|
+
);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
67
105
|
const response = await timing.timeAsync(
|
|
68
106
|
"upload",
|
|
69
107
|
async () => fetch(url, {
|
|
@@ -81,7 +119,8 @@ async function recognize(input, languageCode, timing, options) {
|
|
|
81
119
|
const data = await response.json();
|
|
82
120
|
const { timeline, transcript } = await extractTimelineAndTranscript(
|
|
83
121
|
data,
|
|
84
|
-
filePath
|
|
122
|
+
filePath,
|
|
123
|
+
whisperConfig
|
|
85
124
|
);
|
|
86
125
|
if (!timeline) {
|
|
87
126
|
throw new Error(
|
|
@@ -93,7 +132,7 @@ async function recognize(input, languageCode, timing, options) {
|
|
|
93
132
|
await prepared.cleanup();
|
|
94
133
|
}
|
|
95
134
|
}
|
|
96
|
-
async function extractTimelineAndTranscript(response, audioPath) {
|
|
135
|
+
async function extractTimelineAndTranscript(response, audioPath, whisperConfig) {
|
|
97
136
|
var _a, _b, _c, _d;
|
|
98
137
|
if (response.segments.length === 0) {
|
|
99
138
|
return { timeline: [], transcript: ((_a = response.text) == null ? void 0 : _a.trim()) ?? "" };
|
|
@@ -101,7 +140,10 @@ async function extractTimelineAndTranscript(response, audioPath) {
|
|
|
101
140
|
const hasNestedWords = (((_c = (_b = response.segments[0]) == null ? void 0 : _b.words) == null ? void 0 : _c.length) ?? 0) > 0;
|
|
102
141
|
if (hasNestedWords) {
|
|
103
142
|
const rawSegments = (0, import_WhisperTimeline.parseWhisperServerOutput)(response.segments);
|
|
104
|
-
const splitBoundaries =
|
|
143
|
+
const splitBoundaries = (whisperConfig == null ? void 0 : whisperConfig.audioDuration) ? (0, import_WhisperTimeline.calculateWhisperSplits)(
|
|
144
|
+
whisperConfig.audioDuration,
|
|
145
|
+
whisperConfig.processors
|
|
146
|
+
) : await detectSplitBoundaries(rawSegments, audioPath);
|
|
105
147
|
const timeline2 = (0, import_WhisperTimeline.extractCorrectedTimeline)(rawSegments, { splitBoundaries });
|
|
106
148
|
const transcript = timeline2.map((entry) => entry.text).join(" ");
|
|
107
149
|
return { timeline: timeline2, transcript };
|
|
@@ -9,6 +9,7 @@ import {
|
|
|
9
9
|
} from "../audio/index.js";
|
|
10
10
|
import { createTimeoutAgent } from "../fetch.js";
|
|
11
11
|
import {
|
|
12
|
+
calculateEffectiveProcessors,
|
|
12
13
|
calculateWhisperSplits,
|
|
13
14
|
countProcessorBoundaries,
|
|
14
15
|
extractCorrectedTimeline,
|
|
@@ -46,11 +47,49 @@ async function recognize(input, languageCode, timing, options) {
|
|
|
46
47
|
if (languageCode) {
|
|
47
48
|
form.append("language", languageCode);
|
|
48
49
|
}
|
|
49
|
-
const
|
|
50
|
+
const baseUrl = opts.baseURL.replace(/\/+$/g, "");
|
|
51
|
+
const url = `${baseUrl}${opts.inferencePath}`;
|
|
50
52
|
const headers = {};
|
|
51
53
|
if (opts.apiKey) {
|
|
52
54
|
headers["Authorization"] = `Bearer ${opts.apiKey}`;
|
|
53
55
|
}
|
|
56
|
+
const configResponse = await fetch(`${baseUrl}/config`, {
|
|
57
|
+
headers,
|
|
58
|
+
dispatcher: createTimeoutAgent(opts.timeout)
|
|
59
|
+
});
|
|
60
|
+
let whisperConfig = null;
|
|
61
|
+
if (configResponse.ok) {
|
|
62
|
+
try {
|
|
63
|
+
const [config, audioLength] = await Promise.all([
|
|
64
|
+
configResponse.json(),
|
|
65
|
+
getAudioDuration(filePath)
|
|
66
|
+
]);
|
|
67
|
+
whisperConfig = {
|
|
68
|
+
...config,
|
|
69
|
+
audioDuration: audioLength
|
|
70
|
+
};
|
|
71
|
+
const effectiveProcessors = calculateEffectiveProcessors(
|
|
72
|
+
audioLength,
|
|
73
|
+
whisperConfig.maxProcessors
|
|
74
|
+
);
|
|
75
|
+
if (effectiveProcessors !== whisperConfig.processors) {
|
|
76
|
+
const configForm = new FormData();
|
|
77
|
+
configForm.append("processors", String(effectiveProcessors));
|
|
78
|
+
configForm.append("threads", String(whisperConfig.threads));
|
|
79
|
+
await fetch(`${baseUrl}/config`, {
|
|
80
|
+
method: "POST",
|
|
81
|
+
headers,
|
|
82
|
+
body: configForm
|
|
83
|
+
});
|
|
84
|
+
whisperConfig.processors = effectiveProcessors;
|
|
85
|
+
}
|
|
86
|
+
} catch (e) {
|
|
87
|
+
console.warn(
|
|
88
|
+
`Failed to get config from Whisper server, continuing with default config. If you aren't using ghost-story server, this is expected`,
|
|
89
|
+
e
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
54
93
|
const response = await timing.timeAsync(
|
|
55
94
|
"upload",
|
|
56
95
|
async () => fetch(url, {
|
|
@@ -68,7 +107,8 @@ async function recognize(input, languageCode, timing, options) {
|
|
|
68
107
|
const data = await response.json();
|
|
69
108
|
const { timeline, transcript } = await extractTimelineAndTranscript(
|
|
70
109
|
data,
|
|
71
|
-
filePath
|
|
110
|
+
filePath,
|
|
111
|
+
whisperConfig
|
|
72
112
|
);
|
|
73
113
|
if (!timeline) {
|
|
74
114
|
throw new Error(
|
|
@@ -80,7 +120,7 @@ async function recognize(input, languageCode, timing, options) {
|
|
|
80
120
|
await prepared.cleanup();
|
|
81
121
|
}
|
|
82
122
|
}
|
|
83
|
-
async function extractTimelineAndTranscript(response, audioPath) {
|
|
123
|
+
async function extractTimelineAndTranscript(response, audioPath, whisperConfig) {
|
|
84
124
|
var _a, _b, _c, _d;
|
|
85
125
|
if (response.segments.length === 0) {
|
|
86
126
|
return { timeline: [], transcript: ((_a = response.text) == null ? void 0 : _a.trim()) ?? "" };
|
|
@@ -88,7 +128,10 @@ async function extractTimelineAndTranscript(response, audioPath) {
|
|
|
88
128
|
const hasNestedWords = (((_c = (_b = response.segments[0]) == null ? void 0 : _b.words) == null ? void 0 : _c.length) ?? 0) > 0;
|
|
89
129
|
if (hasNestedWords) {
|
|
90
130
|
const rawSegments = parseWhisperServerOutput(response.segments);
|
|
91
|
-
const splitBoundaries =
|
|
131
|
+
const splitBoundaries = (whisperConfig == null ? void 0 : whisperConfig.audioDuration) ? calculateWhisperSplits(
|
|
132
|
+
whisperConfig.audioDuration,
|
|
133
|
+
whisperConfig.processors
|
|
134
|
+
) : await detectSplitBoundaries(rawSegments, audioPath);
|
|
92
135
|
const timeline2 = extractCorrectedTimeline(rawSegments, { splitBoundaries });
|
|
93
136
|
const transcript = timeline2.map((entry) => entry.text).join(" ");
|
|
94
137
|
return { timeline: timeline2, transcript };
|
|
@@ -302,14 +302,28 @@ function extractCorrectedTimeline(segments, options = {}) {
|
|
|
302
302
|
if (!segment) continue;
|
|
303
303
|
const segmentStart = segment.segmentStart;
|
|
304
304
|
const segmentEnd = segment.segmentEnd < segment.segmentStart ? segment.segmentStart : segment.segmentEnd;
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
305
|
+
if (splitBoundaries.length > 0) {
|
|
306
|
+
const boundary = splitBoundaries.find((boundary2) => {
|
|
307
|
+
return Math.abs(boundary2 - segmentStart) < 2;
|
|
308
|
+
});
|
|
309
|
+
if (boundary) {
|
|
310
|
+
state.cumulativeOffset = getBetterCumulativeOffset(
|
|
311
|
+
state,
|
|
312
|
+
segment,
|
|
313
|
+
splitBoundaries,
|
|
314
|
+
usedSplits
|
|
315
|
+
);
|
|
316
|
+
}
|
|
317
|
+
} else {
|
|
318
|
+
const boundary = detectProcessorBoundary(segment, state);
|
|
319
|
+
if (boundary.isBoundary) {
|
|
320
|
+
state.cumulativeOffset = getBetterCumulativeOffset(
|
|
321
|
+
state,
|
|
322
|
+
segment,
|
|
323
|
+
splitBoundaries,
|
|
324
|
+
usedSplits
|
|
325
|
+
);
|
|
326
|
+
}
|
|
313
327
|
}
|
|
314
328
|
const nextSegment = segments[i + 1] ?? null;
|
|
315
329
|
if (nextSegment && isTimeTravelingSegment(nextSegment)) {
|
|
@@ -272,14 +272,28 @@ function extractCorrectedTimeline(segments, options = {}) {
|
|
|
272
272
|
if (!segment) continue;
|
|
273
273
|
const segmentStart = segment.segmentStart;
|
|
274
274
|
const segmentEnd = segment.segmentEnd < segment.segmentStart ? segment.segmentStart : segment.segmentEnd;
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
275
|
+
if (splitBoundaries.length > 0) {
|
|
276
|
+
const boundary = splitBoundaries.find((boundary2) => {
|
|
277
|
+
return Math.abs(boundary2 - segmentStart) < 2;
|
|
278
|
+
});
|
|
279
|
+
if (boundary) {
|
|
280
|
+
state.cumulativeOffset = getBetterCumulativeOffset(
|
|
281
|
+
state,
|
|
282
|
+
segment,
|
|
283
|
+
splitBoundaries,
|
|
284
|
+
usedSplits
|
|
285
|
+
);
|
|
286
|
+
}
|
|
287
|
+
} else {
|
|
288
|
+
const boundary = detectProcessorBoundary(segment, state);
|
|
289
|
+
if (boundary.isBoundary) {
|
|
290
|
+
state.cumulativeOffset = getBetterCumulativeOffset(
|
|
291
|
+
state,
|
|
292
|
+
segment,
|
|
293
|
+
splitBoundaries,
|
|
294
|
+
usedSplits
|
|
295
|
+
);
|
|
296
|
+
}
|
|
283
297
|
}
|
|
284
298
|
const nextSegment = segments[i + 1] ?? null;
|
|
285
299
|
if (nextSegment && isTimeTravelingSegment(nextSegment)) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@storyteller-platform/ghost-story",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.11",
|
|
4
4
|
"description": "An easy-to-use speech toolset. Fork of the original echogarden project.",
|
|
5
5
|
"author": "Thomas F. K. Jorna",
|
|
6
6
|
"license": "GPL-3.0",
|
|
@@ -81,7 +81,7 @@
|
|
|
81
81
|
"openai": "~4.103.0",
|
|
82
82
|
"tar": "^7.4.3",
|
|
83
83
|
"undici": "7.14.0",
|
|
84
|
-
"zod": "^3.
|
|
84
|
+
"zod": "^4.3.6"
|
|
85
85
|
},
|
|
86
86
|
"devDependencies": {
|
|
87
87
|
"@storyteller-platform/eslint": "0.1.0",
|