@remotion/whisper-web 4.0.302

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/.turbo/turbo-make.log +6 -0
  2. package/LICENSE.md +49 -0
  3. package/README.md +18 -0
  4. package/build-wasm.ts +117 -0
  5. package/bundle.ts +15 -0
  6. package/dist/can-use-whisper-web.d.ts +18 -0
  7. package/dist/can-use-whisper-web.js +80 -0
  8. package/dist/constants.d.ts +10 -0
  9. package/dist/constants.js +225 -0
  10. package/dist/db/delete-object.d.ts +3 -0
  11. package/dist/db/delete-object.js +13 -0
  12. package/dist/db/get-object-from-db.d.ts +10 -0
  13. package/dist/db/get-object-from-db.js +27 -0
  14. package/dist/db/open-db.d.ts +1 -0
  15. package/dist/db/open-db.js +53 -0
  16. package/dist/db/put-object.d.ts +4 -0
  17. package/dist/db/put-object.js +18 -0
  18. package/dist/delete-model.d.ts +2 -0
  19. package/dist/delete-model.js +6 -0
  20. package/dist/download-model.d.ts +5 -0
  21. package/dist/download-model.js +32 -0
  22. package/dist/download-whisper-model.d.ts +15 -0
  23. package/dist/download-whisper-model.js +48 -0
  24. package/dist/esm/index.mjs +652 -0
  25. package/dist/get-loaded-models.d.ts +2 -0
  26. package/dist/get-loaded-models.js +17 -0
  27. package/dist/get-model-url.d.ts +9 -0
  28. package/dist/get-model-url.js +10 -0
  29. package/dist/index.d.ts +14 -0
  30. package/dist/index.js +18 -0
  31. package/dist/load-mod/load-mod.d.ts +2 -0
  32. package/dist/load-mod/load-mod.js +6 -0
  33. package/dist/log.d.ts +10 -0
  34. package/dist/log.js +33 -0
  35. package/dist/mod.d.ts +6 -0
  36. package/dist/mod.js +1 -0
  37. package/dist/print-handler.d.ts +9 -0
  38. package/dist/print-handler.js +25 -0
  39. package/dist/resample-to-16khz.d.ts +8 -0
  40. package/dist/resample-to-16khz.js +66 -0
  41. package/dist/result.d.ts +53 -0
  42. package/dist/result.js +1 -0
  43. package/dist/simulate-progress.d.ts +9 -0
  44. package/dist/simulate-progress.js +53 -0
  45. package/dist/transcribe.d.ts +18 -0
  46. package/dist/transcribe.js +97 -0
  47. package/dist/transcription-speed.d.ts +3 -0
  48. package/dist/transcription-speed.js +13 -0
  49. package/emscripten.cpp +303 -0
  50. package/eslint.config.mjs +5 -0
  51. package/main.d.ts +46 -0
  52. package/main.js +3 -0
  53. package/package.json +52 -0
  54. package/src/can-use-whisper-web.ts +103 -0
  55. package/src/constants.ts +232 -0
  56. package/src/db/delete-object.ts +16 -0
  57. package/src/db/get-object-from-db.ts +43 -0
  58. package/src/db/open-db.ts +62 -0
  59. package/src/db/put-object.ts +27 -0
  60. package/src/delete-model.ts +8 -0
  61. package/src/download-model.ts +52 -0
  62. package/src/download-whisper-model.ts +86 -0
  63. package/src/get-loaded-models.ts +22 -0
  64. package/src/get-model-url.ts +13 -0
  65. package/src/index.module.ts +9 -0
  66. package/src/index.ts +72 -0
  67. package/src/load-mod/load-mod.ts +11 -0
  68. package/src/log.ts +41 -0
  69. package/src/mod.ts +13 -0
  70. package/src/print-handler.ts +39 -0
  71. package/src/resample-to-16khz.ts +105 -0
  72. package/src/result.ts +59 -0
  73. package/src/simulate-progress.ts +74 -0
  74. package/src/transcribe.ts +184 -0
  75. package/src/transcription-speed.ts +21 -0
  76. package/tsconfig.json +11 -0
  77. package/tsconfig.tsbuildinfo +1 -0
  78. package/worker.js +3 -0
package/emscripten.cpp ADDED
@@ -0,0 +1,303 @@
1
+ #include "whisper.h"
2
+
3
+ #include <emscripten.h>
4
+ #include <emscripten/bind.h>
5
+ #include <iostream>
6
+ #include <vector>
7
+ #include <thread>
8
+
9
+ std::thread g_worker;
10
+
11
+ static inline int mpow2(int n) {
12
+ int p = 1;
13
+ while (p <= n) p *= 2;
14
+ return p / 2;
15
+ }
16
+
17
+ static char * escape_double_quotes_and_backslashes(const char * str) {
18
+ if (str == NULL) {
19
+ return NULL;
20
+ }
21
+
22
+ size_t escaped_length = strlen(str) + 1;
23
+
24
+ for (size_t i = 0; str[i] != '\0'; i++) {
25
+ if (str[i] == '"' || str[i] == '\\') {
26
+ escaped_length++;
27
+ }
28
+ }
29
+
30
+ char * escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
31
+ if (escaped == NULL) {
32
+ return NULL;
33
+ }
34
+
35
+ size_t pos = 0;
36
+ for (size_t i = 0; str[i] != '\0'; i++) {
37
+ if (str[i] == '"' || str[i] == '\\') {
38
+ escaped[pos++] = '\\';
39
+ }
40
+ escaped[pos++] = str[i];
41
+ }
42
+
43
+ // no need to set zero due to calloc() being used prior
44
+
45
+ return escaped;
46
+ }
47
+
48
+ // 500 -> 00:05.000
49
+ // 6000 -> 01:00.000
50
+ std::string to_timestamp(int64_t t, bool comma) {
51
+ int64_t msec = t * 10;
52
+ int64_t hr = msec / (1000 * 60 * 60);
53
+ msec = msec - hr * (1000 * 60 * 60);
54
+ int64_t min = msec / (1000 * 60);
55
+ msec = msec - min * (1000 * 60);
56
+ int64_t sec = msec / 1000;
57
+ msec = msec - sec * 1000;
58
+
59
+ char buf[32];
60
+ snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
61
+
62
+ return std::string(buf);
63
+ }
64
+
65
+ static bool output_json(
66
+ struct whisper_context * ctx,
67
+ bool final,
68
+ int first_segment,
69
+ int n_segments) {
70
+ int indent = 0;
71
+ std::string output;
72
+
73
+
74
+ auto doindent = [&]() {
75
+ for (int i = 0; i < indent; i++) output += "\t";
76
+ };
77
+
78
+ auto start_arr = [&](const char *name) {
79
+ doindent();
80
+ output += "\"" + std::string(name) + "\": [";
81
+ indent++;
82
+ };
83
+
84
+ auto end_arr = [&](bool end) {
85
+ indent--;
86
+ doindent();
87
+ output += (end ? "]" : "],");
88
+ };
89
+
90
+ auto start_obj = [&](const char *name) {
91
+ doindent();
92
+ if (name) {
93
+ output += "\"" + std::string(name) + "\": {";
94
+ } else {
95
+ output += "{";
96
+ }
97
+ indent++;
98
+ };
99
+
100
+ auto end_obj = [&](bool end) {
101
+ indent--;
102
+ doindent();
103
+ output += (end ? "}" : "},");
104
+ };
105
+
106
+ auto start_value = [&](const char *name) {
107
+ doindent();
108
+ output += "\"" + std::string(name) + "\": ";
109
+ };
110
+
111
+ auto value_s = [&](const char *name, const char *val, bool end) {
112
+ start_value(name);
113
+ char * val_escaped = escape_double_quotes_and_backslashes(val);
114
+ output += "\"" + std::string(val_escaped) + (end ? "\"" : "\",");
115
+ free(val_escaped);
116
+ };
117
+
118
+ auto end_value = [&](bool end) {
119
+ output += (end ? "" : ",");
120
+ };
121
+
122
+ auto value_i = [&](const char *name, const int64_t val, bool end) {
123
+ start_value(name);
124
+ output += std::to_string(val);
125
+ end_value(end);
126
+ };
127
+
128
+ auto value_f = [&](const char *name, const float val, bool end) {
129
+ start_value(name);
130
+ output += std::to_string(val);
131
+ end_value(end);
132
+ };
133
+
134
+ auto value_b = [&](const char *name, const bool val, bool end) {
135
+ start_value(name);
136
+ output += (val ? "true" : "false");
137
+ end_value(end);
138
+ };
139
+
140
+ auto times_o = [&](int64_t t0, int64_t t1, bool end) {
141
+ start_obj("timestamps");
142
+ value_s("from", to_timestamp(t0, true).c_str(), false);
143
+ value_s("to", to_timestamp(t1, true).c_str(), true);
144
+ end_obj(false);
145
+ start_obj("offsets");
146
+ value_i("from", t0 * 10, false);
147
+ value_i("to", t1 * 10, true);
148
+ end_obj(end);
149
+ };
150
+
151
+ start_obj(nullptr);
152
+ value_s("systeminfo", whisper_print_system_info(), false);
153
+ start_obj("model");
154
+ value_s("type", whisper_model_type_readable(ctx), false);
155
+ value_b("multilingual", whisper_is_multilingual(ctx), false);
156
+ value_i("vocab", whisper_model_n_vocab(ctx), false);
157
+ start_obj("audio");
158
+ value_i("ctx", whisper_model_n_audio_ctx(ctx), false);
159
+ value_i("state", whisper_model_n_audio_state(ctx), false);
160
+ value_i("head", whisper_model_n_audio_head(ctx), false);
161
+ value_i("layer", whisper_model_n_audio_layer(ctx), true);
162
+ end_obj(false);
163
+ start_obj("text");
164
+ value_i("ctx", whisper_model_n_text_ctx(ctx), false);
165
+ value_i("state", whisper_model_n_text_state(ctx), false);
166
+ value_i("head", whisper_model_n_text_head(ctx), false);
167
+ value_i("layer", whisper_model_n_text_layer(ctx), true);
168
+ end_obj(false);
169
+ value_i("mels", whisper_model_n_mels(ctx), false);
170
+ value_i("ftype", whisper_model_ftype(ctx), true);
171
+ end_obj(false);
172
+ start_obj("result");
173
+ value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
174
+ end_obj(false);
175
+ start_arr("transcription");
176
+
177
+ for (int i = first_segment; i < n_segments; ++i) {
178
+ const char * text = whisper_full_get_segment_text(ctx, i);
179
+
180
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
181
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
182
+
183
+ start_obj(nullptr);
184
+ times_o(t0, t1, false);
185
+ value_s("text", text, false);
186
+
187
+ start_arr("tokens");
188
+ const int n = whisper_full_n_tokens(ctx, i);
189
+ for (int j = 0; j < n; ++j) {
190
+ auto token = whisper_full_get_token_data(ctx, i, j);
191
+ start_obj(nullptr);
192
+ value_s("text", whisper_token_to_str(ctx, token.id), false);
193
+ if(token.t0 > -1 && token.t1 > -1) {
194
+ // If we have per-token timestamps, write them out
195
+ times_o(token.t0, token.t1, false);
196
+ }
197
+ value_i("id", token.id, false);
198
+ value_f("p", token.p, false);
199
+ value_f("t_dtw", token.t_dtw, true);
200
+ end_obj(j == (n - 1));
201
+ }
202
+ end_arr(true);
203
+
204
+ end_obj(i == (n_segments - 1));
205
+ }
206
+
207
+ end_arr(true);
208
+ end_obj(true);
209
+
210
+ if (final) {
211
+ printf("remotion_final:%s\n", output.c_str());
212
+ } else {
213
+ printf("remotion_update:%s\n", output.c_str());
214
+ }
215
+
216
+ return true;
217
+ }
218
+
219
+
220
+ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
221
+ const int n_segments = whisper_full_n_segments(ctx);
222
+ const int s0 = n_segments - n_new;
223
+
224
+ if (s0 == 0) {
225
+ printf("\n");
226
+ }
227
+
228
+ output_json(ctx, false, s0, n_segments);
229
+ }
230
+
231
+ // Define the progress callback function
232
+ void progress_callback(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data) {
233
+ printf("remotion_progress:%d%%\n", progress);
234
+ }
235
+
236
+ std::vector<struct whisper_context *> g_contexts(1, nullptr);
237
+
238
+
239
+ EMSCRIPTEN_BINDINGS(whisper) {
240
+ emscripten::function("full_default", emscripten::optional_override([](const std::string & path_model, const emscripten::val & audio, const std::string & model, const std::string & lang, int nthreads, bool translate) {
241
+ if (g_contexts[0] != nullptr) {
242
+ printf("remotion_busy:\n");
243
+ return 0;
244
+ }
245
+
246
+ g_contexts[0] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
247
+
248
+ struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
249
+
250
+ std::vector<float> pcmf32;
251
+
252
+ params.print_realtime = false;
253
+ params.new_segment_callback = whisper_print_segment_callback;
254
+ params.print_progress = false;
255
+ params.print_timestamps = false;
256
+ params.print_special = false;
257
+ params.translate = translate;
258
+ params.token_timestamps = true;
259
+ params.language = lang.c_str(); // Convert std::string to const char*
260
+ params.n_threads = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
261
+ params.offset_ms = 0;
262
+ params.progress_callback = progress_callback; // Assigning the callback
263
+
264
+ const int n = audio["length"].as<int>();
265
+
266
+ emscripten::val heap = emscripten::val::module_property("HEAPU8");
267
+ emscripten::val memory = heap["buffer"];
268
+
269
+ pcmf32.resize(n);
270
+
271
+ emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(pcmf32.data()), n);
272
+ memoryView.call<void>("set", audio);
273
+
274
+ // Print system information
275
+ {
276
+ printf("system_info: n_threads = %d / %d | %s\n",
277
+ params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
278
+
279
+ printf("%s: processing %d samples, %.1f sec, %d threads, lang = %s, task = %s ...\n",
280
+ __func__, int(pcmf32.size()), float(pcmf32.size()) / WHISPER_SAMPLE_RATE,
281
+ params.n_threads,
282
+ params.language,
283
+ params.translate ? "translate" : "transcribe");
284
+
285
+ printf("\n");
286
+ }
287
+
288
+
289
+ // Run the worker
290
+ {
291
+ g_worker = std::thread([params, pcm = std::move(pcmf32)]() {
292
+ whisper_reset_timings(g_contexts[0]);
293
+ whisper_full(g_contexts[0], params, pcm.data(), pcm.size());
294
+ const int n_segments = whisper_full_n_segments(g_contexts[0]);
295
+ output_json(g_contexts[0], true, 0, n_segments);
296
+ whisper_free(g_contexts[0]);
297
+ g_contexts[0] = nullptr;
298
+ });
299
+ }
300
+
301
+ return 0;
302
+ }));
303
+ }
@@ -0,0 +1,5 @@
1
+ import {remotionFlatConfig} from '@remotion/eslint-config-internal';
2
+
3
+ const config = remotionFlatConfig({react: false});
4
+
5
+ export default [config];
package/main.d.ts ADDED
@@ -0,0 +1,46 @@
1
+ // TypeScript bindings for emscripten-generated code. Automatically generated at compile time.
2
+ declare namespace RuntimeExports {
3
+ let print: any;
4
+ let printErr: any;
5
+ /**
6
+ * @param {string|null=} returnType
7
+ * @param {Array=} argTypes
8
+ * @param {Arguments|Array=} args
9
+ * @param {Object=} opts
10
+ */
11
+ function ccall(ident: any, returnType?: (string | null) | undefined, argTypes?: any[] | undefined, args?: (Arguments | any[]) | undefined, opts?: any | undefined): any;
12
+ /**
13
+ * @param {string=} returnType
14
+ * @param {Array=} argTypes
15
+ * @param {Object=} opts
16
+ */
17
+ function cwrap(ident: any, returnType?: string | undefined, argTypes?: any[] | undefined, opts?: any | undefined): (...args: any[]) => any;
18
+ let HEAPF32: any;
19
+ let HEAPF64: any;
20
+ let HEAP_DATA_VIEW: any;
21
+ let HEAP8: any;
22
+ let HEAPU8: any;
23
+ let HEAP16: any;
24
+ let HEAPU16: any;
25
+ let HEAP32: any;
26
+ let HEAPU32: any;
27
+ let HEAP64: any;
28
+ let HEAPU64: any;
29
+ let FS_createPath: any;
30
+ function FS_createDataFile(parent: any, name: any, fileData: any, canRead: any, canWrite: any, canOwn: any): void;
31
+ function FS_createPreloadedFile(parent: any, name: any, url: any, canRead: any, canWrite: any, onload: any, onerror: any, dontCreateFile: any, canOwn: any, preFinish: any): void;
32
+ function FS_unlink(path: any): any;
33
+ let FS_createLazyFile: any;
34
+ let FS_createDevice: any;
35
+ let addRunDependency: any;
36
+ let removeRunDependency: any;
37
+ }
38
+ interface WasmModule {
39
+ }
40
+
41
+ type EmbindString = ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string;
42
+ interface EmbindModule {
43
+ full_default(_0: EmbindString, _1: any, _2: EmbindString, _3: EmbindString, _4: number, _5: boolean): number;
44
+ }
45
+
46
+ export type MainModule = WasmModule & typeof RuntimeExports & EmbindModule;