@ai-sdk/assemblyai 0.0.0-1c33ba03-20260114162300 → 0.0.0-4115c213-20260122152721
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -3
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/docs/100-assemblyai.mdx +282 -0
- package/package.json +14 -5
- package/src/assemblyai-api-types.ts +362 -0
- package/src/assemblyai-config.ts +9 -0
- package/src/assemblyai-error.ts +16 -0
- package/src/assemblyai-provider.ts +112 -0
- package/src/assemblyai-transcription-model.ts +430 -0
- package/src/assemblyai-transcription-settings.ts +1 -0
- package/src/index.ts +6 -0
- package/src/transcript-test.mp3 +0 -0
- package/src/version.ts +6 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,11 +1,40 @@
|
|
|
1
1
|
# @ai-sdk/assemblyai
|
|
2
2
|
|
|
3
|
-
## 0.0.0-
|
|
3
|
+
## 0.0.0-4115c213-20260122152721
|
|
4
4
|
|
|
5
5
|
### Patch Changes
|
|
6
6
|
|
|
7
|
-
-
|
|
8
|
-
|
|
7
|
+
- 4caafb2: chore: excluded tests from src folder in npm package
|
|
8
|
+
- Updated dependencies [4caafb2]
|
|
9
|
+
- @ai-sdk/provider@0.0.0-4115c213-20260122152721
|
|
10
|
+
- @ai-sdk/provider-utils@0.0.0-4115c213-20260122152721
|
|
11
|
+
|
|
12
|
+
## 2.0.10
|
|
13
|
+
|
|
14
|
+
### Patch Changes
|
|
15
|
+
|
|
16
|
+
- 2b8369d: chore: add docs to package dist
|
|
17
|
+
|
|
18
|
+
## 2.0.9
|
|
19
|
+
|
|
20
|
+
### Patch Changes
|
|
21
|
+
|
|
22
|
+
- 8dc54db: chore: add src folders to package bundle
|
|
23
|
+
|
|
24
|
+
## 2.0.8
|
|
25
|
+
|
|
26
|
+
### Patch Changes
|
|
27
|
+
|
|
28
|
+
- Updated dependencies [5c090e7]
|
|
29
|
+
- @ai-sdk/provider@3.0.4
|
|
30
|
+
- @ai-sdk/provider-utils@4.0.8
|
|
31
|
+
|
|
32
|
+
## 2.0.7
|
|
33
|
+
|
|
34
|
+
### Patch Changes
|
|
35
|
+
|
|
36
|
+
- Updated dependencies [46f46e4]
|
|
37
|
+
- @ai-sdk/provider-utils@4.0.7
|
|
9
38
|
|
|
10
39
|
## 2.0.6
|
|
11
40
|
|
package/dist/index.js
CHANGED
|
@@ -401,7 +401,7 @@ var assemblyaiTranscriptionResponseSchema = import_v42.z.object({
|
|
|
401
401
|
});
|
|
402
402
|
|
|
403
403
|
// src/version.ts
|
|
404
|
-
var VERSION = true ? "0.0.0-
|
|
404
|
+
var VERSION = true ? "0.0.0-4115c213-20260122152721" : "0.0.0-test";
|
|
405
405
|
|
|
406
406
|
// src/assemblyai-provider.ts
|
|
407
407
|
function createAssemblyAI(options = {}) {
|
package/dist/index.mjs
CHANGED
|
@@ -385,7 +385,7 @@ var assemblyaiTranscriptionResponseSchema = z2.object({
|
|
|
385
385
|
});
|
|
386
386
|
|
|
387
387
|
// src/version.ts
|
|
388
|
-
var VERSION = true ? "0.0.0-
|
|
388
|
+
var VERSION = true ? "0.0.0-4115c213-20260122152721" : "0.0.0-test";
|
|
389
389
|
|
|
390
390
|
// src/assemblyai-provider.ts
|
|
391
391
|
function createAssemblyAI(options = {}) {
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: AssemblyAI
|
|
3
|
+
description: Learn how to use the AssemblyAI provider for the AI SDK.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AssemblyAI Provider
|
|
7
|
+
|
|
8
|
+
The [AssemblyAI](https://assemblyai.com/) provider contains language model support for the AssemblyAI transcription API.
|
|
9
|
+
|
|
10
|
+
## Setup
|
|
11
|
+
|
|
12
|
+
The AssemblyAI provider is available in the `@ai-sdk/assemblyai` module. You can install it with
|
|
13
|
+
|
|
14
|
+
<Tabs items={['pnpm', 'npm', 'yarn', 'bun']}>
|
|
15
|
+
<Tab>
|
|
16
|
+
<Snippet text="pnpm add @ai-sdk/assemblyai" dark />
|
|
17
|
+
</Tab>
|
|
18
|
+
<Tab>
|
|
19
|
+
<Snippet text="npm install @ai-sdk/assemblyai" dark />
|
|
20
|
+
</Tab>
|
|
21
|
+
<Tab>
|
|
22
|
+
<Snippet text="yarn add @ai-sdk/assemblyai" dark />
|
|
23
|
+
</Tab>
|
|
24
|
+
|
|
25
|
+
<Tab>
|
|
26
|
+
<Snippet text="bun add @ai-sdk/assemblyai" dark />
|
|
27
|
+
</Tab>
|
|
28
|
+
</Tabs>
|
|
29
|
+
|
|
30
|
+
## Provider Instance
|
|
31
|
+
|
|
32
|
+
You can import the default provider instance `assemblyai` from `@ai-sdk/assemblyai`:
|
|
33
|
+
|
|
34
|
+
```ts
|
|
35
|
+
import { assemblyai } from '@ai-sdk/assemblyai';
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
If you need a customized setup, you can import `createAssemblyAI` from `@ai-sdk/assemblyai` and create a provider instance with your settings:
|
|
39
|
+
|
|
40
|
+
```ts
|
|
41
|
+
import { createAssemblyAI } from '@ai-sdk/assemblyai';
|
|
42
|
+
|
|
43
|
+
const assemblyai = createAssemblyAI({
|
|
44
|
+
// custom settings, e.g.
|
|
45
|
+
fetch: customFetch,
|
|
46
|
+
});
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
You can use the following optional settings to customize the AssemblyAI provider instance:
|
|
50
|
+
|
|
51
|
+
- **apiKey** _string_
|
|
52
|
+
|
|
53
|
+
API key that is being sent using the `Authorization` header.
|
|
54
|
+
It defaults to the `ASSEMBLYAI_API_KEY` environment variable.
|
|
55
|
+
|
|
56
|
+
- **headers** _Record<string,string>_
|
|
57
|
+
|
|
58
|
+
Custom headers to include in the requests.
|
|
59
|
+
|
|
60
|
+
- **fetch** _(input: RequestInfo, init?: RequestInit) => Promise<Response>_
|
|
61
|
+
|
|
62
|
+
Custom [fetch](https://developer.mozilla.org/en-US/docs/Web/API/fetch) implementation.
|
|
63
|
+
Defaults to the global `fetch` function.
|
|
64
|
+
You can use it as a middleware to intercept requests,
|
|
65
|
+
or to provide a custom fetch implementation for e.g. testing.
|
|
66
|
+
|
|
67
|
+
## Transcription Models
|
|
68
|
+
|
|
69
|
+
You can create models that call the [AssemblyAI transcription API](https://www.assemblyai.com/docs/getting-started/transcribe-an-audio-file/typescript)
|
|
70
|
+
using the `.transcription()` factory method.
|
|
71
|
+
|
|
72
|
+
The first argument is the model id e.g. `best`.
|
|
73
|
+
|
|
74
|
+
```ts
|
|
75
|
+
const model = assemblyai.transcription('best');
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
You can also pass additional provider-specific options using the `providerOptions` argument. For example, supplying the `contentSafety` option will enable content safety filtering.
|
|
79
|
+
|
|
80
|
+
```ts highlight="6"
|
|
81
|
+
import { experimental_transcribe as transcribe } from 'ai';
|
|
82
|
+
import { assemblyai } from '@ai-sdk/assemblyai';
|
|
83
|
+
import { readFile } from 'fs/promises';
|
|
84
|
+
|
|
85
|
+
const result = await transcribe({
|
|
86
|
+
model: assemblyai.transcription('best'),
|
|
87
|
+
audio: await readFile('audio.mp3'),
|
|
88
|
+
providerOptions: { assemblyai: { contentSafety: true } },
|
|
89
|
+
});
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The following provider options are available:
|
|
93
|
+
|
|
94
|
+
- **audioEndAt** _number_
|
|
95
|
+
|
|
96
|
+
End time of the audio in milliseconds.
|
|
97
|
+
Optional.
|
|
98
|
+
|
|
99
|
+
- **audioStartFrom** _number_
|
|
100
|
+
|
|
101
|
+
Start time of the audio in milliseconds.
|
|
102
|
+
Optional.
|
|
103
|
+
|
|
104
|
+
- **autoChapters** _boolean_
|
|
105
|
+
|
|
106
|
+
Whether to automatically generate chapters for the transcription.
|
|
107
|
+
Optional.
|
|
108
|
+
|
|
109
|
+
- **autoHighlights** _boolean_
|
|
110
|
+
|
|
111
|
+
Whether to automatically generate highlights for the transcription.
|
|
112
|
+
Optional.
|
|
113
|
+
|
|
114
|
+
- **boostParam** _enum_
|
|
115
|
+
|
|
116
|
+
Boost parameter for the transcription.
|
|
117
|
+
Allowed values: `'low'`, `'default'`, `'high'`.
|
|
118
|
+
Optional.
|
|
119
|
+
|
|
120
|
+
- **contentSafety** _boolean_
|
|
121
|
+
|
|
122
|
+
Whether to enable content safety filtering.
|
|
123
|
+
Optional.
|
|
124
|
+
|
|
125
|
+
- **contentSafetyConfidence** _number_
|
|
126
|
+
|
|
127
|
+
Confidence threshold for content safety filtering (25-100).
|
|
128
|
+
Optional.
|
|
129
|
+
|
|
130
|
+
- **customSpelling** _array of objects_
|
|
131
|
+
|
|
132
|
+
Custom spelling rules for the transcription.
|
|
133
|
+
Each object has `from` (array of strings) and `to` (string) properties.
|
|
134
|
+
Optional.
|
|
135
|
+
|
|
136
|
+
- **disfluencies** _boolean_
|
|
137
|
+
|
|
138
|
+
Whether to include disfluencies (um, uh, etc.) in the transcription.
|
|
139
|
+
Optional.
|
|
140
|
+
|
|
141
|
+
- **entityDetection** _boolean_
|
|
142
|
+
|
|
143
|
+
Whether to detect entities in the transcription.
|
|
144
|
+
Optional.
|
|
145
|
+
|
|
146
|
+
- **filterProfanity** _boolean_
|
|
147
|
+
|
|
148
|
+
Whether to filter profanity in the transcription.
|
|
149
|
+
Optional.
|
|
150
|
+
|
|
151
|
+
- **formatText** _boolean_
|
|
152
|
+
|
|
153
|
+
Whether to format the text in the transcription.
|
|
154
|
+
Optional.
|
|
155
|
+
|
|
156
|
+
- **iabCategories** _boolean_
|
|
157
|
+
|
|
158
|
+
Whether to include IAB categories in the transcription.
|
|
159
|
+
Optional.
|
|
160
|
+
|
|
161
|
+
- **languageCode** _string_
|
|
162
|
+
|
|
163
|
+
Language code for the audio.
|
|
164
|
+
Supports numerous ISO-639-1 and ISO-639-3 language codes.
|
|
165
|
+
Optional.
|
|
166
|
+
|
|
167
|
+
- **languageConfidenceThreshold** _number_
|
|
168
|
+
|
|
169
|
+
Confidence threshold for language detection.
|
|
170
|
+
Optional.
|
|
171
|
+
|
|
172
|
+
- **languageDetection** _boolean_
|
|
173
|
+
|
|
174
|
+
Whether to enable language detection.
|
|
175
|
+
Optional.
|
|
176
|
+
|
|
177
|
+
- **multichannel** _boolean_
|
|
178
|
+
|
|
179
|
+
Whether to process multiple audio channels separately.
|
|
180
|
+
Optional.
|
|
181
|
+
|
|
182
|
+
- **punctuate** _boolean_
|
|
183
|
+
|
|
184
|
+
Whether to add punctuation to the transcription.
|
|
185
|
+
Optional.
|
|
186
|
+
|
|
187
|
+
- **redactPii** _boolean_
|
|
188
|
+
|
|
189
|
+
Whether to redact personally identifiable information.
|
|
190
|
+
Optional.
|
|
191
|
+
|
|
192
|
+
- **redactPiiAudio** _boolean_
|
|
193
|
+
|
|
194
|
+
Whether to redact PII in the audio file.
|
|
195
|
+
Optional.
|
|
196
|
+
|
|
197
|
+
- **redactPiiAudioQuality** _enum_
|
|
198
|
+
|
|
199
|
+
Quality of the redacted audio file.
|
|
200
|
+
Allowed values: `'mp3'`, `'wav'`.
|
|
201
|
+
Optional.
|
|
202
|
+
|
|
203
|
+
- **redactPiiPolicies** _array of enums_
|
|
204
|
+
|
|
205
|
+
Policies for PII redaction, specifying which types of information to redact.
|
|
206
|
+
Supports numerous types like `'person_name'`, `'phone_number'`, etc.
|
|
207
|
+
Optional.
|
|
208
|
+
|
|
209
|
+
- **redactPiiSub** _enum_
|
|
210
|
+
|
|
211
|
+
Substitution method for redacted PII.
|
|
212
|
+
Allowed values: `'entity_name'`, `'hash'`.
|
|
213
|
+
Optional.
|
|
214
|
+
|
|
215
|
+
- **sentimentAnalysis** _boolean_
|
|
216
|
+
|
|
217
|
+
Whether to perform sentiment analysis on the transcription.
|
|
218
|
+
Optional.
|
|
219
|
+
|
|
220
|
+
- **speakerLabels** _boolean_
|
|
221
|
+
|
|
222
|
+
Whether to label different speakers in the transcription.
|
|
223
|
+
Optional.
|
|
224
|
+
|
|
225
|
+
- **speakersExpected** _number_
|
|
226
|
+
|
|
227
|
+
Expected number of speakers in the audio.
|
|
228
|
+
Optional.
|
|
229
|
+
|
|
230
|
+
- **speechThreshold** _number_
|
|
231
|
+
|
|
232
|
+
Threshold for speech detection (0-1).
|
|
233
|
+
Optional.
|
|
234
|
+
|
|
235
|
+
- **summarization** _boolean_
|
|
236
|
+
|
|
237
|
+
Whether to generate a summary of the transcription.
|
|
238
|
+
Optional.
|
|
239
|
+
|
|
240
|
+
- **summaryModel** _enum_
|
|
241
|
+
|
|
242
|
+
Model to use for summarization.
|
|
243
|
+
Allowed values: `'informative'`, `'conversational'`, `'catchy'`.
|
|
244
|
+
Optional.
|
|
245
|
+
|
|
246
|
+
- **summaryType** _enum_
|
|
247
|
+
|
|
248
|
+
Type of summary to generate.
|
|
249
|
+
Allowed values: `'bullets'`, `'bullets_verbose'`, `'gist'`, `'headline'`, `'paragraph'`.
|
|
250
|
+
Optional.
|
|
251
|
+
|
|
252
|
+
- **topics** _array of strings_
|
|
253
|
+
|
|
254
|
+
List of topics to detect in the transcription.
|
|
255
|
+
Optional.
|
|
256
|
+
|
|
257
|
+
- **webhookAuthHeaderName** _string_
|
|
258
|
+
|
|
259
|
+
Name of the authentication header for webhook requests.
|
|
260
|
+
Optional.
|
|
261
|
+
|
|
262
|
+
- **webhookAuthHeaderValue** _string_
|
|
263
|
+
|
|
264
|
+
Value of the authentication header for webhook requests.
|
|
265
|
+
Optional.
|
|
266
|
+
|
|
267
|
+
- **webhookUrl** _string_
|
|
268
|
+
|
|
269
|
+
URL to send webhook notifications to.
|
|
270
|
+
Optional.
|
|
271
|
+
|
|
272
|
+
- **wordBoost** _array of strings_
|
|
273
|
+
|
|
274
|
+
List of words to boost in the transcription.
|
|
275
|
+
Optional.
|
|
276
|
+
|
|
277
|
+
### Model Capabilities
|
|
278
|
+
|
|
279
|
+
| Model | Transcription | Duration | Segments | Language |
|
|
280
|
+
| ------ | ------------------- | ------------------- | ------------------- | ------------------- |
|
|
281
|
+
| `best` | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> |
|
|
282
|
+
| `nano` | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ai-sdk/assemblyai",
|
|
3
|
-
"version": "0.0.0-
|
|
3
|
+
"version": "0.0.0-4115c213-20260122152721",
|
|
4
4
|
"license": "Apache-2.0",
|
|
5
5
|
"sideEffects": false,
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -8,9 +8,18 @@
|
|
|
8
8
|
"types": "./dist/index.d.ts",
|
|
9
9
|
"files": [
|
|
10
10
|
"dist/**/*",
|
|
11
|
+
"docs/**/*",
|
|
12
|
+
"src",
|
|
13
|
+
"!src/**/*.test.ts",
|
|
14
|
+
"!src/**/*.test-d.ts",
|
|
15
|
+
"!src/**/__snapshots__",
|
|
16
|
+
"!src/**/__fixtures__",
|
|
11
17
|
"CHANGELOG.md",
|
|
12
18
|
"README.md"
|
|
13
19
|
],
|
|
20
|
+
"directories": {
|
|
21
|
+
"doc": "./docs"
|
|
22
|
+
},
|
|
14
23
|
"exports": {
|
|
15
24
|
"./package.json": "./package.json",
|
|
16
25
|
".": {
|
|
@@ -20,15 +29,15 @@
|
|
|
20
29
|
}
|
|
21
30
|
},
|
|
22
31
|
"dependencies": {
|
|
23
|
-
"@ai-sdk/provider": "
|
|
24
|
-
"@ai-sdk/provider-utils": "0.0.0-
|
|
32
|
+
"@ai-sdk/provider": "0.0.0-4115c213-20260122152721",
|
|
33
|
+
"@ai-sdk/provider-utils": "0.0.0-4115c213-20260122152721"
|
|
25
34
|
},
|
|
26
35
|
"devDependencies": {
|
|
27
36
|
"@types/node": "20.17.24",
|
|
28
37
|
"tsup": "^8",
|
|
29
38
|
"typescript": "5.6.3",
|
|
30
39
|
"zod": "3.25.76",
|
|
31
|
-
"@ai-sdk/test-server": "
|
|
40
|
+
"@ai-sdk/test-server": "0.0.0-4115c213-20260122152721",
|
|
32
41
|
"@vercel/ai-tsconfig": "0.0.0"
|
|
33
42
|
},
|
|
34
43
|
"peerDependencies": {
|
|
@@ -54,7 +63,7 @@
|
|
|
54
63
|
"scripts": {
|
|
55
64
|
"build": "pnpm clean && tsup --tsconfig tsconfig.build.json",
|
|
56
65
|
"build:watch": "pnpm clean && tsup --watch --tsconfig tsconfig.build.json",
|
|
57
|
-
"clean": "del-cli dist *.tsbuildinfo",
|
|
66
|
+
"clean": "del-cli dist docs *.tsbuildinfo",
|
|
58
67
|
"lint": "eslint \"./**/*.ts*\"",
|
|
59
68
|
"type-check": "tsc --build",
|
|
60
69
|
"prettier-check": "prettier --check \"./**/*.ts*\"",
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
export type AssemblyAITranscriptionAPITypes = {
|
|
2
|
+
/**
|
|
3
|
+
* The URL of the audio or video file to transcribe.
|
|
4
|
+
*/
|
|
5
|
+
audio_url: string;
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* The point in time, in milliseconds, to stop transcribing in your media file
|
|
9
|
+
*/
|
|
10
|
+
audio_end_at?: number;
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* The point in time, in milliseconds, to begin transcribing in your media file
|
|
14
|
+
*/
|
|
15
|
+
audio_start_from?: number;
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Enable Auto Chapters, can be true or false
|
|
19
|
+
* @default false
|
|
20
|
+
*/
|
|
21
|
+
auto_chapters?: boolean;
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Enable Key Phrases, either true or false
|
|
25
|
+
* @default false
|
|
26
|
+
*/
|
|
27
|
+
auto_highlights?: boolean;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* How much to boost specified words
|
|
31
|
+
*/
|
|
32
|
+
boost_param?: 'low' | 'default' | 'high';
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Enable Content Moderation, can be true or false
|
|
36
|
+
* @default false
|
|
37
|
+
*/
|
|
38
|
+
content_safety?: boolean;
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* The confidence threshold for the Content Moderation model. Values must be between 25 and 100.
|
|
42
|
+
* @default 50
|
|
43
|
+
*/
|
|
44
|
+
content_safety_confidence?: number;
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Customize how words are spelled and formatted using to and from values
|
|
48
|
+
*/
|
|
49
|
+
custom_spelling?: Array<{
|
|
50
|
+
/**
|
|
51
|
+
* Words or phrases to replace
|
|
52
|
+
*/
|
|
53
|
+
from: string[];
|
|
54
|
+
/**
|
|
55
|
+
* Word to replace with
|
|
56
|
+
*/
|
|
57
|
+
to: string;
|
|
58
|
+
}>;
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Transcribe Filler Words, like "umm", in your media file; can be true or false
|
|
62
|
+
* @default false
|
|
63
|
+
*/
|
|
64
|
+
disfluencies?: boolean;
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Enable Entity Detection, can be true or false
|
|
68
|
+
* @default false
|
|
69
|
+
*/
|
|
70
|
+
entity_detection?: boolean;
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Filter profanity from the transcribed text, can be true or false
|
|
74
|
+
* @default false
|
|
75
|
+
*/
|
|
76
|
+
filter_profanity?: boolean;
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Enable Text Formatting, can be true or false
|
|
80
|
+
* @default true
|
|
81
|
+
*/
|
|
82
|
+
format_text?: boolean;
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Enable Topic Detection, can be true or false
|
|
86
|
+
* @default false
|
|
87
|
+
*/
|
|
88
|
+
iab_categories?: boolean;
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* The language of your audio file. Possible values are found in Supported Languages.
|
|
92
|
+
* @default 'en_us'
|
|
93
|
+
*/
|
|
94
|
+
language_code?:
|
|
95
|
+
| 'en'
|
|
96
|
+
| 'en_au'
|
|
97
|
+
| 'en_uk'
|
|
98
|
+
| 'en_us'
|
|
99
|
+
| 'es'
|
|
100
|
+
| 'fr'
|
|
101
|
+
| 'de'
|
|
102
|
+
| 'it'
|
|
103
|
+
| 'pt'
|
|
104
|
+
| 'nl'
|
|
105
|
+
| 'af'
|
|
106
|
+
| 'sq'
|
|
107
|
+
| 'am'
|
|
108
|
+
| 'ar'
|
|
109
|
+
| 'hy'
|
|
110
|
+
| 'as'
|
|
111
|
+
| 'az'
|
|
112
|
+
| 'ba'
|
|
113
|
+
| 'eu'
|
|
114
|
+
| 'be'
|
|
115
|
+
| 'bn'
|
|
116
|
+
| 'bs'
|
|
117
|
+
| 'br'
|
|
118
|
+
| 'bg'
|
|
119
|
+
| 'my'
|
|
120
|
+
| 'ca'
|
|
121
|
+
| 'zh'
|
|
122
|
+
| 'hr'
|
|
123
|
+
| 'cs'
|
|
124
|
+
| 'da'
|
|
125
|
+
| 'et'
|
|
126
|
+
| 'fo'
|
|
127
|
+
| 'fi'
|
|
128
|
+
| 'gl'
|
|
129
|
+
| 'ka'
|
|
130
|
+
| 'el'
|
|
131
|
+
| 'gu'
|
|
132
|
+
| 'ht'
|
|
133
|
+
| 'ha'
|
|
134
|
+
| 'haw'
|
|
135
|
+
| 'he'
|
|
136
|
+
| 'hi'
|
|
137
|
+
| 'hu'
|
|
138
|
+
| 'is'
|
|
139
|
+
| 'id'
|
|
140
|
+
| 'ja'
|
|
141
|
+
| 'jw'
|
|
142
|
+
| 'kn'
|
|
143
|
+
| 'kk'
|
|
144
|
+
| 'km'
|
|
145
|
+
| 'ko'
|
|
146
|
+
| 'lo'
|
|
147
|
+
| 'la'
|
|
148
|
+
| 'lv'
|
|
149
|
+
| 'ln'
|
|
150
|
+
| 'lt'
|
|
151
|
+
| 'lb'
|
|
152
|
+
| 'mk'
|
|
153
|
+
| 'mg'
|
|
154
|
+
| 'ms'
|
|
155
|
+
| 'ml'
|
|
156
|
+
| 'mt'
|
|
157
|
+
| 'mi'
|
|
158
|
+
| 'mr'
|
|
159
|
+
| 'mn'
|
|
160
|
+
| 'ne'
|
|
161
|
+
| 'no'
|
|
162
|
+
| 'nn'
|
|
163
|
+
| 'oc'
|
|
164
|
+
| 'pa'
|
|
165
|
+
| 'ps'
|
|
166
|
+
| 'fa'
|
|
167
|
+
| 'pl'
|
|
168
|
+
| 'ro'
|
|
169
|
+
| 'ru'
|
|
170
|
+
| 'sa'
|
|
171
|
+
| 'sr'
|
|
172
|
+
| 'sn'
|
|
173
|
+
| 'sd'
|
|
174
|
+
| 'si'
|
|
175
|
+
| 'sk'
|
|
176
|
+
| 'sl'
|
|
177
|
+
| 'so'
|
|
178
|
+
| 'su'
|
|
179
|
+
| 'sw'
|
|
180
|
+
| 'sv'
|
|
181
|
+
| 'tl'
|
|
182
|
+
| 'tg'
|
|
183
|
+
| 'ta'
|
|
184
|
+
| 'tt'
|
|
185
|
+
| 'te'
|
|
186
|
+
| 'th'
|
|
187
|
+
| 'bo'
|
|
188
|
+
| 'tr'
|
|
189
|
+
| 'tk'
|
|
190
|
+
| 'uk'
|
|
191
|
+
| 'ur'
|
|
192
|
+
| 'uz'
|
|
193
|
+
| 'vi'
|
|
194
|
+
| 'cy'
|
|
195
|
+
| 'yi'
|
|
196
|
+
| 'yo';
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* The confidence threshold for the automatically detected language. An error will be returned if the language confidence is below this threshold.
|
|
200
|
+
* @default 0
|
|
201
|
+
*/
|
|
202
|
+
language_confidence_threshold?: number;
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Enable Automatic language detection, either true or false.
|
|
206
|
+
* @default false
|
|
207
|
+
*/
|
|
208
|
+
language_detection?: boolean;
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Enable Multichannel transcription, can be true or false.
|
|
212
|
+
* @default false
|
|
213
|
+
*/
|
|
214
|
+
multichannel?: boolean;
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Enable Automatic Punctuation, can be true or false
|
|
218
|
+
* @default true
|
|
219
|
+
*/
|
|
220
|
+
punctuate?: boolean;
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Redact PII from the transcribed text using the Redact PII model, can be true or false
|
|
224
|
+
* @default false
|
|
225
|
+
*/
|
|
226
|
+
redact_pii?: boolean;
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Generate a copy of the original media file with spoken PII "beeped" out, can be true or false.
|
|
230
|
+
* @default false
|
|
231
|
+
*/
|
|
232
|
+
redact_pii_audio?: boolean;
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Controls the filetype of the audio created by redact_pii_audio. Currently supports mp3 (default) and wav.
|
|
236
|
+
*/
|
|
237
|
+
redact_pii_audio_quality?: 'mp3' | 'wav';
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* The list of PII Redaction policies to enable.
|
|
241
|
+
*/
|
|
242
|
+
redact_pii_policies?: Array<
|
|
243
|
+
| 'account_number'
|
|
244
|
+
| 'banking_information'
|
|
245
|
+
| 'blood_type'
|
|
246
|
+
| 'credit_card_cvv'
|
|
247
|
+
| 'credit_card_expiration'
|
|
248
|
+
| 'credit_card_number'
|
|
249
|
+
| 'date'
|
|
250
|
+
| 'date_interval'
|
|
251
|
+
| 'date_of_birth'
|
|
252
|
+
| 'drivers_license'
|
|
253
|
+
| 'drug'
|
|
254
|
+
| 'duration'
|
|
255
|
+
| 'email_address'
|
|
256
|
+
| 'event'
|
|
257
|
+
| 'filename'
|
|
258
|
+
| 'gender_sexuality'
|
|
259
|
+
| 'healthcare_number'
|
|
260
|
+
| 'injury'
|
|
261
|
+
| 'ip_address'
|
|
262
|
+
| 'language'
|
|
263
|
+
| 'location'
|
|
264
|
+
| 'marital_status'
|
|
265
|
+
| 'medical_condition'
|
|
266
|
+
| 'medical_process'
|
|
267
|
+
| 'money_amount'
|
|
268
|
+
| 'nationality'
|
|
269
|
+
| 'number_sequence'
|
|
270
|
+
| 'occupation'
|
|
271
|
+
| 'organization'
|
|
272
|
+
| 'passport_number'
|
|
273
|
+
| 'password'
|
|
274
|
+
| 'person_age'
|
|
275
|
+
| 'person_name'
|
|
276
|
+
| 'phone_number'
|
|
277
|
+
| 'physical_attribute'
|
|
278
|
+
| 'political_affiliation'
|
|
279
|
+
| 'religion'
|
|
280
|
+
| 'statistics'
|
|
281
|
+
| 'time'
|
|
282
|
+
| 'url'
|
|
283
|
+
| 'us_social_security_number'
|
|
284
|
+
| 'username'
|
|
285
|
+
| 'vehicle_id'
|
|
286
|
+
| 'zodiac_sign'
|
|
287
|
+
>;
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* The replacement logic for detected PII, can be "entity_name" or "hash".
|
|
291
|
+
*/
|
|
292
|
+
redact_pii_sub?: 'entity_name' | 'hash';
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Enable Sentiment Analysis, can be true or false
|
|
296
|
+
* @default false
|
|
297
|
+
*/
|
|
298
|
+
sentiment_analysis?: boolean;
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* Enable Speaker diarization, can be true or false
|
|
302
|
+
* @default false
|
|
303
|
+
*/
|
|
304
|
+
speaker_labels?: boolean;
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Tells the speaker label model how many speakers it should attempt to identify, up to 10.
|
|
308
|
+
*/
|
|
309
|
+
speakers_expected?: number;
|
|
310
|
+
|
|
311
|
+
/**
|
|
312
|
+
* The speech model to use for the transcription.
|
|
313
|
+
*/
|
|
314
|
+
speech_model?: 'best' | 'nano';
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Reject audio files that contain less than this fraction of speech. Valid values are in the range [0, 1] inclusive.
|
|
318
|
+
*/
|
|
319
|
+
speech_threshold?: number;
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Enable Summarization, can be true or false
|
|
323
|
+
* @default false
|
|
324
|
+
*/
|
|
325
|
+
summarization?: boolean;
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* The model to summarize the transcript
|
|
329
|
+
*/
|
|
330
|
+
summary_model?: 'informative' | 'conversational' | 'catchy';
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* The type of summary
|
|
334
|
+
*/
|
|
335
|
+
summary_type?:
|
|
336
|
+
| 'bullets'
|
|
337
|
+
| 'bullets_verbose'
|
|
338
|
+
| 'gist'
|
|
339
|
+
| 'headline'
|
|
340
|
+
| 'paragraph';
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* The header name to be sent with the transcript completed or failed webhook requests
|
|
344
|
+
*/
|
|
345
|
+
webhook_auth_header_name?: string;
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* The header value to send back with the transcript completed or failed webhook requests for added security
|
|
349
|
+
*/
|
|
350
|
+
webhook_auth_header_value?: string;
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
* The URL to which we send webhook requests. We sends two different types of webhook requests.
|
|
354
|
+
* One request when a transcript is completed or failed, and one request when the redacted audio is ready if redact_pii_audio is enabled.
|
|
355
|
+
*/
|
|
356
|
+
webhook_url?: string;
|
|
357
|
+
|
|
358
|
+
/**
|
|
359
|
+
* The list of custom vocabulary to boost transcription probability for
|
|
360
|
+
*/
|
|
361
|
+
word_boost?: string[];
|
|
362
|
+
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { FetchFunction } from '@ai-sdk/provider-utils';
|
|
2
|
+
|
|
3
|
+
export type AssemblyAIConfig = {
|
|
4
|
+
provider: string;
|
|
5
|
+
url: (options: { modelId: string; path: string }) => string;
|
|
6
|
+
headers: () => Record<string, string | undefined>;
|
|
7
|
+
fetch?: FetchFunction;
|
|
8
|
+
generateId?: () => string;
|
|
9
|
+
};
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { z } from 'zod/v4';
|
|
2
|
+
import { createJsonErrorResponseHandler } from '@ai-sdk/provider-utils';
|
|
3
|
+
|
|
4
|
+
export const assemblyaiErrorDataSchema = z.object({
|
|
5
|
+
error: z.object({
|
|
6
|
+
message: z.string(),
|
|
7
|
+
code: z.number(),
|
|
8
|
+
}),
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
export type AssemblyAIErrorData = z.infer<typeof assemblyaiErrorDataSchema>;
|
|
12
|
+
|
|
13
|
+
export const assemblyaiFailedResponseHandler = createJsonErrorResponseHandler({
|
|
14
|
+
errorSchema: assemblyaiErrorDataSchema,
|
|
15
|
+
errorToMessage: data => data.error.message,
|
|
16
|
+
});
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import {
|
|
2
|
+
TranscriptionModelV3,
|
|
3
|
+
ProviderV3,
|
|
4
|
+
NoSuchModelError,
|
|
5
|
+
} from '@ai-sdk/provider';
|
|
6
|
+
import {
|
|
7
|
+
FetchFunction,
|
|
8
|
+
loadApiKey,
|
|
9
|
+
withUserAgentSuffix,
|
|
10
|
+
} from '@ai-sdk/provider-utils';
|
|
11
|
+
import { AssemblyAITranscriptionModel } from './assemblyai-transcription-model';
|
|
12
|
+
import { AssemblyAITranscriptionModelId } from './assemblyai-transcription-settings';
|
|
13
|
+
import { VERSION } from './version';
|
|
14
|
+
|
|
15
|
+
export interface AssemblyAIProvider extends ProviderV3 {
|
|
16
|
+
(
|
|
17
|
+
modelId: 'best',
|
|
18
|
+
settings?: {},
|
|
19
|
+
): {
|
|
20
|
+
transcription: AssemblyAITranscriptionModel;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
Creates a model for transcription.
|
|
25
|
+
*/
|
|
26
|
+
transcription(modelId: AssemblyAITranscriptionModelId): TranscriptionModelV3;
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* @deprecated Use `embeddingModel` instead.
|
|
30
|
+
*/
|
|
31
|
+
textEmbeddingModel(modelId: string): never;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export interface AssemblyAIProviderSettings {
|
|
35
|
+
/**
|
|
36
|
+
API key for authenticating requests.
|
|
37
|
+
*/
|
|
38
|
+
apiKey?: string;
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
Custom headers to include in the requests.
|
|
42
|
+
*/
|
|
43
|
+
headers?: Record<string, string>;
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
Custom fetch implementation. You can use it as a middleware to intercept requests,
|
|
47
|
+
or to provide a custom fetch implementation for e.g. testing.
|
|
48
|
+
*/
|
|
49
|
+
fetch?: FetchFunction;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
Create an AssemblyAI provider instance.
|
|
54
|
+
*/
|
|
55
|
+
export function createAssemblyAI(
|
|
56
|
+
options: AssemblyAIProviderSettings = {},
|
|
57
|
+
): AssemblyAIProvider {
|
|
58
|
+
const getHeaders = () =>
|
|
59
|
+
withUserAgentSuffix(
|
|
60
|
+
{
|
|
61
|
+
authorization: loadApiKey({
|
|
62
|
+
apiKey: options.apiKey,
|
|
63
|
+
environmentVariableName: 'ASSEMBLYAI_API_KEY',
|
|
64
|
+
description: 'AssemblyAI',
|
|
65
|
+
}),
|
|
66
|
+
...options.headers,
|
|
67
|
+
},
|
|
68
|
+
`ai-sdk/assemblyai/${VERSION}`,
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
const createTranscriptionModel = (modelId: AssemblyAITranscriptionModelId) =>
|
|
72
|
+
new AssemblyAITranscriptionModel(modelId, {
|
|
73
|
+
provider: `assemblyai.transcription`,
|
|
74
|
+
url: ({ path }) => `https://api.assemblyai.com${path}`,
|
|
75
|
+
headers: getHeaders,
|
|
76
|
+
fetch: options.fetch,
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
const provider = function (modelId: AssemblyAITranscriptionModelId) {
|
|
80
|
+
return {
|
|
81
|
+
transcription: createTranscriptionModel(modelId),
|
|
82
|
+
};
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
provider.specificationVersion = 'v3' as const;
|
|
86
|
+
provider.transcription = createTranscriptionModel;
|
|
87
|
+
provider.transcriptionModel = createTranscriptionModel;
|
|
88
|
+
|
|
89
|
+
provider.languageModel = () => {
|
|
90
|
+
throw new NoSuchModelError({
|
|
91
|
+
modelId: 'unknown',
|
|
92
|
+
modelType: 'languageModel',
|
|
93
|
+
message: 'AssemblyAI does not provide language models',
|
|
94
|
+
});
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
provider.embeddingModel = (modelId: string) => {
|
|
98
|
+
throw new NoSuchModelError({ modelId, modelType: 'embeddingModel' });
|
|
99
|
+
};
|
|
100
|
+
provider.textEmbeddingModel = provider.embeddingModel;
|
|
101
|
+
|
|
102
|
+
provider.imageModel = (modelId: string) => {
|
|
103
|
+
throw new NoSuchModelError({ modelId, modelType: 'imageModel' });
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
return provider as AssemblyAIProvider;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
Default AssemblyAI provider instance.
|
|
111
|
+
*/
|
|
112
|
+
export const assemblyai = createAssemblyAI();
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
import { TranscriptionModelV3, SharedV3Warning } from '@ai-sdk/provider';
|
|
2
|
+
import {
|
|
3
|
+
combineHeaders,
|
|
4
|
+
createJsonResponseHandler,
|
|
5
|
+
extractResponseHeaders,
|
|
6
|
+
parseProviderOptions,
|
|
7
|
+
postJsonToApi,
|
|
8
|
+
postToApi,
|
|
9
|
+
} from '@ai-sdk/provider-utils';
|
|
10
|
+
import { z } from 'zod/v4';
|
|
11
|
+
import { AssemblyAIConfig } from './assemblyai-config';
|
|
12
|
+
import { assemblyaiFailedResponseHandler } from './assemblyai-error';
|
|
13
|
+
import { AssemblyAITranscriptionModelId } from './assemblyai-transcription-settings';
|
|
14
|
+
import { AssemblyAITranscriptionAPITypes } from './assemblyai-api-types';
|
|
15
|
+
|
|
16
|
+
// https://www.assemblyai.com/docs/api-reference/transcripts/submit
|
|
17
|
+
const assemblyaiProviderOptionsSchema = z.object({
|
|
18
|
+
/**
|
|
19
|
+
* End time of the audio in milliseconds.
|
|
20
|
+
*/
|
|
21
|
+
audioEndAt: z.number().int().nullish(),
|
|
22
|
+
/**
|
|
23
|
+
* Start time of the audio in milliseconds.
|
|
24
|
+
*/
|
|
25
|
+
audioStartFrom: z.number().int().nullish(),
|
|
26
|
+
/**
|
|
27
|
+
* Whether to automatically generate chapters for the transcription.
|
|
28
|
+
*/
|
|
29
|
+
autoChapters: z.boolean().nullish(),
|
|
30
|
+
/**
|
|
31
|
+
* Whether to automatically generate highlights for the transcription.
|
|
32
|
+
*/
|
|
33
|
+
autoHighlights: z.boolean().nullish(),
|
|
34
|
+
/**
|
|
35
|
+
* Boost parameter for the transcription.
|
|
36
|
+
* Allowed values: 'low', 'default', 'high'.
|
|
37
|
+
*/
|
|
38
|
+
boostParam: z.string().nullish(),
|
|
39
|
+
/**
|
|
40
|
+
* Whether to enable content safety filtering.
|
|
41
|
+
*/
|
|
42
|
+
contentSafety: z.boolean().nullish(),
|
|
43
|
+
/**
|
|
44
|
+
* Confidence threshold for content safety filtering (25-100).
|
|
45
|
+
*/
|
|
46
|
+
contentSafetyConfidence: z.number().int().min(25).max(100).nullish(),
|
|
47
|
+
/**
|
|
48
|
+
* Custom spelling rules for the transcription.
|
|
49
|
+
*/
|
|
50
|
+
customSpelling: z
|
|
51
|
+
.array(
|
|
52
|
+
z.object({
|
|
53
|
+
from: z.array(z.string()),
|
|
54
|
+
to: z.string(),
|
|
55
|
+
}),
|
|
56
|
+
)
|
|
57
|
+
.nullish(),
|
|
58
|
+
/**
|
|
59
|
+
* Whether to include filler words (um, uh, etc.) in the transcription.
|
|
60
|
+
*/
|
|
61
|
+
disfluencies: z.boolean().nullish(),
|
|
62
|
+
/**
|
|
63
|
+
* Whether to enable entity detection.
|
|
64
|
+
*/
|
|
65
|
+
entityDetection: z.boolean().nullish(),
|
|
66
|
+
/**
|
|
67
|
+
* Whether to filter profanity from the transcription.
|
|
68
|
+
*/
|
|
69
|
+
filterProfanity: z.boolean().nullish(),
|
|
70
|
+
/**
|
|
71
|
+
* Whether to format text with punctuation and capitalization.
|
|
72
|
+
*/
|
|
73
|
+
formatText: z.boolean().nullish(),
|
|
74
|
+
/**
|
|
75
|
+
* Whether to enable IAB categories detection.
|
|
76
|
+
*/
|
|
77
|
+
iabCategories: z.boolean().nullish(),
|
|
78
|
+
/**
|
|
79
|
+
* Language code for the transcription.
|
|
80
|
+
*/
|
|
81
|
+
languageCode: z.union([z.literal('en'), z.string()]).nullish(),
|
|
82
|
+
/**
|
|
83
|
+
* Confidence threshold for language detection.
|
|
84
|
+
*/
|
|
85
|
+
languageConfidenceThreshold: z.number().nullish(),
|
|
86
|
+
/**
|
|
87
|
+
* Whether to enable language detection.
|
|
88
|
+
*/
|
|
89
|
+
languageDetection: z.boolean().nullish(),
|
|
90
|
+
/**
|
|
91
|
+
* Whether to process audio as multichannel.
|
|
92
|
+
*/
|
|
93
|
+
multichannel: z.boolean().nullish(),
|
|
94
|
+
/**
|
|
95
|
+
* Whether to add punctuation to the transcription.
|
|
96
|
+
*/
|
|
97
|
+
punctuate: z.boolean().nullish(),
|
|
98
|
+
/**
|
|
99
|
+
* Whether to redact personally identifiable information (PII).
|
|
100
|
+
*/
|
|
101
|
+
redactPii: z.boolean().nullish(),
|
|
102
|
+
/**
|
|
103
|
+
* Whether to redact PII in the audio file.
|
|
104
|
+
*/
|
|
105
|
+
redactPiiAudio: z.boolean().nullish(),
|
|
106
|
+
/**
|
|
107
|
+
* Audio format for PII redaction.
|
|
108
|
+
*/
|
|
109
|
+
redactPiiAudioQuality: z.string().nullish(),
|
|
110
|
+
/**
|
|
111
|
+
* List of PII types to redact.
|
|
112
|
+
*/
|
|
113
|
+
redactPiiPolicies: z.array(z.string()).nullish(),
|
|
114
|
+
/**
|
|
115
|
+
* Substitution method for redacted PII.
|
|
116
|
+
*/
|
|
117
|
+
redactPiiSub: z.string().nullish(),
|
|
118
|
+
/**
|
|
119
|
+
* Whether to enable sentiment analysis.
|
|
120
|
+
*/
|
|
121
|
+
sentimentAnalysis: z.boolean().nullish(),
|
|
122
|
+
/**
|
|
123
|
+
* Whether to identify different speakers in the audio.
|
|
124
|
+
*/
|
|
125
|
+
speakerLabels: z.boolean().nullish(),
|
|
126
|
+
/**
|
|
127
|
+
* Number of speakers expected in the audio.
|
|
128
|
+
*/
|
|
129
|
+
speakersExpected: z.number().int().nullish(),
|
|
130
|
+
/**
|
|
131
|
+
* Threshold for speech detection (0-1).
|
|
132
|
+
*/
|
|
133
|
+
speechThreshold: z.number().min(0).max(1).nullish(),
|
|
134
|
+
/**
|
|
135
|
+
* Whether to generate a summary of the transcription.
|
|
136
|
+
*/
|
|
137
|
+
summarization: z.boolean().nullish(),
|
|
138
|
+
/**
|
|
139
|
+
* Model to use for summarization.
|
|
140
|
+
*/
|
|
141
|
+
summaryModel: z.string().nullish(),
|
|
142
|
+
/**
|
|
143
|
+
* Type of summary to generate.
|
|
144
|
+
*/
|
|
145
|
+
summaryType: z.string().nullish(),
|
|
146
|
+
/**
|
|
147
|
+
* Name of the authentication header for webhook requests.
|
|
148
|
+
*/
|
|
149
|
+
webhookAuthHeaderName: z.string().nullish(),
|
|
150
|
+
/**
|
|
151
|
+
* Value of the authentication header for webhook requests.
|
|
152
|
+
*/
|
|
153
|
+
webhookAuthHeaderValue: z.string().nullish(),
|
|
154
|
+
/**
|
|
155
|
+
* URL to send webhook notifications to.
|
|
156
|
+
*/
|
|
157
|
+
webhookUrl: z.string().nullish(),
|
|
158
|
+
/**
|
|
159
|
+
* List of words to boost recognition for.
|
|
160
|
+
*/
|
|
161
|
+
wordBoost: z.array(z.string()).nullish(),
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
export type AssemblyAITranscriptionCallOptions = z.infer<
|
|
165
|
+
typeof assemblyaiProviderOptionsSchema
|
|
166
|
+
>;
|
|
167
|
+
|
|
168
|
+
interface AssemblyAITranscriptionModelConfig extends AssemblyAIConfig {
|
|
169
|
+
_internal?: {
|
|
170
|
+
currentDate?: () => Date;
|
|
171
|
+
};
|
|
172
|
+
/**
|
|
173
|
+
* The polling interval for checking transcript status in milliseconds.
|
|
174
|
+
*/
|
|
175
|
+
pollingInterval?: number;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
export class AssemblyAITranscriptionModel implements TranscriptionModelV3 {
|
|
179
|
+
readonly specificationVersion = 'v3';
|
|
180
|
+
private readonly POLLING_INTERVAL_MS = 3000;
|
|
181
|
+
|
|
182
|
+
get provider(): string {
|
|
183
|
+
return this.config.provider;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
constructor(
|
|
187
|
+
readonly modelId: AssemblyAITranscriptionModelId,
|
|
188
|
+
private readonly config: AssemblyAITranscriptionModelConfig,
|
|
189
|
+
) {}
|
|
190
|
+
|
|
191
|
+
private async getArgs({
|
|
192
|
+
providerOptions,
|
|
193
|
+
}: Parameters<TranscriptionModelV3['doGenerate']>[0]) {
|
|
194
|
+
const warnings: SharedV3Warning[] = [];
|
|
195
|
+
|
|
196
|
+
// Parse provider options
|
|
197
|
+
const assemblyaiOptions = await parseProviderOptions({
|
|
198
|
+
provider: 'assemblyai',
|
|
199
|
+
providerOptions,
|
|
200
|
+
schema: assemblyaiProviderOptionsSchema,
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
const body: Omit<AssemblyAITranscriptionAPITypes, 'audio_url'> = {
|
|
204
|
+
speech_model: this.modelId,
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
// Add provider-specific options
|
|
208
|
+
if (assemblyaiOptions) {
|
|
209
|
+
body.audio_end_at = assemblyaiOptions.audioEndAt ?? undefined;
|
|
210
|
+
body.audio_start_from = assemblyaiOptions.audioStartFrom ?? undefined;
|
|
211
|
+
body.auto_chapters = assemblyaiOptions.autoChapters ?? undefined;
|
|
212
|
+
body.auto_highlights = assemblyaiOptions.autoHighlights ?? undefined;
|
|
213
|
+
body.boost_param = (assemblyaiOptions.boostParam as never) ?? undefined;
|
|
214
|
+
body.content_safety = assemblyaiOptions.contentSafety ?? undefined;
|
|
215
|
+
body.content_safety_confidence =
|
|
216
|
+
assemblyaiOptions.contentSafetyConfidence ?? undefined;
|
|
217
|
+
body.custom_spelling =
|
|
218
|
+
(assemblyaiOptions.customSpelling as never) ?? undefined;
|
|
219
|
+
body.disfluencies = assemblyaiOptions.disfluencies ?? undefined;
|
|
220
|
+
body.entity_detection = assemblyaiOptions.entityDetection ?? undefined;
|
|
221
|
+
body.filter_profanity = assemblyaiOptions.filterProfanity ?? undefined;
|
|
222
|
+
body.format_text = assemblyaiOptions.formatText ?? undefined;
|
|
223
|
+
body.iab_categories = assemblyaiOptions.iabCategories ?? undefined;
|
|
224
|
+
body.language_code =
|
|
225
|
+
(assemblyaiOptions.languageCode as never) ?? undefined;
|
|
226
|
+
body.language_confidence_threshold =
|
|
227
|
+
assemblyaiOptions.languageConfidenceThreshold ?? undefined;
|
|
228
|
+
body.language_detection =
|
|
229
|
+
assemblyaiOptions.languageDetection ?? undefined;
|
|
230
|
+
body.multichannel = assemblyaiOptions.multichannel ?? undefined;
|
|
231
|
+
body.punctuate = assemblyaiOptions.punctuate ?? undefined;
|
|
232
|
+
body.redact_pii = assemblyaiOptions.redactPii ?? undefined;
|
|
233
|
+
body.redact_pii_audio = assemblyaiOptions.redactPiiAudio ?? undefined;
|
|
234
|
+
body.redact_pii_audio_quality =
|
|
235
|
+
(assemblyaiOptions.redactPiiAudioQuality as never) ?? undefined;
|
|
236
|
+
body.redact_pii_policies =
|
|
237
|
+
(assemblyaiOptions.redactPiiPolicies as never) ?? undefined;
|
|
238
|
+
body.redact_pii_sub =
|
|
239
|
+
(assemblyaiOptions.redactPiiSub as never) ?? undefined;
|
|
240
|
+
body.sentiment_analysis =
|
|
241
|
+
assemblyaiOptions.sentimentAnalysis ?? undefined;
|
|
242
|
+
body.speaker_labels = assemblyaiOptions.speakerLabels ?? undefined;
|
|
243
|
+
body.speakers_expected = assemblyaiOptions.speakersExpected ?? undefined;
|
|
244
|
+
body.speech_threshold = assemblyaiOptions.speechThreshold ?? undefined;
|
|
245
|
+
body.summarization = assemblyaiOptions.summarization ?? undefined;
|
|
246
|
+
body.summary_model =
|
|
247
|
+
(assemblyaiOptions.summaryModel as never) ?? undefined;
|
|
248
|
+
body.summary_type = (assemblyaiOptions.summaryType as never) ?? undefined;
|
|
249
|
+
body.webhook_auth_header_name =
|
|
250
|
+
assemblyaiOptions.webhookAuthHeaderName ?? undefined;
|
|
251
|
+
body.webhook_auth_header_value =
|
|
252
|
+
assemblyaiOptions.webhookAuthHeaderValue ?? undefined;
|
|
253
|
+
body.webhook_url = assemblyaiOptions.webhookUrl ?? undefined;
|
|
254
|
+
body.word_boost = assemblyaiOptions.wordBoost ?? undefined;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return {
|
|
258
|
+
body,
|
|
259
|
+
warnings,
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* Polls the given transcript until we have a status other than `processing` or `queued`.
|
|
265
|
+
*
|
|
266
|
+
* @see https://www.assemblyai.com/docs/getting-started/transcribe-an-audio-file#step-33
|
|
267
|
+
*/
|
|
268
|
+
private async waitForCompletion(
|
|
269
|
+
transcriptId: string,
|
|
270
|
+
headers: Record<string, string | undefined> | undefined,
|
|
271
|
+
abortSignal?: AbortSignal,
|
|
272
|
+
): Promise<{
|
|
273
|
+
transcript: z.infer<typeof assemblyaiTranscriptionResponseSchema>;
|
|
274
|
+
responseHeaders: Record<string, string>;
|
|
275
|
+
}> {
|
|
276
|
+
const pollingInterval =
|
|
277
|
+
this.config.pollingInterval ?? this.POLLING_INTERVAL_MS;
|
|
278
|
+
|
|
279
|
+
while (true) {
|
|
280
|
+
if (abortSignal?.aborted) {
|
|
281
|
+
throw new Error('Transcription request was aborted');
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
const response = await fetch(
|
|
285
|
+
this.config.url({
|
|
286
|
+
path: `/v2/transcript/${transcriptId}`,
|
|
287
|
+
modelId: this.modelId,
|
|
288
|
+
}),
|
|
289
|
+
{
|
|
290
|
+
method: 'GET',
|
|
291
|
+
headers: combineHeaders(
|
|
292
|
+
this.config.headers(),
|
|
293
|
+
headers,
|
|
294
|
+
) as HeadersInit,
|
|
295
|
+
signal: abortSignal,
|
|
296
|
+
},
|
|
297
|
+
);
|
|
298
|
+
|
|
299
|
+
if (!response.ok) {
|
|
300
|
+
throw await assemblyaiFailedResponseHandler({
|
|
301
|
+
response,
|
|
302
|
+
url: this.config.url({
|
|
303
|
+
path: `/v2/transcript/${transcriptId}`,
|
|
304
|
+
modelId: this.modelId,
|
|
305
|
+
}),
|
|
306
|
+
requestBodyValues: {},
|
|
307
|
+
});
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
const transcript = assemblyaiTranscriptionResponseSchema.parse(
|
|
311
|
+
await response.json(),
|
|
312
|
+
);
|
|
313
|
+
|
|
314
|
+
if (transcript.status === 'completed') {
|
|
315
|
+
return {
|
|
316
|
+
transcript,
|
|
317
|
+
responseHeaders: extractResponseHeaders(response),
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
if (transcript.status === 'error') {
|
|
322
|
+
throw new Error(
|
|
323
|
+
`Transcription failed: ${transcript.error ?? 'Unknown error'}`,
|
|
324
|
+
);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
await new Promise(resolve => setTimeout(resolve, pollingInterval));
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
async doGenerate(
|
|
332
|
+
options: Parameters<TranscriptionModelV3['doGenerate']>[0],
|
|
333
|
+
): Promise<Awaited<ReturnType<TranscriptionModelV3['doGenerate']>>> {
|
|
334
|
+
const currentDate = this.config._internal?.currentDate?.() ?? new Date();
|
|
335
|
+
|
|
336
|
+
const { value: uploadResponse } = await postToApi({
|
|
337
|
+
url: this.config.url({
|
|
338
|
+
path: '/v2/upload',
|
|
339
|
+
modelId: '',
|
|
340
|
+
}),
|
|
341
|
+
headers: {
|
|
342
|
+
'Content-Type': 'application/octet-stream',
|
|
343
|
+
...combineHeaders(this.config.headers(), options.headers),
|
|
344
|
+
},
|
|
345
|
+
body: {
|
|
346
|
+
content: options.audio,
|
|
347
|
+
values: options.audio,
|
|
348
|
+
},
|
|
349
|
+
failedResponseHandler: assemblyaiFailedResponseHandler,
|
|
350
|
+
successfulResponseHandler: createJsonResponseHandler(
|
|
351
|
+
assemblyaiUploadResponseSchema,
|
|
352
|
+
),
|
|
353
|
+
abortSignal: options.abortSignal,
|
|
354
|
+
fetch: this.config.fetch,
|
|
355
|
+
});
|
|
356
|
+
|
|
357
|
+
const { body, warnings } = await this.getArgs(options);
|
|
358
|
+
|
|
359
|
+
const { value: submitResponse } = await postJsonToApi({
|
|
360
|
+
url: this.config.url({
|
|
361
|
+
path: '/v2/transcript',
|
|
362
|
+
modelId: this.modelId,
|
|
363
|
+
}),
|
|
364
|
+
headers: combineHeaders(this.config.headers(), options.headers),
|
|
365
|
+
body: {
|
|
366
|
+
...body,
|
|
367
|
+
audio_url: uploadResponse.upload_url,
|
|
368
|
+
},
|
|
369
|
+
failedResponseHandler: assemblyaiFailedResponseHandler,
|
|
370
|
+
successfulResponseHandler: createJsonResponseHandler(
|
|
371
|
+
assemblyaiSubmitResponseSchema,
|
|
372
|
+
),
|
|
373
|
+
abortSignal: options.abortSignal,
|
|
374
|
+
fetch: this.config.fetch,
|
|
375
|
+
});
|
|
376
|
+
|
|
377
|
+
const { transcript, responseHeaders } = await this.waitForCompletion(
|
|
378
|
+
submitResponse.id,
|
|
379
|
+
options.headers,
|
|
380
|
+
options.abortSignal,
|
|
381
|
+
);
|
|
382
|
+
|
|
383
|
+
return {
|
|
384
|
+
text: transcript.text ?? '',
|
|
385
|
+
segments:
|
|
386
|
+
transcript.words?.map(word => ({
|
|
387
|
+
text: word.text,
|
|
388
|
+
startSecond: word.start,
|
|
389
|
+
endSecond: word.end,
|
|
390
|
+
})) ?? [],
|
|
391
|
+
language: transcript.language_code ?? undefined,
|
|
392
|
+
durationInSeconds:
|
|
393
|
+
transcript.audio_duration ?? transcript.words?.at(-1)?.end ?? undefined,
|
|
394
|
+
warnings,
|
|
395
|
+
response: {
|
|
396
|
+
timestamp: currentDate,
|
|
397
|
+
modelId: this.modelId,
|
|
398
|
+
headers: responseHeaders, // Headers from final GET request
|
|
399
|
+
body: transcript, // Raw response from final GET request
|
|
400
|
+
},
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
const assemblyaiUploadResponseSchema = z.object({
|
|
406
|
+
upload_url: z.string(),
|
|
407
|
+
});
|
|
408
|
+
|
|
409
|
+
const assemblyaiSubmitResponseSchema = z.object({
|
|
410
|
+
id: z.string(),
|
|
411
|
+
status: z.enum(['queued', 'processing', 'completed', 'error']),
|
|
412
|
+
});
|
|
413
|
+
|
|
414
|
+
const assemblyaiTranscriptionResponseSchema = z.object({
|
|
415
|
+
id: z.string(),
|
|
416
|
+
status: z.enum(['queued', 'processing', 'completed', 'error']),
|
|
417
|
+
text: z.string().nullish(),
|
|
418
|
+
language_code: z.string().nullish(),
|
|
419
|
+
words: z
|
|
420
|
+
.array(
|
|
421
|
+
z.object({
|
|
422
|
+
start: z.number(),
|
|
423
|
+
end: z.number(),
|
|
424
|
+
text: z.string(),
|
|
425
|
+
}),
|
|
426
|
+
)
|
|
427
|
+
.nullish(),
|
|
428
|
+
audio_duration: z.number().nullish(),
|
|
429
|
+
error: z.string().nullish(),
|
|
430
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export type AssemblyAITranscriptionModelId = 'best' | 'nano';
|
package/src/index.ts
ADDED
|
Binary file
|