@vertesia/workflow 0.60.0 → 0.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/activities/executeInteraction.js +7 -1
- package/lib/cjs/activities/executeInteraction.js.map +1 -1
- package/lib/cjs/activities/generateEmbeddings.js +23 -6
- package/lib/cjs/activities/generateEmbeddings.js.map +1 -1
- package/lib/cjs/activities/media/processPdfWithTextract.js +3 -2
- package/lib/cjs/activities/media/processPdfWithTextract.js.map +1 -1
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js +1 -1
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +1 -1
- package/lib/cjs/errors.js +16 -2
- package/lib/cjs/errors.js.map +1 -1
- package/lib/cjs/utils/client.js +6 -3
- package/lib/cjs/utils/client.js.map +1 -1
- package/lib/esm/activities/executeInteraction.js +7 -1
- package/lib/esm/activities/executeInteraction.js.map +1 -1
- package/lib/esm/activities/generateEmbeddings.js +23 -6
- package/lib/esm/activities/generateEmbeddings.js.map +1 -1
- package/lib/esm/activities/media/processPdfWithTextract.js +3 -2
- package/lib/esm/activities/media/processPdfWithTextract.js.map +1 -1
- package/lib/esm/activities/media/transcribeMediaWithGladia.js +1 -1
- package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +1 -1
- package/lib/esm/errors.js +14 -1
- package/lib/esm/errors.js.map +1 -1
- package/lib/esm/utils/client.js +5 -3
- package/lib/esm/utils/client.js.map +1 -1
- package/lib/types/activities/executeInteraction.d.ts.map +1 -1
- package/lib/types/activities/generateEmbeddings.d.ts.map +1 -1
- package/lib/types/errors.d.ts +5 -0
- package/lib/types/errors.d.ts.map +1 -1
- package/lib/types/utils/client.d.ts +5 -0
- package/lib/types/utils/client.d.ts.map +1 -1
- package/lib/workflows-bundle.js +848 -230
- package/package.json +5 -6
- package/src/activities/executeInteraction.ts +8 -1
- package/src/activities/generateEmbeddings.ts +440 -418
- package/src/activities/media/processPdfWithTextract.ts +3 -3
- package/src/activities/media/transcribeMediaWithGladia.ts +1 -1
- package/src/errors.ts +17 -1
- package/src/utils/client.ts +5 -5
@@ -2,12 +2,12 @@ import { EmbeddingsResult } from "@llumiverse/common";
|
|
2
2
|
import { log } from "@temporalio/activity";
|
3
3
|
import { VertesiaClient } from "@vertesia/client";
|
4
4
|
import {
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
ContentObject,
|
6
|
+
DSLActivityExecutionPayload,
|
7
|
+
DSLActivitySpec,
|
8
|
+
ImageRenditionFormat,
|
9
|
+
ProjectConfigurationEmbeddings,
|
10
|
+
SupportedEmbeddingTypes,
|
11
11
|
} from "@vertesia/common";
|
12
12
|
import { setupActivity } from "../dsl/setup/ActivityContext.js";
|
13
13
|
import { NoDocumentFound } from "../errors.js";
|
@@ -16,486 +16,508 @@ import { DocPart, getContentParts } from "../utils/chunks.js";
|
|
16
16
|
import { countTokens } from "../utils/tokens.js";
|
17
17
|
|
18
18
|
export interface GenerateEmbeddingsParams {
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
19
|
+
/**
|
20
|
+
* The model to use for embedding generation
|
21
|
+
* If not set, the default model for the project will be used
|
22
|
+
*/
|
23
|
+
model?: string;
|
24
|
+
|
25
|
+
/**
|
26
|
+
* The environment to use for embedding generation
|
27
|
+
* If not set, the default environment for the project will be used
|
28
|
+
*/
|
29
|
+
environment?: string;
|
30
|
+
|
31
|
+
/**
|
32
|
+
* If true, force embedding generation even if the document already has embeddings
|
33
|
+
*/
|
34
|
+
force?: boolean;
|
35
|
+
|
36
|
+
/**
|
37
|
+
* The embedding type to generate
|
38
|
+
*/
|
39
|
+
type: SupportedEmbeddingTypes;
|
40
|
+
|
41
|
+
/**
|
42
|
+
* The DocParts to use for long documents
|
43
|
+
*/
|
44
|
+
parts?: DocPart[];
|
45
45
|
}
|
46
46
|
|
47
47
|
export interface GenerateEmbeddings
|
48
|
-
|
49
|
-
|
48
|
+
extends DSLActivitySpec<GenerateEmbeddingsParams> {
|
49
|
+
name: "generateEmbeddings";
|
50
50
|
}
|
51
51
|
|
52
52
|
export async function generateEmbeddings(
|
53
|
-
|
53
|
+
payload: DSLActivityExecutionPayload<GenerateEmbeddingsParams>,
|
54
54
|
) {
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
throw new NoDocumentFound("Project not found", [payload.project_id]);
|
70
|
-
}
|
71
|
-
|
72
|
-
if (!projectData?.configuration.embeddings[type]?.enabled) {
|
73
|
-
log.info(
|
74
|
-
`Embeddings generation disabled for type ${type} on project: ${projectData.name} (${projectData.namespace})`,
|
75
|
-
{ config },
|
76
|
-
);
|
77
|
-
return {
|
78
|
-
id: objectId,
|
79
|
-
status: "skipped",
|
80
|
-
message: `Embeddings generation disabled for type ${type}`,
|
81
|
-
};
|
82
|
-
}
|
55
|
+
const { params, client, objectId, fetchProject } =
|
56
|
+
await setupActivity<GenerateEmbeddingsParams>(payload);
|
57
|
+
const { force, type } = params;
|
58
|
+
|
59
|
+
const projectData = await fetchProject();
|
60
|
+
const config = projectData?.configuration.embeddings[type];
|
61
|
+
if (!projectData) {
|
62
|
+
throw new NoDocumentFound("Project not found", [payload.project_id]);
|
63
|
+
}
|
64
|
+
if (!config) {
|
65
|
+
throw new NoDocumentFound("Embeddings configuration not found", [
|
66
|
+
objectId,
|
67
|
+
]);
|
68
|
+
}
|
83
69
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
});
|
70
|
+
if (!projectData) {
|
71
|
+
throw new NoDocumentFound("Project not found", [payload.project_id]);
|
72
|
+
}
|
88
73
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
74
|
+
if (!projectData?.configuration.embeddings[type]?.enabled) {
|
75
|
+
log.info(
|
76
|
+
`Embeddings generation disabled for type ${type} on project: ${projectData.name} (${projectData.namespace})`,
|
77
|
+
{ config },
|
78
|
+
);
|
79
|
+
return {
|
80
|
+
id: objectId,
|
81
|
+
status: "skipped",
|
82
|
+
message: `Embeddings generation disabled for type ${type}`,
|
83
|
+
};
|
84
|
+
}
|
94
85
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
86
|
+
log.info(`${type} embedding generation starting for object ${objectId}`, {
|
87
|
+
force,
|
88
|
+
config,
|
89
|
+
});
|
99
90
|
|
100
|
-
|
101
|
-
|
102
|
-
|
91
|
+
if (!config.environment) {
|
92
|
+
throw new Error(
|
93
|
+
"No environment found in project configuration. Set environment in project configuration to generate embeddings.",
|
94
|
+
);
|
95
|
+
}
|
103
96
|
|
104
|
-
|
105
|
-
|
106
|
-
|
97
|
+
const document = await client.objects.retrieve(
|
98
|
+
objectId,
|
99
|
+
"+text +parts +embeddings +tokens +properties",
|
100
|
+
);
|
107
101
|
|
108
|
-
|
102
|
+
if (!document) {
|
103
|
+
throw new NoDocumentFound("Document not found", [objectId]);
|
104
|
+
}
|
109
105
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
106
|
+
if (!document.content) {
|
107
|
+
throw new NoDocumentFound("Document content not found", [objectId]);
|
108
|
+
}
|
109
|
+
|
110
|
+
let res;
|
111
|
+
|
112
|
+
switch (type) {
|
113
|
+
case SupportedEmbeddingTypes.text:
|
114
|
+
res = await generateTextEmbeddings({
|
115
|
+
client,
|
116
|
+
config,
|
117
|
+
document,
|
118
|
+
type,
|
119
|
+
});
|
120
|
+
break;
|
121
|
+
case SupportedEmbeddingTypes.properties:
|
122
|
+
res = await generateTextEmbeddings({
|
123
|
+
client,
|
124
|
+
config,
|
125
|
+
document,
|
126
|
+
type,
|
127
|
+
});
|
128
|
+
break;
|
129
|
+
case SupportedEmbeddingTypes.image:
|
130
|
+
res = await generateImageEmbeddings({
|
131
|
+
client,
|
132
|
+
config,
|
133
|
+
document,
|
134
|
+
type,
|
135
|
+
});
|
136
|
+
break;
|
137
|
+
default:
|
138
|
+
res = {
|
139
|
+
id: objectId,
|
140
|
+
status: "failed",
|
141
|
+
message: `unsupported embedding type: ${type}`,
|
142
|
+
};
|
143
|
+
}
|
144
|
+
|
145
|
+
return res;
|
144
146
|
}
|
145
147
|
|
146
148
|
interface ExecuteGenerateEmbeddingsParams {
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
149
|
+
document: ContentObject;
|
150
|
+
client: VertesiaClient;
|
151
|
+
type: SupportedEmbeddingTypes;
|
152
|
+
config: ProjectConfigurationEmbeddings;
|
153
|
+
property?: string;
|
154
|
+
force?: boolean;
|
153
155
|
}
|
154
156
|
|
155
157
|
async function generateTextEmbeddings(
|
156
|
-
|
157
|
-
|
158
|
+
{ document, client, type, config }: ExecuteGenerateEmbeddingsParams,
|
159
|
+
parts?: DocPart[],
|
158
160
|
) {
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
if (!document) {
|
164
|
-
return { status: "error", message: "document is null or undefined" };
|
165
|
-
}
|
166
|
-
|
167
|
-
if (
|
168
|
-
type !== SupportedEmbeddingTypes.text &&
|
169
|
-
type !== SupportedEmbeddingTypes.properties
|
170
|
-
) {
|
171
|
-
return {
|
172
|
-
id: document.id,
|
173
|
-
status: "failed",
|
174
|
-
message: `unsupported embedding type: ${type}`,
|
175
|
-
};
|
176
|
-
}
|
177
|
-
|
178
|
-
if (type === SupportedEmbeddingTypes.text && !document.text) {
|
179
|
-
return { id: document.id, status: "failed", message: "no text found" };
|
180
|
-
}
|
181
|
-
if (type === SupportedEmbeddingTypes.properties && !document?.properties) {
|
182
|
-
return {
|
183
|
-
id: document.id,
|
184
|
-
status: "failed",
|
185
|
-
message: "no properties found",
|
186
|
-
};
|
187
|
-
}
|
161
|
+
// if (!force && document.embeddings[type]?.etag === (document.text_etag ?? md5(document.text))) {
|
162
|
+
// return { id: objectId, status: "skipped", message: "embeddings already generated" }
|
163
|
+
// }
|
188
164
|
|
189
|
-
|
190
|
-
|
191
|
-
|
165
|
+
if (!document) {
|
166
|
+
return { status: "error", message: "document is null or undefined" };
|
167
|
+
}
|
192
168
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
});
|
203
|
-
document.tokens = {
|
204
|
-
...tokensData,
|
205
|
-
etag: document.text_etag ?? md5(document.text!),
|
206
|
-
};
|
207
|
-
}
|
208
|
-
|
209
|
-
const maxTokens = config.max_tokens ?? 8000;
|
210
|
-
|
211
|
-
//generate embeddings for the main doc if document isn't too large
|
212
|
-
//if too large, we'll just generate embeddings for the parts
|
213
|
-
//then we can generate embeddings for the main document by averaging the tensors
|
214
|
-
log.info(`Generating ${type} embeddings for document ${document.id}`);
|
215
|
-
if (
|
216
|
-
type === SupportedEmbeddingTypes.text &&
|
217
|
-
document.tokens?.count &&
|
218
|
-
document.tokens?.count > maxTokens
|
219
|
-
) {
|
220
|
-
log.info("Document too large, generating embeddings for parts");
|
221
|
-
|
222
|
-
if (!document.text) {
|
223
|
-
return { id: document.id, status: "failed", message: "no text found" };
|
169
|
+
if (
|
170
|
+
type !== SupportedEmbeddingTypes.text &&
|
171
|
+
type !== SupportedEmbeddingTypes.properties
|
172
|
+
) {
|
173
|
+
return {
|
174
|
+
id: document.id,
|
175
|
+
status: "failed",
|
176
|
+
message: `unsupported embedding type: ${type}`,
|
177
|
+
};
|
224
178
|
}
|
225
179
|
|
226
|
-
if (
|
227
|
-
|
228
|
-
|
180
|
+
if (type === SupportedEmbeddingTypes.text && !document.text) {
|
181
|
+
return { id: document.id, status: "failed", message: "no text found" };
|
182
|
+
}
|
183
|
+
if (type === SupportedEmbeddingTypes.properties && !document?.properties) {
|
184
|
+
return {
|
185
|
+
id: document.id,
|
186
|
+
status: "failed",
|
187
|
+
message: "no properties found",
|
188
|
+
};
|
229
189
|
}
|
230
190
|
|
231
|
-
|
232
|
-
parts: partDefinitions,
|
233
|
-
max_tokens: maxTokens,
|
234
|
-
});
|
235
|
-
const docParts = getContentParts(document.text, partDefinitions);
|
191
|
+
const { environment, model } = config;
|
236
192
|
|
237
|
-
|
238
|
-
const start = new Date().getTime();
|
239
|
-
const generatePartEmbeddings = async (partContent: string, i: number) => {
|
240
|
-
const localStart = new Date().getTime();
|
241
|
-
try {
|
242
|
-
log.info(`Generating embeddings for part ${i}`, {
|
243
|
-
text_len: partContent.length,
|
244
|
-
});
|
245
|
-
if (!partContent) {
|
246
|
-
return {
|
247
|
-
id: i,
|
248
|
-
number: i,
|
249
|
-
result: null,
|
250
|
-
status: "skipped",
|
251
|
-
message: "no text found",
|
252
|
-
};
|
253
|
-
}
|
193
|
+
const partDefinitions = parts ?? [];
|
254
194
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
});
|
265
|
-
return null;
|
195
|
+
// Count tokens if not already done
|
196
|
+
if (!document.tokens?.count && type === SupportedEmbeddingTypes.text) {
|
197
|
+
log.debug("Updating token count for document: " + document.id);
|
198
|
+
const tokensData = countTokens(document.text!);
|
199
|
+
await client.objects.update(document.id, {
|
200
|
+
tokens: {
|
201
|
+
...tokensData,
|
202
|
+
etag: document.text_etag ?? md5(document.text!),
|
203
|
+
},
|
266
204
|
});
|
205
|
+
document.tokens = {
|
206
|
+
...tokensData,
|
207
|
+
etag: document.text_etag ?? md5(document.text!),
|
208
|
+
};
|
209
|
+
}
|
267
210
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
211
|
+
const maxTokens = config.max_tokens ?? 8000;
|
212
|
+
|
213
|
+
//generate embeddings for the main doc if document isn't too large
|
214
|
+
//if too large, we'll just generate embeddings for the parts
|
215
|
+
//then we can generate embeddings for the main document by averaging the tensors
|
216
|
+
log.info(`Generating ${type} embeddings for document ${document.id}`);
|
217
|
+
if (
|
218
|
+
type === SupportedEmbeddingTypes.text &&
|
219
|
+
document.tokens?.count &&
|
220
|
+
document.tokens?.count > maxTokens
|
221
|
+
) {
|
222
|
+
log.info("Document too large, generating embeddings for parts");
|
223
|
+
|
224
|
+
if (!document.text) {
|
225
|
+
return {
|
226
|
+
id: document.id,
|
227
|
+
status: "failed",
|
228
|
+
message: "no text found",
|
229
|
+
};
|
275
230
|
}
|
276
231
|
|
277
|
-
if (
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
232
|
+
if (!partDefinitions || partDefinitions.length === 0) {
|
233
|
+
log.info(
|
234
|
+
"No parts found for document, skipping embeddings generation",
|
235
|
+
);
|
236
|
+
return {
|
237
|
+
id: document.id,
|
238
|
+
status: "failed",
|
239
|
+
message: "no parts found",
|
240
|
+
};
|
284
241
|
}
|
285
|
-
|
286
|
-
|
287
|
-
|
242
|
+
|
243
|
+
log.info("Generating embeddings for parts", {
|
244
|
+
parts: partDefinitions,
|
245
|
+
max_tokens: maxTokens,
|
288
246
|
});
|
247
|
+
const docParts = getContentParts(document.text, partDefinitions);
|
248
|
+
|
249
|
+
log.info(`Retrieved ${docParts.length} parts`);
|
250
|
+
const start = new Date().getTime();
|
251
|
+
const generatePartEmbeddings = async (
|
252
|
+
partContent: string,
|
253
|
+
i: number,
|
254
|
+
) => {
|
255
|
+
const localStart = new Date().getTime();
|
256
|
+
try {
|
257
|
+
log.info(`Generating embeddings for part ${i}`, {
|
258
|
+
text_len: partContent.length,
|
259
|
+
});
|
260
|
+
if (!partContent) {
|
261
|
+
return {
|
262
|
+
id: i,
|
263
|
+
number: i,
|
264
|
+
result: null,
|
265
|
+
status: "skipped",
|
266
|
+
message: "no text found",
|
267
|
+
};
|
268
|
+
}
|
269
|
+
|
270
|
+
const e = await generateEmbeddingsFromStudio(
|
271
|
+
partContent,
|
272
|
+
environment,
|
273
|
+
client,
|
274
|
+
model,
|
275
|
+
).catch((e) => {
|
276
|
+
log.error("Error generating embeddings for part " + i, {
|
277
|
+
text_length: partContent.length,
|
278
|
+
error: e,
|
279
|
+
});
|
280
|
+
return null;
|
281
|
+
});
|
282
|
+
|
283
|
+
if (!e || !e.values) {
|
284
|
+
return {
|
285
|
+
id: i,
|
286
|
+
number: i,
|
287
|
+
result: null,
|
288
|
+
message: "no embeddings generated",
|
289
|
+
};
|
290
|
+
}
|
291
|
+
|
292
|
+
if (e.values.length === 0) {
|
293
|
+
return {
|
294
|
+
id: i,
|
295
|
+
number: i,
|
296
|
+
result: null,
|
297
|
+
message: "no embeddings generated",
|
298
|
+
};
|
299
|
+
}
|
300
|
+
log.info(`Generated embeddings for part ${i}`, {
|
301
|
+
len: e.values.length,
|
302
|
+
duration: new Date().getTime() - localStart,
|
303
|
+
});
|
304
|
+
|
305
|
+
return { number: i, result: e };
|
306
|
+
} catch (err: any) {
|
307
|
+
log.info(
|
308
|
+
`Error generating ${type} embeddings for part ${i} of ${document.id}`,
|
309
|
+
{ error: err },
|
310
|
+
);
|
311
|
+
return {
|
312
|
+
number: i,
|
313
|
+
result: null,
|
314
|
+
message: "error generating embeddings",
|
315
|
+
error: err.message,
|
316
|
+
};
|
317
|
+
}
|
318
|
+
};
|
289
319
|
|
290
|
-
|
291
|
-
|
320
|
+
const partEmbeddings = await Promise.all(
|
321
|
+
docParts.map((part, i) => generatePartEmbeddings(part, i)),
|
322
|
+
);
|
323
|
+
const validPartEmbeddings = partEmbeddings
|
324
|
+
.filter((e) => e.result !== null)
|
325
|
+
.map((e) => e.result);
|
326
|
+
const averagedEmbedding = computeAttentionEmbedding(
|
327
|
+
validPartEmbeddings.map((e) => e.values),
|
328
|
+
);
|
292
329
|
log.info(
|
293
|
-
|
294
|
-
|
330
|
+
`Averaged embeddings for document ${document.id} in ${(new Date().getTime() - start) / 1000} seconds`,
|
331
|
+
{
|
332
|
+
len: averagedEmbedding.length,
|
333
|
+
count: validPartEmbeddings.length,
|
334
|
+
max_tokens: maxTokens,
|
335
|
+
},
|
295
336
|
);
|
337
|
+
await client.objects.setEmbedding(document.id, type, {
|
338
|
+
values: averagedEmbedding,
|
339
|
+
model: validPartEmbeddings[0].model,
|
340
|
+
etag: document.text_etag,
|
341
|
+
});
|
342
|
+
log.info(`Object ${document.id} embedding set`, {
|
343
|
+
type,
|
344
|
+
len: averagedEmbedding.length,
|
345
|
+
});
|
346
|
+
} else {
|
347
|
+
log.info(`Generating ${type} embeddings for document`);
|
348
|
+
|
349
|
+
const res = await generateEmbeddingsFromStudio(
|
350
|
+
JSON.stringify(document[type]),
|
351
|
+
environment,
|
352
|
+
client,
|
353
|
+
);
|
354
|
+
if (!res || !res.values) {
|
355
|
+
return {
|
356
|
+
id: document.id,
|
357
|
+
status: "failed",
|
358
|
+
message: "no embeddings generated",
|
359
|
+
};
|
360
|
+
}
|
361
|
+
|
362
|
+
log.info(`${type} embeddings generated for document ${document.id}`, {
|
363
|
+
len: res.values.length,
|
364
|
+
});
|
365
|
+
await client.objects.setEmbedding(document.id, type, {
|
366
|
+
values: res.values,
|
367
|
+
model: res.model,
|
368
|
+
etag: document.text_etag,
|
369
|
+
});
|
370
|
+
|
296
371
|
return {
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
372
|
+
id: document.id,
|
373
|
+
type,
|
374
|
+
status: "completed",
|
375
|
+
len: res.values.length,
|
301
376
|
};
|
302
|
-
|
303
|
-
|
377
|
+
}
|
378
|
+
}
|
304
379
|
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
);
|
314
|
-
log.info(
|
315
|
-
`Averaged embeddings for document ${document.id} in ${(new Date().getTime() - start) / 1000} seconds`,
|
316
|
-
{
|
317
|
-
len: averagedEmbedding.length,
|
318
|
-
count: validPartEmbeddings.length,
|
319
|
-
max_tokens: maxTokens,
|
320
|
-
},
|
321
|
-
);
|
322
|
-
await client.objects.setEmbedding(document.id, type, {
|
323
|
-
values: averagedEmbedding,
|
324
|
-
model: validPartEmbeddings[0].model,
|
325
|
-
etag: document.text_etag,
|
380
|
+
async function generateImageEmbeddings({
|
381
|
+
document,
|
382
|
+
client,
|
383
|
+
type,
|
384
|
+
config,
|
385
|
+
}: ExecuteGenerateEmbeddingsParams) {
|
386
|
+
log.info("Generating image embeddings for document " + document.id, {
|
387
|
+
content: document.content,
|
326
388
|
});
|
327
|
-
|
328
|
-
|
329
|
-
|
389
|
+
if (
|
390
|
+
!document.content?.type?.startsWith("image/") &&
|
391
|
+
!document.content?.type?.includes("pdf")
|
392
|
+
) {
|
393
|
+
return {
|
394
|
+
id: document.id,
|
395
|
+
type,
|
396
|
+
status: "failed",
|
397
|
+
message: "content is not an image",
|
398
|
+
};
|
399
|
+
}
|
400
|
+
const { environment, model } = config;
|
401
|
+
|
402
|
+
const resRnd = await client.store.objects.getRendition(document.id, {
|
403
|
+
format: ImageRenditionFormat.jpeg,
|
404
|
+
max_hw: 1024,
|
405
|
+
generate_if_missing: true,
|
406
|
+
sign_url: true,
|
330
407
|
});
|
331
|
-
} else {
|
332
|
-
log.info(`Generating ${type} embeddings for document`);
|
333
408
|
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
status: "failed",
|
343
|
-
message: "no embeddings generated",
|
344
|
-
};
|
409
|
+
if (resRnd.status === "generating") {
|
410
|
+
throw new Error("Rendition is generating, will retry later");
|
411
|
+
} else if (
|
412
|
+
resRnd.status === "failed" ||
|
413
|
+
!resRnd.renditions ||
|
414
|
+
!resRnd.renditions.length
|
415
|
+
) {
|
416
|
+
throw new NoDocumentFound("Rendition retrieval failed", [document.id]);
|
345
417
|
}
|
346
418
|
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
etag: document.text_etag,
|
354
|
-
});
|
419
|
+
const renditions = resRnd.renditions;
|
420
|
+
if (!renditions?.length) {
|
421
|
+
throw new NoDocumentFound("No source found in rendition", [
|
422
|
+
document.id,
|
423
|
+
]);
|
424
|
+
}
|
355
425
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
}
|
426
|
+
const rendition = renditions[0];
|
427
|
+
const image = await fetchBlobAsBase64(client, rendition);
|
428
|
+
|
429
|
+
const res = await client.environments
|
430
|
+
.embeddings(environment, {
|
431
|
+
image,
|
432
|
+
model,
|
433
|
+
})
|
434
|
+
.then((res) => res)
|
435
|
+
.catch((e) => {
|
436
|
+
log.error("Error generating embeddings for image", { error: e });
|
437
|
+
throw e;
|
438
|
+
});
|
364
439
|
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
}
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
message: "content is not an image",
|
383
|
-
};
|
384
|
-
}
|
385
|
-
const { environment, model } = config;
|
386
|
-
|
387
|
-
const resRnd = await client.store.objects.getRendition(document.id, {
|
388
|
-
format: ImageRenditionFormat.jpeg,
|
389
|
-
max_hw: 1024,
|
390
|
-
generate_if_missing: true,
|
391
|
-
sign_url: false,
|
392
|
-
});
|
393
|
-
|
394
|
-
if (resRnd.status === "generating") {
|
395
|
-
throw new Error("Rendition is generating, will retry later");
|
396
|
-
} else if (
|
397
|
-
resRnd.status === "failed" ||
|
398
|
-
!resRnd.renditions ||
|
399
|
-
!resRnd.renditions.length
|
400
|
-
) {
|
401
|
-
throw new NoDocumentFound("Rendition retrieval failed", [document.id]);
|
402
|
-
}
|
403
|
-
|
404
|
-
const renditions = resRnd.renditions;
|
405
|
-
if (!renditions?.length) {
|
406
|
-
throw new NoDocumentFound("No source found in rendition", [document.id]);
|
407
|
-
}
|
408
|
-
|
409
|
-
const rendition = renditions[0];
|
410
|
-
const image = await fetchBlobAsBase64(client, rendition);
|
411
|
-
|
412
|
-
const res = await client.environments
|
413
|
-
.embeddings(environment, {
|
414
|
-
image,
|
415
|
-
model,
|
416
|
-
})
|
417
|
-
.then((res) => res)
|
418
|
-
.catch((e) => {
|
419
|
-
log.error("Error generating embeddings for image", { error: e });
|
420
|
-
throw e;
|
421
|
-
});
|
440
|
+
if (!res || !res.values) {
|
441
|
+
return {
|
442
|
+
id: document.id,
|
443
|
+
status: "failed",
|
444
|
+
message: "no embeddings generated",
|
445
|
+
};
|
446
|
+
}
|
447
|
+
|
448
|
+
await client.objects.setEmbedding(
|
449
|
+
document.id,
|
450
|
+
SupportedEmbeddingTypes.image,
|
451
|
+
{
|
452
|
+
values: res.values,
|
453
|
+
model: res.model,
|
454
|
+
etag: document.text_etag,
|
455
|
+
},
|
456
|
+
);
|
422
457
|
|
423
|
-
if (!res || !res.values) {
|
424
458
|
return {
|
425
|
-
|
426
|
-
|
427
|
-
|
459
|
+
id: document.id,
|
460
|
+
type,
|
461
|
+
status: "completed",
|
462
|
+
len: res.values.length,
|
428
463
|
};
|
429
|
-
}
|
430
|
-
|
431
|
-
await client.objects.setEmbedding(
|
432
|
-
document.id,
|
433
|
-
SupportedEmbeddingTypes.image,
|
434
|
-
{
|
435
|
-
values: res.values,
|
436
|
-
model: res.model,
|
437
|
-
etag: document.text_etag,
|
438
|
-
},
|
439
|
-
);
|
440
|
-
|
441
|
-
return { id: document.id, type, status: "completed", len: res.values.length };
|
442
464
|
}
|
443
465
|
|
444
466
|
async function generateEmbeddingsFromStudio(
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
467
|
+
text: string,
|
468
|
+
env: string,
|
469
|
+
client: VertesiaClient,
|
470
|
+
model?: string,
|
449
471
|
): Promise<EmbeddingsResult> {
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
472
|
+
log.info(
|
473
|
+
`Generating embeddings for text of ${text.length} chars with environment ${env}`,
|
474
|
+
);
|
475
|
+
|
476
|
+
return client.environments
|
477
|
+
.embeddings(env, {
|
478
|
+
text,
|
479
|
+
model,
|
480
|
+
})
|
481
|
+
.then((res) => res)
|
482
|
+
.catch((e) => {
|
483
|
+
log.error("Error generating embeddings for text", { error: e });
|
484
|
+
throw e;
|
485
|
+
});
|
464
486
|
}
|
465
487
|
|
466
488
|
//Simplified attention mechanism
|
467
489
|
// This is a naive implementation and should be replaced with a more sophisticated
|
468
490
|
// using tensorflow in a specific package
|
469
491
|
function computeAttentionEmbedding(chunkEmbeddings: number[][]): number[] {
|
470
|
-
|
492
|
+
if (chunkEmbeddings.length === 0) return [];
|
471
493
|
|
472
|
-
|
494
|
+
const start = new Date().getTime();
|
473
495
|
|
474
|
-
|
475
|
-
|
496
|
+
// Generate random attention weights
|
497
|
+
const attentionWeights = chunkEmbeddings.map(() => Math.random());
|
476
498
|
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
499
|
+
// Apply softmax to get attention scores
|
500
|
+
const expWeights = attentionWeights.map((w) => Math.exp(w));
|
501
|
+
const sumExpWeights = expWeights.reduce((sum, val) => sum + val, 0);
|
502
|
+
const attentionScores = expWeights.map((w) => w / sumExpWeights);
|
481
503
|
|
482
|
-
|
483
|
-
|
504
|
+
// Get embedding dimension
|
505
|
+
const embeddingDim = chunkEmbeddings[0].length;
|
484
506
|
|
485
|
-
|
486
|
-
|
507
|
+
// Initialize document embedding
|
508
|
+
const documentEmbedding = new Array(embeddingDim).fill(0);
|
487
509
|
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
510
|
+
// Weighted sum of embeddings
|
511
|
+
for (let i = 0; i < chunkEmbeddings.length; i++) {
|
512
|
+
for (let j = 0; j < embeddingDim; j++) {
|
513
|
+
documentEmbedding[j] += chunkEmbeddings[i][j] * attentionScores[i];
|
514
|
+
}
|
492
515
|
}
|
493
|
-
}
|
494
516
|
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
517
|
+
const duration = new Date().getTime() - start;
|
518
|
+
console.log(
|
519
|
+
`Computed document embedding in ${duration}ms for ${chunkEmbeddings.length} chunks`,
|
520
|
+
);
|
499
521
|
|
500
|
-
|
522
|
+
return documentEmbedding;
|
501
523
|
}
|