@vertesia/workflow 0.60.0 → 0.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/lib/cjs/activities/executeInteraction.js +7 -1
  2. package/lib/cjs/activities/executeInteraction.js.map +1 -1
  3. package/lib/cjs/activities/generateEmbeddings.js +23 -6
  4. package/lib/cjs/activities/generateEmbeddings.js.map +1 -1
  5. package/lib/cjs/activities/media/processPdfWithTextract.js +3 -2
  6. package/lib/cjs/activities/media/processPdfWithTextract.js.map +1 -1
  7. package/lib/cjs/activities/media/transcribeMediaWithGladia.js +1 -1
  8. package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +1 -1
  9. package/lib/cjs/errors.js +16 -2
  10. package/lib/cjs/errors.js.map +1 -1
  11. package/lib/cjs/utils/client.js +6 -3
  12. package/lib/cjs/utils/client.js.map +1 -1
  13. package/lib/esm/activities/executeInteraction.js +7 -1
  14. package/lib/esm/activities/executeInteraction.js.map +1 -1
  15. package/lib/esm/activities/generateEmbeddings.js +23 -6
  16. package/lib/esm/activities/generateEmbeddings.js.map +1 -1
  17. package/lib/esm/activities/media/processPdfWithTextract.js +3 -2
  18. package/lib/esm/activities/media/processPdfWithTextract.js.map +1 -1
  19. package/lib/esm/activities/media/transcribeMediaWithGladia.js +1 -1
  20. package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +1 -1
  21. package/lib/esm/errors.js +14 -1
  22. package/lib/esm/errors.js.map +1 -1
  23. package/lib/esm/utils/client.js +5 -3
  24. package/lib/esm/utils/client.js.map +1 -1
  25. package/lib/types/activities/executeInteraction.d.ts.map +1 -1
  26. package/lib/types/activities/generateEmbeddings.d.ts.map +1 -1
  27. package/lib/types/errors.d.ts +5 -0
  28. package/lib/types/errors.d.ts.map +1 -1
  29. package/lib/types/utils/client.d.ts +5 -0
  30. package/lib/types/utils/client.d.ts.map +1 -1
  31. package/lib/workflows-bundle.js +848 -230
  32. package/package.json +5 -6
  33. package/src/activities/executeInteraction.ts +8 -1
  34. package/src/activities/generateEmbeddings.ts +440 -418
  35. package/src/activities/media/processPdfWithTextract.ts +3 -3
  36. package/src/activities/media/transcribeMediaWithGladia.ts +1 -1
  37. package/src/errors.ts +17 -1
  38. package/src/utils/client.ts +5 -5
@@ -2,12 +2,12 @@ import { EmbeddingsResult } from "@llumiverse/common";
2
2
  import { log } from "@temporalio/activity";
3
3
  import { VertesiaClient } from "@vertesia/client";
4
4
  import {
5
- ContentObject,
6
- DSLActivityExecutionPayload,
7
- DSLActivitySpec,
8
- ImageRenditionFormat,
9
- ProjectConfigurationEmbeddings,
10
- SupportedEmbeddingTypes,
5
+ ContentObject,
6
+ DSLActivityExecutionPayload,
7
+ DSLActivitySpec,
8
+ ImageRenditionFormat,
9
+ ProjectConfigurationEmbeddings,
10
+ SupportedEmbeddingTypes,
11
11
  } from "@vertesia/common";
12
12
  import { setupActivity } from "../dsl/setup/ActivityContext.js";
13
13
  import { NoDocumentFound } from "../errors.js";
@@ -16,486 +16,508 @@ import { DocPart, getContentParts } from "../utils/chunks.js";
16
16
  import { countTokens } from "../utils/tokens.js";
17
17
 
18
18
  export interface GenerateEmbeddingsParams {
19
- /**
20
- * The model to use for embedding generation
21
- * If not set, the default model for the project will be used
22
- */
23
- model?: string;
24
-
25
- /**
26
- * The environment to use for embedding generation
27
- * If not set, the default environment for the project will be used
28
- */
29
- environment?: string;
30
-
31
- /**
32
- * If true, force embedding generation even if the document already has embeddings
33
- */
34
- force?: boolean;
35
-
36
- /**
37
- * The embedding type to generate
38
- */
39
- type: SupportedEmbeddingTypes;
40
-
41
- /**
42
- * The DocParts to use for long documents
43
- */
44
- parts?: DocPart[];
19
+ /**
20
+ * The model to use for embedding generation
21
+ * If not set, the default model for the project will be used
22
+ */
23
+ model?: string;
24
+
25
+ /**
26
+ * The environment to use for embedding generation
27
+ * If not set, the default environment for the project will be used
28
+ */
29
+ environment?: string;
30
+
31
+ /**
32
+ * If true, force embedding generation even if the document already has embeddings
33
+ */
34
+ force?: boolean;
35
+
36
+ /**
37
+ * The embedding type to generate
38
+ */
39
+ type: SupportedEmbeddingTypes;
40
+
41
+ /**
42
+ * The DocParts to use for long documents
43
+ */
44
+ parts?: DocPart[];
45
45
  }
46
46
 
47
47
  export interface GenerateEmbeddings
48
- extends DSLActivitySpec<GenerateEmbeddingsParams> {
49
- name: "generateEmbeddings";
48
+ extends DSLActivitySpec<GenerateEmbeddingsParams> {
49
+ name: "generateEmbeddings";
50
50
  }
51
51
 
52
52
  export async function generateEmbeddings(
53
- payload: DSLActivityExecutionPayload<GenerateEmbeddingsParams>,
53
+ payload: DSLActivityExecutionPayload<GenerateEmbeddingsParams>,
54
54
  ) {
55
- const { params, client, objectId, fetchProject } =
56
- await setupActivity<GenerateEmbeddingsParams>(payload);
57
- const { force, type } = params;
58
-
59
- const projectData = await fetchProject();
60
- const config = projectData?.configuration.embeddings[type];
61
- if (!projectData) {
62
- throw new NoDocumentFound("Project not found", [payload.project_id]);
63
- }
64
- if (!config) {
65
- throw new NoDocumentFound("Embeddings configuration not found", [objectId]);
66
- }
67
-
68
- if (!projectData) {
69
- throw new NoDocumentFound("Project not found", [payload.project_id]);
70
- }
71
-
72
- if (!projectData?.configuration.embeddings[type]?.enabled) {
73
- log.info(
74
- `Embeddings generation disabled for type ${type} on project: ${projectData.name} (${projectData.namespace})`,
75
- { config },
76
- );
77
- return {
78
- id: objectId,
79
- status: "skipped",
80
- message: `Embeddings generation disabled for type ${type}`,
81
- };
82
- }
55
+ const { params, client, objectId, fetchProject } =
56
+ await setupActivity<GenerateEmbeddingsParams>(payload);
57
+ const { force, type } = params;
58
+
59
+ const projectData = await fetchProject();
60
+ const config = projectData?.configuration.embeddings[type];
61
+ if (!projectData) {
62
+ throw new NoDocumentFound("Project not found", [payload.project_id]);
63
+ }
64
+ if (!config) {
65
+ throw new NoDocumentFound("Embeddings configuration not found", [
66
+ objectId,
67
+ ]);
68
+ }
83
69
 
84
- log.info(`${type} embedding generation starting for object ${objectId}`, {
85
- force,
86
- config,
87
- });
70
+ if (!projectData) {
71
+ throw new NoDocumentFound("Project not found", [payload.project_id]);
72
+ }
88
73
 
89
- if (!config.environment) {
90
- throw new Error(
91
- "No environment found in project configuration. Set environment in project configuration to generate embeddings.",
92
- );
93
- }
74
+ if (!projectData?.configuration.embeddings[type]?.enabled) {
75
+ log.info(
76
+ `Embeddings generation disabled for type ${type} on project: ${projectData.name} (${projectData.namespace})`,
77
+ { config },
78
+ );
79
+ return {
80
+ id: objectId,
81
+ status: "skipped",
82
+ message: `Embeddings generation disabled for type ${type}`,
83
+ };
84
+ }
94
85
 
95
- const document = await client.objects.retrieve(
96
- objectId,
97
- "+text +parts +embeddings +tokens +properties",
98
- );
86
+ log.info(`${type} embedding generation starting for object ${objectId}`, {
87
+ force,
88
+ config,
89
+ });
99
90
 
100
- if (!document) {
101
- throw new NoDocumentFound("Document not found", [objectId]);
102
- }
91
+ if (!config.environment) {
92
+ throw new Error(
93
+ "No environment found in project configuration. Set environment in project configuration to generate embeddings.",
94
+ );
95
+ }
103
96
 
104
- if (!document.content) {
105
- throw new NoDocumentFound("Document content not found", [objectId]);
106
- }
97
+ const document = await client.objects.retrieve(
98
+ objectId,
99
+ "+text +parts +embeddings +tokens +properties",
100
+ );
107
101
 
108
- let res;
102
+ if (!document) {
103
+ throw new NoDocumentFound("Document not found", [objectId]);
104
+ }
109
105
 
110
- switch (type) {
111
- case SupportedEmbeddingTypes.text:
112
- res = await generateTextEmbeddings({
113
- client,
114
- config,
115
- document,
116
- type,
117
- });
118
- break;
119
- case SupportedEmbeddingTypes.properties:
120
- res = await generateTextEmbeddings({
121
- client,
122
- config,
123
- document,
124
- type,
125
- });
126
- break;
127
- case SupportedEmbeddingTypes.image:
128
- res = await generateImageEmbeddings({
129
- client,
130
- config,
131
- document,
132
- type,
133
- });
134
- break;
135
- default:
136
- res = {
137
- id: objectId,
138
- status: "failed",
139
- message: `unsupported embedding type: ${type}`,
140
- };
141
- }
142
-
143
- return res;
106
+ if (!document.content) {
107
+ throw new NoDocumentFound("Document content not found", [objectId]);
108
+ }
109
+
110
+ let res;
111
+
112
+ switch (type) {
113
+ case SupportedEmbeddingTypes.text:
114
+ res = await generateTextEmbeddings({
115
+ client,
116
+ config,
117
+ document,
118
+ type,
119
+ });
120
+ break;
121
+ case SupportedEmbeddingTypes.properties:
122
+ res = await generateTextEmbeddings({
123
+ client,
124
+ config,
125
+ document,
126
+ type,
127
+ });
128
+ break;
129
+ case SupportedEmbeddingTypes.image:
130
+ res = await generateImageEmbeddings({
131
+ client,
132
+ config,
133
+ document,
134
+ type,
135
+ });
136
+ break;
137
+ default:
138
+ res = {
139
+ id: objectId,
140
+ status: "failed",
141
+ message: `unsupported embedding type: ${type}`,
142
+ };
143
+ }
144
+
145
+ return res;
144
146
  }
145
147
 
146
148
  interface ExecuteGenerateEmbeddingsParams {
147
- document: ContentObject;
148
- client: VertesiaClient;
149
- type: SupportedEmbeddingTypes;
150
- config: ProjectConfigurationEmbeddings;
151
- property?: string;
152
- force?: boolean;
149
+ document: ContentObject;
150
+ client: VertesiaClient;
151
+ type: SupportedEmbeddingTypes;
152
+ config: ProjectConfigurationEmbeddings;
153
+ property?: string;
154
+ force?: boolean;
153
155
  }
154
156
 
155
157
  async function generateTextEmbeddings(
156
- { document, client, type, config }: ExecuteGenerateEmbeddingsParams,
157
- parts?: DocPart[],
158
+ { document, client, type, config }: ExecuteGenerateEmbeddingsParams,
159
+ parts?: DocPart[],
158
160
  ) {
159
- // if (!force && document.embeddings[type]?.etag === (document.text_etag ?? md5(document.text))) {
160
- // return { id: objectId, status: "skipped", message: "embeddings already generated" }
161
- // }
162
-
163
- if (!document) {
164
- return { status: "error", message: "document is null or undefined" };
165
- }
166
-
167
- if (
168
- type !== SupportedEmbeddingTypes.text &&
169
- type !== SupportedEmbeddingTypes.properties
170
- ) {
171
- return {
172
- id: document.id,
173
- status: "failed",
174
- message: `unsupported embedding type: ${type}`,
175
- };
176
- }
177
-
178
- if (type === SupportedEmbeddingTypes.text && !document.text) {
179
- return { id: document.id, status: "failed", message: "no text found" };
180
- }
181
- if (type === SupportedEmbeddingTypes.properties && !document?.properties) {
182
- return {
183
- id: document.id,
184
- status: "failed",
185
- message: "no properties found",
186
- };
187
- }
161
+ // if (!force && document.embeddings[type]?.etag === (document.text_etag ?? md5(document.text))) {
162
+ // return { id: objectId, status: "skipped", message: "embeddings already generated" }
163
+ // }
188
164
 
189
- const { environment, model } = config;
190
-
191
- const partDefinitions = parts ?? [];
165
+ if (!document) {
166
+ return { status: "error", message: "document is null or undefined" };
167
+ }
192
168
 
193
- // Count tokens if not already done
194
- if (!document.tokens?.count && type === SupportedEmbeddingTypes.text) {
195
- log.debug("Updating token count for document: " + document.id);
196
- const tokensData = countTokens(document.text!);
197
- await client.objects.update(document.id, {
198
- tokens: {
199
- ...tokensData,
200
- etag: document.text_etag ?? md5(document.text!),
201
- },
202
- });
203
- document.tokens = {
204
- ...tokensData,
205
- etag: document.text_etag ?? md5(document.text!),
206
- };
207
- }
208
-
209
- const maxTokens = config.max_tokens ?? 8000;
210
-
211
- //generate embeddings for the main doc if document isn't too large
212
- //if too large, we'll just generate embeddings for the parts
213
- //then we can generate embeddings for the main document by averaging the tensors
214
- log.info(`Generating ${type} embeddings for document ${document.id}`);
215
- if (
216
- type === SupportedEmbeddingTypes.text &&
217
- document.tokens?.count &&
218
- document.tokens?.count > maxTokens
219
- ) {
220
- log.info("Document too large, generating embeddings for parts");
221
-
222
- if (!document.text) {
223
- return { id: document.id, status: "failed", message: "no text found" };
169
+ if (
170
+ type !== SupportedEmbeddingTypes.text &&
171
+ type !== SupportedEmbeddingTypes.properties
172
+ ) {
173
+ return {
174
+ id: document.id,
175
+ status: "failed",
176
+ message: `unsupported embedding type: ${type}`,
177
+ };
224
178
  }
225
179
 
226
- if (!partDefinitions || partDefinitions.length === 0) {
227
- log.info("No parts found for document, skipping embeddings generation");
228
- return { id: document.id, status: "failed", message: "no parts found" };
180
+ if (type === SupportedEmbeddingTypes.text && !document.text) {
181
+ return { id: document.id, status: "failed", message: "no text found" };
182
+ }
183
+ if (type === SupportedEmbeddingTypes.properties && !document?.properties) {
184
+ return {
185
+ id: document.id,
186
+ status: "failed",
187
+ message: "no properties found",
188
+ };
229
189
  }
230
190
 
231
- log.info("Generating embeddings for parts", {
232
- parts: partDefinitions,
233
- max_tokens: maxTokens,
234
- });
235
- const docParts = getContentParts(document.text, partDefinitions);
191
+ const { environment, model } = config;
236
192
 
237
- log.info(`Retrieved ${docParts.length} parts`);
238
- const start = new Date().getTime();
239
- const generatePartEmbeddings = async (partContent: string, i: number) => {
240
- const localStart = new Date().getTime();
241
- try {
242
- log.info(`Generating embeddings for part ${i}`, {
243
- text_len: partContent.length,
244
- });
245
- if (!partContent) {
246
- return {
247
- id: i,
248
- number: i,
249
- result: null,
250
- status: "skipped",
251
- message: "no text found",
252
- };
253
- }
193
+ const partDefinitions = parts ?? [];
254
194
 
255
- const e = await generateEmbeddingsFromStudio(
256
- partContent,
257
- environment,
258
- client,
259
- model,
260
- ).catch((e) => {
261
- log.error("Error generating embeddings for part " + i, {
262
- text_length: partContent.length,
263
- error: e,
264
- });
265
- return null;
195
+ // Count tokens if not already done
196
+ if (!document.tokens?.count && type === SupportedEmbeddingTypes.text) {
197
+ log.debug("Updating token count for document: " + document.id);
198
+ const tokensData = countTokens(document.text!);
199
+ await client.objects.update(document.id, {
200
+ tokens: {
201
+ ...tokensData,
202
+ etag: document.text_etag ?? md5(document.text!),
203
+ },
266
204
  });
205
+ document.tokens = {
206
+ ...tokensData,
207
+ etag: document.text_etag ?? md5(document.text!),
208
+ };
209
+ }
267
210
 
268
- if (!e || !e.values) {
269
- return {
270
- id: i,
271
- number: i,
272
- result: null,
273
- message: "no embeddings generated",
274
- };
211
+ const maxTokens = config.max_tokens ?? 8000;
212
+
213
+ //generate embeddings for the main doc if document isn't too large
214
+ //if too large, we'll just generate embeddings for the parts
215
+ //then we can generate embeddings for the main document by averaging the tensors
216
+ log.info(`Generating ${type} embeddings for document ${document.id}`);
217
+ if (
218
+ type === SupportedEmbeddingTypes.text &&
219
+ document.tokens?.count &&
220
+ document.tokens?.count > maxTokens
221
+ ) {
222
+ log.info("Document too large, generating embeddings for parts");
223
+
224
+ if (!document.text) {
225
+ return {
226
+ id: document.id,
227
+ status: "failed",
228
+ message: "no text found",
229
+ };
275
230
  }
276
231
 
277
- if (e.values.length === 0) {
278
- return {
279
- id: i,
280
- number: i,
281
- result: null,
282
- message: "no embeddings generated",
283
- };
232
+ if (!partDefinitions || partDefinitions.length === 0) {
233
+ log.info(
234
+ "No parts found for document, skipping embeddings generation",
235
+ );
236
+ return {
237
+ id: document.id,
238
+ status: "failed",
239
+ message: "no parts found",
240
+ };
284
241
  }
285
- log.info(`Generated embeddings for part ${i}`, {
286
- len: e.values.length,
287
- duration: new Date().getTime() - localStart,
242
+
243
+ log.info("Generating embeddings for parts", {
244
+ parts: partDefinitions,
245
+ max_tokens: maxTokens,
288
246
  });
247
+ const docParts = getContentParts(document.text, partDefinitions);
248
+
249
+ log.info(`Retrieved ${docParts.length} parts`);
250
+ const start = new Date().getTime();
251
+ const generatePartEmbeddings = async (
252
+ partContent: string,
253
+ i: number,
254
+ ) => {
255
+ const localStart = new Date().getTime();
256
+ try {
257
+ log.info(`Generating embeddings for part ${i}`, {
258
+ text_len: partContent.length,
259
+ });
260
+ if (!partContent) {
261
+ return {
262
+ id: i,
263
+ number: i,
264
+ result: null,
265
+ status: "skipped",
266
+ message: "no text found",
267
+ };
268
+ }
269
+
270
+ const e = await generateEmbeddingsFromStudio(
271
+ partContent,
272
+ environment,
273
+ client,
274
+ model,
275
+ ).catch((e) => {
276
+ log.error("Error generating embeddings for part " + i, {
277
+ text_length: partContent.length,
278
+ error: e,
279
+ });
280
+ return null;
281
+ });
282
+
283
+ if (!e || !e.values) {
284
+ return {
285
+ id: i,
286
+ number: i,
287
+ result: null,
288
+ message: "no embeddings generated",
289
+ };
290
+ }
291
+
292
+ if (e.values.length === 0) {
293
+ return {
294
+ id: i,
295
+ number: i,
296
+ result: null,
297
+ message: "no embeddings generated",
298
+ };
299
+ }
300
+ log.info(`Generated embeddings for part ${i}`, {
301
+ len: e.values.length,
302
+ duration: new Date().getTime() - localStart,
303
+ });
304
+
305
+ return { number: i, result: e };
306
+ } catch (err: any) {
307
+ log.info(
308
+ `Error generating ${type} embeddings for part ${i} of ${document.id}`,
309
+ { error: err },
310
+ );
311
+ return {
312
+ number: i,
313
+ result: null,
314
+ message: "error generating embeddings",
315
+ error: err.message,
316
+ };
317
+ }
318
+ };
289
319
 
290
- return { number: i, result: e };
291
- } catch (err: any) {
320
+ const partEmbeddings = await Promise.all(
321
+ docParts.map((part, i) => generatePartEmbeddings(part, i)),
322
+ );
323
+ const validPartEmbeddings = partEmbeddings
324
+ .filter((e) => e.result !== null)
325
+ .map((e) => e.result);
326
+ const averagedEmbedding = computeAttentionEmbedding(
327
+ validPartEmbeddings.map((e) => e.values),
328
+ );
292
329
  log.info(
293
- `Error generating ${type} embeddings for part ${i} of ${document.id}`,
294
- { error: err },
330
+ `Averaged embeddings for document ${document.id} in ${(new Date().getTime() - start) / 1000} seconds`,
331
+ {
332
+ len: averagedEmbedding.length,
333
+ count: validPartEmbeddings.length,
334
+ max_tokens: maxTokens,
335
+ },
295
336
  );
337
+ await client.objects.setEmbedding(document.id, type, {
338
+ values: averagedEmbedding,
339
+ model: validPartEmbeddings[0].model,
340
+ etag: document.text_etag,
341
+ });
342
+ log.info(`Object ${document.id} embedding set`, {
343
+ type,
344
+ len: averagedEmbedding.length,
345
+ });
346
+ } else {
347
+ log.info(`Generating ${type} embeddings for document`);
348
+
349
+ const res = await generateEmbeddingsFromStudio(
350
+ JSON.stringify(document[type]),
351
+ environment,
352
+ client,
353
+ );
354
+ if (!res || !res.values) {
355
+ return {
356
+ id: document.id,
357
+ status: "failed",
358
+ message: "no embeddings generated",
359
+ };
360
+ }
361
+
362
+ log.info(`${type} embeddings generated for document ${document.id}`, {
363
+ len: res.values.length,
364
+ });
365
+ await client.objects.setEmbedding(document.id, type, {
366
+ values: res.values,
367
+ model: res.model,
368
+ etag: document.text_etag,
369
+ });
370
+
296
371
  return {
297
- number: i,
298
- result: null,
299
- message: "error generating embeddings",
300
- error: err.message,
372
+ id: document.id,
373
+ type,
374
+ status: "completed",
375
+ len: res.values.length,
301
376
  };
302
- }
303
- };
377
+ }
378
+ }
304
379
 
305
- const partEmbeddings = await Promise.all(
306
- docParts.map((part, i) => generatePartEmbeddings(part, i)),
307
- );
308
- const validPartEmbeddings = partEmbeddings
309
- .filter((e) => e.result !== null)
310
- .map((e) => e.result);
311
- const averagedEmbedding = computeAttentionEmbedding(
312
- validPartEmbeddings.map((e) => e.values),
313
- );
314
- log.info(
315
- `Averaged embeddings for document ${document.id} in ${(new Date().getTime() - start) / 1000} seconds`,
316
- {
317
- len: averagedEmbedding.length,
318
- count: validPartEmbeddings.length,
319
- max_tokens: maxTokens,
320
- },
321
- );
322
- await client.objects.setEmbedding(document.id, type, {
323
- values: averagedEmbedding,
324
- model: validPartEmbeddings[0].model,
325
- etag: document.text_etag,
380
+ async function generateImageEmbeddings({
381
+ document,
382
+ client,
383
+ type,
384
+ config,
385
+ }: ExecuteGenerateEmbeddingsParams) {
386
+ log.info("Generating image embeddings for document " + document.id, {
387
+ content: document.content,
326
388
  });
327
- log.info(`Object ${document.id} embedding set`, {
328
- type,
329
- len: averagedEmbedding.length,
389
+ if (
390
+ !document.content?.type?.startsWith("image/") &&
391
+ !document.content?.type?.includes("pdf")
392
+ ) {
393
+ return {
394
+ id: document.id,
395
+ type,
396
+ status: "failed",
397
+ message: "content is not an image",
398
+ };
399
+ }
400
+ const { environment, model } = config;
401
+
402
+ const resRnd = await client.store.objects.getRendition(document.id, {
403
+ format: ImageRenditionFormat.jpeg,
404
+ max_hw: 1024,
405
+ generate_if_missing: true,
406
+ sign_url: true,
330
407
  });
331
- } else {
332
- log.info(`Generating ${type} embeddings for document`);
333
408
 
334
- const res = await generateEmbeddingsFromStudio(
335
- JSON.stringify(document[type]),
336
- environment,
337
- client,
338
- );
339
- if (!res || !res.values) {
340
- return {
341
- id: document.id,
342
- status: "failed",
343
- message: "no embeddings generated",
344
- };
409
+ if (resRnd.status === "generating") {
410
+ throw new Error("Rendition is generating, will retry later");
411
+ } else if (
412
+ resRnd.status === "failed" ||
413
+ !resRnd.renditions ||
414
+ !resRnd.renditions.length
415
+ ) {
416
+ throw new NoDocumentFound("Rendition retrieval failed", [document.id]);
345
417
  }
346
418
 
347
- log.info(`${type} embeddings generated for document ${document.id}`, {
348
- len: res.values.length,
349
- });
350
- await client.objects.setEmbedding(document.id, type, {
351
- values: res.values,
352
- model: res.model,
353
- etag: document.text_etag,
354
- });
419
+ const renditions = resRnd.renditions;
420
+ if (!renditions?.length) {
421
+ throw new NoDocumentFound("No source found in rendition", [
422
+ document.id,
423
+ ]);
424
+ }
355
425
 
356
- return {
357
- id: document.id,
358
- type,
359
- status: "completed",
360
- len: res.values.length,
361
- };
362
- }
363
- }
426
+ const rendition = renditions[0];
427
+ const image = await fetchBlobAsBase64(client, rendition);
428
+
429
+ const res = await client.environments
430
+ .embeddings(environment, {
431
+ image,
432
+ model,
433
+ })
434
+ .then((res) => res)
435
+ .catch((e) => {
436
+ log.error("Error generating embeddings for image", { error: e });
437
+ throw e;
438
+ });
364
439
 
365
- async function generateImageEmbeddings({
366
- document,
367
- client,
368
- type,
369
- config,
370
- }: ExecuteGenerateEmbeddingsParams) {
371
- log.info("Generating image embeddings for document " + document.id, {
372
- content: document.content,
373
- });
374
- if (
375
- !document.content?.type?.startsWith("image/") &&
376
- !document.content?.type?.includes("pdf")
377
- ) {
378
- return {
379
- id: document.id,
380
- type,
381
- status: "failed",
382
- message: "content is not an image",
383
- };
384
- }
385
- const { environment, model } = config;
386
-
387
- const resRnd = await client.store.objects.getRendition(document.id, {
388
- format: ImageRenditionFormat.jpeg,
389
- max_hw: 1024,
390
- generate_if_missing: true,
391
- sign_url: false,
392
- });
393
-
394
- if (resRnd.status === "generating") {
395
- throw new Error("Rendition is generating, will retry later");
396
- } else if (
397
- resRnd.status === "failed" ||
398
- !resRnd.renditions ||
399
- !resRnd.renditions.length
400
- ) {
401
- throw new NoDocumentFound("Rendition retrieval failed", [document.id]);
402
- }
403
-
404
- const renditions = resRnd.renditions;
405
- if (!renditions?.length) {
406
- throw new NoDocumentFound("No source found in rendition", [document.id]);
407
- }
408
-
409
- const rendition = renditions[0];
410
- const image = await fetchBlobAsBase64(client, rendition);
411
-
412
- const res = await client.environments
413
- .embeddings(environment, {
414
- image,
415
- model,
416
- })
417
- .then((res) => res)
418
- .catch((e) => {
419
- log.error("Error generating embeddings for image", { error: e });
420
- throw e;
421
- });
440
+ if (!res || !res.values) {
441
+ return {
442
+ id: document.id,
443
+ status: "failed",
444
+ message: "no embeddings generated",
445
+ };
446
+ }
447
+
448
+ await client.objects.setEmbedding(
449
+ document.id,
450
+ SupportedEmbeddingTypes.image,
451
+ {
452
+ values: res.values,
453
+ model: res.model,
454
+ etag: document.text_etag,
455
+ },
456
+ );
422
457
 
423
- if (!res || !res.values) {
424
458
  return {
425
- id: document.id,
426
- status: "failed",
427
- message: "no embeddings generated",
459
+ id: document.id,
460
+ type,
461
+ status: "completed",
462
+ len: res.values.length,
428
463
  };
429
- }
430
-
431
- await client.objects.setEmbedding(
432
- document.id,
433
- SupportedEmbeddingTypes.image,
434
- {
435
- values: res.values,
436
- model: res.model,
437
- etag: document.text_etag,
438
- },
439
- );
440
-
441
- return { id: document.id, type, status: "completed", len: res.values.length };
442
464
  }
443
465
 
444
466
  async function generateEmbeddingsFromStudio(
445
- text: string,
446
- env: string,
447
- client: VertesiaClient,
448
- model?: string,
467
+ text: string,
468
+ env: string,
469
+ client: VertesiaClient,
470
+ model?: string,
449
471
  ): Promise<EmbeddingsResult> {
450
- log.info(
451
- `Generating embeddings for text of ${text.length} chars with environment ${env}`,
452
- );
453
-
454
- return client.environments
455
- .embeddings(env, {
456
- text,
457
- model,
458
- })
459
- .then((res) => res)
460
- .catch((e) => {
461
- log.error("Error generating embeddings for text", { error: e });
462
- throw e;
463
- });
472
+ log.info(
473
+ `Generating embeddings for text of ${text.length} chars with environment ${env}`,
474
+ );
475
+
476
+ return client.environments
477
+ .embeddings(env, {
478
+ text,
479
+ model,
480
+ })
481
+ .then((res) => res)
482
+ .catch((e) => {
483
+ log.error("Error generating embeddings for text", { error: e });
484
+ throw e;
485
+ });
464
486
  }
465
487
 
466
488
  //Simplified attention mechanism
467
489
  // This is a naive implementation and should be replaced with a more sophisticated
468
490
  // using tensorflow in a specific package
469
491
  function computeAttentionEmbedding(chunkEmbeddings: number[][]): number[] {
470
- if (chunkEmbeddings.length === 0) return [];
492
+ if (chunkEmbeddings.length === 0) return [];
471
493
 
472
- const start = new Date().getTime();
494
+ const start = new Date().getTime();
473
495
 
474
- // Generate random attention weights
475
- const attentionWeights = chunkEmbeddings.map(() => Math.random());
496
+ // Generate random attention weights
497
+ const attentionWeights = chunkEmbeddings.map(() => Math.random());
476
498
 
477
- // Apply softmax to get attention scores
478
- const expWeights = attentionWeights.map((w) => Math.exp(w));
479
- const sumExpWeights = expWeights.reduce((sum, val) => sum + val, 0);
480
- const attentionScores = expWeights.map((w) => w / sumExpWeights);
499
+ // Apply softmax to get attention scores
500
+ const expWeights = attentionWeights.map((w) => Math.exp(w));
501
+ const sumExpWeights = expWeights.reduce((sum, val) => sum + val, 0);
502
+ const attentionScores = expWeights.map((w) => w / sumExpWeights);
481
503
 
482
- // Get embedding dimension
483
- const embeddingDim = chunkEmbeddings[0].length;
504
+ // Get embedding dimension
505
+ const embeddingDim = chunkEmbeddings[0].length;
484
506
 
485
- // Initialize document embedding
486
- const documentEmbedding = new Array(embeddingDim).fill(0);
507
+ // Initialize document embedding
508
+ const documentEmbedding = new Array(embeddingDim).fill(0);
487
509
 
488
- // Weighted sum of embeddings
489
- for (let i = 0; i < chunkEmbeddings.length; i++) {
490
- for (let j = 0; j < embeddingDim; j++) {
491
- documentEmbedding[j] += chunkEmbeddings[i][j] * attentionScores[i];
510
+ // Weighted sum of embeddings
511
+ for (let i = 0; i < chunkEmbeddings.length; i++) {
512
+ for (let j = 0; j < embeddingDim; j++) {
513
+ documentEmbedding[j] += chunkEmbeddings[i][j] * attentionScores[i];
514
+ }
492
515
  }
493
- }
494
516
 
495
- const duration = new Date().getTime() - start;
496
- console.log(
497
- `Computed document embedding in ${duration}ms for ${chunkEmbeddings.length} chunks`,
498
- );
517
+ const duration = new Date().getTime() - start;
518
+ console.log(
519
+ `Computed document embedding in ${duration}ms for ${chunkEmbeddings.length} chunks`,
520
+ );
499
521
 
500
- return documentEmbedding;
522
+ return documentEmbedding;
501
523
  }