@vertesia/workflow 0.81.0 → 0.81.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vertesia/workflow",
3
- "version": "0.81.0",
3
+ "version": "0.81.1",
4
4
  "type": "module",
5
5
  "description": "Vertesia workflow DSL",
6
6
  "main": "./lib/esm/index.js",
@@ -44,11 +44,11 @@
44
44
  "tmp": "^0.2.4",
45
45
  "tmp-promise": "^3.0.3",
46
46
  "yaml": "^2.6.0",
47
+ "@vertesia/memory": "0.81.1",
48
+ "@vertesia/common": "0.81.1",
49
+ "@vertesia/client": "0.81.1",
47
50
  "@llumiverse/common": "0.24.0",
48
- "@vertesia/common": "0.81.0",
49
- "@vertesia/client": "0.81.0",
50
- "@vertesia/api-fetch-client": "0.81.0",
51
- "@vertesia/memory": "0.81.0"
51
+ "@vertesia/api-fetch-client": "0.81.1"
52
52
  },
53
53
  "ts_dual_module": {
54
54
  "outDir": "lib",
@@ -11,8 +11,8 @@ import {
11
11
  } from "@vertesia/common";
12
12
  import { setupActivity } from "../dsl/setup/ActivityContext.js";
13
13
  import { DocumentNotFoundError } from "../errors.js";
14
- import { fetchBlobAsBase64, md5 } from "../utils/blobs.js";
15
- import { DocPart, getContentParts } from "../utils/chunks.js";
14
+ import { fetchBlobAsBase64 } from "../utils/blobs.js";
15
+ import { DocPart } from "../utils/chunks.js";
16
16
  import { countTokens } from "../utils/tokens.js";
17
17
 
18
18
  export interface GenerateEmbeddingsParams {
@@ -155,8 +155,7 @@ interface ExecuteGenerateEmbeddingsParams {
155
155
  }
156
156
 
157
157
  async function generateTextEmbeddings(
158
- { document, client, type, config }: ExecuteGenerateEmbeddingsParams,
159
- parts?: DocPart[],
158
+ { document, client, type, config }: ExecuteGenerateEmbeddingsParams
160
159
  ) {
161
160
 
162
161
  if (!document) {
@@ -185,161 +184,32 @@ async function generateTextEmbeddings(
185
184
  };
186
185
  }
187
186
 
188
- const { environment, model } = config;
187
+ const { environment } = config;
189
188
 
190
- const partDefinitions = parts ?? [];
191
-
192
- // Count tokens if not already done
193
- if (!document.tokens?.count && type === SupportedEmbeddingTypes.text) {
194
- log.debug("Updating token count for document: " + document.id);
195
- const tokensData = countTokens(document.text!);
196
- await client.objects.update(document.id, {
197
- tokens: {
198
- ...tokensData,
199
- etag: document.text_etag ?? md5(document.text!),
200
- },
201
- });
202
- document.tokens = {
203
- ...tokensData,
204
- etag: document.text_etag ?? md5(document.text!),
205
- };
189
+ // Count tokens if needed, do not rely on existing token count
190
+ let tokenCount : number | undefined = undefined;
191
+ if (type === SupportedEmbeddingTypes.text && document.text) {
192
+ tokenCount = countTokens(document.text).count;
206
193
  }
207
194
 
208
195
  const maxTokens = config.max_tokens ?? 8000;
209
196
 
210
197
  //generate embeddings for the main doc if document isn't too large
211
- //if too large, we'll just generate embeddings for the parts
212
- //then we can generate embeddings for the main document by averaging the tensors
213
198
  log.info(`Generating ${type} embeddings for document ${document.id}`);
214
199
  if (
215
200
  type === SupportedEmbeddingTypes.text &&
216
- document.tokens?.count &&
217
- document.tokens?.count > maxTokens
201
+ tokenCount !== undefined &&
202
+ tokenCount > maxTokens
218
203
  ) {
219
- log.info("Document too large, generating embeddings for parts");
220
-
221
- if (!document.text) {
222
- return {
223
- id: document.id,
224
- status: "failed",
225
- message: "no text found",
226
- };
227
- }
228
-
229
- if (!partDefinitions || partDefinitions.length === 0) {
230
- log.info(
231
- "No parts found for document, skipping embeddings generation",
232
- );
233
- return {
234
- id: document.id,
235
- status: "failed",
236
- message: "no parts found",
237
- };
238
- }
239
-
240
- log.info("Generating embeddings for parts", {
241
- parts: partDefinitions,
242
- max_tokens: maxTokens,
243
- });
244
- const docParts = getContentParts(document.text, partDefinitions);
245
-
246
- log.info(`Retrieved ${docParts.length} parts`);
247
- const start = new Date().getTime();
248
- const generatePartEmbeddings = async (
249
- partContent: string,
250
- i: number,
251
- ) => {
252
- const localStart = new Date().getTime();
253
- try {
254
- log.info(`Generating embeddings for part ${i}`, {
255
- text_len: partContent.length,
256
- });
257
- if (!partContent) {
258
- return {
259
- id: i,
260
- number: i,
261
- result: null,
262
- status: "skipped",
263
- message: "no text found",
264
- };
265
- }
266
-
267
- const e = await generateEmbeddingsFromStudio(
268
- partContent,
269
- environment,
270
- client,
271
- model,
272
- ).catch((e) => {
273
- log.error("Error generating embeddings for part " + i, {
274
- text_length: partContent.length,
275
- error: e,
276
- });
277
- return null;
278
- });
279
-
280
- if (!e || !e.values) {
281
- return {
282
- id: i,
283
- number: i,
284
- result: null,
285
- message: "no embeddings generated",
286
- };
287
- }
288
-
289
- if (e.values.length === 0) {
290
- return {
291
- id: i,
292
- number: i,
293
- result: null,
294
- message: "no embeddings generated",
295
- };
296
- }
297
- log.info(`Generated embeddings for part ${i}`, {
298
- len: e.values.length,
299
- duration: new Date().getTime() - localStart,
300
- });
301
-
302
- return { number: i, result: e };
303
- } catch (err: any) {
304
- log.info(
305
- `Error generating ${type} embeddings for part ${i} of ${document.id}`,
306
- { error: err },
307
- );
308
- return {
309
- number: i,
310
- result: null,
311
- message: "error generating embeddings",
312
- error: err.message,
313
- };
314
- }
315
- };
316
-
317
- const partEmbeddings = await Promise.all(
318
- docParts.map((part, i) => generatePartEmbeddings(part, i)),
319
- );
320
- const validPartEmbeddings = partEmbeddings
321
- .filter((e) => e.result !== null)
322
- .map((e) => e.result);
323
- const averagedEmbedding = computeAttentionEmbedding(
324
- validPartEmbeddings.map((e) => e.values),
325
- );
326
- log.info(
327
- `Averaged embeddings for document ${document.id} in ${(new Date().getTime() - start) / 1000} seconds`,
328
- {
329
- len: averagedEmbedding.length,
330
- count: validPartEmbeddings.length,
331
- max_tokens: maxTokens,
332
- },
204
+ //TODO: Review strategy for large documents
205
+ log.warn(
206
+ `Document too large for ${type} embeddings generation, skipping (${tokenCount} tokens)`,
333
207
  );
334
- await client.objects.setEmbedding(document.id, type, {
335
- values: averagedEmbedding,
336
- model: validPartEmbeddings[0].model,
337
- etag: document.text_etag,
338
- });
339
- log.info(`Object ${document.id} embedding set`, {
340
- type,
341
- len: averagedEmbedding.length,
342
- });
208
+ return {
209
+ id: document.id,
210
+ status: "skipped",
211
+ message: `${type} embeddings generation, skipped for large document (${tokenCount} tokens)`,
212
+ }
343
213
  } else {
344
214
  log.info(`Generating ${type} embeddings for document`);
345
215
 
@@ -480,40 +350,3 @@ async function generateEmbeddingsFromStudio(
480
350
  throw e;
481
351
  });
482
352
  }
483
-
484
- //Simplified attention mechanism
485
- // This is a naive implementation and should be replaced with a more sophisticated
486
- // using tensorflow in a specific package
487
- function computeAttentionEmbedding(chunkEmbeddings: number[][]): number[] {
488
- if (chunkEmbeddings.length === 0) return [];
489
-
490
- const start = new Date().getTime();
491
-
492
- // Generate random attention weights
493
- const attentionWeights = chunkEmbeddings.map(() => Math.random());
494
-
495
- // Apply softmax to get attention scores
496
- const expWeights = attentionWeights.map((w) => Math.exp(w));
497
- const sumExpWeights = expWeights.reduce((sum, val) => sum + val, 0);
498
- const attentionScores = expWeights.map((w) => w / sumExpWeights);
499
-
500
- // Get embedding dimension
501
- const embeddingDim = chunkEmbeddings[0].length;
502
-
503
- // Initialize document embedding
504
- const documentEmbedding = new Array(embeddingDim).fill(0);
505
-
506
- // Weighted sum of embeddings
507
- for (let i = 0; i < chunkEmbeddings.length; i++) {
508
- for (let j = 0; j < embeddingDim; j++) {
509
- documentEmbedding[j] += chunkEmbeddings[i][j] * attentionScores[i];
510
- }
511
- }
512
-
513
- const duration = new Date().getTime() - start;
514
- console.log(
515
- `Computed document embedding in ${duration}ms for ${chunkEmbeddings.length} chunks`,
516
- );
517
-
518
- return documentEmbedding;
519
- }