@vertesia/workflow 0.81.0 → 0.81.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/activities/generateEmbeddings.js +14 -144
- package/lib/cjs/activities/generateEmbeddings.js.map +1 -1
- package/lib/esm/activities/generateEmbeddings.js +15 -145
- package/lib/esm/activities/generateEmbeddings.js.map +1 -1
- package/lib/types/activities/generateEmbeddings.d.ts +1 -1
- package/lib/types/activities/generateEmbeddings.d.ts.map +1 -1
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts +1 -1
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts.map +1 -1
- package/lib/workflows-bundle.js +12 -2
- package/package.json +5 -5
- package/src/activities/generateEmbeddings.ts +18 -185
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vertesia/workflow",
|
|
3
|
-
"version": "0.81.
|
|
3
|
+
"version": "0.81.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Vertesia workflow DSL",
|
|
6
6
|
"main": "./lib/esm/index.js",
|
|
@@ -44,11 +44,11 @@
|
|
|
44
44
|
"tmp": "^0.2.4",
|
|
45
45
|
"tmp-promise": "^3.0.3",
|
|
46
46
|
"yaml": "^2.6.0",
|
|
47
|
+
"@vertesia/memory": "0.81.1",
|
|
48
|
+
"@vertesia/common": "0.81.1",
|
|
49
|
+
"@vertesia/client": "0.81.1",
|
|
47
50
|
"@llumiverse/common": "0.24.0",
|
|
48
|
-
"@vertesia/
|
|
49
|
-
"@vertesia/client": "0.81.0",
|
|
50
|
-
"@vertesia/api-fetch-client": "0.81.0",
|
|
51
|
-
"@vertesia/memory": "0.81.0"
|
|
51
|
+
"@vertesia/api-fetch-client": "0.81.1"
|
|
52
52
|
},
|
|
53
53
|
"ts_dual_module": {
|
|
54
54
|
"outDir": "lib",
|
|
@@ -11,8 +11,8 @@ import {
|
|
|
11
11
|
} from "@vertesia/common";
|
|
12
12
|
import { setupActivity } from "../dsl/setup/ActivityContext.js";
|
|
13
13
|
import { DocumentNotFoundError } from "../errors.js";
|
|
14
|
-
import { fetchBlobAsBase64
|
|
15
|
-
import { DocPart
|
|
14
|
+
import { fetchBlobAsBase64 } from "../utils/blobs.js";
|
|
15
|
+
import { DocPart } from "../utils/chunks.js";
|
|
16
16
|
import { countTokens } from "../utils/tokens.js";
|
|
17
17
|
|
|
18
18
|
export interface GenerateEmbeddingsParams {
|
|
@@ -155,8 +155,7 @@ interface ExecuteGenerateEmbeddingsParams {
|
|
|
155
155
|
}
|
|
156
156
|
|
|
157
157
|
async function generateTextEmbeddings(
|
|
158
|
-
{ document, client, type, config }: ExecuteGenerateEmbeddingsParams
|
|
159
|
-
parts?: DocPart[],
|
|
158
|
+
{ document, client, type, config }: ExecuteGenerateEmbeddingsParams
|
|
160
159
|
) {
|
|
161
160
|
|
|
162
161
|
if (!document) {
|
|
@@ -185,161 +184,32 @@ async function generateTextEmbeddings(
|
|
|
185
184
|
};
|
|
186
185
|
}
|
|
187
186
|
|
|
188
|
-
const { environment
|
|
187
|
+
const { environment } = config;
|
|
189
188
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
log.debug("Updating token count for document: " + document.id);
|
|
195
|
-
const tokensData = countTokens(document.text!);
|
|
196
|
-
await client.objects.update(document.id, {
|
|
197
|
-
tokens: {
|
|
198
|
-
...tokensData,
|
|
199
|
-
etag: document.text_etag ?? md5(document.text!),
|
|
200
|
-
},
|
|
201
|
-
});
|
|
202
|
-
document.tokens = {
|
|
203
|
-
...tokensData,
|
|
204
|
-
etag: document.text_etag ?? md5(document.text!),
|
|
205
|
-
};
|
|
189
|
+
// Count tokens if needed, do not rely on existing token count
|
|
190
|
+
let tokenCount : number | undefined = undefined;
|
|
191
|
+
if (type === SupportedEmbeddingTypes.text && document.text) {
|
|
192
|
+
tokenCount = countTokens(document.text).count;
|
|
206
193
|
}
|
|
207
194
|
|
|
208
195
|
const maxTokens = config.max_tokens ?? 8000;
|
|
209
196
|
|
|
210
197
|
//generate embeddings for the main doc if document isn't too large
|
|
211
|
-
//if too large, we'll just generate embeddings for the parts
|
|
212
|
-
//then we can generate embeddings for the main document by averaging the tensors
|
|
213
198
|
log.info(`Generating ${type} embeddings for document ${document.id}`);
|
|
214
199
|
if (
|
|
215
200
|
type === SupportedEmbeddingTypes.text &&
|
|
216
|
-
|
|
217
|
-
|
|
201
|
+
tokenCount !== undefined &&
|
|
202
|
+
tokenCount > maxTokens
|
|
218
203
|
) {
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
return {
|
|
223
|
-
id: document.id,
|
|
224
|
-
status: "failed",
|
|
225
|
-
message: "no text found",
|
|
226
|
-
};
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
if (!partDefinitions || partDefinitions.length === 0) {
|
|
230
|
-
log.info(
|
|
231
|
-
"No parts found for document, skipping embeddings generation",
|
|
232
|
-
);
|
|
233
|
-
return {
|
|
234
|
-
id: document.id,
|
|
235
|
-
status: "failed",
|
|
236
|
-
message: "no parts found",
|
|
237
|
-
};
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
log.info("Generating embeddings for parts", {
|
|
241
|
-
parts: partDefinitions,
|
|
242
|
-
max_tokens: maxTokens,
|
|
243
|
-
});
|
|
244
|
-
const docParts = getContentParts(document.text, partDefinitions);
|
|
245
|
-
|
|
246
|
-
log.info(`Retrieved ${docParts.length} parts`);
|
|
247
|
-
const start = new Date().getTime();
|
|
248
|
-
const generatePartEmbeddings = async (
|
|
249
|
-
partContent: string,
|
|
250
|
-
i: number,
|
|
251
|
-
) => {
|
|
252
|
-
const localStart = new Date().getTime();
|
|
253
|
-
try {
|
|
254
|
-
log.info(`Generating embeddings for part ${i}`, {
|
|
255
|
-
text_len: partContent.length,
|
|
256
|
-
});
|
|
257
|
-
if (!partContent) {
|
|
258
|
-
return {
|
|
259
|
-
id: i,
|
|
260
|
-
number: i,
|
|
261
|
-
result: null,
|
|
262
|
-
status: "skipped",
|
|
263
|
-
message: "no text found",
|
|
264
|
-
};
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
const e = await generateEmbeddingsFromStudio(
|
|
268
|
-
partContent,
|
|
269
|
-
environment,
|
|
270
|
-
client,
|
|
271
|
-
model,
|
|
272
|
-
).catch((e) => {
|
|
273
|
-
log.error("Error generating embeddings for part " + i, {
|
|
274
|
-
text_length: partContent.length,
|
|
275
|
-
error: e,
|
|
276
|
-
});
|
|
277
|
-
return null;
|
|
278
|
-
});
|
|
279
|
-
|
|
280
|
-
if (!e || !e.values) {
|
|
281
|
-
return {
|
|
282
|
-
id: i,
|
|
283
|
-
number: i,
|
|
284
|
-
result: null,
|
|
285
|
-
message: "no embeddings generated",
|
|
286
|
-
};
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
if (e.values.length === 0) {
|
|
290
|
-
return {
|
|
291
|
-
id: i,
|
|
292
|
-
number: i,
|
|
293
|
-
result: null,
|
|
294
|
-
message: "no embeddings generated",
|
|
295
|
-
};
|
|
296
|
-
}
|
|
297
|
-
log.info(`Generated embeddings for part ${i}`, {
|
|
298
|
-
len: e.values.length,
|
|
299
|
-
duration: new Date().getTime() - localStart,
|
|
300
|
-
});
|
|
301
|
-
|
|
302
|
-
return { number: i, result: e };
|
|
303
|
-
} catch (err: any) {
|
|
304
|
-
log.info(
|
|
305
|
-
`Error generating ${type} embeddings for part ${i} of ${document.id}`,
|
|
306
|
-
{ error: err },
|
|
307
|
-
);
|
|
308
|
-
return {
|
|
309
|
-
number: i,
|
|
310
|
-
result: null,
|
|
311
|
-
message: "error generating embeddings",
|
|
312
|
-
error: err.message,
|
|
313
|
-
};
|
|
314
|
-
}
|
|
315
|
-
};
|
|
316
|
-
|
|
317
|
-
const partEmbeddings = await Promise.all(
|
|
318
|
-
docParts.map((part, i) => generatePartEmbeddings(part, i)),
|
|
319
|
-
);
|
|
320
|
-
const validPartEmbeddings = partEmbeddings
|
|
321
|
-
.filter((e) => e.result !== null)
|
|
322
|
-
.map((e) => e.result);
|
|
323
|
-
const averagedEmbedding = computeAttentionEmbedding(
|
|
324
|
-
validPartEmbeddings.map((e) => e.values),
|
|
325
|
-
);
|
|
326
|
-
log.info(
|
|
327
|
-
`Averaged embeddings for document ${document.id} in ${(new Date().getTime() - start) / 1000} seconds`,
|
|
328
|
-
{
|
|
329
|
-
len: averagedEmbedding.length,
|
|
330
|
-
count: validPartEmbeddings.length,
|
|
331
|
-
max_tokens: maxTokens,
|
|
332
|
-
},
|
|
204
|
+
//TODO: Review strategy for large documents
|
|
205
|
+
log.warn(
|
|
206
|
+
`Document too large for ${type} embeddings generation, skipping (${tokenCount} tokens)`,
|
|
333
207
|
);
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
}
|
|
339
|
-
log.info(`Object ${document.id} embedding set`, {
|
|
340
|
-
type,
|
|
341
|
-
len: averagedEmbedding.length,
|
|
342
|
-
});
|
|
208
|
+
return {
|
|
209
|
+
id: document.id,
|
|
210
|
+
status: "skipped",
|
|
211
|
+
message: `${type} embeddings generation, skipped for large document (${tokenCount} tokens)`,
|
|
212
|
+
}
|
|
343
213
|
} else {
|
|
344
214
|
log.info(`Generating ${type} embeddings for document`);
|
|
345
215
|
|
|
@@ -480,40 +350,3 @@ async function generateEmbeddingsFromStudio(
|
|
|
480
350
|
throw e;
|
|
481
351
|
});
|
|
482
352
|
}
|
|
483
|
-
|
|
484
|
-
//Simplified attention mechanism
|
|
485
|
-
// This is a naive implementation and should be replaced with a more sophisticated
|
|
486
|
-
// using tensorflow in a specific package
|
|
487
|
-
function computeAttentionEmbedding(chunkEmbeddings: number[][]): number[] {
|
|
488
|
-
if (chunkEmbeddings.length === 0) return [];
|
|
489
|
-
|
|
490
|
-
const start = new Date().getTime();
|
|
491
|
-
|
|
492
|
-
// Generate random attention weights
|
|
493
|
-
const attentionWeights = chunkEmbeddings.map(() => Math.random());
|
|
494
|
-
|
|
495
|
-
// Apply softmax to get attention scores
|
|
496
|
-
const expWeights = attentionWeights.map((w) => Math.exp(w));
|
|
497
|
-
const sumExpWeights = expWeights.reduce((sum, val) => sum + val, 0);
|
|
498
|
-
const attentionScores = expWeights.map((w) => w / sumExpWeights);
|
|
499
|
-
|
|
500
|
-
// Get embedding dimension
|
|
501
|
-
const embeddingDim = chunkEmbeddings[0].length;
|
|
502
|
-
|
|
503
|
-
// Initialize document embedding
|
|
504
|
-
const documentEmbedding = new Array(embeddingDim).fill(0);
|
|
505
|
-
|
|
506
|
-
// Weighted sum of embeddings
|
|
507
|
-
for (let i = 0; i < chunkEmbeddings.length; i++) {
|
|
508
|
-
for (let j = 0; j < embeddingDim; j++) {
|
|
509
|
-
documentEmbedding[j] += chunkEmbeddings[i][j] * attentionScores[i];
|
|
510
|
-
}
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
const duration = new Date().getTime() - start;
|
|
514
|
-
console.log(
|
|
515
|
-
`Computed document embedding in ${duration}ms for ${chunkEmbeddings.length} chunks`,
|
|
516
|
-
);
|
|
517
|
-
|
|
518
|
-
return documentEmbedding;
|
|
519
|
-
}
|