@vertesia/workflow 1.0.0-dev.20260128.144200 → 1.0.0-dev.20260225.024852Z
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +1 -1
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -1
- package/lib/cjs/activities/chunkDocument.js +3 -1
- package/lib/cjs/activities/chunkDocument.js.map +1 -1
- package/lib/cjs/activities/extractDocumentText.js +56 -16
- package/lib/cjs/activities/extractDocumentText.js.map +1 -1
- package/lib/cjs/activities/generateDocumentProperties.js +4 -2
- package/lib/cjs/activities/generateDocumentProperties.js.map +1 -1
- package/lib/cjs/activities/generateEmbeddings.js +20 -10
- package/lib/cjs/activities/generateEmbeddings.js.map +1 -1
- package/lib/cjs/activities/generateOrAssignContentType.js +2 -2
- package/lib/cjs/activities/generateOrAssignContentType.js.map +1 -1
- package/lib/cjs/activities/index-dsl.js +7 -7
- package/lib/cjs/activities/index-dsl.js.map +1 -1
- package/lib/cjs/activities/media/saveGladiaTranscription.js +38 -24
- package/lib/cjs/activities/media/saveGladiaTranscription.js.map +1 -1
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js +41 -24
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +1 -1
- package/lib/cjs/activities/notifyWebhook.js +11 -2
- package/lib/cjs/activities/notifyWebhook.js.map +1 -1
- package/lib/cjs/activities/renditions/generateImageRendition.js +2 -2
- package/lib/cjs/activities/renditions/generateImageRendition.js.map +1 -1
- package/lib/cjs/activities/setDocumentStatus.js +13 -2
- package/lib/cjs/activities/setDocumentStatus.js.map +1 -1
- package/lib/cjs/conversion/image.js +10 -10
- package/lib/cjs/conversion/image.js.map +1 -1
- package/lib/cjs/dsl/dsl-workflow.js +44 -7
- package/lib/cjs/dsl/dsl-workflow.js.map +1 -1
- package/lib/cjs/dsl/setup/ActivityContext.js +56 -0
- package/lib/cjs/dsl/setup/ActivityContext.js.map +1 -1
- package/lib/cjs/errors.js +11 -1
- package/lib/cjs/errors.js.map +1 -1
- package/lib/cjs/index.js +6 -5
- package/lib/cjs/index.js.map +1 -1
- package/lib/cjs/result-types.js.map +1 -1
- package/lib/cjs/utils/renditions.js +9 -5
- package/lib/cjs/utils/renditions.js.map +1 -1
- package/lib/cjs/utils/text-preview-utils.js +43 -0
- package/lib/cjs/utils/text-preview-utils.js.map +1 -0
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +1 -1
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -1
- package/lib/esm/activities/chunkDocument.js +3 -1
- package/lib/esm/activities/chunkDocument.js.map +1 -1
- package/lib/esm/activities/extractDocumentText.js +56 -16
- package/lib/esm/activities/extractDocumentText.js.map +1 -1
- package/lib/esm/activities/generateDocumentProperties.js +4 -2
- package/lib/esm/activities/generateDocumentProperties.js.map +1 -1
- package/lib/esm/activities/generateEmbeddings.js +20 -10
- package/lib/esm/activities/generateEmbeddings.js.map +1 -1
- package/lib/esm/activities/generateOrAssignContentType.js +2 -2
- package/lib/esm/activities/generateOrAssignContentType.js.map +1 -1
- package/lib/esm/activities/index-dsl.js +3 -3
- package/lib/esm/activities/index-dsl.js.map +1 -1
- package/lib/esm/activities/media/saveGladiaTranscription.js +38 -24
- package/lib/esm/activities/media/saveGladiaTranscription.js.map +1 -1
- package/lib/esm/activities/media/transcribeMediaWithGladia.js +41 -24
- package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +1 -1
- package/lib/esm/activities/notifyWebhook.js +11 -2
- package/lib/esm/activities/notifyWebhook.js.map +1 -1
- package/lib/esm/activities/renditions/generateImageRendition.js +2 -2
- package/lib/esm/activities/renditions/generateImageRendition.js.map +1 -1
- package/lib/esm/activities/setDocumentStatus.js +13 -2
- package/lib/esm/activities/setDocumentStatus.js.map +1 -1
- package/lib/esm/conversion/image.js +10 -10
- package/lib/esm/conversion/image.js.map +1 -1
- package/lib/esm/dsl/dsl-workflow.js +44 -7
- package/lib/esm/dsl/dsl-workflow.js.map +1 -1
- package/lib/esm/dsl/setup/ActivityContext.js +57 -1
- package/lib/esm/dsl/setup/ActivityContext.js.map +1 -1
- package/lib/esm/errors.js +9 -0
- package/lib/esm/errors.js.map +1 -1
- package/lib/esm/index.js +6 -5
- package/lib/esm/index.js.map +1 -1
- package/lib/esm/result-types.js.map +1 -1
- package/lib/esm/utils/renditions.js +9 -5
- package/lib/esm/utils/renditions.js.map +1 -1
- package/lib/esm/utils/text-preview-utils.js +38 -0
- package/lib/esm/utils/text-preview-utils.js.map +1 -0
- package/lib/tsconfig.tsbuildinfo +1 -1
- package/lib/types/activities/chunkDocument.d.ts.map +1 -1
- package/lib/types/activities/extractDocumentText.d.ts +1 -0
- package/lib/types/activities/extractDocumentText.d.ts.map +1 -1
- package/lib/types/activities/generateDocumentProperties.d.ts.map +1 -1
- package/lib/types/activities/generateEmbeddings.d.ts.map +1 -1
- package/lib/types/activities/index-dsl.d.ts +3 -3
- package/lib/types/activities/index-dsl.d.ts.map +1 -1
- package/lib/types/activities/media/saveGladiaTranscription.d.ts +1 -0
- package/lib/types/activities/media/saveGladiaTranscription.d.ts.map +1 -1
- package/lib/types/activities/media/transcribeMediaWithGladia.d.ts +1 -0
- package/lib/types/activities/media/transcribeMediaWithGladia.d.ts.map +1 -1
- package/lib/types/activities/setDocumentStatus.d.ts +1 -1
- package/lib/types/activities/setDocumentStatus.d.ts.map +1 -1
- package/lib/types/dsl/dsl-workflow.d.ts.map +1 -1
- package/lib/types/dsl/setup/ActivityContext.d.ts +32 -2
- package/lib/types/dsl/setup/ActivityContext.d.ts.map +1 -1
- package/lib/types/errors.d.ts +4 -0
- package/lib/types/errors.d.ts.map +1 -1
- package/lib/types/index.d.ts +6 -5
- package/lib/types/index.d.ts.map +1 -1
- package/lib/types/result-types.d.ts +5 -1
- package/lib/types/result-types.d.ts.map +1 -1
- package/lib/types/utils/renditions.d.ts +2 -0
- package/lib/types/utils/renditions.d.ts.map +1 -1
- package/lib/types/utils/text-preview-utils.d.ts +15 -0
- package/lib/types/utils/text-preview-utils.d.ts.map +1 -0
- package/lib/workflows-bundle.js +11747 -11141
- package/package.json +6 -7
- package/src/activities/advanced/createOrUpdateDocumentFromInteractionRun.ts +1 -1
- package/src/activities/chunkDocument.ts +3 -1
- package/src/activities/extractDocumentText.ts +85 -26
- package/src/activities/generateDocumentProperties.ts +4 -2
- package/src/activities/generateEmbeddings.ts +22 -14
- package/src/activities/generateOrAssignContentType.ts +2 -2
- package/src/activities/index-dsl.ts +4 -3
- package/src/activities/media/saveGladiaTranscription.test.ts +406 -0
- package/src/activities/media/saveGladiaTranscription.ts +41 -26
- package/src/activities/media/transcribeMediaWithGladia.test.ts +583 -0
- package/src/activities/media/transcribeMediaWithGladia.ts +46 -25
- package/src/activities/notifyWebhook.test.ts +121 -8
- package/src/activities/notifyWebhook.ts +10 -2
- package/src/activities/renditions/generateImageRendition.ts +2 -2
- package/src/activities/setDocumentStatus.ts +12 -4
- package/src/conversion/image.test.ts +1 -0
- package/src/conversion/image.ts +10 -10
- package/src/dsl/dsl-workflow.ts +57 -9
- package/src/dsl/setup/ActivityContext.ts +73 -0
- package/src/dsl.ts +1 -0
- package/src/errors.ts +15 -0
- package/src/index.ts +6 -5
- package/src/result-types.ts +5 -1
- package/src/utils/renditions.ts +11 -5
- package/src/utils/text-preview-utils.ts +62 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vertesia/workflow",
|
|
3
|
-
"version": "1.0.0-dev.
|
|
3
|
+
"version": "1.0.0-dev.20260225.024852Z",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Vertesia workflow DSL",
|
|
6
6
|
"main": "./lib/esm/index.js",
|
|
@@ -30,7 +30,6 @@
|
|
|
30
30
|
"@aws-sdk/credential-providers": "^3.948.0",
|
|
31
31
|
"@temporalio/activity": "^1.11.5",
|
|
32
32
|
"@temporalio/workflow": "^1.11.5",
|
|
33
|
-
"@types/json-schema": "^7.0.15",
|
|
34
33
|
"fast-deep-equal": "^3.1.3",
|
|
35
34
|
"jsonwebtoken": "^9.0.3",
|
|
36
35
|
"mime": "^4.0.0",
|
|
@@ -44,11 +43,11 @@
|
|
|
44
43
|
"tmp": "^0.2.4",
|
|
45
44
|
"tmp-promise": "^3.0.3",
|
|
46
45
|
"yaml": "^2.6.0",
|
|
47
|
-
"@
|
|
48
|
-
"@
|
|
49
|
-
"@vertesia/
|
|
50
|
-
"@vertesia/
|
|
51
|
-
"@vertesia/memory": "1.0.0-dev.
|
|
46
|
+
"@vertesia/api-fetch-client": "1.0.0-dev.20260225.024852Z",
|
|
47
|
+
"@llumiverse/common": "1.0.0-dev.20260224.234313Z",
|
|
48
|
+
"@vertesia/common": "1.0.0-dev.20260225.024852Z",
|
|
49
|
+
"@vertesia/client": "1.0.0-dev.20260225.024852Z",
|
|
50
|
+
"@vertesia/memory": "1.0.0-dev.20260225.024852Z"
|
|
52
51
|
},
|
|
53
52
|
"ts_dual_module": {
|
|
54
53
|
"outDir": "lib",
|
|
@@ -90,7 +90,7 @@ export async function createOrUpdateDocumentFromInteractionRun(payload: DSLActiv
|
|
|
90
90
|
generation_run_info: {
|
|
91
91
|
id: run.id,
|
|
92
92
|
date: new Date().toISOString(),
|
|
93
|
-
model: run.modelId,
|
|
93
|
+
model: run.modelId ?? "",
|
|
94
94
|
target: jsonResult ? 'properties' : 'text'
|
|
95
95
|
}
|
|
96
96
|
};
|
|
@@ -55,7 +55,9 @@ export async function chunkDocument(payload: DSLActivityExecutionPayload<ChunkDo
|
|
|
55
55
|
|
|
56
56
|
const document = await client.objects.retrieve(objectId, "+text");
|
|
57
57
|
|
|
58
|
-
const type = document.type
|
|
58
|
+
const type = document.type
|
|
59
|
+
? await client.types.catalog.resolve(document.type)
|
|
60
|
+
: undefined;
|
|
59
61
|
|
|
60
62
|
if (!type?.is_chunkable) {
|
|
61
63
|
log.warn('Type is not chunkable for object ID: ' + objectId);
|
|
@@ -4,6 +4,7 @@ import {
|
|
|
4
4
|
CreateContentObjectPayload,
|
|
5
5
|
DSLActivityExecutionPayload,
|
|
6
6
|
DSLActivitySpec,
|
|
7
|
+
WorkflowInputFile,
|
|
7
8
|
} from "@vertesia/common";
|
|
8
9
|
import { markdownWithMarkitdown } from "../conversion/markitdown.js";
|
|
9
10
|
import { mutoolPdfToText } from "../conversion/mutool.js";
|
|
@@ -12,6 +13,10 @@ import { setupActivity } from "../dsl/setup/ActivityContext.js";
|
|
|
12
13
|
import { DocumentNotFoundError } from "../errors.js";
|
|
13
14
|
import { TextExtractionResult, TextExtractionStatus } from "../result-types.js";
|
|
14
15
|
import { fetchBlobAsBuffer, md5 } from "../utils/blobs.js";
|
|
16
|
+
import {
|
|
17
|
+
createFileSourceResult,
|
|
18
|
+
uploadTextPreviewToStorage
|
|
19
|
+
} from "../utils/text-preview-utils.js";
|
|
15
20
|
import { countTokens } from "../utils/tokens.js";
|
|
16
21
|
|
|
17
22
|
//@ts-ignore
|
|
@@ -19,8 +24,9 @@ const JSON: DSLActivitySpec = {
|
|
|
19
24
|
name: "extractDocumentText",
|
|
20
25
|
};
|
|
21
26
|
|
|
22
|
-
|
|
23
|
-
|
|
27
|
+
export interface ExtractDocumentTextParams {
|
|
28
|
+
output_storage_path?: string;
|
|
29
|
+
}
|
|
24
30
|
export interface ExtractDocumentText extends DSLActivitySpec<ExtractDocumentTextParams> {
|
|
25
31
|
name: "extractDocumentText";
|
|
26
32
|
projection?: never;
|
|
@@ -29,8 +35,27 @@ export interface ExtractDocumentText extends DSLActivitySpec<ExtractDocumentText
|
|
|
29
35
|
export async function extractDocumentText(
|
|
30
36
|
payload: DSLActivityExecutionPayload<ExtractDocumentTextParams>,
|
|
31
37
|
): Promise<TextExtractionResult> {
|
|
32
|
-
const
|
|
38
|
+
const context = await setupActivity(payload);
|
|
39
|
+
const { client, inputType, params } = context;
|
|
40
|
+
const { output_storage_path } = params;
|
|
41
|
+
|
|
42
|
+
if (inputType === 'files') {
|
|
43
|
+
// File mode: extract from file source
|
|
44
|
+
if (!output_storage_path) {
|
|
45
|
+
throw new Error('output_storage_path is required when extracting text from file sources');
|
|
46
|
+
}
|
|
47
|
+
return extractFromFileSource(client, context.file, output_storage_path);
|
|
48
|
+
} else {
|
|
49
|
+
// Object mode: fetch from object store
|
|
50
|
+
return extractFromObject(client, context.objectId, context.objectIds || []);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
33
53
|
|
|
54
|
+
async function extractFromObject(
|
|
55
|
+
client: any,
|
|
56
|
+
objectId: string,
|
|
57
|
+
objectIds: string[],
|
|
58
|
+
): Promise<TextExtractionResult> {
|
|
34
59
|
const r = await client.objects.find({
|
|
35
60
|
query: { _id: objectId },
|
|
36
61
|
limit: 1,
|
|
@@ -39,7 +64,7 @@ export async function extractDocumentText(
|
|
|
39
64
|
const doc = r[0] as ContentObject;
|
|
40
65
|
if (!doc) {
|
|
41
66
|
log.error(`Document ${objectId} not found`);
|
|
42
|
-
throw new DocumentNotFoundError(`Document ${objectId} not found`,
|
|
67
|
+
throw new DocumentNotFoundError(`Document ${objectId} not found`, objectIds);
|
|
43
68
|
}
|
|
44
69
|
|
|
45
70
|
log.info(`Extracting text for object ${doc.id}`);
|
|
@@ -65,9 +90,62 @@ export async function extractDocumentText(
|
|
|
65
90
|
return createResponse(doc, "", TextExtractionStatus.error, e.message);
|
|
66
91
|
}
|
|
67
92
|
|
|
93
|
+
const txt = await extractTextFromBuffer(fileBuffer, doc.content.type);
|
|
94
|
+
if (!txt) {
|
|
95
|
+
return createResponse(
|
|
96
|
+
doc,
|
|
97
|
+
doc.text ?? "",
|
|
98
|
+
TextExtractionStatus.skipped,
|
|
99
|
+
`Unsupported mime type: ${doc.content.type}`,
|
|
100
|
+
);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const tokensData = countTokens(txt);
|
|
104
|
+
const etag = doc.content.etag ?? md5(txt);
|
|
105
|
+
|
|
106
|
+
const updateData: CreateContentObjectPayload = {
|
|
107
|
+
text: txt,
|
|
108
|
+
text_etag: etag,
|
|
109
|
+
tokens: {
|
|
110
|
+
...tokensData,
|
|
111
|
+
etag: etag,
|
|
112
|
+
},
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
await client.objects.update(doc.id, updateData);
|
|
116
|
+
|
|
117
|
+
return createResponse(doc, txt, TextExtractionStatus.success);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async function extractFromFileSource(
|
|
121
|
+
client: any,
|
|
122
|
+
input_file: WorkflowInputFile,
|
|
123
|
+
output_storage_path: string
|
|
124
|
+
): Promise<TextExtractionResult> {
|
|
125
|
+
log.info(`Extracting text from ${input_file}`);
|
|
126
|
+
|
|
127
|
+
let fileBuffer: Buffer;
|
|
128
|
+
try {
|
|
129
|
+
fileBuffer = await fetchBlobAsBuffer(client, input_file.url);
|
|
130
|
+
} catch (e: any) {
|
|
131
|
+
log.error(`Error reading file: ${e}`);
|
|
132
|
+
return createFileSourceResult(input_file.url, output_storage_path, null);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const txt = await extractTextFromBuffer(fileBuffer, input_file.mimetype);
|
|
136
|
+
|
|
137
|
+
// Upload extracted text to storage
|
|
138
|
+
if (txt && output_storage_path) {
|
|
139
|
+
await uploadTextPreviewToStorage(client, txt, output_storage_path, "Document");
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return createFileSourceResult(input_file.url, output_storage_path, txt);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
async function extractTextFromBuffer(fileBuffer: Buffer, mimeType: string): Promise<string | null> {
|
|
68
146
|
let txt: string;
|
|
69
147
|
|
|
70
|
-
switch (
|
|
148
|
+
switch (mimeType) {
|
|
71
149
|
case "application/pdf":
|
|
72
150
|
txt = await mutoolPdfToText(fileBuffer);
|
|
73
151
|
break;
|
|
@@ -131,29 +209,10 @@ export async function extractDocumentText(
|
|
|
131
209
|
txt = fileBuffer.toString("utf8"); //TODO: add charset detection
|
|
132
210
|
break;
|
|
133
211
|
}
|
|
134
|
-
return
|
|
135
|
-
doc,
|
|
136
|
-
doc.text ?? "",
|
|
137
|
-
TextExtractionStatus.skipped,
|
|
138
|
-
`Unsupported mime type: ${doc.content.type}`,
|
|
139
|
-
);
|
|
212
|
+
return null;
|
|
140
213
|
}
|
|
141
214
|
|
|
142
|
-
|
|
143
|
-
const etag = doc.content.etag ?? md5(txt);
|
|
144
|
-
|
|
145
|
-
const updateData: CreateContentObjectPayload = {
|
|
146
|
-
text: txt,
|
|
147
|
-
text_etag: etag,
|
|
148
|
-
tokens: {
|
|
149
|
-
...tokensData,
|
|
150
|
-
etag: etag,
|
|
151
|
-
},
|
|
152
|
-
};
|
|
153
|
-
|
|
154
|
-
await client.objects.update(doc.id, updateData);
|
|
155
|
-
|
|
156
|
-
return createResponse(doc, txt, TextExtractionStatus.success);
|
|
215
|
+
return txt;
|
|
157
216
|
}
|
|
158
217
|
|
|
159
218
|
function createResponse(
|
|
@@ -30,7 +30,9 @@ export async function generateDocumentProperties(
|
|
|
30
30
|
const project = await context.fetchProject();
|
|
31
31
|
|
|
32
32
|
const doc = await client.objects.retrieve(objectId, "+text");
|
|
33
|
-
const type = doc.type
|
|
33
|
+
const type = doc.type
|
|
34
|
+
? await client.types.catalog.resolve(doc.type)
|
|
35
|
+
: undefined;
|
|
34
36
|
|
|
35
37
|
if (!doc?.text && !params.use_vision && !doc?.content?.type?.startsWith("image/")) {
|
|
36
38
|
log.warn(`Object ${objectId} not found or text is empty`);
|
|
@@ -112,7 +114,7 @@ export async function generateDocumentProperties(
|
|
|
112
114
|
generation_run_info: {
|
|
113
115
|
id: infoRes.id,
|
|
114
116
|
date: new Date().toISOString(),
|
|
115
|
-
model: infoRes.modelId,
|
|
117
|
+
model: infoRes.modelId ?? "",
|
|
116
118
|
},
|
|
117
119
|
}, { suppressWorkflows: true });
|
|
118
120
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { EmbeddingsResult } from "@llumiverse/common";
|
|
2
2
|
import { log } from "@temporalio/activity";
|
|
3
|
-
import { VertesiaClient } from "@vertesia/client";
|
|
3
|
+
import { VertesiaClient, ZenoClientNotFoundError } from "@vertesia/client";
|
|
4
4
|
import {
|
|
5
5
|
ContentObject,
|
|
6
6
|
DSLActivityExecutionPayload,
|
|
@@ -72,7 +72,7 @@ export async function generateEmbeddings(
|
|
|
72
72
|
}
|
|
73
73
|
|
|
74
74
|
if (!projectData?.configuration.embeddings[type]?.enabled) {
|
|
75
|
-
log.
|
|
75
|
+
log.debug(
|
|
76
76
|
`Embeddings generation disabled for type ${type} on project: ${projectData.name} (${projectData.namespace})`,
|
|
77
77
|
{ config },
|
|
78
78
|
);
|
|
@@ -83,7 +83,7 @@ export async function generateEmbeddings(
|
|
|
83
83
|
};
|
|
84
84
|
}
|
|
85
85
|
|
|
86
|
-
log.
|
|
86
|
+
log.debug(`${type} embedding generation starting for object ${objectId}`, {
|
|
87
87
|
force,
|
|
88
88
|
config,
|
|
89
89
|
});
|
|
@@ -94,10 +94,18 @@ export async function generateEmbeddings(
|
|
|
94
94
|
);
|
|
95
95
|
}
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
97
|
+
let document;
|
|
98
|
+
try {
|
|
99
|
+
document = await client.objects.retrieve(
|
|
100
|
+
objectId,
|
|
101
|
+
"+text +parts +embeddings +tokens +properties",
|
|
102
|
+
);
|
|
103
|
+
} catch (error) {
|
|
104
|
+
if (error instanceof ZenoClientNotFoundError) {
|
|
105
|
+
throw new DocumentNotFoundError(`Document not found: ${objectId}`, [objectId]);
|
|
106
|
+
}
|
|
107
|
+
throw error;
|
|
108
|
+
}
|
|
101
109
|
|
|
102
110
|
if (!document) {
|
|
103
111
|
throw new DocumentNotFoundError("Document not found", [objectId]);
|
|
@@ -195,7 +203,7 @@ async function generateTextEmbeddings(
|
|
|
195
203
|
// Skip if embeddings already exist with matching etag (unless force=true)
|
|
196
204
|
const existingEmbedding = document.embeddings?.[type];
|
|
197
205
|
if (!force && existingEmbedding?.etag && textEtag && existingEmbedding.etag === textEtag) {
|
|
198
|
-
log.
|
|
206
|
+
log.debug(`Skipping ${type} embeddings for document ${document.id} - etag unchanged`);
|
|
199
207
|
return {
|
|
200
208
|
id: document.id,
|
|
201
209
|
type,
|
|
@@ -213,7 +221,7 @@ async function generateTextEmbeddings(
|
|
|
213
221
|
const maxTokens = config.max_tokens ?? 8000;
|
|
214
222
|
|
|
215
223
|
//generate embeddings for the main doc if document isn't too large
|
|
216
|
-
log.
|
|
224
|
+
log.debug(`Generating ${type} embeddings for document ${document.id}`);
|
|
217
225
|
if (
|
|
218
226
|
type === SupportedEmbeddingTypes.text &&
|
|
219
227
|
tokenCount !== undefined &&
|
|
@@ -229,7 +237,7 @@ async function generateTextEmbeddings(
|
|
|
229
237
|
message: `${type} embeddings generation, skipped for large document (${tokenCount} tokens)`,
|
|
230
238
|
}
|
|
231
239
|
} else {
|
|
232
|
-
log.
|
|
240
|
+
log.debug(`Generating ${type} embeddings for document`);
|
|
233
241
|
|
|
234
242
|
const res = await generateEmbeddingsFromStudio(
|
|
235
243
|
JSON.stringify(document[type]),
|
|
@@ -244,7 +252,7 @@ async function generateTextEmbeddings(
|
|
|
244
252
|
};
|
|
245
253
|
}
|
|
246
254
|
|
|
247
|
-
log.
|
|
255
|
+
log.debug(`${type} embeddings generated for document ${document.id}`, {
|
|
248
256
|
len: res.values.length,
|
|
249
257
|
});
|
|
250
258
|
await client.objects.setEmbedding(document.id, type, {
|
|
@@ -269,7 +277,7 @@ async function generateImageEmbeddings({
|
|
|
269
277
|
config,
|
|
270
278
|
force,
|
|
271
279
|
}: ExecuteGenerateEmbeddingsParams) {
|
|
272
|
-
log.
|
|
280
|
+
log.debug("Generating image embeddings for document " + document.id, {
|
|
273
281
|
content: document.content,
|
|
274
282
|
});
|
|
275
283
|
if (
|
|
@@ -290,7 +298,7 @@ async function generateImageEmbeddings({
|
|
|
290
298
|
// Skip if embeddings already exist with matching etag (unless force=true)
|
|
291
299
|
const existingEmbedding = document.embeddings?.[type];
|
|
292
300
|
if (!force && existingEmbedding?.etag && contentEtag && existingEmbedding.etag === contentEtag) {
|
|
293
|
-
log.
|
|
301
|
+
log.debug(`Skipping ${type} embeddings for document ${document.id} - content etag unchanged`);
|
|
294
302
|
return {
|
|
295
303
|
id: document.id,
|
|
296
304
|
type,
|
|
@@ -370,7 +378,7 @@ async function generateEmbeddingsFromStudio(
|
|
|
370
378
|
client: VertesiaClient,
|
|
371
379
|
model?: string,
|
|
372
380
|
): Promise<EmbeddingsResult> {
|
|
373
|
-
log.
|
|
381
|
+
log.debug(
|
|
374
382
|
`Generating embeddings for text of ${text.length} chars with environment ${env}`,
|
|
375
383
|
);
|
|
376
384
|
|
|
@@ -52,7 +52,7 @@ export async function generateOrAssignContentType(
|
|
|
52
52
|
const interactionName =
|
|
53
53
|
params.interactionNames?.selectDocumentType ?? INT_SELECT_DOCUMENT_TYPE;
|
|
54
54
|
|
|
55
|
-
log.
|
|
55
|
+
log.debug("SelectDocumentType for object: " + objectId, { payload });
|
|
56
56
|
|
|
57
57
|
const object = await client.objects.retrieve(objectId, "+text");
|
|
58
58
|
|
|
@@ -80,7 +80,7 @@ export async function generateOrAssignContentType(
|
|
|
80
80
|
return { status: "failed", error: "no-text" };
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
-
const types = await client.types.list(
|
|
83
|
+
const types = await client.types.catalog.list({
|
|
84
84
|
schema: true,
|
|
85
85
|
});
|
|
86
86
|
|
|
@@ -3,6 +3,7 @@ export { createDocumentTypeFromInteractionRun } from "./advanced/createDocumentT
|
|
|
3
3
|
export { createOrUpdateDocumentFromInteractionRun } from "./advanced/createOrUpdateDocumentFromInteractionRun.js";
|
|
4
4
|
export { updateDocumentFromInteractionRun } from "./advanced/updateDocumentFromInteractionRun.js";
|
|
5
5
|
export { chunkDocument } from "./chunkDocument.js";
|
|
6
|
+
export { copyParentArtifacts } from "./copyParentArtifacts.js";
|
|
6
7
|
export { createPdfDocumentFromSource } from "./createDocumentFromOther.js";
|
|
7
8
|
export { executeInteraction } from "./executeInteraction.js";
|
|
8
9
|
export { extractDocumentText } from "./extractDocumentText.js";
|
|
@@ -11,16 +12,16 @@ export { generateEmbeddings } from "./generateEmbeddings.js";
|
|
|
11
12
|
export { generateOrAssignContentType } from "./generateOrAssignContentType.js";
|
|
12
13
|
export { getObjectFromStore } from "./getObjectFromStore.js";
|
|
13
14
|
export { handleDslError } from "./handleError.js";
|
|
14
|
-
export { prepareVideo } from "./media/prepareVideo.js";
|
|
15
15
|
export { prepareAudio } from "./media/prepareAudio.js";
|
|
16
|
+
export { prepareVideo } from "./media/prepareVideo.js";
|
|
16
17
|
export { convertPdfToStructuredText } from "./media/processPdfWithTextract.js";
|
|
17
18
|
export { saveGladiaTranscription } from "./media/saveGladiaTranscription.js";
|
|
18
19
|
export { transcribeMedia } from "./media/transcribeMediaWithGladia.js";
|
|
19
20
|
export type { TranscriptMediaResult } from "./media/transcribeMediaWithGladia.js";
|
|
21
|
+
export { mergeChildArtifacts } from "./mergeChildArtifacts.js";
|
|
20
22
|
export { notifyWebhook } from "./notifyWebhook.js";
|
|
21
23
|
export { checkRateLimit } from "./rateLimiter.js";
|
|
22
|
-
export { copyParentArtifacts } from "./copyParentArtifacts.js";
|
|
23
|
-
export { mergeChildArtifacts } from "./mergeChildArtifacts.js";
|
|
24
24
|
export { generateImageRendition } from "./renditions/generateImageRendition.js";
|
|
25
25
|
export { generateVideoRendition } from "./renditions/generateVideoRendition.js";
|
|
26
26
|
export { setDocumentStatus } from "./setDocumentStatus.js";
|
|
27
|
+
|