@vertesia/workflow 0.52.0 → 0.55.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/activities/createDocumentFromOther.js +1 -1
- package/lib/cjs/activities/executeInteraction.js +29 -15
- package/lib/cjs/activities/executeInteraction.js.map +1 -1
- package/lib/cjs/activities/extractDocumentText.js +33 -30
- package/lib/cjs/activities/extractDocumentText.js.map +1 -1
- package/lib/cjs/activities/generateEmbeddings.js +1 -1
- package/lib/cjs/activities/generateEmbeddings.js.map +1 -1
- package/lib/cjs/activities/generateImageRendition.js +31 -11
- package/lib/cjs/activities/generateImageRendition.js.map +1 -1
- package/lib/cjs/activities/generateOrAssignContentType.js +25 -12
- package/lib/cjs/activities/generateOrAssignContentType.js.map +1 -1
- package/lib/cjs/activities/getObjectFromStore.js +1 -1
- package/lib/cjs/activities/handleError.js +22 -0
- package/lib/cjs/activities/handleError.js.map +1 -0
- package/lib/cjs/activities/index-dsl.js +3 -1
- package/lib/cjs/activities/index-dsl.js.map +1 -1
- package/lib/cjs/activities/index.js +0 -1
- package/lib/cjs/activities/index.js.map +1 -1
- package/lib/cjs/activities/media/processPdfWithTextract.js +4 -4
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js +1 -1
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +1 -1
- package/lib/cjs/activities/setDocumentStatus.js +1 -1
- package/lib/cjs/conversion/TextractProcessor.js +9 -9
- package/lib/cjs/conversion/image.js +6 -2
- package/lib/cjs/conversion/image.js.map +1 -1
- package/lib/cjs/conversion/markitdown.js +42 -0
- package/lib/cjs/conversion/markitdown.js.map +1 -0
- package/lib/cjs/conversion/mutool.js +1 -1
- package/lib/cjs/conversion/pandoc.js +9 -9
- package/lib/cjs/conversion/pandoc.js.map +1 -1
- package/lib/cjs/dsl/dsl-workflow.js +59 -11
- package/lib/cjs/dsl/dsl-workflow.js.map +1 -1
- package/lib/cjs/dsl/vars.js +6 -6
- package/lib/cjs/dsl/vars.js.map +1 -1
- package/lib/cjs/index.js +1 -1
- package/lib/cjs/iterative-generation/activities/extractToc.js +1 -1
- package/lib/cjs/iterative-generation/activities/extractToc.js.map +1 -1
- package/lib/cjs/iterative-generation/activities/generatePart.js +2 -2
- package/lib/cjs/iterative-generation/activities/generatePart.js.map +1 -1
- package/lib/cjs/iterative-generation/activities/generateToc.js +1 -1
- package/lib/cjs/iterative-generation/activities/generateToc.js.map +1 -1
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js +1 -1
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js.map +1 -1
- package/lib/cjs/iterative-generation/utils.js +4 -4
- package/lib/cjs/iterative-generation/utils.js.map +1 -1
- package/lib/esm/activities/createDocumentFromOther.js +1 -1
- package/lib/esm/activities/executeInteraction.js +31 -17
- package/lib/esm/activities/executeInteraction.js.map +1 -1
- package/lib/esm/activities/extractDocumentText.js +39 -36
- package/lib/esm/activities/extractDocumentText.js.map +1 -1
- package/lib/esm/activities/generateEmbeddings.js +1 -1
- package/lib/esm/activities/generateEmbeddings.js.map +1 -1
- package/lib/esm/activities/generateImageRendition.js +31 -11
- package/lib/esm/activities/generateImageRendition.js.map +1 -1
- package/lib/esm/activities/generateOrAssignContentType.js +25 -12
- package/lib/esm/activities/generateOrAssignContentType.js.map +1 -1
- package/lib/esm/activities/getObjectFromStore.js +1 -1
- package/lib/esm/activities/handleError.js +19 -0
- package/lib/esm/activities/handleError.js.map +1 -0
- package/lib/esm/activities/index-dsl.js +1 -0
- package/lib/esm/activities/index-dsl.js.map +1 -1
- package/lib/esm/activities/index.js +0 -1
- package/lib/esm/activities/index.js.map +1 -1
- package/lib/esm/activities/media/processPdfWithTextract.js +4 -4
- package/lib/esm/activities/media/transcribeMediaWithGladia.js +1 -1
- package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +1 -1
- package/lib/esm/activities/setDocumentStatus.js +1 -1
- package/lib/esm/conversion/TextractProcessor.js +9 -9
- package/lib/esm/conversion/image.js +6 -2
- package/lib/esm/conversion/image.js.map +1 -1
- package/lib/esm/conversion/markitdown.js +36 -0
- package/lib/esm/conversion/markitdown.js.map +1 -0
- package/lib/esm/conversion/mutool.js +1 -1
- package/lib/esm/conversion/pandoc.js +11 -11
- package/lib/esm/conversion/pandoc.js.map +1 -1
- package/lib/esm/dsl/dsl-workflow.js +60 -12
- package/lib/esm/dsl/dsl-workflow.js.map +1 -1
- package/lib/esm/dsl/vars.js +6 -6
- package/lib/esm/dsl/vars.js.map +1 -1
- package/lib/esm/index.js +1 -1
- package/lib/esm/iterative-generation/activities/extractToc.js +1 -1
- package/lib/esm/iterative-generation/activities/extractToc.js.map +1 -1
- package/lib/esm/iterative-generation/activities/generatePart.js +2 -2
- package/lib/esm/iterative-generation/activities/generatePart.js.map +1 -1
- package/lib/esm/iterative-generation/activities/generateToc.js +1 -1
- package/lib/esm/iterative-generation/activities/generateToc.js.map +1 -1
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js +1 -1
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js.map +1 -1
- package/lib/esm/iterative-generation/utils.js +4 -4
- package/lib/esm/iterative-generation/utils.js.map +1 -1
- package/lib/types/activities/createDocumentFromOther.d.ts +1 -1
- package/lib/types/activities/executeInteraction.d.ts +4 -4
- package/lib/types/activities/executeInteraction.d.ts.map +1 -1
- package/lib/types/activities/extractDocumentText.d.ts +3 -3
- package/lib/types/activities/extractDocumentText.d.ts.map +1 -1
- package/lib/types/activities/generateImageRendition.d.ts +1 -1
- package/lib/types/activities/generateImageRendition.d.ts.map +1 -1
- package/lib/types/activities/generateOrAssignContentType.d.ts +1 -1
- package/lib/types/activities/generateOrAssignContentType.d.ts.map +1 -1
- package/lib/types/activities/getObjectFromStore.d.ts +1 -1
- package/lib/types/activities/handleError.d.ts +6 -0
- package/lib/types/activities/handleError.d.ts.map +1 -0
- package/lib/types/activities/index-dsl.d.ts +1 -0
- package/lib/types/activities/index-dsl.d.ts.map +1 -1
- package/lib/types/activities/index.d.ts +0 -1
- package/lib/types/activities/index.d.ts.map +1 -1
- package/lib/types/activities/setDocumentStatus.d.ts +1 -1
- package/lib/types/conversion/image.d.ts.map +1 -1
- package/lib/types/conversion/markitdown.d.ts +2 -0
- package/lib/types/conversion/markitdown.d.ts.map +1 -0
- package/lib/types/conversion/mutool.d.ts +1 -1
- package/lib/types/conversion/pandoc.d.ts +1 -1
- package/lib/types/conversion/pandoc.d.ts.map +1 -1
- package/lib/types/dsl/dsl-workflow.d.ts +1 -1
- package/lib/types/dsl/dsl-workflow.d.ts.map +1 -1
- package/lib/types/dsl/vars.d.ts +2 -2
- package/lib/types/index.d.ts +1 -1
- package/lib/types/iterative-generation/types.d.ts +3 -3
- package/lib/types/iterative-generation/types.d.ts.map +1 -1
- package/lib/workflows-bundle.js +396 -94
- package/package.json +5 -4
- package/src/activities/createDocumentFromOther.ts +1 -1
- package/src/activities/executeInteraction.ts +66 -39
- package/src/activities/extractDocumentText.ts +67 -51
- package/src/activities/generateEmbeddings.ts +1 -1
- package/src/activities/generateImageRendition.ts +35 -14
- package/src/activities/generateOrAssignContentType.ts +52 -26
- package/src/activities/getObjectFromStore.ts +1 -1
- package/src/activities/handleError.ts +25 -0
- package/src/activities/index-dsl.ts +1 -0
- package/src/activities/index.ts +0 -1
- package/src/activities/media/processPdfWithTextract.ts +4 -4
- package/src/activities/media/transcribeMediaWithGladia.ts +1 -1
- package/src/activities/setDocumentStatus.ts +1 -1
- package/src/conversion/TextractProcessor.ts +9 -9
- package/src/conversion/image.ts +8 -2
- package/src/conversion/markitdown.ts +41 -0
- package/src/conversion/mutool.ts +1 -1
- package/src/conversion/pandoc.test.ts +2 -2
- package/src/conversion/pandoc.ts +38 -42
- package/src/dsl/dsl-workflow.ts +80 -12
- package/src/dsl/validation.test.ts +2 -2
- package/src/dsl/vars.test.ts +1 -1
- package/src/dsl/vars.ts +6 -6
- package/src/dsl/workflow-exec-child.test.ts +14 -4
- package/src/dsl/workflow-fetch.test.ts +1 -1
- package/src/dsl/workflow-import.test.ts +1 -1
- package/src/dsl/workflow.test.ts +12 -2
- package/src/index.ts +1 -1
- package/src/iterative-generation/activities/extractToc.ts +1 -1
- package/src/iterative-generation/activities/generatePart.ts +2 -2
- package/src/iterative-generation/activities/generateToc.ts +1 -1
- package/src/iterative-generation/iterativeGenerationWorkflow.ts +1 -1
- package/src/iterative-generation/types.ts +4 -4
- package/src/iterative-generation/utils.ts +4 -4
@@ -1,11 +1,16 @@
|
|
1
1
|
import { log } from "@temporalio/activity";
|
2
|
-
import {
|
2
|
+
import {
|
3
|
+
ContentObjectTypeItem,
|
4
|
+
CreateContentObjectTypePayload,
|
5
|
+
DSLActivityExecutionPayload,
|
6
|
+
DSLActivitySpec,
|
7
|
+
} from "@vertesia/common";
|
3
8
|
import { ActivityContext, setupActivity } from "../dsl/setup/ActivityContext.js";
|
4
9
|
import { TruncateSpec, truncByMaxTokens } from "../utils/tokens.js";
|
5
10
|
import { InteractionExecutionParams, executeInteractionFromActivity } from "./executeInteraction.js";
|
6
11
|
|
7
|
-
const INT_SELECT_DOCUMENT_TYPE = "sys:SelectDocumentType"
|
8
|
-
const INT_GENERATE_METADATA_MODEL = "sys:GenerateMetadataModel"
|
12
|
+
const INT_SELECT_DOCUMENT_TYPE = "sys:SelectDocumentType";
|
13
|
+
const INT_GENERATE_METADATA_MODEL = "sys:GenerateMetadataModel";
|
9
14
|
|
10
15
|
export interface GenerateOrAssignContentTypeParams extends InteractionExecutionParams {
|
11
16
|
typesHint?: string[];
|
@@ -21,20 +26,21 @@ export interface GenerateOrAssignContentTypeParams extends InteractionExecutionP
|
|
21
26
|
interactionNames?: {
|
22
27
|
selectDocumentType?: string;
|
23
28
|
generateMetadataModel?: string;
|
24
|
-
}
|
29
|
+
};
|
25
30
|
}
|
26
31
|
|
27
32
|
export interface GenerateOrAssignContentType extends DSLActivitySpec<GenerateOrAssignContentTypeParams> {
|
28
|
-
name:
|
33
|
+
name: "generateOrAssignContentType";
|
29
34
|
}
|
30
35
|
|
31
|
-
export async function generateOrAssignContentType(
|
36
|
+
export async function generateOrAssignContentType(
|
37
|
+
payload: DSLActivityExecutionPayload<GenerateOrAssignContentTypeParams>,
|
38
|
+
) {
|
32
39
|
const context = await setupActivity<GenerateOrAssignContentTypeParams>(payload);
|
33
40
|
const { params, client, objectId } = context;
|
34
41
|
|
35
42
|
const interactionName = params.interactionNames?.selectDocumentType ?? INT_SELECT_DOCUMENT_TYPE;
|
36
43
|
|
37
|
-
|
38
44
|
log.info("SelectDocumentType for object: " + objectId, { payload });
|
39
45
|
|
40
46
|
const object = await client.objects.retrieve(objectId, "+text");
|
@@ -48,50 +54,66 @@ export async function generateOrAssignContentType(payload: DSLActivityExecutionP
|
|
48
54
|
return { status: "skipped", message: "Object already has a type: " + object.type.name };
|
49
55
|
}
|
50
56
|
|
51
|
-
if (
|
57
|
+
if (
|
58
|
+
!object ||
|
59
|
+
(!object.text &&
|
60
|
+
!object.content?.type?.startsWith("image/") &&
|
61
|
+
!object.content?.type?.startsWith("application/pdf"))
|
62
|
+
) {
|
52
63
|
log.info(`Object ${objectId} not found or text is empty and not an image`, { object });
|
53
64
|
return { status: "failed", error: "no-text" };
|
54
65
|
}
|
55
66
|
|
56
|
-
const types = await client.types.list(
|
67
|
+
const types = await client.types.list(undefined, {
|
68
|
+
schema: true,
|
69
|
+
});
|
57
70
|
|
58
71
|
//make a list of all existing types, and add hints if any
|
59
|
-
const existing_types = types.filter(t => !["DocumentPart", "Rendition"].includes(t.name));
|
60
|
-
const content = object.text ? truncByMaxTokens(object.text, params.truncate ||
|
72
|
+
const existing_types = types.filter((t) => !["DocumentPart", "Rendition"].includes(t.name));
|
73
|
+
const content = object.text ? truncByMaxTokens(object.text, params.truncate || 30000) : undefined;
|
61
74
|
|
62
75
|
const getImage = async () => {
|
63
76
|
if (object.content?.type?.includes("pdf") && object.text?.length && object.text?.length < 100) {
|
64
|
-
return "store:" + objectId
|
77
|
+
return "store:" + objectId;
|
65
78
|
}
|
66
79
|
if (!object.content?.type?.startsWith("image/")) {
|
67
80
|
return undefined;
|
68
81
|
}
|
69
|
-
const res = await client.objects.getRendition(objectId, {
|
82
|
+
const res = await client.objects.getRendition(objectId, {
|
83
|
+
max_hw: 1024,
|
84
|
+
format: "image/png",
|
85
|
+
generate_if_missing: true,
|
86
|
+
});
|
70
87
|
if (!res.rendition && res.status === "generating") {
|
71
88
|
//throw to try again
|
72
89
|
throw new Error(`Rendition for object ${objectId} is in progress`);
|
73
90
|
} else if (res.rendition) {
|
74
91
|
return "store:" + objectId;
|
75
92
|
}
|
76
|
-
}
|
93
|
+
};
|
77
94
|
|
78
95
|
const fileRef = await getImage();
|
79
96
|
|
80
|
-
log.info(
|
97
|
+
log.info(
|
98
|
+
"Execute SelectDocumentType interaction on content with \nexisting types - passing full types: " +
|
99
|
+
existing_types.filter((t) => !t.tags?.includes("system")),
|
100
|
+
);
|
81
101
|
|
82
102
|
const res = await executeInteractionFromActivity(client, interactionName, params, {
|
83
|
-
existing_types,
|
103
|
+
existing_types,
|
104
|
+
content,
|
105
|
+
image: fileRef,
|
84
106
|
});
|
85
107
|
|
86
108
|
log.info("Selected Content Type Result: " + JSON.stringify(res.result));
|
87
109
|
|
88
110
|
//if type is not identified or not present in the database, generate a new type
|
89
|
-
let selectedType: { id: string
|
111
|
+
let selectedType: { id: string; name: string } | undefined = undefined;
|
90
112
|
|
91
|
-
selectedType = types.find(t => t.name === res.result.document_type);
|
113
|
+
selectedType = types.find((t) => t.name === res.result.document_type);
|
92
114
|
|
93
115
|
if (!selectedType) {
|
94
|
-
log.warn("Document type not
|
116
|
+
log.warn("Document type not identified: starting type generation");
|
95
117
|
const newType = await generateNewType(context, existing_types, content, fileRef);
|
96
118
|
selectedType = { id: newType.id, name: newType.name };
|
97
119
|
}
|
@@ -109,24 +131,28 @@ export async function generateOrAssignContentType(payload: DSLActivityExecutionP
|
|
109
131
|
return {
|
110
132
|
id: selectedType.id,
|
111
133
|
name: selectedType.name,
|
112
|
-
isNew: !types.find(t => t.name === selectedType.name)
|
134
|
+
isNew: !types.find((t) => t.name === selectedType.name),
|
113
135
|
};
|
114
136
|
}
|
115
137
|
|
116
|
-
async function generateNewType(
|
138
|
+
async function generateNewType(
|
139
|
+
context: ActivityContext<GenerateOrAssignContentTypeParams>,
|
140
|
+
existing_types: ContentObjectTypeItem[],
|
141
|
+
content?: string,
|
142
|
+
fileRef?: string,
|
143
|
+
) {
|
117
144
|
const { client, params } = context;
|
118
145
|
|
119
146
|
const project = await context.fetchProject();
|
120
147
|
const interactionName = params.interactionNames?.generateMetadataModel ?? INT_GENERATE_METADATA_MODEL;
|
121
148
|
|
122
149
|
const genTypeRes = await executeInteractionFromActivity(client, interactionName, params, {
|
123
|
-
existing_types
|
150
|
+
existing_types,
|
124
151
|
content: content,
|
125
152
|
human_context: project?.configuration?.human_context ?? undefined,
|
126
|
-
image: fileRef ? fileRef : undefined
|
153
|
+
image: fileRef ? fileRef : undefined,
|
127
154
|
});
|
128
155
|
|
129
|
-
|
130
156
|
if (!genTypeRes.result.document_type) {
|
131
157
|
log.error("No name generated for type", genTypeRes);
|
132
158
|
throw new Error("No name generated for type");
|
@@ -137,10 +163,10 @@ async function generateNewType(context: ActivityContext<GenerateOrAssignContentT
|
|
137
163
|
name: genTypeRes.result.document_type,
|
138
164
|
object_schema: genTypeRes.result.metadata_schema,
|
139
165
|
is_chunkable: genTypeRes.result.is_chunkable,
|
140
|
-
|
166
|
+
table_layout: genTypeRes.result.table_layout,
|
167
|
+
};
|
141
168
|
|
142
169
|
const type = await client.types.create(typeData);
|
143
170
|
|
144
171
|
return type;
|
145
|
-
|
146
172
|
}
|
@@ -12,7 +12,7 @@ export interface GetObject extends DSLActivitySpec<GetObjectParams> {
|
|
12
12
|
}
|
13
13
|
|
14
14
|
/**
|
15
|
-
* We are using a union type for the status parameter since typescript
|
15
|
+
* We are using a union type for the status parameter since typescript enums breaks the workflow code generation
|
16
16
|
* @param objectId
|
17
17
|
* @param status
|
18
18
|
*/
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import { ContentObjectStatus, DSLActivityExecutionPayload } from "@vertesia/common";
|
2
|
+
import { setupActivity } from "../dsl/setup/ActivityContext.js";
|
3
|
+
import { log } from "@temporalio/activity"
|
4
|
+
|
5
|
+
export interface HandleDslErrorParams {
|
6
|
+
errorMessage: string;
|
7
|
+
}
|
8
|
+
|
9
|
+
export async function handleDslError(payload: DSLActivityExecutionPayload<HandleDslErrorParams>): Promise<void> {
|
10
|
+
const { client, params, objectId } = await setupActivity<HandleDslErrorParams>(payload);
|
11
|
+
const isIntake = payload.workflow_name === "StandardDocumentIntake" || payload.workflow_name === "StandardImageIntake";
|
12
|
+
if (!isIntake) {
|
13
|
+
log.warn(`Workflow execution failed, but no error handler registered for this workflow: ${payload.workflow_name}`,
|
14
|
+
{ error: params.errorMessage },
|
15
|
+
);
|
16
|
+
return;
|
17
|
+
}
|
18
|
+
|
19
|
+
try {
|
20
|
+
await client.objects.update(objectId, { status: ContentObjectStatus.failed });
|
21
|
+
} catch (e) {
|
22
|
+
log.error("Failed to handle error", { error: e });
|
23
|
+
}
|
24
|
+
return;
|
25
|
+
}
|
@@ -11,6 +11,7 @@ export { generateEmbeddings } from "./generateEmbeddings.js";
|
|
11
11
|
export { generateImageRendition } from "./generateImageRendition.js";
|
12
12
|
export { generateOrAssignContentType } from "./generateOrAssignContentType.js";
|
13
13
|
export { getObjectFromStore } from "./getObjectFromStore.js";
|
14
|
+
export { handleDslError } from "./handleError.js";
|
14
15
|
export { convertPdfToStructuredText } from "./media/processPdfWithTextract.js";
|
15
16
|
export { transcribeMedia } from "./media/transcribeMediaWithGladia.js";
|
16
17
|
export { notifyWebhook } from "./notifyWebhook.js";
|
package/src/activities/index.ts
CHANGED
@@ -91,11 +91,11 @@ export async function convertPdfToStructuredText(payload: DSLActivityExecutionPa
|
|
91
91
|
|
92
92
|
if (jobStatus === "SUCCEEDED") {
|
93
93
|
log.info(`Job ${jobId} succeeded, saving results`, { jobId });
|
94
|
-
const
|
95
|
-
const tokensData = countTokens(
|
96
|
-
const etag = object.content.etag ?? md5(
|
94
|
+
const fText = await processor.processResults(jobId);
|
95
|
+
const tokensData = countTokens(fText);
|
96
|
+
const etag = object.content.etag ?? md5(fText);
|
97
97
|
const updateData: CreateContentObjectPayload = {
|
98
|
-
text:
|
98
|
+
text: fText,
|
99
99
|
text_etag: etag,
|
100
100
|
tokens: {
|
101
101
|
...tokensData,
|
@@ -74,7 +74,7 @@ export async function transcribeMedia(payload: DSLActivityExecutionPayload<Trans
|
|
74
74
|
|
75
75
|
|
76
76
|
function generateCallbackUrlForGladia(baseUrl: string, authToken: string, taskToken: string, objectId: string) {
|
77
|
-
return `${baseUrl}/api/v1/webhooks/gladia/${objectId}?
|
77
|
+
return `${baseUrl}/api/v1/webhooks/gladia/${objectId}?access_token=${authToken}&task_token=${taskToken}`;
|
78
78
|
}
|
79
79
|
|
80
80
|
interface GladiaTranscriptRequestResponse {
|
@@ -11,7 +11,7 @@ export interface SetDocumentStatus extends DSLActivitySpec<SetDocumentStatusPara
|
|
11
11
|
}
|
12
12
|
|
13
13
|
/**
|
14
|
-
* We are using a union type for the status parameter since typescript
|
14
|
+
* We are using a union type for the status parameter since typescript enums breaks the workflow code generation
|
15
15
|
* @param objectId
|
16
16
|
* @param status
|
17
17
|
*/
|
@@ -472,21 +472,21 @@ export class TextractProcessor {
|
|
472
472
|
}
|
473
473
|
|
474
474
|
// Build final output
|
475
|
-
let
|
475
|
+
let fullText = '';
|
476
476
|
let imgNumber = 1;
|
477
477
|
let tableNumber = 1;
|
478
478
|
for (const page of pageContents) {
|
479
|
-
|
479
|
+
fullText += `<page number="${page.pageNumber}">\n`;
|
480
480
|
for (const block of page.blocks) {
|
481
481
|
if (block.type === 'text') {
|
482
|
-
|
482
|
+
fullText += `<text>\n${block.content}\n</text>\n\n`;
|
483
483
|
} else if (block.type === 'table') {
|
484
484
|
const confidenceAttr = block.confidence !== undefined && this.includeConfidenceInTables
|
485
485
|
? ` confidence="${block.confidence.toFixed(2)}"`
|
486
486
|
: '';
|
487
|
-
|
488
|
-
|
489
|
-
|
487
|
+
fullText += `<table number=${tableNumber++} type="csv" ${confidenceAttr}>\n`;
|
488
|
+
fullText += `${block.content}\n`;
|
489
|
+
fullText += `</table>\n\n`;
|
490
490
|
} else if (block.type === 'image') {
|
491
491
|
// Include geometry if you like
|
492
492
|
const leftAttr = block.left ? ` left="${block.left.toFixed(4)}"` : '';
|
@@ -494,13 +494,13 @@ export class TextractProcessor {
|
|
494
494
|
const widthAttr = block.width ? ` width="${block.width.toFixed(4)}"` : '';
|
495
495
|
const heightAttr = block.height ? ` height="${block.height.toFixed(4)}"` : '';
|
496
496
|
|
497
|
-
|
497
|
+
fullText += `<image id="${imgNumber++}" ${leftAttr}${topAttr}${widthAttr}${heightAttr}>\n${block.content.trim()}\n</image>\n\n`;
|
498
498
|
}
|
499
499
|
}
|
500
|
-
|
500
|
+
fullText += `</page>\n\n`;
|
501
501
|
}
|
502
502
|
|
503
|
-
return
|
503
|
+
return fullText;
|
504
504
|
}
|
505
505
|
|
506
506
|
}
|
package/src/conversion/image.ts
CHANGED
@@ -20,6 +20,8 @@ export async function imageResizer(
|
|
20
20
|
format: string,
|
21
21
|
progressive: boolean = true,
|
22
22
|
): Promise<string> {
|
23
|
+
log.info(`[image-resizer] Resizing image: ${inputPath} to max_hw: ${max_hw}, format: ${format}, progressive: ${progressive}`);
|
24
|
+
|
23
25
|
const allowedFormats = ["jpg", "jpeg", "png", "webp"];
|
24
26
|
|
25
27
|
if (!format || format.trim() === "") {
|
@@ -69,13 +71,17 @@ export async function imageResizer(
|
|
69
71
|
|
70
72
|
log.info(`Resizing image using ImageMagick: ${inputPath} -> ${outputPath}`);
|
71
73
|
|
72
|
-
const
|
74
|
+
const command = `convert`
|
75
|
+
const args = [
|
73
76
|
inputPath,
|
74
77
|
"-resize",
|
75
78
|
`${max_hw}x${max_hw}>`,
|
76
79
|
...(conversionOption ? conversionOption.split(" ") : []),
|
77
80
|
outputPath,
|
78
|
-
]
|
81
|
+
];
|
82
|
+
log.info(`ImageMagick command: ${command} ${args.join(" ")}`);
|
83
|
+
|
84
|
+
const { stderr } = await execFile(command, args);
|
79
85
|
|
80
86
|
if (stderr) {
|
81
87
|
log.warn(`ImageMagick warning: ${stderr}`);
|
@@ -0,0 +1,41 @@
|
|
1
|
+
import { log } from "@temporalio/activity";
|
2
|
+
import { spawn } from "child_process";
|
3
|
+
import fs from "fs";
|
4
|
+
import tmp from "tmp";
|
5
|
+
|
6
|
+
export function markdownWithMarkitdown(buffer: Buffer, ext?: string): Promise<string> {
|
7
|
+
const inputFile = tmp.fileSync({ postfix: ext });
|
8
|
+
const targetFileName = tmp.tmpNameSync({ postfix: ".md" });
|
9
|
+
|
10
|
+
fs.writeSync(inputFile.fd, buffer);
|
11
|
+
|
12
|
+
return new Promise((resolve, reject) => {
|
13
|
+
const tool = "markitdown";
|
14
|
+
log.info(`Converting document to markdown with ${tool}`, { inputFile: inputFile.name, targetFileName });
|
15
|
+
|
16
|
+
const command = spawn(tool, [inputFile.name, "-o", targetFileName]);
|
17
|
+
|
18
|
+
command.on("exit", function (code) {
|
19
|
+
if (code) {
|
20
|
+
reject(new Error(`${tool} exited with code ${code}`));
|
21
|
+
}
|
22
|
+
});
|
23
|
+
|
24
|
+
command.on("close", function (code) {
|
25
|
+
if (code) {
|
26
|
+
reject(new Error(`${tool} exited with code ${code}`));
|
27
|
+
} else {
|
28
|
+
return fs.readFile(targetFileName, "utf8", (err, data) => {
|
29
|
+
if (err) {
|
30
|
+
reject(err);
|
31
|
+
}
|
32
|
+
return resolve(data);
|
33
|
+
});
|
34
|
+
}
|
35
|
+
});
|
36
|
+
|
37
|
+
command.on("error", (err) => {
|
38
|
+
reject(err);
|
39
|
+
});
|
40
|
+
});
|
41
|
+
}
|
package/src/conversion/mutool.ts
CHANGED
@@ -120,7 +120,7 @@ export async function pdfToImages(file: Buffer | string, pages?: number[]): Prom
|
|
120
120
|
|
121
121
|
|
122
122
|
/**
|
123
|
-
* Get
|
123
|
+
* Get some pages from a PDF to create a new one
|
124
124
|
*/
|
125
125
|
|
126
126
|
export async function pdfExtractPages(file: Buffer | string, pages: number[]): Promise<string> {
|
@@ -2,7 +2,7 @@ import { MockActivityEnvironment, TestWorkflowEnvironment } from '@temporalio/te
|
|
2
2
|
import fs from 'fs';
|
3
3
|
import path from 'path';
|
4
4
|
import { beforeAll, expect, test } from 'vitest';
|
5
|
-
import {
|
5
|
+
import { markdownWithPandoc } from '../conversion/pandoc';
|
6
6
|
|
7
7
|
|
8
8
|
let testEnv: TestWorkflowEnvironment;
|
@@ -19,6 +19,6 @@ test('should convert docx to markdown', async () => {
|
|
19
19
|
const filepath = path.join(__dirname, '../../fixtures', 'us-ciia.docx');
|
20
20
|
console.log("Converting file from", filepath);
|
21
21
|
const docx = fs.readFileSync(filepath);
|
22
|
-
const result = await activityContext.run(
|
22
|
+
const result = await activityContext.run(markdownWithPandoc, Buffer.from(docx), 'docx');
|
23
23
|
expect(result).to.include('confidential');
|
24
24
|
});
|
package/src/conversion/pandoc.ts
CHANGED
@@ -1,44 +1,40 @@
|
|
1
|
-
import { log } from
|
2
|
-
import { spawn } from
|
3
|
-
import { PassThrough } from
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
1
|
+
import { log } from "@temporalio/activity";
|
2
|
+
import { spawn } from "child_process";
|
3
|
+
import { PassThrough } from "stream";
|
4
|
+
|
5
|
+
export function markdownWithPandoc(buffer: Buffer, fromFormat: string): Promise<string> {
|
6
|
+
const fromType = undefined;
|
7
|
+
|
8
|
+
return new Promise((resolve, reject) => {
|
9
|
+
log.info(`Converting ${fromType} to markdown`);
|
10
|
+
const input = new PassThrough();
|
11
|
+
input.end(buffer);
|
12
|
+
|
13
|
+
let result: string[] = [];
|
14
|
+
|
15
|
+
const command = spawn("pandoc", ["-t", "markdown", "-f", fromFormat], {
|
16
|
+
stdio: "pipe",
|
17
|
+
});
|
18
|
+
input.pipe(command.stdin);
|
19
|
+
|
20
|
+
command.stdout.on("data", function (data: string) {
|
21
|
+
result.push(data.toString());
|
22
|
+
});
|
23
|
+
command.on("exit", function (code) {
|
24
|
+
if (code) {
|
25
|
+
reject(new Error(`pandoc exited with code ${code}`));
|
26
|
+
}
|
27
|
+
});
|
28
|
+
command.on("close", function (code) {
|
29
|
+
if (code) {
|
30
|
+
reject(new Error(`pandoc exited with code ${code}`));
|
31
|
+
} else {
|
32
|
+
resolve(result.join(""));
|
33
|
+
}
|
34
|
+
});
|
35
|
+
|
36
|
+
command.on("error", (err) => {
|
37
|
+
reject(err);
|
38
|
+
});
|
29
39
|
});
|
30
|
-
command.on('close', function (code) {
|
31
|
-
if (code) {
|
32
|
-
reject(new Error(`pandoc exited with code ${code}`));
|
33
|
-
} else {
|
34
|
-
resolve(result.join(''))
|
35
|
-
}
|
36
|
-
});
|
37
|
-
|
38
|
-
command.on('error', (err) => {
|
39
|
-
reject(err);
|
40
|
-
});
|
41
|
-
|
42
|
-
});
|
43
|
-
|
44
40
|
}
|
package/src/dsl/dsl-workflow.ts
CHANGED
@@ -1,15 +1,30 @@
|
|
1
|
+
import {
|
2
|
+
ActivityInterfaceFor,
|
3
|
+
ActivityOptions,
|
4
|
+
CancellationScope,
|
5
|
+
executeChild,
|
6
|
+
isCancellation,
|
7
|
+
log,
|
8
|
+
patched,
|
9
|
+
proxyActivities,
|
10
|
+
startChild,
|
11
|
+
UntypedActivities,
|
12
|
+
} from "@temporalio/workflow";
|
1
13
|
import {
|
2
14
|
DSLActivityExecutionPayload,
|
3
15
|
DSLActivityOptions,
|
4
16
|
DSLActivitySpec,
|
5
17
|
DSLChildWorkflowStep,
|
6
18
|
DSLWorkflowExecutionPayload,
|
19
|
+
DSLWorkflowSpec,
|
20
|
+
getDocumentIds,
|
7
21
|
WorkflowExecutionPayload
|
8
22
|
} from "@vertesia/common";
|
9
|
-
import { ActivityInterfaceFor, ActivityOptions, executeChild, log, proxyActivities, startChild, UntypedActivities } from "@temporalio/workflow";
|
10
23
|
import ms, { StringValue } from 'ms';
|
11
24
|
import { ActivityParamNotFound, NoDocumentFound, WorkflowParamNotFound } from "../errors.js";
|
12
25
|
import { Vars } from "./vars.js";
|
26
|
+
import { HandleDslErrorParams } from "../activities/handleError.js";
|
27
|
+
import * as activities from "../activities/index.js";
|
13
28
|
|
14
29
|
interface BaseActivityPayload extends WorkflowExecutionPayload {
|
15
30
|
workflow_name: string;
|
@@ -30,7 +45,7 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
|
|
30
45
|
if (!definition) {
|
31
46
|
throw new WorkflowParamNotFound("workflow");
|
32
47
|
}
|
33
|
-
// the base payload
|
48
|
+
// the base payload will be used to create the activities payload
|
34
49
|
const basePayload: BaseActivityPayload = {
|
35
50
|
...payload,
|
36
51
|
workflow_name: definition.name,
|
@@ -42,9 +57,9 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
|
|
42
57
|
...convertDSLActivityOptions(definition.options),
|
43
58
|
startToCloseTimeout: "5 minute",
|
44
59
|
retry: {
|
45
|
-
initialInterval: '
|
60
|
+
initialInterval: '10s',
|
46
61
|
backoffCoefficient: 2,
|
47
|
-
maximumAttempts:
|
62
|
+
maximumAttempts: 10,
|
48
63
|
maximumInterval: 100 * 30 * 1000, //ms
|
49
64
|
nonRetryableErrorTypes: [
|
50
65
|
NoDocumentFound.name,
|
@@ -58,7 +73,7 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
|
|
58
73
|
});
|
59
74
|
const defaultProxy = proxyActivities(defaultOptions);
|
60
75
|
log.debug("Default activity proxy is ready");
|
61
|
-
// merge default vars with the payload vars and add objectIds and
|
76
|
+
// merge default vars with the payload vars and add objectIds and objectId
|
62
77
|
const vars = new Vars({
|
63
78
|
...definition.vars,
|
64
79
|
...payload.vars,
|
@@ -68,6 +83,26 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
|
|
68
83
|
|
69
84
|
log.info("Executing workflow", { payload });
|
70
85
|
|
86
|
+
// TODO(mhuang): remove patch when all workflows are migrated to v2
|
87
|
+
// It avoids breaking the ongoing workflow execution running in v1 and also allows us to
|
88
|
+
// deploy the new error handler in production.
|
89
|
+
// See https://docs.temporal.io/develop/typescript/versioning
|
90
|
+
if (patched('dsl-workflow-error-handling')) {
|
91
|
+
// v2: new version with error handler
|
92
|
+
try {
|
93
|
+
await executeSteps(definition, payload, basePayload, vars, defaultProxy, defaultOptions);
|
94
|
+
} catch (e) {
|
95
|
+
await handleError(e, basePayload, defaultOptions);
|
96
|
+
}
|
97
|
+
} else {
|
98
|
+
// v1: old version without error handler, deprecated since v0.52.0
|
99
|
+
await executeSteps(definition, payload, basePayload, vars, defaultProxy, defaultOptions);
|
100
|
+
}
|
101
|
+
|
102
|
+
return vars.getValue(definition.result || 'result');
|
103
|
+
}
|
104
|
+
|
105
|
+
async function executeSteps(definition: DSLWorkflowSpec, payload: DSLWorkflowExecutionPayload, basePayload: BaseActivityPayload, vars: Vars, defaultProxy: ActivityInterfaceFor<UntypedActivities>, defaultOptions: ActivityOptions) {
|
71
106
|
if (definition.steps) {
|
72
107
|
for (const step of definition.steps) {
|
73
108
|
const stepType = step.type;
|
@@ -89,7 +124,32 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
|
|
89
124
|
} else {
|
90
125
|
throw new Error("No steps or activities found in the workflow definition");
|
91
126
|
}
|
92
|
-
|
127
|
+
}
|
128
|
+
|
129
|
+
async function handleError(originalError: any, basePayload: BaseActivityPayload, defaultOptions: ActivityOptions) {
|
130
|
+
const { handleDslError } = proxyActivities<typeof activities>(defaultOptions);
|
131
|
+
|
132
|
+
const payload = dslActivityPayload(
|
133
|
+
basePayload,
|
134
|
+
{
|
135
|
+
name: "handleDslError",
|
136
|
+
params: { errorMessage: originalError.message },
|
137
|
+
} as DSLActivitySpec,
|
138
|
+
{ errorMessage: originalError.message } satisfies HandleDslErrorParams,
|
139
|
+
)
|
140
|
+
|
141
|
+
if (isCancellation(originalError)) {
|
142
|
+
log.warn(`Workflow execution cancelled, executing error handler to update document status`, { error: originalError });
|
143
|
+
// Cleanup logic must be in a nonCancellable scope
|
144
|
+
// If we'd run cleanup outside of a nonCancellable scope it would've been cancelled
|
145
|
+
// before being started because the Workflow's root scope is cancelled.
|
146
|
+
// see https://docs.temporal.io/develop/typescript/cancellation
|
147
|
+
await CancellationScope.nonCancellable(() => handleDslError(payload));
|
148
|
+
} else {
|
149
|
+
log.warn(`Workflow execution failed, executing error handler to update document status`, { error: originalError });
|
150
|
+
handleDslError(payload);
|
151
|
+
}
|
152
|
+
throw originalError;
|
93
153
|
}
|
94
154
|
|
95
155
|
async function startChildWorkflow(step: DSLChildWorkflowStep, payload: DSLWorkflowExecutionPayload, vars: Vars, debug_mode?: boolean) {
|
@@ -101,14 +161,18 @@ async function startChildWorkflow(step: DSLChildWorkflowStep, payload: DSLWorkfl
|
|
101
161
|
if (debug_mode) {
|
102
162
|
log.debug(`Workflow vars before starting child workflow ${step.name}`, { vars: resolvedVars });
|
103
163
|
}
|
104
|
-
//@ts-ignore
|
105
164
|
const handle = await startChild(step.name, {
|
106
165
|
...step.options,
|
107
166
|
args: [{
|
108
167
|
...payload,
|
109
168
|
workflow: step.spec,
|
110
169
|
vars: resolvedVars
|
111
|
-
}]
|
170
|
+
}],
|
171
|
+
searchAttributes: {
|
172
|
+
AccountId: [payload.account_id],
|
173
|
+
DocumentId: getDocumentIds(payload),
|
174
|
+
ProjectId: [payload.project_id],
|
175
|
+
},
|
112
176
|
});
|
113
177
|
if (step.output) {
|
114
178
|
vars.setValue(step.output, handle.workflowId);
|
@@ -122,16 +186,20 @@ async function executeChildWorkflow(step: DSLChildWorkflowStep, payload: DSLWork
|
|
122
186
|
Object.assign(resolvedVars, step.vars);
|
123
187
|
}
|
124
188
|
if (debug_mode) {
|
125
|
-
log.debug(`Workflow vars before
|
189
|
+
log.debug(`Workflow vars before executing child workflow ${step.name}`, { vars: resolvedVars });
|
126
190
|
}
|
127
|
-
//@ts-ignore
|
128
191
|
const result = await executeChild(step.name, {
|
129
192
|
...step.options,
|
130
193
|
args: [{
|
131
194
|
...payload,
|
132
195
|
workflow: step.spec,
|
133
196
|
vars: resolvedVars,
|
134
|
-
}]
|
197
|
+
}],
|
198
|
+
searchAttributes: {
|
199
|
+
AccountId: [payload.account_id],
|
200
|
+
DocumentId: getDocumentIds(payload),
|
201
|
+
ProjectId: [payload.project_id],
|
202
|
+
},
|
135
203
|
});
|
136
204
|
|
137
205
|
if (step.output) {
|
@@ -149,7 +217,7 @@ async function runActivity(activity: DSLActivitySpec, basePayload: BaseActivityP
|
|
149
217
|
log.debug(`Workflow vars before executing activity ${activity.name}`, { vars: vars.resolve() });
|
150
218
|
}
|
151
219
|
if (activity.condition && !vars.match(activity.condition)) {
|
152
|
-
log.info("Activity
|
220
|
+
log.info("Activity skipped: condition not satisfied", activity.condition);
|
153
221
|
return;
|
154
222
|
}
|
155
223
|
const importParams = vars.createImportVars(activity.import);
|