@vertesia/workflow 0.52.0 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/lib/cjs/activities/createDocumentFromOther.js +1 -1
  2. package/lib/cjs/activities/executeInteraction.js +29 -15
  3. package/lib/cjs/activities/executeInteraction.js.map +1 -1
  4. package/lib/cjs/activities/extractDocumentText.js +33 -30
  5. package/lib/cjs/activities/extractDocumentText.js.map +1 -1
  6. package/lib/cjs/activities/generateEmbeddings.js +1 -1
  7. package/lib/cjs/activities/generateEmbeddings.js.map +1 -1
  8. package/lib/cjs/activities/generateImageRendition.js +31 -11
  9. package/lib/cjs/activities/generateImageRendition.js.map +1 -1
  10. package/lib/cjs/activities/generateOrAssignContentType.js +25 -12
  11. package/lib/cjs/activities/generateOrAssignContentType.js.map +1 -1
  12. package/lib/cjs/activities/getObjectFromStore.js +1 -1
  13. package/lib/cjs/activities/handleError.js +22 -0
  14. package/lib/cjs/activities/handleError.js.map +1 -0
  15. package/lib/cjs/activities/index-dsl.js +3 -1
  16. package/lib/cjs/activities/index-dsl.js.map +1 -1
  17. package/lib/cjs/activities/index.js +0 -1
  18. package/lib/cjs/activities/index.js.map +1 -1
  19. package/lib/cjs/activities/media/processPdfWithTextract.js +4 -4
  20. package/lib/cjs/activities/media/transcribeMediaWithGladia.js +1 -1
  21. package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +1 -1
  22. package/lib/cjs/activities/setDocumentStatus.js +1 -1
  23. package/lib/cjs/conversion/TextractProcessor.js +9 -9
  24. package/lib/cjs/conversion/image.js +6 -2
  25. package/lib/cjs/conversion/image.js.map +1 -1
  26. package/lib/cjs/conversion/markitdown.js +42 -0
  27. package/lib/cjs/conversion/markitdown.js.map +1 -0
  28. package/lib/cjs/conversion/mutool.js +1 -1
  29. package/lib/cjs/conversion/pandoc.js +9 -9
  30. package/lib/cjs/conversion/pandoc.js.map +1 -1
  31. package/lib/cjs/dsl/dsl-workflow.js +59 -11
  32. package/lib/cjs/dsl/dsl-workflow.js.map +1 -1
  33. package/lib/cjs/dsl/vars.js +6 -6
  34. package/lib/cjs/dsl/vars.js.map +1 -1
  35. package/lib/cjs/index.js +1 -1
  36. package/lib/cjs/iterative-generation/activities/extractToc.js +1 -1
  37. package/lib/cjs/iterative-generation/activities/extractToc.js.map +1 -1
  38. package/lib/cjs/iterative-generation/activities/generatePart.js +2 -2
  39. package/lib/cjs/iterative-generation/activities/generatePart.js.map +1 -1
  40. package/lib/cjs/iterative-generation/activities/generateToc.js +1 -1
  41. package/lib/cjs/iterative-generation/activities/generateToc.js.map +1 -1
  42. package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js +1 -1
  43. package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js.map +1 -1
  44. package/lib/cjs/iterative-generation/utils.js +4 -4
  45. package/lib/cjs/iterative-generation/utils.js.map +1 -1
  46. package/lib/esm/activities/createDocumentFromOther.js +1 -1
  47. package/lib/esm/activities/executeInteraction.js +31 -17
  48. package/lib/esm/activities/executeInteraction.js.map +1 -1
  49. package/lib/esm/activities/extractDocumentText.js +39 -36
  50. package/lib/esm/activities/extractDocumentText.js.map +1 -1
  51. package/lib/esm/activities/generateEmbeddings.js +1 -1
  52. package/lib/esm/activities/generateEmbeddings.js.map +1 -1
  53. package/lib/esm/activities/generateImageRendition.js +31 -11
  54. package/lib/esm/activities/generateImageRendition.js.map +1 -1
  55. package/lib/esm/activities/generateOrAssignContentType.js +25 -12
  56. package/lib/esm/activities/generateOrAssignContentType.js.map +1 -1
  57. package/lib/esm/activities/getObjectFromStore.js +1 -1
  58. package/lib/esm/activities/handleError.js +19 -0
  59. package/lib/esm/activities/handleError.js.map +1 -0
  60. package/lib/esm/activities/index-dsl.js +1 -0
  61. package/lib/esm/activities/index-dsl.js.map +1 -1
  62. package/lib/esm/activities/index.js +0 -1
  63. package/lib/esm/activities/index.js.map +1 -1
  64. package/lib/esm/activities/media/processPdfWithTextract.js +4 -4
  65. package/lib/esm/activities/media/transcribeMediaWithGladia.js +1 -1
  66. package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +1 -1
  67. package/lib/esm/activities/setDocumentStatus.js +1 -1
  68. package/lib/esm/conversion/TextractProcessor.js +9 -9
  69. package/lib/esm/conversion/image.js +6 -2
  70. package/lib/esm/conversion/image.js.map +1 -1
  71. package/lib/esm/conversion/markitdown.js +36 -0
  72. package/lib/esm/conversion/markitdown.js.map +1 -0
  73. package/lib/esm/conversion/mutool.js +1 -1
  74. package/lib/esm/conversion/pandoc.js +11 -11
  75. package/lib/esm/conversion/pandoc.js.map +1 -1
  76. package/lib/esm/dsl/dsl-workflow.js +60 -12
  77. package/lib/esm/dsl/dsl-workflow.js.map +1 -1
  78. package/lib/esm/dsl/vars.js +6 -6
  79. package/lib/esm/dsl/vars.js.map +1 -1
  80. package/lib/esm/index.js +1 -1
  81. package/lib/esm/iterative-generation/activities/extractToc.js +1 -1
  82. package/lib/esm/iterative-generation/activities/extractToc.js.map +1 -1
  83. package/lib/esm/iterative-generation/activities/generatePart.js +2 -2
  84. package/lib/esm/iterative-generation/activities/generatePart.js.map +1 -1
  85. package/lib/esm/iterative-generation/activities/generateToc.js +1 -1
  86. package/lib/esm/iterative-generation/activities/generateToc.js.map +1 -1
  87. package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js +1 -1
  88. package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js.map +1 -1
  89. package/lib/esm/iterative-generation/utils.js +4 -4
  90. package/lib/esm/iterative-generation/utils.js.map +1 -1
  91. package/lib/types/activities/createDocumentFromOther.d.ts +1 -1
  92. package/lib/types/activities/executeInteraction.d.ts +4 -4
  93. package/lib/types/activities/executeInteraction.d.ts.map +1 -1
  94. package/lib/types/activities/extractDocumentText.d.ts +3 -3
  95. package/lib/types/activities/extractDocumentText.d.ts.map +1 -1
  96. package/lib/types/activities/generateImageRendition.d.ts +1 -1
  97. package/lib/types/activities/generateImageRendition.d.ts.map +1 -1
  98. package/lib/types/activities/generateOrAssignContentType.d.ts +1 -1
  99. package/lib/types/activities/generateOrAssignContentType.d.ts.map +1 -1
  100. package/lib/types/activities/getObjectFromStore.d.ts +1 -1
  101. package/lib/types/activities/handleError.d.ts +6 -0
  102. package/lib/types/activities/handleError.d.ts.map +1 -0
  103. package/lib/types/activities/index-dsl.d.ts +1 -0
  104. package/lib/types/activities/index-dsl.d.ts.map +1 -1
  105. package/lib/types/activities/index.d.ts +0 -1
  106. package/lib/types/activities/index.d.ts.map +1 -1
  107. package/lib/types/activities/setDocumentStatus.d.ts +1 -1
  108. package/lib/types/conversion/image.d.ts.map +1 -1
  109. package/lib/types/conversion/markitdown.d.ts +2 -0
  110. package/lib/types/conversion/markitdown.d.ts.map +1 -0
  111. package/lib/types/conversion/mutool.d.ts +1 -1
  112. package/lib/types/conversion/pandoc.d.ts +1 -1
  113. package/lib/types/conversion/pandoc.d.ts.map +1 -1
  114. package/lib/types/dsl/dsl-workflow.d.ts +1 -1
  115. package/lib/types/dsl/dsl-workflow.d.ts.map +1 -1
  116. package/lib/types/dsl/vars.d.ts +2 -2
  117. package/lib/types/index.d.ts +1 -1
  118. package/lib/types/iterative-generation/types.d.ts +3 -3
  119. package/lib/types/iterative-generation/types.d.ts.map +1 -1
  120. package/lib/workflows-bundle.js +396 -94
  121. package/package.json +5 -4
  122. package/src/activities/createDocumentFromOther.ts +1 -1
  123. package/src/activities/executeInteraction.ts +66 -39
  124. package/src/activities/extractDocumentText.ts +67 -51
  125. package/src/activities/generateEmbeddings.ts +1 -1
  126. package/src/activities/generateImageRendition.ts +35 -14
  127. package/src/activities/generateOrAssignContentType.ts +52 -26
  128. package/src/activities/getObjectFromStore.ts +1 -1
  129. package/src/activities/handleError.ts +25 -0
  130. package/src/activities/index-dsl.ts +1 -0
  131. package/src/activities/index.ts +0 -1
  132. package/src/activities/media/processPdfWithTextract.ts +4 -4
  133. package/src/activities/media/transcribeMediaWithGladia.ts +1 -1
  134. package/src/activities/setDocumentStatus.ts +1 -1
  135. package/src/conversion/TextractProcessor.ts +9 -9
  136. package/src/conversion/image.ts +8 -2
  137. package/src/conversion/markitdown.ts +41 -0
  138. package/src/conversion/mutool.ts +1 -1
  139. package/src/conversion/pandoc.test.ts +2 -2
  140. package/src/conversion/pandoc.ts +38 -42
  141. package/src/dsl/dsl-workflow.ts +80 -12
  142. package/src/dsl/validation.test.ts +2 -2
  143. package/src/dsl/vars.test.ts +1 -1
  144. package/src/dsl/vars.ts +6 -6
  145. package/src/dsl/workflow-exec-child.test.ts +14 -4
  146. package/src/dsl/workflow-fetch.test.ts +1 -1
  147. package/src/dsl/workflow-import.test.ts +1 -1
  148. package/src/dsl/workflow.test.ts +12 -2
  149. package/src/index.ts +1 -1
  150. package/src/iterative-generation/activities/extractToc.ts +1 -1
  151. package/src/iterative-generation/activities/generatePart.ts +2 -2
  152. package/src/iterative-generation/activities/generateToc.ts +1 -1
  153. package/src/iterative-generation/iterativeGenerationWorkflow.ts +1 -1
  154. package/src/iterative-generation/types.ts +4 -4
  155. package/src/iterative-generation/utils.ts +4 -4
@@ -1,11 +1,16 @@
1
1
  import { log } from "@temporalio/activity";
2
- import { ContentObjectTypeItem, CreateContentObjectTypePayload, DSLActivityExecutionPayload, DSLActivitySpec } from "@vertesia/common";
2
+ import {
3
+ ContentObjectTypeItem,
4
+ CreateContentObjectTypePayload,
5
+ DSLActivityExecutionPayload,
6
+ DSLActivitySpec,
7
+ } from "@vertesia/common";
3
8
  import { ActivityContext, setupActivity } from "../dsl/setup/ActivityContext.js";
4
9
  import { TruncateSpec, truncByMaxTokens } from "../utils/tokens.js";
5
10
  import { InteractionExecutionParams, executeInteractionFromActivity } from "./executeInteraction.js";
6
11
 
7
- const INT_SELECT_DOCUMENT_TYPE = "sys:SelectDocumentType"
8
- const INT_GENERATE_METADATA_MODEL = "sys:GenerateMetadataModel"
12
+ const INT_SELECT_DOCUMENT_TYPE = "sys:SelectDocumentType";
13
+ const INT_GENERATE_METADATA_MODEL = "sys:GenerateMetadataModel";
9
14
 
10
15
  export interface GenerateOrAssignContentTypeParams extends InteractionExecutionParams {
11
16
  typesHint?: string[];
@@ -21,20 +26,21 @@ export interface GenerateOrAssignContentTypeParams extends InteractionExecutionP
21
26
  interactionNames?: {
22
27
  selectDocumentType?: string;
23
28
  generateMetadataModel?: string;
24
- }
29
+ };
25
30
  }
26
31
 
27
32
  export interface GenerateOrAssignContentType extends DSLActivitySpec<GenerateOrAssignContentTypeParams> {
28
- name: 'generateOrAssignContentType';
33
+ name: "generateOrAssignContentType";
29
34
  }
30
35
 
31
- export async function generateOrAssignContentType(payload: DSLActivityExecutionPayload<GenerateOrAssignContentTypeParams>) {
36
+ export async function generateOrAssignContentType(
37
+ payload: DSLActivityExecutionPayload<GenerateOrAssignContentTypeParams>,
38
+ ) {
32
39
  const context = await setupActivity<GenerateOrAssignContentTypeParams>(payload);
33
40
  const { params, client, objectId } = context;
34
41
 
35
42
  const interactionName = params.interactionNames?.selectDocumentType ?? INT_SELECT_DOCUMENT_TYPE;
36
43
 
37
-
38
44
  log.info("SelectDocumentType for object: " + objectId, { payload });
39
45
 
40
46
  const object = await client.objects.retrieve(objectId, "+text");
@@ -48,50 +54,66 @@ export async function generateOrAssignContentType(payload: DSLActivityExecutionP
48
54
  return { status: "skipped", message: "Object already has a type: " + object.type.name };
49
55
  }
50
56
 
51
- if (!object || (!object.text && !object.content?.type?.startsWith("image/") && !object.content?.type?.startsWith("application/pdf"))) {
57
+ if (
58
+ !object ||
59
+ (!object.text &&
60
+ !object.content?.type?.startsWith("image/") &&
61
+ !object.content?.type?.startsWith("application/pdf"))
62
+ ) {
52
63
  log.info(`Object ${objectId} not found or text is empty and not an image`, { object });
53
64
  return { status: "failed", error: "no-text" };
54
65
  }
55
66
 
56
- const types = await client.types.list();
67
+ const types = await client.types.list(undefined, {
68
+ schema: true,
69
+ });
57
70
 
58
71
  //make a list of all existing types, and add hints if any
59
- const existing_types = types.filter(t => !["DocumentPart", "Rendition"].includes(t.name));
60
- const content = object.text ? truncByMaxTokens(object.text, params.truncate || 4000) : undefined;
72
+ const existing_types = types.filter((t) => !["DocumentPart", "Rendition"].includes(t.name));
73
+ const content = object.text ? truncByMaxTokens(object.text, params.truncate || 30000) : undefined;
61
74
 
62
75
  const getImage = async () => {
63
76
  if (object.content?.type?.includes("pdf") && object.text?.length && object.text?.length < 100) {
64
- return "store:" + objectId
77
+ return "store:" + objectId;
65
78
  }
66
79
  if (!object.content?.type?.startsWith("image/")) {
67
80
  return undefined;
68
81
  }
69
- const res = await client.objects.getRendition(objectId, { max_hw: 1024, format: "image/png", generate_if_missing: true });
82
+ const res = await client.objects.getRendition(objectId, {
83
+ max_hw: 1024,
84
+ format: "image/png",
85
+ generate_if_missing: true,
86
+ });
70
87
  if (!res.rendition && res.status === "generating") {
71
88
  //throw to try again
72
89
  throw new Error(`Rendition for object ${objectId} is in progress`);
73
90
  } else if (res.rendition) {
74
91
  return "store:" + objectId;
75
92
  }
76
- }
93
+ };
77
94
 
78
95
  const fileRef = await getImage();
79
96
 
80
- log.info("Execute SelectDocumentType interaction on content with \nexisting types: " + existing_types.map(t => t.name).join(","));
97
+ log.info(
98
+ "Execute SelectDocumentType interaction on content with \nexisting types - passing full types: " +
99
+ existing_types.filter((t) => !t.tags?.includes("system")),
100
+ );
81
101
 
82
102
  const res = await executeInteractionFromActivity(client, interactionName, params, {
83
- existing_types, content, image: fileRef
103
+ existing_types,
104
+ content,
105
+ image: fileRef,
84
106
  });
85
107
 
86
108
  log.info("Selected Content Type Result: " + JSON.stringify(res.result));
87
109
 
88
110
  //if type is not identified or not present in the database, generate a new type
89
- let selectedType: { id: string, name: string } | undefined = undefined;
111
+ let selectedType: { id: string; name: string } | undefined = undefined;
90
112
 
91
- selectedType = types.find(t => t.name === res.result.document_type);
113
+ selectedType = types.find((t) => t.name === res.result.document_type);
92
114
 
93
115
  if (!selectedType) {
94
- log.warn("Document type not idenfified: starting type generation");
116
+ log.warn("Document type not identified: starting type generation");
95
117
  const newType = await generateNewType(context, existing_types, content, fileRef);
96
118
  selectedType = { id: newType.id, name: newType.name };
97
119
  }
@@ -109,24 +131,28 @@ export async function generateOrAssignContentType(payload: DSLActivityExecutionP
109
131
  return {
110
132
  id: selectedType.id,
111
133
  name: selectedType.name,
112
- isNew: !types.find(t => t.name === selectedType.name)
134
+ isNew: !types.find((t) => t.name === selectedType.name),
113
135
  };
114
136
  }
115
137
 
116
- async function generateNewType(context: ActivityContext<GenerateOrAssignContentTypeParams>, existing_types: ContentObjectTypeItem[], content?: string, fileRef?: string) {
138
+ async function generateNewType(
139
+ context: ActivityContext<GenerateOrAssignContentTypeParams>,
140
+ existing_types: ContentObjectTypeItem[],
141
+ content?: string,
142
+ fileRef?: string,
143
+ ) {
117
144
  const { client, params } = context;
118
145
 
119
146
  const project = await context.fetchProject();
120
147
  const interactionName = params.interactionNames?.generateMetadataModel ?? INT_GENERATE_METADATA_MODEL;
121
148
 
122
149
  const genTypeRes = await executeInteractionFromActivity(client, interactionName, params, {
123
- existing_types: existing_types.map(t => t.name),
150
+ existing_types,
124
151
  content: content,
125
152
  human_context: project?.configuration?.human_context ?? undefined,
126
- image: fileRef ? fileRef : undefined
153
+ image: fileRef ? fileRef : undefined,
127
154
  });
128
155
 
129
-
130
156
  if (!genTypeRes.result.document_type) {
131
157
  log.error("No name generated for type", genTypeRes);
132
158
  throw new Error("No name generated for type");
@@ -137,10 +163,10 @@ async function generateNewType(context: ActivityContext<GenerateOrAssignContentT
137
163
  name: genTypeRes.result.document_type,
138
164
  object_schema: genTypeRes.result.metadata_schema,
139
165
  is_chunkable: genTypeRes.result.is_chunkable,
140
- }
166
+ table_layout: genTypeRes.result.table_layout,
167
+ };
141
168
 
142
169
  const type = await client.types.create(typeData);
143
170
 
144
171
  return type;
145
-
146
172
  }
@@ -12,7 +12,7 @@ export interface GetObject extends DSLActivitySpec<GetObjectParams> {
12
12
  }
13
13
 
14
14
  /**
15
- * We are using a union type for the status parameter since typescript enumbs breaks the workflow code generation
15
+ * We are using a union type for the status parameter since typescript enums breaks the workflow code generation
16
16
  * @param objectId
17
17
  * @param status
18
18
  */
@@ -0,0 +1,25 @@
1
+ import { ContentObjectStatus, DSLActivityExecutionPayload } from "@vertesia/common";
2
+ import { setupActivity } from "../dsl/setup/ActivityContext.js";
3
+ import { log } from "@temporalio/activity"
4
+
5
+ export interface HandleDslErrorParams {
6
+ errorMessage: string;
7
+ }
8
+
9
+ export async function handleDslError(payload: DSLActivityExecutionPayload<HandleDslErrorParams>): Promise<void> {
10
+ const { client, params, objectId } = await setupActivity<HandleDslErrorParams>(payload);
11
+ const isIntake = payload.workflow_name === "StandardDocumentIntake" || payload.workflow_name === "StandardImageIntake";
12
+ if (!isIntake) {
13
+ log.warn(`Workflow execution failed, but no error handler registered for this workflow: ${payload.workflow_name}`,
14
+ { error: params.errorMessage },
15
+ );
16
+ return;
17
+ }
18
+
19
+ try {
20
+ await client.objects.update(objectId, { status: ContentObjectStatus.failed });
21
+ } catch (e) {
22
+ log.error("Failed to handle error", { error: e });
23
+ }
24
+ return;
25
+ }
@@ -11,6 +11,7 @@ export { generateEmbeddings } from "./generateEmbeddings.js";
11
11
  export { generateImageRendition } from "./generateImageRendition.js";
12
12
  export { generateOrAssignContentType } from "./generateOrAssignContentType.js";
13
13
  export { getObjectFromStore } from "./getObjectFromStore.js";
14
+ export { handleDslError } from "./handleError.js";
14
15
  export { convertPdfToStructuredText } from "./media/processPdfWithTextract.js";
15
16
  export { transcribeMedia } from "./media/transcribeMediaWithGladia.js";
16
17
  export { notifyWebhook } from "./notifyWebhook.js";
@@ -2,4 +2,3 @@
2
2
  * Here we export all activities to be registered with the temporal worker
3
3
  */
4
4
  export * from "./index-dsl.js";
5
- export * from "../iterative-generation/activities/index.js";
@@ -91,11 +91,11 @@ export async function convertPdfToStructuredText(payload: DSLActivityExecutionPa
91
91
 
92
92
  if (jobStatus === "SUCCEEDED") {
93
93
  log.info(`Job ${jobId} succeeded, saving results`, { jobId });
94
- const ftext = await processor.processResults(jobId);
95
- const tokensData = countTokens(ftext);
96
- const etag = object.content.etag ?? md5(ftext);
94
+ const fText = await processor.processResults(jobId);
95
+ const tokensData = countTokens(fText);
96
+ const etag = object.content.etag ?? md5(fText);
97
97
  const updateData: CreateContentObjectPayload = {
98
- text: ftext,
98
+ text: fText,
99
99
  text_etag: etag,
100
100
  tokens: {
101
101
  ...tokensData,
@@ -74,7 +74,7 @@ export async function transcribeMedia(payload: DSLActivityExecutionPayload<Trans
74
74
 
75
75
 
76
76
  function generateCallbackUrlForGladia(baseUrl: string, authToken: string, taskToken: string, objectId: string) {
77
- return `${baseUrl}/api/v1/webhooks/gladia/${objectId}?auth_token=${authToken}&task_token=${taskToken}`;
77
+ return `${baseUrl}/api/v1/webhooks/gladia/${objectId}?access_token=${authToken}&task_token=${taskToken}`;
78
78
  }
79
79
 
80
80
  interface GladiaTranscriptRequestResponse {
@@ -11,7 +11,7 @@ export interface SetDocumentStatus extends DSLActivitySpec<SetDocumentStatusPara
11
11
  }
12
12
 
13
13
  /**
14
- * We are using a union type for the status parameter since typescript enumbs breaks the workflow code generation
14
+ * We are using a union type for the status parameter since typescript enums breaks the workflow code generation
15
15
  * @param objectId
16
16
  * @param status
17
17
  */
@@ -472,21 +472,21 @@ export class TextractProcessor {
472
472
  }
473
473
 
474
474
  // Build final output
475
- let fulltext = '';
475
+ let fullText = '';
476
476
  let imgNumber = 1;
477
477
  let tableNumber = 1;
478
478
  for (const page of pageContents) {
479
- fulltext += `<page number="${page.pageNumber}">\n`;
479
+ fullText += `<page number="${page.pageNumber}">\n`;
480
480
  for (const block of page.blocks) {
481
481
  if (block.type === 'text') {
482
- fulltext += `<text>\n${block.content}\n</text>\n\n`;
482
+ fullText += `<text>\n${block.content}\n</text>\n\n`;
483
483
  } else if (block.type === 'table') {
484
484
  const confidenceAttr = block.confidence !== undefined && this.includeConfidenceInTables
485
485
  ? ` confidence="${block.confidence.toFixed(2)}"`
486
486
  : '';
487
- fulltext += `<table number=${tableNumber++} type="csv" ${confidenceAttr}>\n`;
488
- fulltext += `${block.content}\n`;
489
- fulltext += `</table>\n\n`;
487
+ fullText += `<table number=${tableNumber++} type="csv" ${confidenceAttr}>\n`;
488
+ fullText += `${block.content}\n`;
489
+ fullText += `</table>\n\n`;
490
490
  } else if (block.type === 'image') {
491
491
  // Include geometry if you like
492
492
  const leftAttr = block.left ? ` left="${block.left.toFixed(4)}"` : '';
@@ -494,13 +494,13 @@ export class TextractProcessor {
494
494
  const widthAttr = block.width ? ` width="${block.width.toFixed(4)}"` : '';
495
495
  const heightAttr = block.height ? ` height="${block.height.toFixed(4)}"` : '';
496
496
 
497
- fulltext += `<image id="${imgNumber++}" ${leftAttr}${topAttr}${widthAttr}${heightAttr}>\n${block.content.trim()}\n</image>\n\n`;
497
+ fullText += `<image id="${imgNumber++}" ${leftAttr}${topAttr}${widthAttr}${heightAttr}>\n${block.content.trim()}\n</image>\n\n`;
498
498
  }
499
499
  }
500
- fulltext += `</page>\n\n`;
500
+ fullText += `</page>\n\n`;
501
501
  }
502
502
 
503
- return fulltext;
503
+ return fullText;
504
504
  }
505
505
 
506
506
  }
@@ -20,6 +20,8 @@ export async function imageResizer(
20
20
  format: string,
21
21
  progressive: boolean = true,
22
22
  ): Promise<string> {
23
+ log.info(`[image-resizer] Resizing image: ${inputPath} to max_hw: ${max_hw}, format: ${format}, progressive: ${progressive}`);
24
+
23
25
  const allowedFormats = ["jpg", "jpeg", "png", "webp"];
24
26
 
25
27
  if (!format || format.trim() === "") {
@@ -69,13 +71,17 @@ export async function imageResizer(
69
71
 
70
72
  log.info(`Resizing image using ImageMagick: ${inputPath} -> ${outputPath}`);
71
73
 
72
- const { stderr } = await execFile("convert", [
74
+ const command = `convert`
75
+ const args = [
73
76
  inputPath,
74
77
  "-resize",
75
78
  `${max_hw}x${max_hw}>`,
76
79
  ...(conversionOption ? conversionOption.split(" ") : []),
77
80
  outputPath,
78
- ]);
81
+ ];
82
+ log.info(`ImageMagick command: ${command} ${args.join(" ")}`);
83
+
84
+ const { stderr } = await execFile(command, args);
79
85
 
80
86
  if (stderr) {
81
87
  log.warn(`ImageMagick warning: ${stderr}`);
@@ -0,0 +1,41 @@
1
+ import { log } from "@temporalio/activity";
2
+ import { spawn } from "child_process";
3
+ import fs from "fs";
4
+ import tmp from "tmp";
5
+
6
+ export function markdownWithMarkitdown(buffer: Buffer, ext?: string): Promise<string> {
7
+ const inputFile = tmp.fileSync({ postfix: ext });
8
+ const targetFileName = tmp.tmpNameSync({ postfix: ".md" });
9
+
10
+ fs.writeSync(inputFile.fd, buffer);
11
+
12
+ return new Promise((resolve, reject) => {
13
+ const tool = "markitdown";
14
+ log.info(`Converting document to markdown with ${tool}`, { inputFile: inputFile.name, targetFileName });
15
+
16
+ const command = spawn(tool, [inputFile.name, "-o", targetFileName]);
17
+
18
+ command.on("exit", function (code) {
19
+ if (code) {
20
+ reject(new Error(`${tool} exited with code ${code}`));
21
+ }
22
+ });
23
+
24
+ command.on("close", function (code) {
25
+ if (code) {
26
+ reject(new Error(`${tool} exited with code ${code}`));
27
+ } else {
28
+ return fs.readFile(targetFileName, "utf8", (err, data) => {
29
+ if (err) {
30
+ reject(err);
31
+ }
32
+ return resolve(data);
33
+ });
34
+ }
35
+ });
36
+
37
+ command.on("error", (err) => {
38
+ reject(err);
39
+ });
40
+ });
41
+ }
@@ -120,7 +120,7 @@ export async function pdfToImages(file: Buffer | string, pages?: number[]): Prom
120
120
 
121
121
 
122
122
  /**
123
- * Get somes pages from a PDF to create a new one
123
+ * Get some pages from a PDF to create a new one
124
124
  */
125
125
 
126
126
  export async function pdfExtractPages(file: Buffer | string, pages: number[]): Promise<string> {
@@ -2,7 +2,7 @@ import { MockActivityEnvironment, TestWorkflowEnvironment } from '@temporalio/te
2
2
  import fs from 'fs';
3
3
  import path from 'path';
4
4
  import { beforeAll, expect, test } from 'vitest';
5
- import { manyToMarkdown } from '../conversion/pandoc';
5
+ import { markdownWithPandoc } from '../conversion/pandoc';
6
6
 
7
7
 
8
8
  let testEnv: TestWorkflowEnvironment;
@@ -19,6 +19,6 @@ test('should convert docx to markdown', async () => {
19
19
  const filepath = path.join(__dirname, '../../fixtures', 'us-ciia.docx');
20
20
  console.log("Converting file from", filepath);
21
21
  const docx = fs.readFileSync(filepath);
22
- const result = await activityContext.run(manyToMarkdown, Buffer.from(docx), 'docx');
22
+ const result = await activityContext.run(markdownWithPandoc, Buffer.from(docx), 'docx');
23
23
  expect(result).to.include('confidential');
24
24
  });
@@ -1,44 +1,40 @@
1
- import { log } from '@temporalio/activity';
2
- import { spawn } from 'child_process';
3
- import { PassThrough } from 'stream';
4
-
5
-
6
- export function manyToMarkdown(buffer: Buffer, fromFormat: string): Promise<string> {
7
-
8
- const fromType = undefined;
9
-
10
- return new Promise((resolve, reject) => {
11
- log.info(`Converting ${fromType} to markdown`);
12
- const input = new PassThrough();
13
- input.end(buffer);
14
-
15
- let result: string[] = [];
16
-
17
- const command = spawn("pandoc", ["-t", "markdown", '-f', fromFormat], {
18
- stdio: 'pipe',
19
- });
20
- input.pipe(command.stdin);
21
-
22
- command.stdout.on('data', function (data: string) {
23
- result.push(data.toString());
24
- });
25
- command.on('exit', function (code) {
26
- if (code) {
27
- reject(new Error(`pandoc exited with code ${code}`));
28
- }
1
+ import { log } from "@temporalio/activity";
2
+ import { spawn } from "child_process";
3
+ import { PassThrough } from "stream";
4
+
5
+ export function markdownWithPandoc(buffer: Buffer, fromFormat: string): Promise<string> {
6
+ const fromType = undefined;
7
+
8
+ return new Promise((resolve, reject) => {
9
+ log.info(`Converting ${fromType} to markdown`);
10
+ const input = new PassThrough();
11
+ input.end(buffer);
12
+
13
+ let result: string[] = [];
14
+
15
+ const command = spawn("pandoc", ["-t", "markdown", "-f", fromFormat], {
16
+ stdio: "pipe",
17
+ });
18
+ input.pipe(command.stdin);
19
+
20
+ command.stdout.on("data", function (data: string) {
21
+ result.push(data.toString());
22
+ });
23
+ command.on("exit", function (code) {
24
+ if (code) {
25
+ reject(new Error(`pandoc exited with code ${code}`));
26
+ }
27
+ });
28
+ command.on("close", function (code) {
29
+ if (code) {
30
+ reject(new Error(`pandoc exited with code ${code}`));
31
+ } else {
32
+ resolve(result.join(""));
33
+ }
34
+ });
35
+
36
+ command.on("error", (err) => {
37
+ reject(err);
38
+ });
29
39
  });
30
- command.on('close', function (code) {
31
- if (code) {
32
- reject(new Error(`pandoc exited with code ${code}`));
33
- } else {
34
- resolve(result.join(''))
35
- }
36
- });
37
-
38
- command.on('error', (err) => {
39
- reject(err);
40
- });
41
-
42
- });
43
-
44
40
  }
@@ -1,15 +1,30 @@
1
+ import {
2
+ ActivityInterfaceFor,
3
+ ActivityOptions,
4
+ CancellationScope,
5
+ executeChild,
6
+ isCancellation,
7
+ log,
8
+ patched,
9
+ proxyActivities,
10
+ startChild,
11
+ UntypedActivities,
12
+ } from "@temporalio/workflow";
1
13
  import {
2
14
  DSLActivityExecutionPayload,
3
15
  DSLActivityOptions,
4
16
  DSLActivitySpec,
5
17
  DSLChildWorkflowStep,
6
18
  DSLWorkflowExecutionPayload,
19
+ DSLWorkflowSpec,
20
+ getDocumentIds,
7
21
  WorkflowExecutionPayload
8
22
  } from "@vertesia/common";
9
- import { ActivityInterfaceFor, ActivityOptions, executeChild, log, proxyActivities, startChild, UntypedActivities } from "@temporalio/workflow";
10
23
  import ms, { StringValue } from 'ms';
11
24
  import { ActivityParamNotFound, NoDocumentFound, WorkflowParamNotFound } from "../errors.js";
12
25
  import { Vars } from "./vars.js";
26
+ import { HandleDslErrorParams } from "../activities/handleError.js";
27
+ import * as activities from "../activities/index.js";
13
28
 
14
29
  interface BaseActivityPayload extends WorkflowExecutionPayload {
15
30
  workflow_name: string;
@@ -30,7 +45,7 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
30
45
  if (!definition) {
31
46
  throw new WorkflowParamNotFound("workflow");
32
47
  }
33
- // the base payload wiull be used to create the activities payload
48
+ // the base payload will be used to create the activities payload
34
49
  const basePayload: BaseActivityPayload = {
35
50
  ...payload,
36
51
  workflow_name: definition.name,
@@ -42,9 +57,9 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
42
57
  ...convertDSLActivityOptions(definition.options),
43
58
  startToCloseTimeout: "5 minute",
44
59
  retry: {
45
- initialInterval: '30s',
60
+ initialInterval: '10s',
46
61
  backoffCoefficient: 2,
47
- maximumAttempts: 20,
62
+ maximumAttempts: 10,
48
63
  maximumInterval: 100 * 30 * 1000, //ms
49
64
  nonRetryableErrorTypes: [
50
65
  NoDocumentFound.name,
@@ -58,7 +73,7 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
58
73
  });
59
74
  const defaultProxy = proxyActivities(defaultOptions);
60
75
  log.debug("Default activity proxy is ready");
61
- // merge default vars with the payload vars and add objectIds and obejctId
76
+ // merge default vars with the payload vars and add objectIds and objectId
62
77
  const vars = new Vars({
63
78
  ...definition.vars,
64
79
  ...payload.vars,
@@ -68,6 +83,26 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
68
83
 
69
84
  log.info("Executing workflow", { payload });
70
85
 
86
+ // TODO(mhuang): remove patch when all workflows are migrated to v2
87
+ // It avoids breaking the ongoing workflow execution running in v1 and also allows us to
88
+ // deploy the new error handler in production.
89
+ // See https://docs.temporal.io/develop/typescript/versioning
90
+ if (patched('dsl-workflow-error-handling')) {
91
+ // v2: new version with error handler
92
+ try {
93
+ await executeSteps(definition, payload, basePayload, vars, defaultProxy, defaultOptions);
94
+ } catch (e) {
95
+ await handleError(e, basePayload, defaultOptions);
96
+ }
97
+ } else {
98
+ // v1: old version without error handler, deprecated since v0.52.0
99
+ await executeSteps(definition, payload, basePayload, vars, defaultProxy, defaultOptions);
100
+ }
101
+
102
+ return vars.getValue(definition.result || 'result');
103
+ }
104
+
105
+ async function executeSteps(definition: DSLWorkflowSpec, payload: DSLWorkflowExecutionPayload, basePayload: BaseActivityPayload, vars: Vars, defaultProxy: ActivityInterfaceFor<UntypedActivities>, defaultOptions: ActivityOptions) {
71
106
  if (definition.steps) {
72
107
  for (const step of definition.steps) {
73
108
  const stepType = step.type;
@@ -89,7 +124,32 @@ export async function dslWorkflow(payload: DSLWorkflowExecutionPayload) {
89
124
  } else {
90
125
  throw new Error("No steps or activities found in the workflow definition");
91
126
  }
92
- return vars.getValue(definition.result || 'result');
127
+ }
128
+
129
+ async function handleError(originalError: any, basePayload: BaseActivityPayload, defaultOptions: ActivityOptions) {
130
+ const { handleDslError } = proxyActivities<typeof activities>(defaultOptions);
131
+
132
+ const payload = dslActivityPayload(
133
+ basePayload,
134
+ {
135
+ name: "handleDslError",
136
+ params: { errorMessage: originalError.message },
137
+ } as DSLActivitySpec,
138
+ { errorMessage: originalError.message } satisfies HandleDslErrorParams,
139
+ )
140
+
141
+ if (isCancellation(originalError)) {
142
+ log.warn(`Workflow execution cancelled, executing error handler to update document status`, { error: originalError });
143
+ // Cleanup logic must be in a nonCancellable scope
144
+ // If we'd run cleanup outside of a nonCancellable scope it would've been cancelled
145
+ // before being started because the Workflow's root scope is cancelled.
146
+ // see https://docs.temporal.io/develop/typescript/cancellation
147
+ await CancellationScope.nonCancellable(() => handleDslError(payload));
148
+ } else {
149
+ log.warn(`Workflow execution failed, executing error handler to update document status`, { error: originalError });
150
+ handleDslError(payload);
151
+ }
152
+ throw originalError;
93
153
  }
94
154
 
95
155
  async function startChildWorkflow(step: DSLChildWorkflowStep, payload: DSLWorkflowExecutionPayload, vars: Vars, debug_mode?: boolean) {
@@ -101,14 +161,18 @@ async function startChildWorkflow(step: DSLChildWorkflowStep, payload: DSLWorkfl
101
161
  if (debug_mode) {
102
162
  log.debug(`Workflow vars before starting child workflow ${step.name}`, { vars: resolvedVars });
103
163
  }
104
- //@ts-ignore
105
164
  const handle = await startChild(step.name, {
106
165
  ...step.options,
107
166
  args: [{
108
167
  ...payload,
109
168
  workflow: step.spec,
110
169
  vars: resolvedVars
111
- }]
170
+ }],
171
+ searchAttributes: {
172
+ AccountId: [payload.account_id],
173
+ DocumentId: getDocumentIds(payload),
174
+ ProjectId: [payload.project_id],
175
+ },
112
176
  });
113
177
  if (step.output) {
114
178
  vars.setValue(step.output, handle.workflowId);
@@ -122,16 +186,20 @@ async function executeChildWorkflow(step: DSLChildWorkflowStep, payload: DSLWork
122
186
  Object.assign(resolvedVars, step.vars);
123
187
  }
124
188
  if (debug_mode) {
125
- log.debug(`Workflow vars before excuting child workflow ${step.name}`, { vars: resolvedVars });
189
+ log.debug(`Workflow vars before executing child workflow ${step.name}`, { vars: resolvedVars });
126
190
  }
127
- //@ts-ignore
128
191
  const result = await executeChild(step.name, {
129
192
  ...step.options,
130
193
  args: [{
131
194
  ...payload,
132
195
  workflow: step.spec,
133
196
  vars: resolvedVars,
134
- }]
197
+ }],
198
+ searchAttributes: {
199
+ AccountId: [payload.account_id],
200
+ DocumentId: getDocumentIds(payload),
201
+ ProjectId: [payload.project_id],
202
+ },
135
203
  });
136
204
 
137
205
  if (step.output) {
@@ -149,7 +217,7 @@ async function runActivity(activity: DSLActivitySpec, basePayload: BaseActivityP
149
217
  log.debug(`Workflow vars before executing activity ${activity.name}`, { vars: vars.resolve() });
150
218
  }
151
219
  if (activity.condition && !vars.match(activity.condition)) {
152
- log.info("Activity skiped: condition not satisfied", activity.condition);
220
+ log.info("Activity skipped: condition not satisfied", activity.condition);
153
221
  return;
154
222
  }
155
223
  const importParams = vars.createImportVars(activity.import);