@vertesia/workflow 0.50.1 → 0.52.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +7 -1
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -1
- package/lib/cjs/activities/chunkDocument.js +39 -34
- package/lib/cjs/activities/chunkDocument.js.map +1 -1
- package/lib/cjs/activities/createDocumentFromOther.js +2 -2
- package/lib/cjs/activities/createDocumentFromOther.js.map +1 -1
- package/lib/cjs/activities/executeInteraction.js +12 -7
- package/lib/cjs/activities/executeInteraction.js.map +1 -1
- package/lib/cjs/activities/extractDocumentText.js +25 -13
- package/lib/cjs/activities/extractDocumentText.js.map +1 -1
- package/lib/cjs/activities/generateDocumentProperties.js +22 -4
- package/lib/cjs/activities/generateDocumentProperties.js.map +1 -1
- package/lib/cjs/activities/generateEmbeddings.js +58 -102
- package/lib/cjs/activities/generateEmbeddings.js.map +1 -1
- package/lib/cjs/activities/generateImageRendition.js +77 -34
- package/lib/cjs/activities/generateImageRendition.js.map +1 -1
- package/lib/cjs/activities/generateOrAssignContentType.js +3 -7
- package/lib/cjs/activities/generateOrAssignContentType.js.map +1 -1
- package/lib/cjs/activities/notifyWebhook.js.map +1 -1
- package/lib/cjs/conversion/image.js +80 -12
- package/lib/cjs/conversion/image.js.map +1 -1
- package/lib/cjs/dsl/setup/ActivityContext.js +32 -8
- package/lib/cjs/dsl/setup/ActivityContext.js.map +1 -1
- package/lib/cjs/dsl.js +20 -0
- package/lib/cjs/dsl.js.map +1 -0
- package/lib/cjs/errors.js +13 -1
- package/lib/cjs/errors.js.map +1 -1
- package/lib/cjs/index.js +8 -2
- package/lib/cjs/index.js.map +1 -1
- package/lib/cjs/iterative-generation/activities/extractToc.js +2 -2
- package/lib/cjs/iterative-generation/activities/extractToc.js.map +1 -1
- package/lib/cjs/iterative-generation/activities/finalizeOutput.js +1 -1
- package/lib/cjs/iterative-generation/activities/finalizeOutput.js.map +1 -1
- package/lib/cjs/iterative-generation/activities/generatePart.js +1 -1
- package/lib/cjs/iterative-generation/activities/generatePart.js.map +1 -1
- package/lib/cjs/iterative-generation/activities/generateToc.js +1 -1
- package/lib/cjs/iterative-generation/activities/generateToc.js.map +1 -1
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js +2 -1
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js.map +1 -1
- package/lib/cjs/iterative-generation/utils.js +7 -4
- package/lib/cjs/iterative-generation/utils.js.map +1 -1
- package/lib/cjs/system/notifyWebhookWorkflow.js +2 -1
- package/lib/cjs/system/notifyWebhookWorkflow.js.map +1 -1
- package/lib/cjs/system/recalculateEmbeddingsWorkflow.js +1 -1
- package/lib/cjs/system/recalculateEmbeddingsWorkflow.js.map +1 -1
- package/lib/cjs/utils/blobs.js +13 -7
- package/lib/cjs/utils/blobs.js.map +1 -1
- package/lib/cjs/utils/chunks.js +14 -0
- package/lib/cjs/utils/chunks.js.map +1 -0
- package/lib/cjs/utils/client.js +6 -5
- package/lib/cjs/utils/client.js.map +1 -1
- package/lib/cjs/utils/memory.js +2 -9
- package/lib/cjs/utils/memory.js.map +1 -1
- package/lib/cjs/workflows.js +1 -3
- package/lib/cjs/workflows.js.map +1 -1
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +7 -1
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -1
- package/lib/esm/activities/chunkDocument.js +39 -34
- package/lib/esm/activities/chunkDocument.js.map +1 -1
- package/lib/esm/activities/createDocumentFromOther.js +1 -1
- package/lib/esm/activities/createDocumentFromOther.js.map +1 -1
- package/lib/esm/activities/executeInteraction.js +12 -7
- package/lib/esm/activities/executeInteraction.js.map +1 -1
- package/lib/esm/activities/extractDocumentText.js +25 -13
- package/lib/esm/activities/extractDocumentText.js.map +1 -1
- package/lib/esm/activities/generateDocumentProperties.js +22 -4
- package/lib/esm/activities/generateDocumentProperties.js.map +1 -1
- package/lib/esm/activities/generateEmbeddings.js +58 -69
- package/lib/esm/activities/generateEmbeddings.js.map +1 -1
- package/lib/esm/activities/generateImageRendition.js +78 -35
- package/lib/esm/activities/generateImageRendition.js.map +1 -1
- package/lib/esm/activities/generateOrAssignContentType.js +3 -7
- package/lib/esm/activities/generateOrAssignContentType.js.map +1 -1
- package/lib/esm/activities/notifyWebhook.js.map +1 -1
- package/lib/esm/conversion/image.js +80 -12
- package/lib/esm/conversion/image.js.map +1 -1
- package/lib/esm/dsl/setup/ActivityContext.js +34 -10
- package/lib/esm/dsl/setup/ActivityContext.js.map +1 -1
- package/lib/esm/dsl.js +4 -0
- package/lib/esm/dsl.js.map +1 -0
- package/lib/esm/errors.js +11 -0
- package/lib/esm/errors.js.map +1 -1
- package/lib/esm/index.js +8 -2
- package/lib/esm/index.js.map +1 -1
- package/lib/esm/iterative-generation/activities/extractToc.js +3 -3
- package/lib/esm/iterative-generation/activities/extractToc.js.map +1 -1
- package/lib/esm/iterative-generation/activities/finalizeOutput.js +2 -2
- package/lib/esm/iterative-generation/activities/finalizeOutput.js.map +1 -1
- package/lib/esm/iterative-generation/activities/generatePart.js +2 -2
- package/lib/esm/iterative-generation/activities/generatePart.js.map +1 -1
- package/lib/esm/iterative-generation/activities/generateToc.js +2 -2
- package/lib/esm/iterative-generation/activities/generateToc.js.map +1 -1
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js +2 -1
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js.map +1 -1
- package/lib/esm/iterative-generation/utils.js +7 -4
- package/lib/esm/iterative-generation/utils.js.map +1 -1
- package/lib/esm/system/notifyWebhookWorkflow.js +2 -1
- package/lib/esm/system/notifyWebhookWorkflow.js.map +1 -1
- package/lib/esm/system/recalculateEmbeddingsWorkflow.js +2 -2
- package/lib/esm/system/recalculateEmbeddingsWorkflow.js.map +1 -1
- package/lib/esm/utils/blobs.js +13 -7
- package/lib/esm/utils/blobs.js.map +1 -1
- package/lib/esm/utils/chunks.js +9 -0
- package/lib/esm/utils/chunks.js.map +1 -0
- package/lib/esm/utils/client.js +5 -4
- package/lib/esm/utils/client.js.map +1 -1
- package/lib/esm/utils/memory.js +2 -7
- package/lib/esm/utils/memory.js.map +1 -1
- package/lib/esm/workflows.js +0 -1
- package/lib/esm/workflows.js.map +1 -1
- package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts +10 -0
- package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts.map +1 -1
- package/lib/types/activities/chunkDocument.d.ts +15 -0
- package/lib/types/activities/chunkDocument.d.ts.map +1 -1
- package/lib/types/activities/createDocumentFromOther.d.ts.map +1 -1
- package/lib/types/activities/executeInteraction.d.ts +19 -4
- package/lib/types/activities/executeInteraction.d.ts.map +1 -1
- package/lib/types/activities/extractDocumentText.d.ts.map +1 -1
- package/lib/types/activities/generateDocumentProperties.d.ts +1 -1
- package/lib/types/activities/generateDocumentProperties.d.ts.map +1 -1
- package/lib/types/activities/generateEmbeddings.d.ts +21 -17
- package/lib/types/activities/generateEmbeddings.d.ts.map +1 -1
- package/lib/types/activities/generateImageRendition.d.ts +3 -5
- package/lib/types/activities/generateImageRendition.d.ts.map +1 -1
- package/lib/types/activities/generateOrAssignContentType.d.ts.map +1 -1
- package/lib/types/activities/notifyWebhook.d.ts +1 -2
- package/lib/types/activities/notifyWebhook.d.ts.map +1 -1
- package/lib/types/conversion/image.d.ts +8 -6
- package/lib/types/conversion/image.d.ts.map +1 -1
- package/lib/types/dsl/dslProxyActivities.d.ts +2 -2
- package/lib/types/dsl/dslProxyActivities.d.ts.map +1 -1
- package/lib/types/dsl/setup/ActivityContext.d.ts +3 -0
- package/lib/types/dsl/setup/ActivityContext.d.ts.map +1 -1
- package/lib/types/dsl.d.ts +4 -0
- package/lib/types/dsl.d.ts.map +1 -0
- package/lib/types/errors.d.ts +6 -0
- package/lib/types/errors.d.ts.map +1 -1
- package/lib/types/index.d.ts +8 -2
- package/lib/types/index.d.ts.map +1 -1
- package/lib/types/iterative-generation/activities/extractToc.d.ts.map +1 -1
- package/lib/types/iterative-generation/activities/finalizeOutput.d.ts.map +1 -1
- package/lib/types/iterative-generation/activities/generatePart.d.ts.map +1 -1
- package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts.map +1 -1
- package/lib/types/iterative-generation/utils.d.ts +2 -2
- package/lib/types/iterative-generation/utils.d.ts.map +1 -1
- package/lib/types/system/notifyWebhookWorkflow.d.ts.map +1 -1
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts +2 -17
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts.map +1 -1
- package/lib/types/utils/blobs.d.ts.map +1 -1
- package/lib/types/utils/chunks.d.ts +9 -0
- package/lib/types/utils/chunks.d.ts.map +1 -0
- package/lib/types/utils/client.d.ts +2 -2
- package/lib/types/utils/client.d.ts.map +1 -1
- package/lib/types/utils/memory.d.ts +1 -5
- package/lib/types/utils/memory.d.ts.map +1 -1
- package/lib/types/workflows.d.ts +0 -1
- package/lib/types/workflows.d.ts.map +1 -1
- package/lib/workflows-bundle.js +8311 -5790
- package/package.json +28 -10
- package/src/activities/advanced/createOrUpdateDocumentFromInteractionRun.ts +20 -1
- package/src/activities/chunkDocument.ts +62 -42
- package/src/activities/createDocumentFromOther.ts +2 -2
- package/src/activities/executeInteraction.ts +33 -12
- package/src/activities/extractDocumentText.ts +30 -14
- package/src/activities/generateDocumentProperties.ts +37 -16
- package/src/activities/generateEmbeddings.ts +91 -79
- package/src/activities/generateImageRendition.ts +100 -53
- package/src/activities/generateOrAssignContentType.ts +5 -11
- package/src/activities/notifyWebhook.ts +2 -2
- package/src/conversion/image.test.ts +110 -18
- package/src/conversion/image.ts +90 -15
- package/src/conversion/pandoc.test.ts +7 -5
- package/src/dsl/dslProxyActivities.ts +2 -2
- package/src/dsl/setup/ActivityContext.ts +60 -19
- package/src/dsl.ts +3 -0
- package/src/errors.ts +27 -6
- package/src/index.ts +9 -2
- package/src/iterative-generation/activities/extractToc.ts +3 -3
- package/src/iterative-generation/activities/finalizeOutput.ts +3 -3
- package/src/iterative-generation/activities/generatePart.ts +3 -3
- package/src/iterative-generation/activities/generateToc.ts +2 -2
- package/src/iterative-generation/iterativeGenerationWorkflow.ts +2 -1
- package/src/iterative-generation/utils.ts +10 -6
- package/src/system/notifyWebhookWorkflow.ts +3 -2
- package/src/system/recalculateEmbeddingsWorkflow.ts +2 -2
- package/src/utils/blobs.ts +12 -7
- package/src/utils/chunks.ts +17 -0
- package/src/utils/client.ts +6 -5
- package/src/utils/memory.ts +3 -8
- package/src/workflows.ts +0 -2
- package/lib/cjs/conversion/pdf.js +0 -13
- package/lib/cjs/conversion/pdf.js.map +0 -1
- package/lib/cjs/system/generateObjectText.js +0 -76
- package/lib/cjs/system/generateObjectText.js.map +0 -1
- package/lib/esm/conversion/pdf.js +0 -7
- package/lib/esm/conversion/pdf.js.map +0 -1
- package/lib/esm/system/generateObjectText.js +0 -73
- package/lib/esm/system/generateObjectText.js.map +0 -1
- package/lib/types/conversion/pdf.d.ts +0 -2
- package/lib/types/conversion/pdf.d.ts.map +0 -1
- package/lib/types/system/generateObjectText.d.ts +0 -4
- package/lib/types/system/generateObjectText.d.ts.map +0 -1
- package/src/conversion/pdf.test.ts +0 -35
- package/src/conversion/pdf.ts +0 -8
- package/src/system/generateObjectText.ts +0 -95
@@ -1,8 +1,9 @@
|
|
1
1
|
|
2
2
|
import { log } from "@temporalio/workflow";
|
3
3
|
import { ContentEventName, WorkflowExecutionPayload } from "@vertesia/common";
|
4
|
-
import * as activities from "../activities/
|
4
|
+
import * as activities from "../activities/notifyWebhook.js";
|
5
5
|
import { dslProxyActivities } from "../dsl/dslProxyActivities.js";
|
6
|
+
import { WF_NON_RETRYABLE_ERRORS } from "../errors.js";
|
6
7
|
|
7
8
|
const {
|
8
9
|
notifyWebhook
|
@@ -13,7 +14,7 @@ const {
|
|
13
14
|
backoffCoefficient: 2,
|
14
15
|
maximumAttempts: 5,
|
15
16
|
maximumInterval: 100 * 30 * 1000, //ms
|
16
|
-
nonRetryableErrorTypes:
|
17
|
+
nonRetryableErrorTypes: WF_NON_RETRYABLE_ERRORS,
|
17
18
|
},
|
18
19
|
});
|
19
20
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
import { SupportedEmbeddingTypes, WorkflowExecutionPayload } from "@vertesia/common";
|
3
3
|
import * as activities from "../activities/index-dsl.js";
|
4
4
|
import { dslProxyActivities } from "../dsl/dslProxyActivities.js";
|
5
|
-
import {
|
5
|
+
import { WF_NON_RETRYABLE_ERRORS } from "../errors.js";
|
6
6
|
|
7
7
|
const {
|
8
8
|
generateEmbeddings,
|
@@ -13,7 +13,7 @@ const {
|
|
13
13
|
backoffCoefficient: 2,
|
14
14
|
maximumAttempts: 10,
|
15
15
|
maximumInterval: 100 * 30 * 1000, //ms
|
16
|
-
nonRetryableErrorTypes:
|
16
|
+
nonRetryableErrorTypes: WF_NON_RETRYABLE_ERRORS,
|
17
17
|
},
|
18
18
|
});
|
19
19
|
|
package/src/utils/blobs.ts
CHANGED
@@ -10,7 +10,12 @@ export async function fetchBlobAsStream(client: VertesiaClient, blobUri: string)
|
|
10
10
|
try {
|
11
11
|
return await client.files.downloadFile(blobUri);
|
12
12
|
} catch (err: any) {
|
13
|
-
|
13
|
+
if (err.message.includes("not found")) {
|
14
|
+
//TODO improve error handling with a fetch fail error class in the client
|
15
|
+
throw new NoDocumentFound(`Failed to download blob ${blobUri}: ${err.message}`, []);
|
16
|
+
} else {
|
17
|
+
throw new Error(`Failed to download blob ${blobUri}: ${err.message}`);
|
18
|
+
}
|
14
19
|
}
|
15
20
|
}
|
16
21
|
export async function fetchBlobAsBuffer(client: VertesiaClient, blobUri: string): Promise<Buffer> {
|
@@ -24,7 +29,7 @@ export async function fetchBlobAsBuffer(client: VertesiaClient, blobUri: string)
|
|
24
29
|
|
25
30
|
export async function fetchBlobAsBase64(client: VertesiaClient, blobUri: string): Promise<string> {
|
26
31
|
const buffer = await fetchBlobAsBuffer(client, blobUri);
|
27
|
-
return buffer.toString(
|
32
|
+
return buffer.toString("base64");
|
28
33
|
}
|
29
34
|
|
30
35
|
export async function saveBlobToFile(client: VertesiaClient, blobUri: string, toFile: string): Promise<void> {
|
@@ -35,9 +40,9 @@ export async function saveBlobToFile(client: VertesiaClient, blobUri: string, to
|
|
35
40
|
|
36
41
|
export async function saveBlobToTempFile(client: VertesiaClient, blobUri: string, fileExt?: string): Promise<string> {
|
37
42
|
const tmpFile = tmp.fileSync({
|
38
|
-
prefix: "
|
39
|
-
postfix: fileExt,
|
40
|
-
discardDescriptor: true
|
43
|
+
prefix: "vertesia-activity-",
|
44
|
+
postfix: fileExt ? "." + fileExt : "",
|
45
|
+
discardDescriptor: true,
|
41
46
|
});
|
42
47
|
await saveBlobToFile(client, blobUri, tmpFile.name);
|
43
48
|
return tmpFile.name;
|
@@ -47,12 +52,12 @@ async function writeChunksToStream(chunks: AsyncIterable<Uint8Array>, out: NodeJ
|
|
47
52
|
for await (const chunk of chunks) {
|
48
53
|
if (!out.write(chunk)) {
|
49
54
|
// If the internal buffer is full, wait until it's drained
|
50
|
-
await new Promise(resolve => out.once(
|
55
|
+
await new Promise((resolve) => out.once("drain", resolve));
|
51
56
|
}
|
52
57
|
}
|
53
58
|
out.end(); // Close the stream when done
|
54
59
|
}
|
55
60
|
|
56
61
|
export function md5(contents: string) {
|
57
|
-
return crypto.createHash(
|
62
|
+
return crypto.createHash("md5").update(contents).digest("hex");
|
58
63
|
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
export interface DocPart {
|
3
|
+
line_number_start: number
|
4
|
+
line_number_end: number
|
5
|
+
name: string
|
6
|
+
type: string
|
7
|
+
}
|
8
|
+
|
9
|
+
export const getContentPart = (content: string, part: DocPart): string => {
|
10
|
+
const lines = content.split('\n');
|
11
|
+
const text = lines.filter((_l, i) => i >= part.line_number_start && i <= part.line_number_end).join('\n');
|
12
|
+
return text;
|
13
|
+
}
|
14
|
+
|
15
|
+
export const getContentParts = (content: string, parts: DocPart[]): string[] => {
|
16
|
+
return parts.map(part => getContentPart(content, part));
|
17
|
+
}
|
package/src/utils/client.ts
CHANGED
@@ -3,21 +3,22 @@
|
|
3
3
|
*/
|
4
4
|
|
5
5
|
import { VertesiaClient } from "@vertesia/client";
|
6
|
-
import {
|
6
|
+
import { WorkflowExecutionBaseParams } from "@vertesia/common";
|
7
|
+
import { WorkflowParamNotFound } from "../errors.js";
|
7
8
|
|
8
9
|
|
9
|
-
export function
|
10
|
+
export function getVertesiaClient(payload: WorkflowExecutionBaseParams) {
|
10
11
|
|
11
12
|
if (!payload.auth_token) {
|
12
|
-
throw new
|
13
|
+
throw new WorkflowParamNotFound("Authentication Token is missing from WorkflowExecutionPayload.authToken");
|
13
14
|
}
|
14
15
|
|
15
16
|
if (!payload.config?.studio_url) {
|
16
|
-
throw new
|
17
|
+
throw new WorkflowParamNotFound("Content Store URL is missing from WorkflowExecutionPayload.servers.storeUrl");
|
17
18
|
}
|
18
19
|
|
19
20
|
if (!payload.config?.store_url) {
|
20
|
-
throw new
|
21
|
+
throw new WorkflowParamNotFound("Content Store URL is missing from WorkflowExecutionPayload.servers.storeUrl");
|
21
22
|
}
|
22
23
|
|
23
24
|
const client = new VertesiaClient({
|
package/src/utils/memory.ts
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
import { VertesiaClient
|
1
|
+
import { VertesiaClient } from "@vertesia/client";
|
2
|
+
import { NodeStreamSource } from "@vertesia/client/node";
|
2
3
|
import { Commands, MemoryPack, buildMemoryPack as _buildMemoryPack, loadMemoryPack as _loadMemoryPack } from "@vertesia/memory";
|
3
4
|
import { createReadStream, createWriteStream } from "fs";
|
4
5
|
import { rm } from "fs/promises";
|
5
|
-
import {
|
6
|
-
import { Readable } from "stream";
|
6
|
+
import { webStreamToReadable } from "node-web-stream-adapters";
|
7
7
|
import { pipeline } from "stream/promises";
|
8
8
|
|
9
9
|
import tmp from "tmp";
|
@@ -11,11 +11,6 @@ import zlib from "zlib";
|
|
11
11
|
|
12
12
|
tmp.setGracefulCleanup();
|
13
13
|
|
14
|
-
export class NodeStreamSource extends StreamSource {
|
15
|
-
constructor(stream: Readable, name: string, type?: string, id?: string) {
|
16
|
-
super(readableToWebStream(stream), name, type, id);
|
17
|
-
}
|
18
|
-
}
|
19
14
|
|
20
15
|
export async function publishMemoryPack(client: VertesiaClient, file: string, name: string): Promise<void> {
|
21
16
|
const stream = createReadStream(file);
|
package/src/workflows.ts
CHANGED
@@ -3,7 +3,5 @@
|
|
3
3
|
*/
|
4
4
|
export { dslWorkflow } from "./dsl/dsl-workflow.js";
|
5
5
|
export { iterativeGenerationWorkflow } from "./iterative-generation/iterativeGenerationWorkflow.js";
|
6
|
-
export { generateObjectText } from "./system/generateObjectText.js";
|
7
6
|
export { notifyWebhookWorkflow } from "./system/notifyWebhookWorkflow.js";
|
8
7
|
export { recalculateEmbeddingsWorkflow } from "./system/recalculateEmbeddingsWorkflow.js";
|
9
|
-
|
@@ -1,13 +0,0 @@
|
|
1
|
-
"use strict";
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
4
|
-
};
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
6
|
-
exports.trasformPdfToMarkdown = trasformPdfToMarkdown;
|
7
|
-
const pdf2md_1 = __importDefault(require("@opendocsg/pdf2md"));
|
8
|
-
const pdf2mdFn = pdf2md_1.default;
|
9
|
-
function trasformPdfToMarkdown(buffer) {
|
10
|
-
const arr = new Uint8Array(buffer);
|
11
|
-
return pdf2mdFn(arr);
|
12
|
-
}
|
13
|
-
//# sourceMappingURL=pdf.js.map
|
@@ -1 +0,0 @@
|
|
1
|
-
{"version":3,"file":"pdf.js","sourceRoot":"","sources":["../../../src/conversion/pdf.ts"],"names":[],"mappings":";;;;;AAIA,sDAGC;AAPD,+DAAuC;AAEvC,MAAM,QAAQ,GAAG,gBAA4D,CAAC;AAE9E,SAAgB,qBAAqB,CAAC,MAAc;IAChD,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;IACnC,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC"}
|
@@ -1,76 +0,0 @@
|
|
1
|
-
"use strict";
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
-
exports.generateObjectText = generateObjectText;
|
4
|
-
const workflow_1 = require("@temporalio/workflow");
|
5
|
-
const dslProxyActivities_js_1 = require("../dsl/dslProxyActivities.js");
|
6
|
-
const errors_js_1 = require("../errors.js");
|
7
|
-
const { getObjectFromStore, extractDocumentText } = (0, dslProxyActivities_js_1.dslProxyActivities)("generateTextWorkflow", {
|
8
|
-
startToCloseTimeout: "5 minute",
|
9
|
-
retry: {
|
10
|
-
initialInterval: '5s',
|
11
|
-
backoffCoefficient: 2,
|
12
|
-
maximumAttempts: 5,
|
13
|
-
maximumInterval: 100 * 30 * 1000, //ms
|
14
|
-
nonRetryableErrorTypes: [],
|
15
|
-
},
|
16
|
-
});
|
17
|
-
const { transcribeMedia, convertPdfToStructuredText } = (0, dslProxyActivities_js_1.dslProxyActivities)("generateTextWorkflow", {
|
18
|
-
startToCloseTimeout: "30 minute",
|
19
|
-
retry: {
|
20
|
-
initialInterval: '30s',
|
21
|
-
backoffCoefficient: 2,
|
22
|
-
maximumAttempts: 5,
|
23
|
-
maximumInterval: 100 * 30 * 1000, //ms
|
24
|
-
nonRetryableErrorTypes: [],
|
25
|
-
},
|
26
|
-
});
|
27
|
-
async function generateObjectText(payload) {
|
28
|
-
const { objectIds } = payload;
|
29
|
-
const objectId = objectIds[0];
|
30
|
-
const object = await getObjectFromStore(payload, {});
|
31
|
-
if (!object.content?.source) {
|
32
|
-
throw new errors_js_1.NoDocumentFound(`No source or mimetype found for object ${objectId}`, objectIds);
|
33
|
-
}
|
34
|
-
const mimetype = object.content.type;
|
35
|
-
if (!mimetype) {
|
36
|
-
throw new errors_js_1.NoDocumentFound(`No mimetype found for object ${objectId}`, objectIds);
|
37
|
-
}
|
38
|
-
const converter = ConverterActivity.find(({ type }) => type.test(mimetype));
|
39
|
-
if (!converter) {
|
40
|
-
throw new errors_js_1.NoDocumentFound(`No converter found for mimetype ${mimetype}`, objectIds);
|
41
|
-
}
|
42
|
-
workflow_1.log.info(`Converting file type ${mimetype} to text with ${converter.name}`);
|
43
|
-
const res = await converter.activity(payload)(payload, converter.params);
|
44
|
-
workflow_1.log.info("Generated text for object", { res, objectId });
|
45
|
-
return res;
|
46
|
-
}
|
47
|
-
const ConverterActivity = [
|
48
|
-
{
|
49
|
-
type: /application\/pdf/,
|
50
|
-
activity: (payload) => {
|
51
|
-
const useTextractForPDF = payload.vars?.useTextractForPdf ?? false;
|
52
|
-
return useTextractForPDF ? convertPdfToStructuredText : extractDocumentText;
|
53
|
-
},
|
54
|
-
name: "ConvertPdfToStructuredText",
|
55
|
-
params: {},
|
56
|
-
},
|
57
|
-
{
|
58
|
-
type: /audio\/.+/,
|
59
|
-
activity: () => transcribeMedia,
|
60
|
-
name: "TranscribeMedia",
|
61
|
-
params: {},
|
62
|
-
},
|
63
|
-
{
|
64
|
-
type: /video\/.+/,
|
65
|
-
activity: () => transcribeMedia,
|
66
|
-
name: "TranscribeMedia",
|
67
|
-
params: {},
|
68
|
-
},
|
69
|
-
{
|
70
|
-
type: /.+/,
|
71
|
-
activity: () => extractDocumentText,
|
72
|
-
name: "extractText",
|
73
|
-
params: {},
|
74
|
-
}
|
75
|
-
];
|
76
|
-
//# sourceMappingURL=generateObjectText.js.map
|
@@ -1 +0,0 @@
|
|
1
|
-
{"version":3,"file":"generateObjectText.js","sourceRoot":"","sources":["../../../src/system/generateObjectText.ts"],"names":[],"mappings":";;AAqCA,gDA0BC;AA9DD,mDAA2C;AAG3C,wEAAkE;AAClE,4CAA+C;AAG/C,MAAM,EACF,kBAAkB,EAClB,mBAAmB,EACtB,GAAG,IAAA,0CAAkB,EAAoB,sBAAsB,EAAE;IAC9D,mBAAmB,EAAE,UAAU;IAC/B,KAAK,EAAE;QACH,eAAe,EAAE,IAAI;QACrB,kBAAkB,EAAE,CAAC;QACrB,eAAe,EAAE,CAAC;QAClB,eAAe,EAAE,GAAG,GAAG,EAAE,GAAG,IAAI,EAAE,IAAI;QACtC,sBAAsB,EAAE,EAAE;KAC7B;CACJ,CAAC,CAAC;AAEH,MAAM,EACF,eAAe,EACf,0BAA0B,EAC7B,GAAG,IAAA,0CAAkB,EAAoB,sBAAsB,EAAE;IAC9D,mBAAmB,EAAE,WAAW;IAChC,KAAK,EAAE;QACH,eAAe,EAAE,KAAK;QACtB,kBAAkB,EAAE,CAAC;QACrB,eAAe,EAAE,CAAC;QAClB,eAAe,EAAE,GAAG,GAAG,EAAE,GAAG,IAAI,EAAE,IAAI;QACtC,sBAAsB,EAAE,EAAE;KAC7B;CACJ,CAAC,CAAC;AAGI,KAAK,UAAU,kBAAkB,CAAC,OAAiC;IAEtE,MAAM,EAAE,SAAS,EAAE,GAAG,OAAO,CAAC;IAC9B,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;IAE9B,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IAErD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,2BAAe,CAAC,0CAA0C,QAAQ,EAAE,EAAE,SAAS,CAAC,CAAC;IAC/F,CAAC;IACD,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC;IACrC,IAAI,CAAC,QAAQ,EAAE,CAAC;QACZ,MAAM,IAAI,2BAAe,CAAC,gCAAgC,QAAQ,EAAE,EAAE,SAAS,CAAC,CAAC;IACrF,CAAC;IAED,MAAM,SAAS,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC5E,IAAI,CAAC,SAAS,EAAE,CAAC;QACb,MAAM,IAAI,2BAAe,CAAC,mCAAmC,QAAQ,EAAE,EAAE,SAAS,CAAC,CAAC;IACxF,CAAC;IACD,cAAG,CAAC,IAAI,CAAC,wBAAwB,QAAQ,iBAAiB,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC;IAE5E,MAAM,GAAG,GAAG,MAAM,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;IAEzE,cAAG,CAAC,IAAI,CAAC,2BAA2B,EAAE,EAAE,GAAG,EAAE,QAAQ,EAAE,CAAC,CAAC;IACzD,OAAO,GAAG,CAAC;AAEf,CAAC;AAGD,MAAM,iBAAiB,GAAG;IACtB;QACI,IAAI,EAAE,kBAAkB;QACxB,QAAQ,EAAE,CAAC,OAAiC,EAAE,EAAE;YAC5C,MAAM,iBAAiB,GAAG,OAAO,CAAC,IAAI,EAAE,iBAAiB,IAAI,KAAK,CAAC;YACnE,OAAO,iBAAiB,CAAC,CAAC,CAAC,0BAA0B,CAAC,CAAC,CAAC,mBAAmB,CAAC;QAChF,CAAC;QACD,IAAI,EAAE,4BAA4B;QAClC,MAAM,EAAE,EAAE;KACb;IACD;QACI,IAAI,EAAE,WAAW;QACjB,QAAQ,EAAE,GAAG,EAAE,CAAC,eAAe;QAC/B,IAAI,EAAE,iBAAiB;QACvB,MAAM,EAAE,EAAE;KACb;IACD;QACI,IAAI,EAAE,WAAW;QACjB,QAAQ,EAAE,GAAG,EAAE,CAAC,eAAe;QAC/B,IAAI,EAAE,iBAAiB;QACvB,MAAM,EAAE,EAAE;KACb;IACD;QACI,IAAI,EAAE,IAAI;QACV,QAAQ,EAAE,GAAG,EAAE,CAAC,mBAAmB;QACnC,IAAI,EAAE,aAAa;QACnB,MAAM,EAAE,EAAE;KACb;CACJ,CAAA"}
|
@@ -1 +0,0 @@
|
|
1
|
-
{"version":3,"file":"pdf.js","sourceRoot":"","sources":["../../../src/conversion/pdf.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,mBAAmB,CAAC;AAEvC,MAAM,QAAQ,GAAG,MAA4D,CAAC;AAE9E,MAAM,UAAU,qBAAqB,CAAC,MAAc;IAChD,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;IACnC,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC"}
|
@@ -1,73 +0,0 @@
|
|
1
|
-
import { log } from "@temporalio/workflow";
|
2
|
-
import { dslProxyActivities } from "../dsl/dslProxyActivities.js";
|
3
|
-
import { NoDocumentFound } from "../errors.js";
|
4
|
-
const { getObjectFromStore, extractDocumentText } = dslProxyActivities("generateTextWorkflow", {
|
5
|
-
startToCloseTimeout: "5 minute",
|
6
|
-
retry: {
|
7
|
-
initialInterval: '5s',
|
8
|
-
backoffCoefficient: 2,
|
9
|
-
maximumAttempts: 5,
|
10
|
-
maximumInterval: 100 * 30 * 1000, //ms
|
11
|
-
nonRetryableErrorTypes: [],
|
12
|
-
},
|
13
|
-
});
|
14
|
-
const { transcribeMedia, convertPdfToStructuredText } = dslProxyActivities("generateTextWorkflow", {
|
15
|
-
startToCloseTimeout: "30 minute",
|
16
|
-
retry: {
|
17
|
-
initialInterval: '30s',
|
18
|
-
backoffCoefficient: 2,
|
19
|
-
maximumAttempts: 5,
|
20
|
-
maximumInterval: 100 * 30 * 1000, //ms
|
21
|
-
nonRetryableErrorTypes: [],
|
22
|
-
},
|
23
|
-
});
|
24
|
-
export async function generateObjectText(payload) {
|
25
|
-
const { objectIds } = payload;
|
26
|
-
const objectId = objectIds[0];
|
27
|
-
const object = await getObjectFromStore(payload, {});
|
28
|
-
if (!object.content?.source) {
|
29
|
-
throw new NoDocumentFound(`No source or mimetype found for object ${objectId}`, objectIds);
|
30
|
-
}
|
31
|
-
const mimetype = object.content.type;
|
32
|
-
if (!mimetype) {
|
33
|
-
throw new NoDocumentFound(`No mimetype found for object ${objectId}`, objectIds);
|
34
|
-
}
|
35
|
-
const converter = ConverterActivity.find(({ type }) => type.test(mimetype));
|
36
|
-
if (!converter) {
|
37
|
-
throw new NoDocumentFound(`No converter found for mimetype ${mimetype}`, objectIds);
|
38
|
-
}
|
39
|
-
log.info(`Converting file type ${mimetype} to text with ${converter.name}`);
|
40
|
-
const res = await converter.activity(payload)(payload, converter.params);
|
41
|
-
log.info("Generated text for object", { res, objectId });
|
42
|
-
return res;
|
43
|
-
}
|
44
|
-
const ConverterActivity = [
|
45
|
-
{
|
46
|
-
type: /application\/pdf/,
|
47
|
-
activity: (payload) => {
|
48
|
-
const useTextractForPDF = payload.vars?.useTextractForPdf ?? false;
|
49
|
-
return useTextractForPDF ? convertPdfToStructuredText : extractDocumentText;
|
50
|
-
},
|
51
|
-
name: "ConvertPdfToStructuredText",
|
52
|
-
params: {},
|
53
|
-
},
|
54
|
-
{
|
55
|
-
type: /audio\/.+/,
|
56
|
-
activity: () => transcribeMedia,
|
57
|
-
name: "TranscribeMedia",
|
58
|
-
params: {},
|
59
|
-
},
|
60
|
-
{
|
61
|
-
type: /video\/.+/,
|
62
|
-
activity: () => transcribeMedia,
|
63
|
-
name: "TranscribeMedia",
|
64
|
-
params: {},
|
65
|
-
},
|
66
|
-
{
|
67
|
-
type: /.+/,
|
68
|
-
activity: () => extractDocumentText,
|
69
|
-
name: "extractText",
|
70
|
-
params: {},
|
71
|
-
}
|
72
|
-
];
|
73
|
-
//# sourceMappingURL=generateObjectText.js.map
|
@@ -1 +0,0 @@
|
|
1
|
-
{"version":3,"file":"generateObjectText.js","sourceRoot":"","sources":["../../../src/system/generateObjectText.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,GAAG,EAAE,MAAM,sBAAsB,CAAC;AAG3C,OAAO,EAAE,kBAAkB,EAAE,MAAM,8BAA8B,CAAC;AAClE,OAAO,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAG/C,MAAM,EACF,kBAAkB,EAClB,mBAAmB,EACtB,GAAG,kBAAkB,CAAoB,sBAAsB,EAAE;IAC9D,mBAAmB,EAAE,UAAU;IAC/B,KAAK,EAAE;QACH,eAAe,EAAE,IAAI;QACrB,kBAAkB,EAAE,CAAC;QACrB,eAAe,EAAE,CAAC;QAClB,eAAe,EAAE,GAAG,GAAG,EAAE,GAAG,IAAI,EAAE,IAAI;QACtC,sBAAsB,EAAE,EAAE;KAC7B;CACJ,CAAC,CAAC;AAEH,MAAM,EACF,eAAe,EACf,0BAA0B,EAC7B,GAAG,kBAAkB,CAAoB,sBAAsB,EAAE;IAC9D,mBAAmB,EAAE,WAAW;IAChC,KAAK,EAAE;QACH,eAAe,EAAE,KAAK;QACtB,kBAAkB,EAAE,CAAC;QACrB,eAAe,EAAE,CAAC;QAClB,eAAe,EAAE,GAAG,GAAG,EAAE,GAAG,IAAI,EAAE,IAAI;QACtC,sBAAsB,EAAE,EAAE;KAC7B;CACJ,CAAC,CAAC;AAGH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,OAAiC;IAEtE,MAAM,EAAE,SAAS,EAAE,GAAG,OAAO,CAAC;IAC9B,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;IAE9B,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IAErD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,eAAe,CAAC,0CAA0C,QAAQ,EAAE,EAAE,SAAS,CAAC,CAAC;IAC/F,CAAC;IACD,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC;IACrC,IAAI,CAAC,QAAQ,EAAE,CAAC;QACZ,MAAM,IAAI,eAAe,CAAC,gCAAgC,QAAQ,EAAE,EAAE,SAAS,CAAC,CAAC;IACrF,CAAC;IAED,MAAM,SAAS,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC5E,IAAI,CAAC,SAAS,EAAE,CAAC;QACb,MAAM,IAAI,eAAe,CAAC,mCAAmC,QAAQ,EAAE,EAAE,SAAS,CAAC,CAAC;IACxF,CAAC;IACD,GAAG,CAAC,IAAI,CAAC,wBAAwB,QAAQ,iBAAiB,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC;IAE5E,MAAM,GAAG,GAAG,MAAM,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;IAEzE,GAAG,CAAC,IAAI,CAAC,2BAA2B,EAAE,EAAE,GAAG,EAAE,QAAQ,EAAE,CAAC,CAAC;IACzD,OAAO,GAAG,CAAC;AAEf,CAAC;AAGD,MAAM,iBAAiB,GAAG;IACtB;QACI,IAAI,EAAE,kBAAkB;QACxB,QAAQ,EAAE,CAAC,OAAiC,EAAE,EAAE;YAC5C,MAAM,iBAAiB,GAAG,OAAO,CAAC,IAAI,EAAE,iBAAiB,IAAI,KAAK,CAAC;YACnE,OAAO,iBAAiB,CAAC,CAAC,CAAC,0BAA0B,CAAC,CAAC,CAAC,mBAAmB,CAAC;QAChF,CAAC;QACD,IAAI,EAAE,4BAA4B;QAClC,MAAM,EAAE,EAAE;KACb;IACD;QACI,IAAI,EAAE,WAAW;QACjB,QAAQ,EAAE,GAAG,EAAE,CAAC,eAAe;QAC/B,IAAI,EAAE,iBAAiB;QACvB,MAAM,EAAE,EAAE;KACb;IACD;QACI,IAAI,EAAE,WAAW;QACjB,QAAQ,EAAE,GAAG,EAAE,CAAC,eAAe;QAC/B,IAAI,EAAE,iBAAiB;QACvB,MAAM,EAAE,EAAE;KACb;IACD;QACI,IAAI,EAAE,IAAI;QACV,QAAQ,EAAE,GAAG,EAAE,CAAC,mBAAmB;QACnC,IAAI,EAAE,aAAa;QACnB,MAAM,EAAE,EAAE;KACb;CACJ,CAAA"}
|
@@ -1 +0,0 @@
|
|
1
|
-
{"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../../src/conversion/pdf.ts"],"names":[],"mappings":"AAIA,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,mBAGnD"}
|
@@ -1,4 +0,0 @@
|
|
1
|
-
import { WorkflowExecutionPayload } from "@vertesia/common";
|
2
|
-
import { TextExtractionResult } from "../index.js";
|
3
|
-
export declare function generateObjectText(payload: WorkflowExecutionPayload): Promise<TextExtractionResult>;
|
4
|
-
//# sourceMappingURL=generateObjectText.d.ts.map
|
@@ -1 +0,0 @@
|
|
1
|
-
{"version":3,"file":"generateObjectText.d.ts","sourceRoot":"","sources":["../../../src/system/generateObjectText.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,wBAAwB,EAAE,MAAM,kBAAkB,CAAC;AAI5D,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AA+BnD,wBAAsB,kBAAkB,CAAC,OAAO,EAAE,wBAAwB,GAAG,OAAO,CAAC,oBAAoB,CAAC,CA0BzG"}
|
@@ -1,35 +0,0 @@
|
|
1
|
-
import { MockActivityEnvironment, TestWorkflowEnvironment } from '@temporalio/testing';
|
2
|
-
import fs from 'fs';
|
3
|
-
import path from 'path';
|
4
|
-
import { beforeAll, expect, test } from 'vitest';
|
5
|
-
import { trasformPdfToMarkdown } from '../conversion/pdf';
|
6
|
-
|
7
|
-
|
8
|
-
let testEnv: TestWorkflowEnvironment;
|
9
|
-
let activityContext: MockActivityEnvironment;
|
10
|
-
|
11
|
-
beforeAll(async () => {
|
12
|
-
//testEnv = await TestWorkflowEnvironment.createLocal();
|
13
|
-
activityContext = new MockActivityEnvironment();
|
14
|
-
});
|
15
|
-
|
16
|
-
|
17
|
-
test('Converts a PDF to markdown', async () => {
|
18
|
-
const pdfPath = path.resolve(__dirname, '../../fixtures', 'test-pdf1.pdf');
|
19
|
-
const pdfBuffer = fs.readFileSync(pdfPath);
|
20
|
-
const result = await activityContext.run(trasformPdfToMarkdown, pdfBuffer);
|
21
|
-
|
22
|
-
expect(result).toContain('America');
|
23
|
-
});
|
24
|
-
|
25
|
-
// https://github.com/becomposable/studio/issues/432 Skip tests
|
26
|
-
// The file "test-pdf2.pdf" could not be opened. It may be damanged or use a file format that
|
27
|
-
// Preview doesn't recognize.
|
28
|
-
test.skip('Converts another PDF to markdown', async () => {
|
29
|
-
const pdfPath = path.resolve(__dirname, '../../fixtures', 'test-pdf2.pdf');
|
30
|
-
const pdfBuffer = fs.readFileSync(pdfPath);
|
31
|
-
const result = await activityContext.run(trasformPdfToMarkdown, pdfBuffer);
|
32
|
-
|
33
|
-
expect(result).toContain('America');
|
34
|
-
|
35
|
-
});
|
package/src/conversion/pdf.ts
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
|
2
|
-
import { log } from "@temporalio/workflow";
|
3
|
-
import { WorkflowExecutionPayload } from "@vertesia/common";
|
4
|
-
import * as activities from "../activities/index-dsl.js";
|
5
|
-
import { dslProxyActivities } from "../dsl/dslProxyActivities.js";
|
6
|
-
import { NoDocumentFound } from "../errors.js";
|
7
|
-
import { TextExtractionResult } from "../index.js";
|
8
|
-
|
9
|
-
const {
|
10
|
-
getObjectFromStore,
|
11
|
-
extractDocumentText
|
12
|
-
} = dslProxyActivities<typeof activities>("generateTextWorkflow", {
|
13
|
-
startToCloseTimeout: "5 minute",
|
14
|
-
retry: {
|
15
|
-
initialInterval: '5s',
|
16
|
-
backoffCoefficient: 2,
|
17
|
-
maximumAttempts: 5,
|
18
|
-
maximumInterval: 100 * 30 * 1000, //ms
|
19
|
-
nonRetryableErrorTypes: [],
|
20
|
-
},
|
21
|
-
});
|
22
|
-
|
23
|
-
const {
|
24
|
-
transcribeMedia,
|
25
|
-
convertPdfToStructuredText
|
26
|
-
} = dslProxyActivities<typeof activities>("generateTextWorkflow", {
|
27
|
-
startToCloseTimeout: "30 minute",
|
28
|
-
retry: {
|
29
|
-
initialInterval: '30s',
|
30
|
-
backoffCoefficient: 2,
|
31
|
-
maximumAttempts: 5,
|
32
|
-
maximumInterval: 100 * 30 * 1000, //ms
|
33
|
-
nonRetryableErrorTypes: [],
|
34
|
-
},
|
35
|
-
});
|
36
|
-
|
37
|
-
|
38
|
-
export async function generateObjectText(payload: WorkflowExecutionPayload): Promise<TextExtractionResult> {
|
39
|
-
|
40
|
-
const { objectIds } = payload;
|
41
|
-
const objectId = objectIds[0];
|
42
|
-
|
43
|
-
const object = await getObjectFromStore(payload, {});
|
44
|
-
|
45
|
-
if (!object.content?.source) {
|
46
|
-
throw new NoDocumentFound(`No source or mimetype found for object ${objectId}`, objectIds);
|
47
|
-
}
|
48
|
-
const mimetype = object.content.type;
|
49
|
-
if (!mimetype) {
|
50
|
-
throw new NoDocumentFound(`No mimetype found for object ${objectId}`, objectIds);
|
51
|
-
}
|
52
|
-
|
53
|
-
const converter = ConverterActivity.find(({ type }) => type.test(mimetype));
|
54
|
-
if (!converter) {
|
55
|
-
throw new NoDocumentFound(`No converter found for mimetype ${mimetype}`, objectIds);
|
56
|
-
}
|
57
|
-
log.info(`Converting file type ${mimetype} to text with ${converter.name}`);
|
58
|
-
|
59
|
-
const res = await converter.activity(payload)(payload, converter.params);
|
60
|
-
|
61
|
-
log.info("Generated text for object", { res, objectId });
|
62
|
-
return res;
|
63
|
-
|
64
|
-
}
|
65
|
-
|
66
|
-
|
67
|
-
const ConverterActivity = [
|
68
|
-
{
|
69
|
-
type: /application\/pdf/,
|
70
|
-
activity: (payload: WorkflowExecutionPayload) => {
|
71
|
-
const useTextractForPDF = payload.vars?.useTextractForPdf ?? false;
|
72
|
-
return useTextractForPDF ? convertPdfToStructuredText : extractDocumentText;
|
73
|
-
},
|
74
|
-
name: "ConvertPdfToStructuredText",
|
75
|
-
params: {},
|
76
|
-
},
|
77
|
-
{
|
78
|
-
type: /audio\/.+/,
|
79
|
-
activity: () => transcribeMedia,
|
80
|
-
name: "TranscribeMedia",
|
81
|
-
params: {},
|
82
|
-
},
|
83
|
-
{
|
84
|
-
type: /video\/.+/,
|
85
|
-
activity: () => transcribeMedia,
|
86
|
-
name: "TranscribeMedia",
|
87
|
-
params: {},
|
88
|
-
},
|
89
|
-
{
|
90
|
-
type: /.+/,
|
91
|
-
activity: () => extractDocumentText,
|
92
|
-
name: "extractText",
|
93
|
-
params: {},
|
94
|
-
}
|
95
|
-
]
|