@vertesia/workflow 0.51.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +9 -6
- package/src/activities/advanced/createOrUpdateDocumentFromInteractionRun.ts +20 -1
- package/src/activities/chunkDocument.ts +62 -42
- package/src/activities/createDocumentFromOther.ts +2 -2
- package/src/activities/executeInteraction.ts +92 -47
- package/src/activities/extractDocumentText.ts +91 -54
- package/src/activities/generateDocumentProperties.ts +37 -16
- package/src/activities/generateEmbeddings.ts +91 -79
- package/src/activities/generateImageRendition.ts +127 -59
- package/src/activities/generateOrAssignContentType.ts +52 -32
- package/src/activities/getObjectFromStore.ts +1 -1
- package/src/activities/handleError.ts +25 -0
- package/src/activities/index-dsl.ts +1 -0
- package/src/activities/index.ts +0 -1
- package/src/activities/media/processPdfWithTextract.ts +4 -4
- package/src/activities/media/transcribeMediaWithGladia.ts +1 -1
- package/src/activities/notifyWebhook.ts +2 -2
- package/src/activities/setDocumentStatus.ts +1 -1
- package/src/conversion/TextractProcessor.ts +9 -9
- package/src/conversion/image.test.ts +110 -18
- package/src/conversion/image.ts +96 -15
- package/src/conversion/markitdown.ts +41 -0
- package/src/conversion/mutool.ts +1 -1
- package/src/conversion/pandoc.test.ts +8 -6
- package/src/conversion/pandoc.ts +38 -42
- package/src/dsl/dsl-workflow.ts +80 -12
- package/src/dsl/setup/ActivityContext.ts +57 -16
- package/src/dsl/validation.test.ts +2 -2
- package/src/dsl/vars.test.ts +1 -1
- package/src/dsl/vars.ts +6 -6
- package/src/dsl/workflow-exec-child.test.ts +14 -4
- package/src/dsl/workflow-fetch.test.ts +1 -1
- package/src/dsl/workflow-import.test.ts +1 -1
- package/src/dsl/workflow.test.ts +12 -2
- package/src/dsl.ts +1 -1
- package/src/errors.ts +27 -6
- package/src/index.ts +1 -1
- package/src/iterative-generation/activities/extractToc.ts +1 -1
- package/src/iterative-generation/activities/generatePart.ts +2 -2
- package/src/iterative-generation/activities/generateToc.ts +1 -1
- package/src/iterative-generation/iterativeGenerationWorkflow.ts +3 -2
- package/src/iterative-generation/types.ts +4 -4
- package/src/iterative-generation/utils.ts +4 -4
- package/src/system/notifyWebhookWorkflow.ts +2 -1
- package/src/system/recalculateEmbeddingsWorkflow.ts +2 -2
- package/src/utils/blobs.ts +11 -6
- package/src/utils/chunks.ts +17 -0
- package/src/utils/client.ts +4 -3
- package/src/utils/memory.ts +3 -8
- package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js +0 -32
- package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js.map +0 -1
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +0 -66
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +0 -1
- package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js +0 -18
- package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js.map +0 -1
- package/lib/cjs/activities/chunkDocument.js +0 -79
- package/lib/cjs/activities/chunkDocument.js.map +0 -1
- package/lib/cjs/activities/createDocumentFromOther.js +0 -64
- package/lib/cjs/activities/createDocumentFromOther.js.map +0 -1
- package/lib/cjs/activities/executeInteraction.js +0 -134
- package/lib/cjs/activities/executeInteraction.js.map +0 -1
- package/lib/cjs/activities/extractDocumentText.js +0 -135
- package/lib/cjs/activities/extractDocumentText.js.map +0 -1
- package/lib/cjs/activities/generateDocumentProperties.js +0 -59
- package/lib/cjs/activities/generateDocumentProperties.js.map +0 -1
- package/lib/cjs/activities/generateEmbeddings.js +0 -292
- package/lib/cjs/activities/generateEmbeddings.js.map +0 -1
- package/lib/cjs/activities/generateImageRendition.js +0 -104
- package/lib/cjs/activities/generateImageRendition.js.map +0 -1
- package/lib/cjs/activities/generateOrAssignContentType.js +0 -103
- package/lib/cjs/activities/generateOrAssignContentType.js.map +0 -1
- package/lib/cjs/activities/getObjectFromStore.js +0 -20
- package/lib/cjs/activities/getObjectFromStore.js.map +0 -1
- package/lib/cjs/activities/index-dsl.js +0 -37
- package/lib/cjs/activities/index-dsl.js.map +0 -1
- package/lib/cjs/activities/index.js +0 -22
- package/lib/cjs/activities/index.js.map +0 -1
- package/lib/cjs/activities/media/processPdfWithTextract.js +0 -102
- package/lib/cjs/activities/media/processPdfWithTextract.js.map +0 -1
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js +0 -51
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +0 -1
- package/lib/cjs/activities/notifyWebhook.js +0 -34
- package/lib/cjs/activities/notifyWebhook.js.map +0 -1
- package/lib/cjs/activities/setDocumentStatus.js +0 -15
- package/lib/cjs/activities/setDocumentStatus.js.map +0 -1
- package/lib/cjs/conversion/TextractProcessor.js +0 -417
- package/lib/cjs/conversion/TextractProcessor.js.map +0 -1
- package/lib/cjs/conversion/image.js +0 -22
- package/lib/cjs/conversion/image.js.map +0 -1
- package/lib/cjs/conversion/mutool.js +0 -147
- package/lib/cjs/conversion/mutool.js.map +0 -1
- package/lib/cjs/conversion/pandoc.js +0 -39
- package/lib/cjs/conversion/pandoc.js.map +0 -1
- package/lib/cjs/dsl/conditions.js +0 -81
- package/lib/cjs/dsl/conditions.js.map +0 -1
- package/lib/cjs/dsl/dsl-workflow.js +0 -223
- package/lib/cjs/dsl/dsl-workflow.js.map +0 -1
- package/lib/cjs/dsl/dslProxyActivities.js +0 -23
- package/lib/cjs/dsl/dslProxyActivities.js.map +0 -1
- package/lib/cjs/dsl/projections.js +0 -59
- package/lib/cjs/dsl/projections.js.map +0 -1
- package/lib/cjs/dsl/setup/ActivityContext.js +0 -96
- package/lib/cjs/dsl/setup/ActivityContext.js.map +0 -1
- package/lib/cjs/dsl/setup/fetch/DataProvider.js +0 -51
- package/lib/cjs/dsl/setup/fetch/DataProvider.js.map +0 -1
- package/lib/cjs/dsl/setup/fetch/index.js +0 -16
- package/lib/cjs/dsl/setup/fetch/index.js.map +0 -1
- package/lib/cjs/dsl/setup/fetch/providers.js +0 -67
- package/lib/cjs/dsl/setup/fetch/providers.js.map +0 -1
- package/lib/cjs/dsl/test/test-child-workflow.js +0 -10
- package/lib/cjs/dsl/test/test-child-workflow.js.map +0 -1
- package/lib/cjs/dsl/validation.js +0 -122
- package/lib/cjs/dsl/validation.js.map +0 -1
- package/lib/cjs/dsl/vars.js +0 -341
- package/lib/cjs/dsl/vars.js.map +0 -1
- package/lib/cjs/dsl/walk.js +0 -100
- package/lib/cjs/dsl/walk.js.map +0 -1
- package/lib/cjs/dsl.js +0 -20
- package/lib/cjs/dsl.js.map +0 -1
- package/lib/cjs/errors.js +0 -36
- package/lib/cjs/errors.js.map +0 -1
- package/lib/cjs/index.js +0 -50
- package/lib/cjs/index.js.map +0 -1
- package/lib/cjs/iterative-generation/activities/extractToc.js +0 -47
- package/lib/cjs/iterative-generation/activities/extractToc.js.map +0 -1
- package/lib/cjs/iterative-generation/activities/finalizeOutput.js +0 -69
- package/lib/cjs/iterative-generation/activities/finalizeOutput.js.map +0 -1
- package/lib/cjs/iterative-generation/activities/generatePart.js +0 -73
- package/lib/cjs/iterative-generation/activities/generatePart.js.map +0 -1
- package/lib/cjs/iterative-generation/activities/generateToc.js +0 -91
- package/lib/cjs/iterative-generation/activities/generateToc.js.map +0 -1
- package/lib/cjs/iterative-generation/activities/index.js +0 -12
- package/lib/cjs/iterative-generation/activities/index.js.map +0 -1
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js +0 -55
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js.map +0 -1
- package/lib/cjs/iterative-generation/types.js +0 -5
- package/lib/cjs/iterative-generation/types.js.map +0 -1
- package/lib/cjs/iterative-generation/utils.js +0 -121
- package/lib/cjs/iterative-generation/utils.js.map +0 -1
- package/lib/cjs/package.json +0 -3
- package/lib/cjs/result-types.js +0 -10
- package/lib/cjs/result-types.js.map +0 -1
- package/lib/cjs/system/notifyWebhookWorkflow.js +0 -46
- package/lib/cjs/system/notifyWebhookWorkflow.js.map +0 -1
- package/lib/cjs/system/recalculateEmbeddingsWorkflow.js +0 -28
- package/lib/cjs/system/recalculateEmbeddingsWorkflow.js.map +0 -1
- package/lib/cjs/utils/auth.js +0 -15
- package/lib/cjs/utils/auth.js.map +0 -1
- package/lib/cjs/utils/blobs.js +0 -63
- package/lib/cjs/utils/blobs.js.map +0 -1
- package/lib/cjs/utils/client.js +0 -25
- package/lib/cjs/utils/client.js.map +0 -1
- package/lib/cjs/utils/expand-vars.js +0 -33
- package/lib/cjs/utils/expand-vars.js.map +0 -1
- package/lib/cjs/utils/memory.js +0 -72
- package/lib/cjs/utils/memory.js.map +0 -1
- package/lib/cjs/utils/tokens.js +0 -38
- package/lib/cjs/utils/tokens.js.map +0 -1
- package/lib/cjs/vars.js +0 -20
- package/lib/cjs/vars.js.map +0 -1
- package/lib/cjs/workflows.js +0 -15
- package/lib/cjs/workflows.js.map +0 -1
- package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js +0 -29
- package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js.map +0 -1
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +0 -63
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +0 -1
- package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js +0 -15
- package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js.map +0 -1
- package/lib/esm/activities/chunkDocument.js +0 -76
- package/lib/esm/activities/chunkDocument.js.map +0 -1
- package/lib/esm/activities/createDocumentFromOther.js +0 -58
- package/lib/esm/activities/createDocumentFromOther.js.map +0 -1
- package/lib/esm/activities/executeInteraction.js +0 -130
- package/lib/esm/activities/executeInteraction.js.map +0 -1
- package/lib/esm/activities/extractDocumentText.js +0 -132
- package/lib/esm/activities/extractDocumentText.js.map +0 -1
- package/lib/esm/activities/generateDocumentProperties.js +0 -56
- package/lib/esm/activities/generateDocumentProperties.js.map +0 -1
- package/lib/esm/activities/generateEmbeddings.js +0 -256
- package/lib/esm/activities/generateEmbeddings.js.map +0 -1
- package/lib/esm/activities/generateImageRendition.js +0 -98
- package/lib/esm/activities/generateImageRendition.js.map +0 -1
- package/lib/esm/activities/generateOrAssignContentType.js +0 -100
- package/lib/esm/activities/generateOrAssignContentType.js.map +0 -1
- package/lib/esm/activities/getObjectFromStore.js +0 -17
- package/lib/esm/activities/getObjectFromStore.js.map +0 -1
- package/lib/esm/activities/index-dsl.js +0 -18
- package/lib/esm/activities/index-dsl.js.map +0 -1
- package/lib/esm/activities/index.js +0 -6
- package/lib/esm/activities/index.js.map +0 -1
- package/lib/esm/activities/media/processPdfWithTextract.js +0 -98
- package/lib/esm/activities/media/processPdfWithTextract.js.map +0 -1
- package/lib/esm/activities/media/transcribeMediaWithGladia.js +0 -48
- package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +0 -1
- package/lib/esm/activities/notifyWebhook.js +0 -31
- package/lib/esm/activities/notifyWebhook.js.map +0 -1
- package/lib/esm/activities/setDocumentStatus.js +0 -12
- package/lib/esm/activities/setDocumentStatus.js.map +0 -1
- package/lib/esm/conversion/TextractProcessor.js +0 -410
- package/lib/esm/conversion/TextractProcessor.js.map +0 -1
- package/lib/esm/conversion/image.js +0 -16
- package/lib/esm/conversion/image.js.map +0 -1
- package/lib/esm/conversion/mutool.js +0 -139
- package/lib/esm/conversion/mutool.js.map +0 -1
- package/lib/esm/conversion/pandoc.js +0 -36
- package/lib/esm/conversion/pandoc.js.map +0 -1
- package/lib/esm/dsl/conditions.js +0 -75
- package/lib/esm/dsl/conditions.js.map +0 -1
- package/lib/esm/dsl/dsl-workflow.js +0 -216
- package/lib/esm/dsl/dsl-workflow.js.map +0 -1
- package/lib/esm/dsl/dslProxyActivities.js +0 -20
- package/lib/esm/dsl/dslProxyActivities.js.map +0 -1
- package/lib/esm/dsl/projections.js +0 -55
- package/lib/esm/dsl/projections.js.map +0 -1
- package/lib/esm/dsl/setup/ActivityContext.js +0 -91
- package/lib/esm/dsl/setup/ActivityContext.js.map +0 -1
- package/lib/esm/dsl/setup/fetch/DataProvider.js +0 -47
- package/lib/esm/dsl/setup/fetch/DataProvider.js.map +0 -1
- package/lib/esm/dsl/setup/fetch/index.js +0 -12
- package/lib/esm/dsl/setup/fetch/index.js.map +0 -1
- package/lib/esm/dsl/setup/fetch/providers.js +0 -61
- package/lib/esm/dsl/setup/fetch/providers.js.map +0 -1
- package/lib/esm/dsl/test/test-child-workflow.js +0 -5
- package/lib/esm/dsl/test/test-child-workflow.js.map +0 -1
- package/lib/esm/dsl/validation.js +0 -118
- package/lib/esm/dsl/validation.js.map +0 -1
- package/lib/esm/dsl/vars.js +0 -335
- package/lib/esm/dsl/vars.js.map +0 -1
- package/lib/esm/dsl/walk.js +0 -96
- package/lib/esm/dsl/walk.js.map +0 -1
- package/lib/esm/dsl.js +0 -4
- package/lib/esm/dsl.js.map +0 -1
- package/lib/esm/errors.js +0 -30
- package/lib/esm/errors.js.map +0 -1
- package/lib/esm/index.js +0 -32
- package/lib/esm/index.js.map +0 -1
- package/lib/esm/iterative-generation/activities/extractToc.js +0 -44
- package/lib/esm/iterative-generation/activities/extractToc.js.map +0 -1
- package/lib/esm/iterative-generation/activities/finalizeOutput.js +0 -66
- package/lib/esm/iterative-generation/activities/finalizeOutput.js.map +0 -1
- package/lib/esm/iterative-generation/activities/generatePart.js +0 -70
- package/lib/esm/iterative-generation/activities/generatePart.js.map +0 -1
- package/lib/esm/iterative-generation/activities/generateToc.js +0 -88
- package/lib/esm/iterative-generation/activities/generateToc.js.map +0 -1
- package/lib/esm/iterative-generation/activities/index.js +0 -5
- package/lib/esm/iterative-generation/activities/index.js.map +0 -1
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js +0 -52
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js.map +0 -1
- package/lib/esm/iterative-generation/types.js +0 -2
- package/lib/esm/iterative-generation/types.js.map +0 -1
- package/lib/esm/iterative-generation/utils.js +0 -112
- package/lib/esm/iterative-generation/utils.js.map +0 -1
- package/lib/esm/result-types.js +0 -7
- package/lib/esm/result-types.js.map +0 -1
- package/lib/esm/system/notifyWebhookWorkflow.js +0 -43
- package/lib/esm/system/notifyWebhookWorkflow.js.map +0 -1
- package/lib/esm/system/recalculateEmbeddingsWorkflow.js +0 -25
- package/lib/esm/system/recalculateEmbeddingsWorkflow.js.map +0 -1
- package/lib/esm/utils/auth.js +0 -8
- package/lib/esm/utils/auth.js.map +0 -1
- package/lib/esm/utils/blobs.js +0 -52
- package/lib/esm/utils/blobs.js.map +0 -1
- package/lib/esm/utils/client.js +0 -22
- package/lib/esm/utils/client.js.map +0 -1
- package/lib/esm/utils/expand-vars.js +0 -30
- package/lib/esm/utils/expand-vars.js.map +0 -1
- package/lib/esm/utils/memory.js +0 -60
- package/lib/esm/utils/memory.js.map +0 -1
- package/lib/esm/utils/tokens.js +0 -34
- package/lib/esm/utils/tokens.js.map +0 -1
- package/lib/esm/vars.js +0 -4
- package/lib/esm/vars.js.map +0 -1
- package/lib/esm/workflows.js +0 -8
- package/lib/esm/workflows.js.map +0 -1
- package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts +0 -17
- package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts.map +0 -1
- package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts +0 -29
- package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts.map +0 -1
- package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts +0 -19
- package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts.map +0 -1
- package/lib/types/activities/chunkDocument.d.ts +0 -18
- package/lib/types/activities/chunkDocument.d.ts.map +0 -1
- package/lib/types/activities/createDocumentFromOther.d.ts +0 -21
- package/lib/types/activities/createDocumentFromOther.d.ts.map +0 -1
- package/lib/types/activities/executeInteraction.d.ts +0 -44
- package/lib/types/activities/executeInteraction.d.ts.map +0 -1
- package/lib/types/activities/extractDocumentText.d.ts +0 -10
- package/lib/types/activities/extractDocumentText.d.ts.map +0 -1
- package/lib/types/activities/generateDocumentProperties.d.ts +0 -32
- package/lib/types/activities/generateDocumentProperties.d.ts.map +0 -1
- package/lib/types/activities/generateEmbeddings.d.ts +0 -49
- package/lib/types/activities/generateEmbeddings.d.ts.map +0 -1
- package/lib/types/activities/generateImageRendition.d.ts +0 -17
- package/lib/types/activities/generateImageRendition.d.ts.map +0 -1
- package/lib/types/activities/generateOrAssignContentType.d.ts +0 -44
- package/lib/types/activities/generateOrAssignContentType.d.ts.map +0 -1
- package/lib/types/activities/getObjectFromStore.d.ts +0 -14
- package/lib/types/activities/getObjectFromStore.d.ts.map +0 -1
- package/lib/types/activities/index-dsl.d.ts +0 -17
- package/lib/types/activities/index-dsl.d.ts.map +0 -1
- package/lib/types/activities/index.d.ts +0 -6
- package/lib/types/activities/index.d.ts.map +0 -1
- package/lib/types/activities/media/processPdfWithTextract.d.ts +0 -26
- package/lib/types/activities/media/processPdfWithTextract.d.ts.map +0 -1
- package/lib/types/activities/media/transcribeMediaWithGladia.d.ts +0 -14
- package/lib/types/activities/media/transcribeMediaWithGladia.d.ts.map +0 -1
- package/lib/types/activities/notifyWebhook.d.ts +0 -17
- package/lib/types/activities/notifyWebhook.d.ts.map +0 -1
- package/lib/types/activities/setDocumentStatus.d.ts +0 -15
- package/lib/types/activities/setDocumentStatus.d.ts.map +0 -1
- package/lib/types/conversion/TextractProcessor.d.ts +0 -45
- package/lib/types/conversion/TextractProcessor.d.ts.map +0 -1
- package/lib/types/conversion/image.d.ts +0 -9
- package/lib/types/conversion/image.d.ts.map +0 -1
- package/lib/types/conversion/mutool.d.ts +0 -19
- package/lib/types/conversion/mutool.d.ts.map +0 -1
- package/lib/types/conversion/pandoc.d.ts +0 -2
- package/lib/types/conversion/pandoc.d.ts.map +0 -1
- package/lib/types/dsl/conditions.d.ts +0 -2
- package/lib/types/dsl/conditions.d.ts.map +0 -1
- package/lib/types/dsl/dsl-workflow.d.ts +0 -5
- package/lib/types/dsl/dsl-workflow.d.ts.map +0 -1
- package/lib/types/dsl/dslProxyActivities.d.ts +0 -10
- package/lib/types/dsl/dslProxyActivities.d.ts.map +0 -1
- package/lib/types/dsl/projections.d.ts +0 -4
- package/lib/types/dsl/projections.d.ts.map +0 -1
- package/lib/types/dsl/setup/ActivityContext.d.ts +0 -14
- package/lib/types/dsl/setup/ActivityContext.d.ts.map +0 -1
- package/lib/types/dsl/setup/fetch/DataProvider.d.ts +0 -9
- package/lib/types/dsl/setup/fetch/DataProvider.d.ts.map +0 -1
- package/lib/types/dsl/setup/fetch/index.d.ts +0 -6
- package/lib/types/dsl/setup/fetch/index.d.ts.map +0 -1
- package/lib/types/dsl/setup/fetch/providers.d.ts +0 -25
- package/lib/types/dsl/setup/fetch/providers.d.ts.map +0 -1
- package/lib/types/dsl/test/test-child-workflow.d.ts +0 -4
- package/lib/types/dsl/test/test-child-workflow.d.ts.map +0 -1
- package/lib/types/dsl/validation.d.ts +0 -4
- package/lib/types/dsl/validation.d.ts.map +0 -1
- package/lib/types/dsl/vars.d.ts +0 -48
- package/lib/types/dsl/vars.d.ts.map +0 -1
- package/lib/types/dsl/walk.d.ts +0 -18
- package/lib/types/dsl/walk.d.ts.map +0 -1
- package/lib/types/dsl.d.ts +0 -4
- package/lib/types/dsl.d.ts.map +0 -1
- package/lib/types/errors.d.ts +0 -16
- package/lib/types/errors.d.ts.map +0 -1
- package/lib/types/index.d.ts +0 -31
- package/lib/types/index.d.ts.map +0 -1
- package/lib/types/iterative-generation/activities/extractToc.d.ts +0 -10
- package/lib/types/iterative-generation/activities/extractToc.d.ts.map +0 -1
- package/lib/types/iterative-generation/activities/finalizeOutput.d.ts +0 -3
- package/lib/types/iterative-generation/activities/finalizeOutput.d.ts.map +0 -1
- package/lib/types/iterative-generation/activities/generatePart.d.ts +0 -3
- package/lib/types/iterative-generation/activities/generatePart.d.ts.map +0 -1
- package/lib/types/iterative-generation/activities/generateToc.d.ts +0 -4
- package/lib/types/iterative-generation/activities/generateToc.d.ts.map +0 -1
- package/lib/types/iterative-generation/activities/index.d.ts +0 -5
- package/lib/types/iterative-generation/activities/index.d.ts.map +0 -1
- package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts +0 -3
- package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts.map +0 -1
- package/lib/types/iterative-generation/types.d.ts +0 -79
- package/lib/types/iterative-generation/types.d.ts.map +0 -1
- package/lib/types/iterative-generation/utils.d.ts +0 -27
- package/lib/types/iterative-generation/utils.d.ts.map +0 -1
- package/lib/types/result-types.d.ts +0 -22
- package/lib/types/result-types.d.ts.map +0 -1
- package/lib/types/system/notifyWebhookWorkflow.d.ts +0 -3
- package/lib/types/system/notifyWebhookWorkflow.d.ts.map +0 -1
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts +0 -40
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts.map +0 -1
- package/lib/types/utils/auth.d.ts +0 -4
- package/lib/types/utils/auth.d.ts.map +0 -1
- package/lib/types/utils/blobs.d.ts +0 -8
- package/lib/types/utils/blobs.d.ts.map +0 -1
- package/lib/types/utils/client.d.ts +0 -7
- package/lib/types/utils/client.d.ts.map +0 -1
- package/lib/types/utils/expand-vars.d.ts +0 -8
- package/lib/types/utils/expand-vars.d.ts.map +0 -1
- package/lib/types/utils/memory.d.ts +0 -12
- package/lib/types/utils/memory.d.ts.map +0 -1
- package/lib/types/utils/tokens.d.ts +0 -11
- package/lib/types/utils/tokens.d.ts.map +0 -1
- package/lib/types/vars.d.ts +0 -3
- package/lib/types/vars.d.ts.map +0 -1
- package/lib/types/workflows.d.ts +0 -8
- package/lib/types/workflows.d.ts.map +0 -1
- package/lib/workflows-bundle.js +0 -19897
@@ -1,33 +1,41 @@
|
|
1
1
|
import { log } from "@temporalio/activity";
|
2
|
-
import {
|
3
|
-
|
4
|
-
|
2
|
+
import {
|
3
|
+
ContentObject,
|
4
|
+
CreateContentObjectPayload,
|
5
|
+
DSLActivityExecutionPayload,
|
6
|
+
DSLActivitySpec,
|
7
|
+
} from "@vertesia/common";
|
8
|
+
import { mutoolPdfToText } from "../conversion/mutool.js";
|
9
|
+
import { markdownWithPandoc } from "../conversion/pandoc.js";
|
5
10
|
import { setupActivity } from "../dsl/setup/ActivityContext.js";
|
6
|
-
import { NoDocumentFound } from
|
7
|
-
import { TextExtractionResult, TextExtractionStatus } from
|
8
|
-
import { fetchBlobAsBuffer, md5 } from
|
9
|
-
import { countTokens } from
|
11
|
+
import { NoDocumentFound } from "../errors.js";
|
12
|
+
import { TextExtractionResult, TextExtractionStatus } from "../result-types.js";
|
13
|
+
import { fetchBlobAsBuffer, md5 } from "../utils/blobs.js";
|
14
|
+
import { countTokens } from "../utils/tokens.js";
|
15
|
+
import { markdownWithMarkitdown } from "../conversion/markitdown.js";
|
10
16
|
|
11
17
|
//@ts-ignore
|
12
18
|
const JSON: DSLActivitySpec = {
|
13
|
-
name:
|
14
|
-
}
|
19
|
+
name: "extractDocumentText",
|
20
|
+
};
|
15
21
|
|
16
22
|
// doesn't have any own param
|
17
|
-
export interface ExtractDocumentTextParams {
|
23
|
+
export interface ExtractDocumentTextParams {}
|
18
24
|
export interface ExtractDocumentText extends DSLActivitySpec<ExtractDocumentTextParams> {
|
19
|
-
name:
|
25
|
+
name: "extractDocumentText";
|
20
26
|
projection?: never;
|
21
27
|
}
|
22
28
|
|
23
|
-
export async function extractDocumentText(
|
29
|
+
export async function extractDocumentText(
|
30
|
+
payload: DSLActivityExecutionPayload<ExtractDocumentTextParams>,
|
31
|
+
): Promise<TextExtractionResult> {
|
24
32
|
const { client, objectId } = await setupActivity(payload);
|
25
33
|
|
26
34
|
const r = await client.objects.find({
|
27
35
|
query: { _id: objectId },
|
28
36
|
limit: 1,
|
29
|
-
select: "+text"
|
30
|
-
})
|
37
|
+
select: "+text",
|
38
|
+
});
|
31
39
|
const doc = r[0] as ContentObject;
|
32
40
|
if (!doc) {
|
33
41
|
log.error(`Document ${objectId} not found`);
|
@@ -36,7 +44,6 @@ export async function extractDocumentText(payload: DSLActivityExecutionPayload<E
|
|
36
44
|
|
37
45
|
log.info(`Extracting text for object ${doc.id}`);
|
38
46
|
|
39
|
-
|
40
47
|
if (!doc.content?.type || !doc.content?.source) {
|
41
48
|
if (doc.text) {
|
42
49
|
return createResponse(doc, doc.text, TextExtractionStatus.skipped, "Text present and no source or type");
|
@@ -58,74 +65,80 @@ export async function extractDocumentText(payload: DSLActivityExecutionPayload<E
|
|
58
65
|
return createResponse(doc, "", TextExtractionStatus.error, e.message);
|
59
66
|
}
|
60
67
|
|
61
|
-
|
62
68
|
let txt: string;
|
63
69
|
|
64
70
|
switch (doc.content.type) {
|
65
|
-
|
66
|
-
case 'application/pdf':
|
67
|
-
//if pdf is more than 2MB, use mutool
|
71
|
+
case "application/pdf":
|
68
72
|
txt = await mutoolPdfToText(fileBuffer);
|
69
73
|
break;
|
70
74
|
|
71
|
-
case
|
72
|
-
txt = fileBuffer.toString(
|
75
|
+
case "text/plain":
|
76
|
+
txt = fileBuffer.toString("utf8");
|
73
77
|
break;
|
74
78
|
|
75
79
|
//docx
|
76
|
-
case
|
77
|
-
txt = await
|
80
|
+
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
81
|
+
txt = await markdownWithMarkitdown(fileBuffer, "docx");
|
82
|
+
break;
|
83
|
+
|
84
|
+
//pptx
|
85
|
+
case "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
86
|
+
txt = await markdownWithMarkitdown(fileBuffer, "pptx");
|
78
87
|
break;
|
79
88
|
|
80
89
|
//html
|
81
|
-
case
|
82
|
-
txt = await
|
90
|
+
case "text/html":
|
91
|
+
txt = await markdownWithPandoc(fileBuffer, "html");
|
83
92
|
break;
|
84
93
|
|
85
94
|
//opendocument
|
86
|
-
case
|
87
|
-
txt = await
|
95
|
+
case "application/vnd.oasis.opendocument.text":
|
96
|
+
txt = await markdownWithPandoc(fileBuffer, "odt");
|
88
97
|
break;
|
89
98
|
|
90
99
|
//rtf
|
91
|
-
case
|
92
|
-
txt = await
|
100
|
+
case "application/rtf":
|
101
|
+
txt = await markdownWithPandoc(fileBuffer, "rtf");
|
93
102
|
break;
|
94
103
|
|
95
104
|
//markdown
|
96
|
-
case
|
97
|
-
txt = fileBuffer.toString(
|
105
|
+
case "text/markdown":
|
106
|
+
txt = fileBuffer.toString("utf8");
|
98
107
|
break;
|
99
108
|
|
100
109
|
//csv
|
101
|
-
case
|
102
|
-
txt = fileBuffer.toString(
|
110
|
+
case "text/csv":
|
111
|
+
txt = fileBuffer.toString("utf8");
|
103
112
|
break;
|
104
113
|
|
105
114
|
//typescript
|
106
|
-
case
|
107
|
-
txt = fileBuffer.toString(
|
115
|
+
case "application/typescript":
|
116
|
+
txt = fileBuffer.toString("utf8");
|
108
117
|
break;
|
109
118
|
|
110
119
|
//javascript
|
111
|
-
case
|
112
|
-
txt = fileBuffer.toString(
|
120
|
+
case "application/javascript":
|
121
|
+
txt = fileBuffer.toString("utf8");
|
113
122
|
break;
|
114
123
|
|
115
124
|
//json
|
116
|
-
case
|
117
|
-
txt = fileBuffer.toString(
|
125
|
+
case "application/json":
|
126
|
+
txt = fileBuffer.toString("utf8");
|
118
127
|
break;
|
119
128
|
|
120
129
|
default:
|
121
130
|
if (sniffIfText(fileBuffer)) {
|
122
|
-
txt = fileBuffer.toString(
|
131
|
+
txt = fileBuffer.toString("utf8"); //TODO: add charset detection
|
123
132
|
break;
|
124
133
|
}
|
125
|
-
return createResponse(
|
134
|
+
return createResponse(
|
135
|
+
doc,
|
136
|
+
doc.text ?? "",
|
137
|
+
TextExtractionStatus.skipped,
|
138
|
+
`Unsupported mime type: ${doc.content.type}`,
|
139
|
+
);
|
126
140
|
}
|
127
141
|
|
128
|
-
|
129
142
|
const tokensData = countTokens(txt);
|
130
143
|
const etag = doc.content.etag ?? md5(txt);
|
131
144
|
|
@@ -135,15 +148,20 @@ export async function extractDocumentText(payload: DSLActivityExecutionPayload<E
|
|
135
148
|
tokens: {
|
136
149
|
...tokensData,
|
137
150
|
etag: etag,
|
138
|
-
}
|
139
|
-
}
|
151
|
+
},
|
152
|
+
};
|
140
153
|
|
141
154
|
await client.objects.update(doc.id, updateData);
|
142
155
|
|
143
156
|
return createResponse(doc, txt, TextExtractionStatus.success);
|
144
157
|
}
|
145
158
|
|
146
|
-
function createResponse(
|
159
|
+
function createResponse(
|
160
|
+
doc: ContentObject,
|
161
|
+
text: string,
|
162
|
+
status: TextExtractionStatus,
|
163
|
+
message?: string,
|
164
|
+
): TextExtractionResult {
|
147
165
|
return {
|
148
166
|
status,
|
149
167
|
message,
|
@@ -151,18 +169,37 @@ function createResponse(doc: ContentObject, text: string, status: TextExtraction
|
|
151
169
|
len: text.length,
|
152
170
|
objectId: doc.id,
|
153
171
|
hasText: !!text,
|
154
|
-
}
|
155
|
-
|
172
|
+
};
|
156
173
|
}
|
157
174
|
|
158
|
-
|
159
|
-
//if file is less than 100KB, check if it looks like text
|
160
175
|
function sniffIfText(buf: Buffer) {
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
176
|
+
// If file is too large, don't even try
|
177
|
+
if (buf.length > 500 * 1024) {
|
178
|
+
return false;
|
179
|
+
}
|
180
|
+
|
181
|
+
// Count binary/control characters
|
182
|
+
let binaryCount = 0;
|
183
|
+
const sampleSize = Math.min(buf.length, 1000); // Check first 1000 bytes
|
184
|
+
|
185
|
+
for (let i = 0; i < sampleSize; i++) {
|
186
|
+
// Count control characters (except common whitespace)
|
187
|
+
const byte = buf[i];
|
188
|
+
if ((byte < 32 && ![9, 10, 13].includes(byte)) || byte === 0) {
|
189
|
+
binaryCount++;
|
165
190
|
}
|
166
191
|
}
|
167
|
-
|
192
|
+
|
193
|
+
// If more than 10% binary/control chars, probably not text
|
194
|
+
if (binaryCount / sampleSize > 0.1) {
|
195
|
+
return false;
|
196
|
+
}
|
197
|
+
|
198
|
+
// Additional check for valid UTF-8 encoding
|
199
|
+
try {
|
200
|
+
const s = buf.toString("utf8");
|
201
|
+
return s.length > 0 && !s.includes("\uFFFD"); // Replacement character
|
202
|
+
} catch (e) {
|
203
|
+
return false;
|
204
|
+
}
|
168
205
|
}
|
@@ -1,10 +1,10 @@
|
|
1
|
-
import { DSLActivityExecutionPayload, DSLActivitySpec } from "@vertesia/common";
|
2
1
|
import { log } from "@temporalio/activity";
|
2
|
+
import { DSLActivityExecutionPayload, DSLActivitySpec } from "@vertesia/common";
|
3
3
|
import { setupActivity } from "../dsl/setup/ActivityContext.js";
|
4
4
|
import { TruncateSpec } from "../utils/tokens.js";
|
5
5
|
import { InteractionExecutionParams, executeInteractionFromActivity } from "./executeInteraction.js";
|
6
6
|
|
7
|
-
const INT_EXTRACT_INFORMATION = "sys:ExtractInformation"
|
7
|
+
const INT_EXTRACT_INFORMATION = "sys:ExtractInformation";
|
8
8
|
export interface GenerateDocumentPropertiesParams extends InteractionExecutionParams {
|
9
9
|
typesHint?: string[];
|
10
10
|
/**
|
@@ -17,10 +17,12 @@ export interface GenerateDocumentPropertiesParams extends InteractionExecutionPa
|
|
17
17
|
use_vision?: boolean;
|
18
18
|
}
|
19
19
|
export interface GenerateDocumentProperties extends DSLActivitySpec<GenerateDocumentPropertiesParams> {
|
20
|
-
name:
|
20
|
+
name: "generateDocumentProperties";
|
21
21
|
}
|
22
22
|
|
23
|
-
export async function generateDocumentProperties(
|
23
|
+
export async function generateDocumentProperties(
|
24
|
+
payload: DSLActivityExecutionPayload<GenerateDocumentPropertiesParams>,
|
25
|
+
) {
|
24
26
|
const context = await setupActivity<GenerateDocumentPropertiesParams>(payload);
|
25
27
|
const { params, client, objectId } = context;
|
26
28
|
const interactionName = params.interactionName ?? INT_EXTRACT_INFORMATION;
|
@@ -32,7 +34,7 @@ export async function generateDocumentProperties(payload: DSLActivityExecutionPa
|
|
32
34
|
|
33
35
|
if (!doc?.text && !params.use_vision && !doc?.content?.type?.startsWith("image/")) {
|
34
36
|
log.warn(`Object ${objectId} not found or text is empty`);
|
35
|
-
return { status: "failed", error: "no-text" }
|
37
|
+
return { status: "failed", error: "no-text" };
|
36
38
|
}
|
37
39
|
|
38
40
|
if (!type || !type.object_schema) {
|
@@ -50,16 +52,19 @@ export async function generateDocumentProperties(payload: DSLActivityExecutionPa
|
|
50
52
|
}
|
51
53
|
|
52
54
|
log.info(`Object ${objectId} is not an image or pdf`);
|
53
|
-
return undefined
|
54
|
-
}
|
55
|
+
return undefined;
|
56
|
+
};
|
55
57
|
|
56
58
|
const promptData = {
|
57
59
|
content: doc.text ?? undefined,
|
58
60
|
image: getImageRef() ?? undefined,
|
59
61
|
human_context: project?.configuration?.human_context ?? undefined,
|
60
|
-
}
|
62
|
+
};
|
61
63
|
|
62
|
-
log.info(
|
64
|
+
log.info(
|
65
|
+
` Extracting information from object ${objectId} with type ${type.name}`,
|
66
|
+
payload.debug_mode ? { params } : undefined,
|
67
|
+
);
|
63
68
|
|
64
69
|
const infoRes = await executeInteractionFromActivity(
|
65
70
|
client,
|
@@ -70,24 +75,40 @@ export async function generateDocumentProperties(payload: DSLActivityExecutionPa
|
|
70
75
|
result_schema: type.object_schema,
|
71
76
|
},
|
72
77
|
promptData,
|
73
|
-
payload.debug_mode ?? false
|
78
|
+
payload.debug_mode ?? false,
|
74
79
|
);
|
75
80
|
|
81
|
+
const getText = () => {
|
82
|
+
if (doc.text) {
|
83
|
+
return undefined;
|
84
|
+
}
|
85
|
+
let text = "";
|
86
|
+
if (infoRes.result.title) {
|
87
|
+
text += infoRes.result.title + "\n";
|
88
|
+
}
|
89
|
+
if (infoRes.result.description) {
|
90
|
+
text += infoRes.result.description;
|
91
|
+
}
|
92
|
+
if (text) {
|
93
|
+
return text;
|
94
|
+
} else {
|
95
|
+
return undefined;
|
96
|
+
}
|
97
|
+
};
|
98
|
+
|
76
99
|
log.info(`Extracted information from object ${objectId} with type ${type.name}`, { runId: infoRes.id });
|
77
100
|
await client.objects.update(doc.id, {
|
78
101
|
properties: {
|
79
102
|
...infoRes.result,
|
80
|
-
etag: doc.text_etag
|
103
|
+
etag: doc.text_etag,
|
81
104
|
},
|
82
|
-
text:
|
105
|
+
text: getText(),
|
83
106
|
generation_run_info: {
|
84
107
|
id: infoRes.id,
|
85
108
|
date: new Date().toISOString(),
|
86
109
|
model: infoRes.modelId,
|
87
|
-
}
|
110
|
+
},
|
88
111
|
});
|
89
112
|
|
90
|
-
|
91
113
|
return { status: "completed" };
|
92
|
-
|
93
|
-
}
|
114
|
+
}
|
@@ -1,19 +1,42 @@
|
|
1
|
-
import { VertesiaClient } from "@vertesia/client";
|
2
|
-
import { ContentObject, DSLActivityExecutionPayload, DSLActivitySpec, ProjectConfigurationEmbeddings, SupportedEmbeddingTypes } from "@vertesia/common";
|
3
1
|
import { EmbeddingsResult } from "@llumiverse/core";
|
4
2
|
import { log } from "@temporalio/activity";
|
5
|
-
import
|
3
|
+
import { VertesiaClient } from "@vertesia/client";
|
4
|
+
import { ContentObject, DSLActivityExecutionPayload, DSLActivitySpec, ProjectConfigurationEmbeddings, SupportedEmbeddingTypes } from "@vertesia/common";
|
6
5
|
import { setupActivity } from "../dsl/setup/ActivityContext.js";
|
7
6
|
import { NoDocumentFound } from '../errors.js';
|
8
7
|
import { fetchBlobAsBase64, md5 } from "../utils/blobs.js";
|
8
|
+
import { DocPart, getContentParts } from "../utils/chunks.js";
|
9
9
|
import { countTokens } from "../utils/tokens.js";
|
10
10
|
|
11
11
|
|
12
12
|
export interface GenerateEmbeddingsParams {
|
13
|
+
|
14
|
+
/**
|
15
|
+
* The model to use for embedding generation
|
16
|
+
* If not set, the default model for the project will be used
|
17
|
+
*/
|
13
18
|
model?: string;
|
19
|
+
|
20
|
+
/**
|
21
|
+
* The environment to use for embedding generation
|
22
|
+
* If not set, the default environment for the project will be used
|
23
|
+
*/
|
14
24
|
environment?: string;
|
25
|
+
|
26
|
+
/**
|
27
|
+
* If true, force embedding generation even if the document already has embeddings
|
28
|
+
*/
|
15
29
|
force?: boolean;
|
30
|
+
|
31
|
+
/**
|
32
|
+
* The embedding type to generate
|
33
|
+
*/
|
16
34
|
type: SupportedEmbeddingTypes;
|
35
|
+
|
36
|
+
/**
|
37
|
+
* The DocParts to use for long documents
|
38
|
+
*/
|
39
|
+
parts?: DocPart[];
|
17
40
|
}
|
18
41
|
|
19
42
|
export interface GenerateEmbeddings extends DSLActivitySpec<GenerateEmbeddingsParams> {
|
@@ -103,7 +126,7 @@ interface ExecuteGenerateEmbeddingsParams {
|
|
103
126
|
force?: boolean;
|
104
127
|
}
|
105
128
|
|
106
|
-
async function generateTextEmbeddings({ document, client, type, config }: ExecuteGenerateEmbeddingsParams) {
|
129
|
+
async function generateTextEmbeddings({ document, client, type, config }: ExecuteGenerateEmbeddingsParams, parts?: DocPart[],) {
|
107
130
|
// if (!force && document.embeddings[type]?.etag === (document.text_etag ?? md5(document.text))) {
|
108
131
|
// return { id: objectId, status: "skipped", message: "embeddings already generated" }
|
109
132
|
// }
|
@@ -125,6 +148,8 @@ async function generateTextEmbeddings({ document, client, type, config }: Execut
|
|
125
148
|
|
126
149
|
const { environment, model } = config;
|
127
150
|
|
151
|
+
const partDefinitions = parts ?? [];
|
152
|
+
|
128
153
|
// Count tokens if not already done
|
129
154
|
if (!document.tokens?.count && type === SupportedEmbeddingTypes.text) {
|
130
155
|
log.debug('Updating token count for document: ' + document.id);
|
@@ -150,79 +175,64 @@ async function generateTextEmbeddings({ document, client, type, config }: Execut
|
|
150
175
|
if (type === SupportedEmbeddingTypes.text && document.tokens?.count && document.tokens?.count > maxTokens) {
|
151
176
|
log.info('Document too large, generating embeddings for parts');
|
152
177
|
|
153
|
-
|
154
|
-
|
178
|
+
|
179
|
+
if (!document.text) {
|
180
|
+
return { id: document.id, status: "failed", message: "no text found" }
|
155
181
|
}
|
156
182
|
|
157
|
-
|
158
|
-
|
183
|
+
if (!partDefinitions || partDefinitions.length === 0) {
|
184
|
+
log.info('No parts found for document, skipping embeddings generation');
|
185
|
+
return { id: document.id, status: "failed", message: "no parts found" }
|
186
|
+
}
|
159
187
|
|
160
|
-
const generatePartEmbeddings = async (part: ContentObject<any>, i: number) => {
|
161
|
-
try {
|
162
|
-
log.info(`Generating embeddings for part ${part.id}`, { text_len: part.text?.length })
|
163
|
-
if (!part.text) {
|
164
|
-
return { id: part.id, number: i, result: null, status: "skipped", message: "no text found" }
|
165
|
-
}
|
166
188
|
|
167
|
-
|
168
|
-
|
169
|
-
|
189
|
+
log.info('Generating embeddings for parts', { parts: partDefinitions, max_tokens: maxTokens });
|
190
|
+
const docParts = getContentParts(document.text, partDefinitions);
|
191
|
+
|
192
|
+
|
193
|
+
log.info(`Retrieved ${docParts.length} parts`)
|
194
|
+
const start = new Date().getTime();
|
195
|
+
const generatePartEmbeddings = async (partContent: string, i: number) => {
|
196
|
+
const localStart = new Date().getTime();
|
197
|
+
try {
|
198
|
+
log.info(`Generating embeddings for part ${i}`, { text_len: partContent.length })
|
199
|
+
if (!partContent) {
|
200
|
+
return { id: i, number: i, result: null, status: "skipped", message: "no text found" }
|
170
201
|
}
|
171
202
|
|
172
|
-
const e = await generateEmbeddingsFromStudio(
|
173
|
-
log.error('Error generating embeddings for part'
|
203
|
+
const e = await generateEmbeddingsFromStudio(partContent, environment, client, model).catch(e => {
|
204
|
+
log.error('Error generating embeddings for part ' + i, { text_length: partContent.length, error: e });
|
174
205
|
return null;
|
175
206
|
});
|
176
207
|
|
177
208
|
if (!e || !e.values) {
|
178
|
-
return { id:
|
209
|
+
return { id: i, number: i, result: null, message: "no embeddings generated" }
|
179
210
|
}
|
180
211
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
}).catch(err => {
|
188
|
-
log.info(`Error updating embeddings on part ${part.id}`);
|
189
|
-
return { id: part.id, number: i, result: null, message: "error setting embeddings on part", error: err.message }
|
190
|
-
})
|
191
|
-
|
192
|
-
log.info('Generated embeddings for part: ' + part.id);
|
193
|
-
return { id: part.id, number: i, result: e }
|
212
|
+
if (e.values.length === 0) {
|
213
|
+
return { id: i, number: i, result: null, message: "no embeddings generated" }
|
214
|
+
}
|
215
|
+
log.info(`Generated embeddings for part ${i}`, { len: e.values.length, duration: new Date().getTime() - localStart });
|
216
|
+
|
217
|
+
return { number: i, result: e }
|
194
218
|
} catch (err: any) {
|
195
|
-
log.info(`Error generating ${type} embeddings for part ${
|
196
|
-
return {
|
219
|
+
log.info(`Error generating ${type} embeddings for part ${i} of ${document.id}`, { error: err });
|
220
|
+
return { number: i, result: null, message: "error generating embeddings", error: err.message }
|
197
221
|
}
|
198
222
|
}
|
199
223
|
|
200
|
-
const
|
201
|
-
const
|
202
|
-
|
203
|
-
|
204
|
-
// log.info(`Processing part ${p.id}`)
|
205
|
-
// const r = await generatePartEmbeddings(p, i++);
|
206
|
-
// res.push(r)
|
207
|
-
// }
|
208
|
-
|
209
|
-
|
210
|
-
// Filter out parts without embeddings
|
211
|
-
const validEmbeddings = res.filter(item => item.result !== null) as { id: string, number: number, result: EmbeddingsResult }[];
|
212
|
-
|
213
|
-
// Compute the document-level embedding using TensorFlow for attention mechanism
|
214
|
-
log.info('Computing document-level embedding using TF');
|
215
|
-
const documentEmbedding = computeAttentionEmbedding(validEmbeddings.map(item => item.result.values));
|
216
|
-
|
217
|
-
// Save the document-level embedding
|
224
|
+
const partEmbeddings = await Promise.all(docParts.map((part, i) => generatePartEmbeddings(part, i)));
|
225
|
+
const validPartEmbeddings = partEmbeddings.filter(e => e.result !== null).map(e => e.result);
|
226
|
+
const averagedEmbedding = computeAttentionEmbedding(validPartEmbeddings.map(e => e.values));
|
227
|
+
log.info(`Averaged embeddings for document ${document.id} in ${(new Date().getTime() - start) / 1000} seconds`, { len: averagedEmbedding.length, count: validPartEmbeddings.length, max_tokens: maxTokens });
|
218
228
|
await client.objects.setEmbedding(document.id, type,
|
219
229
|
{
|
220
|
-
values:
|
221
|
-
model:
|
230
|
+
values: averagedEmbedding,
|
231
|
+
model: validPartEmbeddings[0].model,
|
222
232
|
etag: document.text_etag
|
223
233
|
}
|
224
234
|
);
|
225
|
-
|
235
|
+
log.info(`Object ${document.id} embedding set`, { type, len: averagedEmbedding.length });
|
226
236
|
|
227
237
|
} else {
|
228
238
|
log.info(`Generating ${type} embeddings for document`);
|
@@ -311,35 +321,37 @@ async function generateEmbeddingsFromStudio(text: string, env: string, client: V
|
|
311
321
|
|
312
322
|
}
|
313
323
|
|
314
|
-
|
315
|
-
|
316
|
-
|
324
|
+
//Simplified attention mechanism
|
325
|
+
// This is a naive implementation and should be replaced with a more sophisticated
|
326
|
+
// using tensorflow in a specific package
|
327
|
+
function computeAttentionEmbedding(chunkEmbeddings: number[][]): number[] {
|
328
|
+
if (chunkEmbeddings.length === 0) return [];
|
329
|
+
|
317
330
|
const start = new Date().getTime();
|
318
331
|
|
319
|
-
//
|
320
|
-
const
|
332
|
+
// Generate random attention weights
|
333
|
+
const attentionWeights = chunkEmbeddings.map(() => Math.random());
|
321
334
|
|
322
|
-
//
|
323
|
-
const
|
335
|
+
// Apply softmax to get attention scores
|
336
|
+
const expWeights = attentionWeights.map(w => Math.exp(w));
|
337
|
+
const sumExpWeights = expWeights.reduce((sum, val) => sum + val, 0);
|
338
|
+
const attentionScores = expWeights.map(w => w / sumExpWeights);
|
324
339
|
|
325
|
-
//
|
326
|
-
const
|
340
|
+
// Get embedding dimension
|
341
|
+
const embeddingDim = chunkEmbeddings[0].length;
|
327
342
|
|
328
|
-
//
|
329
|
-
const
|
330
|
-
const documentEmbeddingTensor = tf.sum(weightedEmbeddings, axis);
|
343
|
+
// Initialize document embedding
|
344
|
+
const documentEmbedding = new Array(embeddingDim).fill(0);
|
331
345
|
|
332
|
-
//
|
333
|
-
|
334
|
-
|
335
|
-
|
346
|
+
// Weighted sum of embeddings
|
347
|
+
for (let i = 0; i < chunkEmbeddings.length; i++) {
|
348
|
+
for (let j = 0; j < embeddingDim; j++) {
|
349
|
+
documentEmbedding[j] += chunkEmbeddings[i][j] * attentionScores[i];
|
350
|
+
}
|
351
|
+
}
|
336
352
|
|
337
|
-
|
338
|
-
|
339
|
-
attentionWeights.dispose();
|
340
|
-
attentionScores.dispose();
|
341
|
-
weightedEmbeddings.dispose();
|
342
|
-
documentEmbeddingTensor.dispose();
|
353
|
+
const duration = new Date().getTime() - start;
|
354
|
+
console.log(`Computed document embedding in ${duration}ms for ${chunkEmbeddings.length} chunks`);
|
343
355
|
|
344
356
|
return documentEmbedding;
|
345
|
-
}
|
357
|
+
}
|