@vertesia/workflow 0.24.0-dev.202601221707
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +13 -0
- package/README.md +65 -0
- package/bin/bundle-workflows.mjs +39 -0
- package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js +33 -0
- package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js.map +1 -0
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +73 -0
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -0
- package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js +19 -0
- package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js.map +1 -0
- package/lib/cjs/activities/chunkDocument.js +85 -0
- package/lib/cjs/activities/chunkDocument.js.map +1 -0
- package/lib/cjs/activities/copyParentArtifacts.js +127 -0
- package/lib/cjs/activities/copyParentArtifacts.js.map +1 -0
- package/lib/cjs/activities/createDocumentFromOther.js +64 -0
- package/lib/cjs/activities/createDocumentFromOther.js.map +1 -0
- package/lib/cjs/activities/executeInteraction.js +194 -0
- package/lib/cjs/activities/executeInteraction.js.map +1 -0
- package/lib/cjs/activities/extractDocumentText.js +156 -0
- package/lib/cjs/activities/extractDocumentText.js.map +1 -0
- package/lib/cjs/activities/generateDocumentProperties.js +83 -0
- package/lib/cjs/activities/generateDocumentProperties.js.map +1 -0
- package/lib/cjs/activities/generateEmbeddings.js +257 -0
- package/lib/cjs/activities/generateEmbeddings.js.map +1 -0
- package/lib/cjs/activities/generateOrAssignContentType.js +125 -0
- package/lib/cjs/activities/generateOrAssignContentType.js.map +1 -0
- package/lib/cjs/activities/getObjectFromStore.js +20 -0
- package/lib/cjs/activities/getObjectFromStore.js.map +1 -0
- package/lib/cjs/activities/handleError.js +22 -0
- package/lib/cjs/activities/handleError.js.map +1 -0
- package/lib/cjs/activities/index-dsl.js +51 -0
- package/lib/cjs/activities/index-dsl.js.map +1 -0
- package/lib/cjs/activities/index.js +21 -0
- package/lib/cjs/activities/index.js.map +1 -0
- package/lib/cjs/activities/media/prepareAudio.js +239 -0
- package/lib/cjs/activities/media/prepareAudio.js.map +1 -0
- package/lib/cjs/activities/media/prepareVideo.js +429 -0
- package/lib/cjs/activities/media/prepareVideo.js.map +1 -0
- package/lib/cjs/activities/media/processPdfWithTextract.js +103 -0
- package/lib/cjs/activities/media/processPdfWithTextract.js.map +1 -0
- package/lib/cjs/activities/media/saveGladiaTranscription.js +81 -0
- package/lib/cjs/activities/media/saveGladiaTranscription.js.map +1 -0
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js +82 -0
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +1 -0
- package/lib/cjs/activities/notifyWebhook.js +158 -0
- package/lib/cjs/activities/notifyWebhook.js.map +1 -0
- package/lib/cjs/activities/rateLimiter.js +30 -0
- package/lib/cjs/activities/rateLimiter.js.map +1 -0
- package/lib/cjs/activities/renditions/generateImageRendition.js +66 -0
- package/lib/cjs/activities/renditions/generateImageRendition.js.map +1 -0
- package/lib/cjs/activities/renditions/generateVideoRendition.js +200 -0
- package/lib/cjs/activities/renditions/generateVideoRendition.js.map +1 -0
- package/lib/cjs/activities/setDocumentStatus.js +15 -0
- package/lib/cjs/activities/setDocumentStatus.js.map +1 -0
- package/lib/cjs/conversion/TextractProcessor.js +417 -0
- package/lib/cjs/conversion/TextractProcessor.js.map +1 -0
- package/lib/cjs/conversion/image.js +149 -0
- package/lib/cjs/conversion/image.js.map +1 -0
- package/lib/cjs/conversion/markitdown.js +42 -0
- package/lib/cjs/conversion/markitdown.js.map +1 -0
- package/lib/cjs/conversion/mutool.js +147 -0
- package/lib/cjs/conversion/mutool.js.map +1 -0
- package/lib/cjs/conversion/pandoc.js +39 -0
- package/lib/cjs/conversion/pandoc.js.map +1 -0
- package/lib/cjs/dsl/conditions.js +81 -0
- package/lib/cjs/dsl/conditions.js.map +1 -0
- package/lib/cjs/dsl/dsl-workflow.js +343 -0
- package/lib/cjs/dsl/dsl-workflow.js.map +1 -0
- package/lib/cjs/dsl/dslProxyActivities.js +23 -0
- package/lib/cjs/dsl/dslProxyActivities.js.map +1 -0
- package/lib/cjs/dsl/projections.js +59 -0
- package/lib/cjs/dsl/projections.js.map +1 -0
- package/lib/cjs/dsl/setup/ActivityContext.js +122 -0
- package/lib/cjs/dsl/setup/ActivityContext.js.map +1 -0
- package/lib/cjs/dsl/setup/fetch/DataProvider.js +51 -0
- package/lib/cjs/dsl/setup/fetch/DataProvider.js.map +1 -0
- package/lib/cjs/dsl/setup/fetch/index.js +16 -0
- package/lib/cjs/dsl/setup/fetch/index.js.map +1 -0
- package/lib/cjs/dsl/setup/fetch/providers.js +67 -0
- package/lib/cjs/dsl/setup/fetch/providers.js.map +1 -0
- package/lib/cjs/dsl/test/test-child-workflow.js +10 -0
- package/lib/cjs/dsl/test/test-child-workflow.js.map +1 -0
- package/lib/cjs/dsl/validation.js +122 -0
- package/lib/cjs/dsl/validation.js.map +1 -0
- package/lib/cjs/dsl/vars.js +341 -0
- package/lib/cjs/dsl/vars.js.map +1 -0
- package/lib/cjs/dsl/walk.js +100 -0
- package/lib/cjs/dsl/walk.js.map +1 -0
- package/lib/cjs/dsl.js +20 -0
- package/lib/cjs/dsl.js.map +1 -0
- package/lib/cjs/errors.js +79 -0
- package/lib/cjs/errors.js.map +1 -0
- package/lib/cjs/index.js +56 -0
- package/lib/cjs/index.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/extractToc.js +47 -0
- package/lib/cjs/iterative-generation/activities/extractToc.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/finalizeOutput.js +72 -0
- package/lib/cjs/iterative-generation/activities/finalizeOutput.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/generatePart.js +78 -0
- package/lib/cjs/iterative-generation/activities/generatePart.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/generateToc.js +86 -0
- package/lib/cjs/iterative-generation/activities/generateToc.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/index.js +12 -0
- package/lib/cjs/iterative-generation/activities/index.js.map +1 -0
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js +56 -0
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js.map +1 -0
- package/lib/cjs/iterative-generation/types.js +5 -0
- package/lib/cjs/iterative-generation/types.js.map +1 -0
- package/lib/cjs/iterative-generation/utils.js +121 -0
- package/lib/cjs/iterative-generation/utils.js.map +1 -0
- package/lib/cjs/package.json +3 -0
- package/lib/cjs/result-types.js +10 -0
- package/lib/cjs/result-types.js.map +1 -0
- package/lib/cjs/system/notifyWebhookWorkflow.js +53 -0
- package/lib/cjs/system/notifyWebhookWorkflow.js.map +1 -0
- package/lib/cjs/system/recalculateEmbeddingsWorkflow.js +33 -0
- package/lib/cjs/system/recalculateEmbeddingsWorkflow.js.map +1 -0
- package/lib/cjs/utils/auth.js +15 -0
- package/lib/cjs/utils/auth.js.map +1 -0
- package/lib/cjs/utils/blobs.js +64 -0
- package/lib/cjs/utils/blobs.js.map +1 -0
- package/lib/cjs/utils/chunks.js +14 -0
- package/lib/cjs/utils/chunks.js.map +1 -0
- package/lib/cjs/utils/client.js +31 -0
- package/lib/cjs/utils/client.js.map +1 -0
- package/lib/cjs/utils/expand-vars.js +33 -0
- package/lib/cjs/utils/expand-vars.js.map +1 -0
- package/lib/cjs/utils/memory.js +65 -0
- package/lib/cjs/utils/memory.js.map +1 -0
- package/lib/cjs/utils/renditions.js +88 -0
- package/lib/cjs/utils/renditions.js.map +1 -0
- package/lib/cjs/utils/storage.js +54 -0
- package/lib/cjs/utils/storage.js.map +1 -0
- package/lib/cjs/utils/tokens.js +38 -0
- package/lib/cjs/utils/tokens.js.map +1 -0
- package/lib/cjs/vars.js +20 -0
- package/lib/cjs/vars.js.map +1 -0
- package/lib/cjs/workflows.js +15 -0
- package/lib/cjs/workflows.js.map +1 -0
- package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js +30 -0
- package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js.map +1 -0
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +70 -0
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -0
- package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js +16 -0
- package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js.map +1 -0
- package/lib/esm/activities/chunkDocument.js +82 -0
- package/lib/esm/activities/chunkDocument.js.map +1 -0
- package/lib/esm/activities/copyParentArtifacts.js +124 -0
- package/lib/esm/activities/copyParentArtifacts.js.map +1 -0
- package/lib/esm/activities/createDocumentFromOther.js +58 -0
- package/lib/esm/activities/createDocumentFromOther.js.map +1 -0
- package/lib/esm/activities/executeInteraction.js +190 -0
- package/lib/esm/activities/executeInteraction.js.map +1 -0
- package/lib/esm/activities/extractDocumentText.js +153 -0
- package/lib/esm/activities/extractDocumentText.js.map +1 -0
- package/lib/esm/activities/generateDocumentProperties.js +80 -0
- package/lib/esm/activities/generateDocumentProperties.js.map +1 -0
- package/lib/esm/activities/generateEmbeddings.js +254 -0
- package/lib/esm/activities/generateEmbeddings.js.map +1 -0
- package/lib/esm/activities/generateOrAssignContentType.js +122 -0
- package/lib/esm/activities/generateOrAssignContentType.js.map +1 -0
- package/lib/esm/activities/getObjectFromStore.js +17 -0
- package/lib/esm/activities/getObjectFromStore.js.map +1 -0
- package/lib/esm/activities/handleError.js +19 -0
- package/lib/esm/activities/handleError.js.map +1 -0
- package/lib/esm/activities/index-dsl.js +25 -0
- package/lib/esm/activities/index-dsl.js.map +1 -0
- package/lib/esm/activities/index.js +5 -0
- package/lib/esm/activities/index.js.map +1 -0
- package/lib/esm/activities/media/prepareAudio.js +200 -0
- package/lib/esm/activities/media/prepareAudio.js.map +1 -0
- package/lib/esm/activities/media/prepareVideo.js +390 -0
- package/lib/esm/activities/media/prepareVideo.js.map +1 -0
- package/lib/esm/activities/media/processPdfWithTextract.js +99 -0
- package/lib/esm/activities/media/processPdfWithTextract.js.map +1 -0
- package/lib/esm/activities/media/saveGladiaTranscription.js +78 -0
- package/lib/esm/activities/media/saveGladiaTranscription.js.map +1 -0
- package/lib/esm/activities/media/transcribeMediaWithGladia.js +79 -0
- package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +1 -0
- package/lib/esm/activities/notifyWebhook.js +155 -0
- package/lib/esm/activities/notifyWebhook.js.map +1 -0
- package/lib/esm/activities/rateLimiter.js +27 -0
- package/lib/esm/activities/rateLimiter.js.map +1 -0
- package/lib/esm/activities/renditions/generateImageRendition.js +63 -0
- package/lib/esm/activities/renditions/generateImageRendition.js.map +1 -0
- package/lib/esm/activities/renditions/generateVideoRendition.js +194 -0
- package/lib/esm/activities/renditions/generateVideoRendition.js.map +1 -0
- package/lib/esm/activities/setDocumentStatus.js +12 -0
- package/lib/esm/activities/setDocumentStatus.js.map +1 -0
- package/lib/esm/conversion/TextractProcessor.js +410 -0
- package/lib/esm/conversion/TextractProcessor.js.map +1 -0
- package/lib/esm/conversion/image.js +143 -0
- package/lib/esm/conversion/image.js.map +1 -0
- package/lib/esm/conversion/markitdown.js +36 -0
- package/lib/esm/conversion/markitdown.js.map +1 -0
- package/lib/esm/conversion/mutool.js +139 -0
- package/lib/esm/conversion/mutool.js.map +1 -0
- package/lib/esm/conversion/pandoc.js +36 -0
- package/lib/esm/conversion/pandoc.js.map +1 -0
- package/lib/esm/dsl/conditions.js +75 -0
- package/lib/esm/dsl/conditions.js.map +1 -0
- package/lib/esm/dsl/dsl-workflow.js +336 -0
- package/lib/esm/dsl/dsl-workflow.js.map +1 -0
- package/lib/esm/dsl/dslProxyActivities.js +20 -0
- package/lib/esm/dsl/dslProxyActivities.js.map +1 -0
- package/lib/esm/dsl/projections.js +55 -0
- package/lib/esm/dsl/projections.js.map +1 -0
- package/lib/esm/dsl/setup/ActivityContext.js +117 -0
- package/lib/esm/dsl/setup/ActivityContext.js.map +1 -0
- package/lib/esm/dsl/setup/fetch/DataProvider.js +47 -0
- package/lib/esm/dsl/setup/fetch/DataProvider.js.map +1 -0
- package/lib/esm/dsl/setup/fetch/index.js +12 -0
- package/lib/esm/dsl/setup/fetch/index.js.map +1 -0
- package/lib/esm/dsl/setup/fetch/providers.js +61 -0
- package/lib/esm/dsl/setup/fetch/providers.js.map +1 -0
- package/lib/esm/dsl/test/test-child-workflow.js +5 -0
- package/lib/esm/dsl/test/test-child-workflow.js.map +1 -0
- package/lib/esm/dsl/validation.js +118 -0
- package/lib/esm/dsl/validation.js.map +1 -0
- package/lib/esm/dsl/vars.js +335 -0
- package/lib/esm/dsl/vars.js.map +1 -0
- package/lib/esm/dsl/walk.js +96 -0
- package/lib/esm/dsl/walk.js.map +1 -0
- package/lib/esm/dsl.js +4 -0
- package/lib/esm/dsl.js.map +1 -0
- package/lib/esm/errors.js +69 -0
- package/lib/esm/errors.js.map +1 -0
- package/lib/esm/index.js +38 -0
- package/lib/esm/index.js.map +1 -0
- package/lib/esm/iterative-generation/activities/extractToc.js +44 -0
- package/lib/esm/iterative-generation/activities/extractToc.js.map +1 -0
- package/lib/esm/iterative-generation/activities/finalizeOutput.js +69 -0
- package/lib/esm/iterative-generation/activities/finalizeOutput.js.map +1 -0
- package/lib/esm/iterative-generation/activities/generatePart.js +75 -0
- package/lib/esm/iterative-generation/activities/generatePart.js.map +1 -0
- package/lib/esm/iterative-generation/activities/generateToc.js +83 -0
- package/lib/esm/iterative-generation/activities/generateToc.js.map +1 -0
- package/lib/esm/iterative-generation/activities/index.js +5 -0
- package/lib/esm/iterative-generation/activities/index.js.map +1 -0
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js +53 -0
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js.map +1 -0
- package/lib/esm/iterative-generation/types.js +2 -0
- package/lib/esm/iterative-generation/types.js.map +1 -0
- package/lib/esm/iterative-generation/utils.js +112 -0
- package/lib/esm/iterative-generation/utils.js.map +1 -0
- package/lib/esm/result-types.js +7 -0
- package/lib/esm/result-types.js.map +1 -0
- package/lib/esm/system/notifyWebhookWorkflow.js +50 -0
- package/lib/esm/system/notifyWebhookWorkflow.js.map +1 -0
- package/lib/esm/system/recalculateEmbeddingsWorkflow.js +30 -0
- package/lib/esm/system/recalculateEmbeddingsWorkflow.js.map +1 -0
- package/lib/esm/utils/auth.js +8 -0
- package/lib/esm/utils/auth.js.map +1 -0
- package/lib/esm/utils/blobs.js +54 -0
- package/lib/esm/utils/blobs.js.map +1 -0
- package/lib/esm/utils/chunks.js +9 -0
- package/lib/esm/utils/chunks.js.map +1 -0
- package/lib/esm/utils/client.js +27 -0
- package/lib/esm/utils/client.js.map +1 -0
- package/lib/esm/utils/expand-vars.js +30 -0
- package/lib/esm/utils/expand-vars.js.map +1 -0
- package/lib/esm/utils/memory.js +55 -0
- package/lib/esm/utils/memory.js.map +1 -0
- package/lib/esm/utils/renditions.js +80 -0
- package/lib/esm/utils/renditions.js.map +1 -0
- package/lib/esm/utils/storage.js +45 -0
- package/lib/esm/utils/storage.js.map +1 -0
- package/lib/esm/utils/tokens.js +34 -0
- package/lib/esm/utils/tokens.js.map +1 -0
- package/lib/esm/vars.js +4 -0
- package/lib/esm/vars.js.map +1 -0
- package/lib/esm/workflows.js +8 -0
- package/lib/esm/workflows.js.map +1 -0
- package/lib/tsconfig.tsbuildinfo +1 -0
- package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts +17 -0
- package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts.map +1 -0
- package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts +39 -0
- package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts.map +1 -0
- package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts +19 -0
- package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts.map +1 -0
- package/lib/types/activities/chunkDocument.d.ts +33 -0
- package/lib/types/activities/chunkDocument.d.ts.map +1 -0
- package/lib/types/activities/copyParentArtifacts.d.ts +19 -0
- package/lib/types/activities/copyParentArtifacts.d.ts.map +1 -0
- package/lib/types/activities/createDocumentFromOther.d.ts +21 -0
- package/lib/types/activities/createDocumentFromOther.d.ts.map +1 -0
- package/lib/types/activities/executeInteraction.d.ts +61 -0
- package/lib/types/activities/executeInteraction.d.ts.map +1 -0
- package/lib/types/activities/extractDocumentText.d.ts +10 -0
- package/lib/types/activities/extractDocumentText.d.ts.map +1 -0
- package/lib/types/activities/generateDocumentProperties.d.ts +32 -0
- package/lib/types/activities/generateDocumentProperties.d.ts.map +1 -0
- package/lib/types/activities/generateEmbeddings.d.ts +53 -0
- package/lib/types/activities/generateEmbeddings.d.ts.map +1 -0
- package/lib/types/activities/generateOrAssignContentType.d.ts +44 -0
- package/lib/types/activities/generateOrAssignContentType.d.ts.map +1 -0
- package/lib/types/activities/getObjectFromStore.d.ts +14 -0
- package/lib/types/activities/getObjectFromStore.d.ts.map +1 -0
- package/lib/types/activities/handleError.d.ts +6 -0
- package/lib/types/activities/handleError.d.ts.map +1 -0
- package/lib/types/activities/index-dsl.d.ts +25 -0
- package/lib/types/activities/index-dsl.d.ts.map +1 -0
- package/lib/types/activities/index.d.ts +5 -0
- package/lib/types/activities/index.d.ts.map +1 -0
- package/lib/types/activities/media/prepareAudio.d.ts +25 -0
- package/lib/types/activities/media/prepareAudio.d.ts.map +1 -0
- package/lib/types/activities/media/prepareVideo.d.ts +30 -0
- package/lib/types/activities/media/prepareVideo.d.ts.map +1 -0
- package/lib/types/activities/media/processPdfWithTextract.d.ts +26 -0
- package/lib/types/activities/media/processPdfWithTextract.d.ts.map +1 -0
- package/lib/types/activities/media/saveGladiaTranscription.d.ts +14 -0
- package/lib/types/activities/media/saveGladiaTranscription.d.ts.map +1 -0
- package/lib/types/activities/media/transcribeMediaWithGladia.d.ts +19 -0
- package/lib/types/activities/media/transcribeMediaWithGladia.d.ts.map +1 -0
- package/lib/types/activities/notifyWebhook.d.ts +27 -0
- package/lib/types/activities/notifyWebhook.d.ts.map +1 -0
- package/lib/types/activities/rateLimiter.d.ts +11 -0
- package/lib/types/activities/rateLimiter.d.ts.map +1 -0
- package/lib/types/activities/renditions/generateImageRendition.d.ts +14 -0
- package/lib/types/activities/renditions/generateImageRendition.d.ts.map +1 -0
- package/lib/types/activities/renditions/generateVideoRendition.d.ts +15 -0
- package/lib/types/activities/renditions/generateVideoRendition.d.ts.map +1 -0
- package/lib/types/activities/setDocumentStatus.d.ts +15 -0
- package/lib/types/activities/setDocumentStatus.d.ts.map +1 -0
- package/lib/types/conversion/TextractProcessor.d.ts +45 -0
- package/lib/types/conversion/TextractProcessor.d.ts.map +1 -0
- package/lib/types/conversion/image.d.ts +13 -0
- package/lib/types/conversion/image.d.ts.map +1 -0
- package/lib/types/conversion/markitdown.d.ts +2 -0
- package/lib/types/conversion/markitdown.d.ts.map +1 -0
- package/lib/types/conversion/mutool.d.ts +19 -0
- package/lib/types/conversion/mutool.d.ts.map +1 -0
- package/lib/types/conversion/pandoc.d.ts +2 -0
- package/lib/types/conversion/pandoc.d.ts.map +1 -0
- package/lib/types/dsl/conditions.d.ts +2 -0
- package/lib/types/dsl/conditions.d.ts.map +1 -0
- package/lib/types/dsl/dsl-workflow.d.ts +5 -0
- package/lib/types/dsl/dsl-workflow.d.ts.map +1 -0
- package/lib/types/dsl/dslProxyActivities.d.ts +10 -0
- package/lib/types/dsl/dslProxyActivities.d.ts.map +1 -0
- package/lib/types/dsl/projections.d.ts +4 -0
- package/lib/types/dsl/projections.d.ts.map +1 -0
- package/lib/types/dsl/setup/ActivityContext.d.ts +17 -0
- package/lib/types/dsl/setup/ActivityContext.d.ts.map +1 -0
- package/lib/types/dsl/setup/fetch/DataProvider.d.ts +9 -0
- package/lib/types/dsl/setup/fetch/DataProvider.d.ts.map +1 -0
- package/lib/types/dsl/setup/fetch/index.d.ts +6 -0
- package/lib/types/dsl/setup/fetch/index.d.ts.map +1 -0
- package/lib/types/dsl/setup/fetch/providers.d.ts +25 -0
- package/lib/types/dsl/setup/fetch/providers.d.ts.map +1 -0
- package/lib/types/dsl/test/test-child-workflow.d.ts +4 -0
- package/lib/types/dsl/test/test-child-workflow.d.ts.map +1 -0
- package/lib/types/dsl/validation.d.ts +4 -0
- package/lib/types/dsl/validation.d.ts.map +1 -0
- package/lib/types/dsl/vars.d.ts +48 -0
- package/lib/types/dsl/vars.d.ts.map +1 -0
- package/lib/types/dsl/walk.d.ts +18 -0
- package/lib/types/dsl/walk.d.ts.map +1 -0
- package/lib/types/dsl.d.ts +4 -0
- package/lib/types/dsl.d.ts.map +1 -0
- package/lib/types/errors.d.ts +37 -0
- package/lib/types/errors.d.ts.map +1 -0
- package/lib/types/index.d.ts +37 -0
- package/lib/types/index.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/extractToc.d.ts +10 -0
- package/lib/types/iterative-generation/activities/extractToc.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/finalizeOutput.d.ts +3 -0
- package/lib/types/iterative-generation/activities/finalizeOutput.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/generatePart.d.ts +3 -0
- package/lib/types/iterative-generation/activities/generatePart.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/generateToc.d.ts +4 -0
- package/lib/types/iterative-generation/activities/generateToc.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/index.d.ts +5 -0
- package/lib/types/iterative-generation/activities/index.d.ts.map +1 -0
- package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts +3 -0
- package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts.map +1 -0
- package/lib/types/iterative-generation/types.d.ts +79 -0
- package/lib/types/iterative-generation/types.d.ts.map +1 -0
- package/lib/types/iterative-generation/utils.d.ts +26 -0
- package/lib/types/iterative-generation/utils.d.ts.map +1 -0
- package/lib/types/result-types.d.ts +22 -0
- package/lib/types/result-types.d.ts.map +1 -0
- package/lib/types/system/notifyWebhookWorkflow.d.ts +8 -0
- package/lib/types/system/notifyWebhookWorkflow.d.ts.map +1 -0
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts +25 -0
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts.map +1 -0
- package/lib/types/utils/auth.d.ts +4 -0
- package/lib/types/utils/auth.d.ts.map +1 -0
- package/lib/types/utils/blobs.d.ts +7 -0
- package/lib/types/utils/blobs.d.ts.map +1 -0
- package/lib/types/utils/chunks.d.ts +9 -0
- package/lib/types/utils/chunks.d.ts.map +1 -0
- package/lib/types/utils/client.d.ts +8 -0
- package/lib/types/utils/client.d.ts.map +1 -0
- package/lib/types/utils/expand-vars.d.ts +8 -0
- package/lib/types/utils/expand-vars.d.ts.map +1 -0
- package/lib/types/utils/memory.d.ts +8 -0
- package/lib/types/utils/memory.d.ts.map +1 -0
- package/lib/types/utils/renditions.d.ts +23 -0
- package/lib/types/utils/renditions.d.ts.map +1 -0
- package/lib/types/utils/storage.d.ts +16 -0
- package/lib/types/utils/storage.d.ts.map +1 -0
- package/lib/types/utils/tokens.d.ts +11 -0
- package/lib/types/utils/tokens.d.ts.map +1 -0
- package/lib/types/vars.d.ts +3 -0
- package/lib/types/vars.d.ts.map +1 -0
- package/lib/types/workflows.d.ts +8 -0
- package/lib/types/workflows.d.ts.map +1 -0
- package/lib/workflows-bundle.js +17213 -0
- package/package.json +146 -0
- package/src/activities/advanced/createDocumentTypeFromInteractionRun.ts +55 -0
- package/src/activities/advanced/createOrUpdateDocumentFromInteractionRun.ts +119 -0
- package/src/activities/advanced/updateDocumentFromInteractionRun.ts +35 -0
- package/src/activities/chunkDocument.ts +146 -0
- package/src/activities/copyParentArtifacts.ts +162 -0
- package/src/activities/createDocumentFromOther.ts +92 -0
- package/src/activities/executeInteraction.ts +300 -0
- package/src/activities/extractDocumentText.ts +205 -0
- package/src/activities/generateDocumentProperties.ts +120 -0
- package/src/activities/generateEmbeddings.ts +387 -0
- package/src/activities/generateOrAssignContentType.ts +218 -0
- package/src/activities/getObjectFromStore.ts +31 -0
- package/src/activities/handleError.ts +25 -0
- package/src/activities/index-dsl.ts +25 -0
- package/src/activities/index.ts +4 -0
- package/src/activities/media/prepareAudio.ts +334 -0
- package/src/activities/media/prepareVideo.ts +622 -0
- package/src/activities/media/processPdfWithTextract.ts +141 -0
- package/src/activities/media/saveGladiaTranscription.ts +128 -0
- package/src/activities/media/transcribeMediaWithGladia.ts +117 -0
- package/src/activities/notifyWebhook.test.ts +134 -0
- package/src/activities/notifyWebhook.ts +199 -0
- package/src/activities/rateLimiter.ts +41 -0
- package/src/activities/renditions/generateImageRendition.ts +111 -0
- package/src/activities/renditions/generateVideoRendition.ts +293 -0
- package/src/activities/setDocumentStatus.ts +25 -0
- package/src/conversion/TextractProcessor.ts +506 -0
- package/src/conversion/image.test.ts +118 -0
- package/src/conversion/image.ts +168 -0
- package/src/conversion/markitdown.ts +41 -0
- package/src/conversion/mutool.test.ts +74 -0
- package/src/conversion/mutool.ts +180 -0
- package/src/conversion/pandoc.test.ts +24 -0
- package/src/conversion/pandoc.ts +40 -0
- package/src/dsl/conditions.ts +76 -0
- package/src/dsl/dsl-workflow.test.ts +58 -0
- package/src/dsl/dsl-workflow.ts +397 -0
- package/src/dsl/dslProxyActivities.ts +38 -0
- package/src/dsl/ms.d.ts +11 -0
- package/src/dsl/projections.test.ts +159 -0
- package/src/dsl/projections.ts +72 -0
- package/src/dsl/setup/ActivityContext.ts +178 -0
- package/src/dsl/setup/fetch/DataProvider.ts +45 -0
- package/src/dsl/setup/fetch/index.ts +19 -0
- package/src/dsl/setup/fetch/providers.ts +67 -0
- package/src/dsl/test/test-child-workflow.ts +6 -0
- package/src/dsl/validation.test.ts +257 -0
- package/src/dsl/validation.ts +125 -0
- package/src/dsl/vars.test.ts +245 -0
- package/src/dsl/vars.ts +340 -0
- package/src/dsl/walk.test.ts +81 -0
- package/src/dsl/walk.ts +103 -0
- package/src/dsl/workflow-exec-child.test.ts +273 -0
- package/src/dsl/workflow-fetch.test.ts +138 -0
- package/src/dsl/workflow-import.test.ts +89 -0
- package/src/dsl/workflow.test.ts +122 -0
- package/src/dsl.ts +3 -0
- package/src/errors.ts +101 -0
- package/src/index.ts +41 -0
- package/src/iterative-generation/activities/extractToc.ts +63 -0
- package/src/iterative-generation/activities/finalizeOutput.ts +100 -0
- package/src/iterative-generation/activities/generatePart.ts +123 -0
- package/src/iterative-generation/activities/generateToc.ts +116 -0
- package/src/iterative-generation/activities/index.ts +4 -0
- package/src/iterative-generation/iterativeGenerationWorkflow.ts +68 -0
- package/src/iterative-generation/types.ts +99 -0
- package/src/iterative-generation/utils.ts +126 -0
- package/src/result-types.ts +25 -0
- package/src/system/notifyWebhookWorkflow.ts +70 -0
- package/src/system/recalculateEmbeddingsWorkflow.ts +41 -0
- package/src/utils/auth.ts +10 -0
- package/src/utils/blobs.ts +59 -0
- package/src/utils/chunks.ts +17 -0
- package/src/utils/client.ts +46 -0
- package/src/utils/expand-vars.ts +31 -0
- package/src/utils/memory.ts +61 -0
- package/src/utils/renditions.ts +127 -0
- package/src/utils/storage.ts +60 -0
- package/src/utils/tokens.ts +44 -0
- package/src/vars.ts +3 -0
- package/src/workflows.ts +7 -0
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
import { PutObjectCommand, S3Client } from "@aws-sdk/client-s3";
|
|
2
|
+
import type { Block } from "@aws-sdk/client-textract";
|
|
3
|
+
import {
|
|
4
|
+
GetDocumentAnalysisCommand,
|
|
5
|
+
StartDocumentAnalysisCommand,
|
|
6
|
+
TextractClient
|
|
7
|
+
} from "@aws-sdk/client-textract";
|
|
8
|
+
import type { AwsCredentialIdentityProvider } from "@smithy/types";
|
|
9
|
+
import Papa from 'papaparse';
|
|
10
|
+
|
|
11
|
+
interface BlocksMap {
|
|
12
|
+
[key: string]: Block;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
interface ContentBlock {
|
|
16
|
+
type: 'text' | 'table' | 'image';
|
|
17
|
+
content: string;
|
|
18
|
+
confidence?: number;
|
|
19
|
+
// Optional geometry if it's an image
|
|
20
|
+
left?: number;
|
|
21
|
+
top?: number;
|
|
22
|
+
width?: number;
|
|
23
|
+
height?: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
interface PageContent {
|
|
27
|
+
pageNumber: number;
|
|
28
|
+
blocks: ContentBlock[];
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
interface TextractProcessorOptions {
|
|
32
|
+
fileKey: string;
|
|
33
|
+
region: string;
|
|
34
|
+
bucket: string;
|
|
35
|
+
credentials?: AwsCredentialIdentityProvider;
|
|
36
|
+
log?: any;
|
|
37
|
+
detectImages?: boolean;
|
|
38
|
+
/**
|
|
39
|
+
* NEW: If true, includes cell-confidence information in the table CSV
|
|
40
|
+
*/
|
|
41
|
+
includeConfidenceInTables?: boolean;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export class TextractProcessor {
|
|
45
|
+
private textractClient: TextractClient;
|
|
46
|
+
private s3Client: S3Client;
|
|
47
|
+
private fileKey: string;
|
|
48
|
+
private bucket: string;
|
|
49
|
+
private log: any;
|
|
50
|
+
private detectImages: boolean;
|
|
51
|
+
/**
|
|
52
|
+
* Whether or not to include confidence values in CSV output for tables.
|
|
53
|
+
*/
|
|
54
|
+
private includeConfidenceInTables: boolean;
|
|
55
|
+
|
|
56
|
+
constructor({
|
|
57
|
+
fileKey,
|
|
58
|
+
region,
|
|
59
|
+
bucket,
|
|
60
|
+
credentials,
|
|
61
|
+
log,
|
|
62
|
+
detectImages = false,
|
|
63
|
+
includeConfidenceInTables = false // NEW default = false
|
|
64
|
+
}: TextractProcessorOptions) {
|
|
65
|
+
this.fileKey = fileKey;
|
|
66
|
+
this.bucket = bucket;
|
|
67
|
+
this.log = log;
|
|
68
|
+
this.detectImages = detectImages;
|
|
69
|
+
this.includeConfidenceInTables = includeConfidenceInTables;
|
|
70
|
+
|
|
71
|
+
this.textractClient = new TextractClient({
|
|
72
|
+
region,
|
|
73
|
+
credentials
|
|
74
|
+
});
|
|
75
|
+
this.s3Client = new S3Client({
|
|
76
|
+
region,
|
|
77
|
+
credentials
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
private getText(result: Block, blocksMap: BlocksMap): string {
|
|
82
|
+
let text = '';
|
|
83
|
+
if (result.Relationships) {
|
|
84
|
+
for (const relationship of result.Relationships) {
|
|
85
|
+
if (relationship.Type === 'CHILD') {
|
|
86
|
+
for (const childId of relationship.Ids || []) {
|
|
87
|
+
const word = blocksMap[childId];
|
|
88
|
+
if (word.BlockType === 'WORD') {
|
|
89
|
+
const wordText = word.Text || '';
|
|
90
|
+
// Example logic to quote numeric text with commas
|
|
91
|
+
if (wordText.includes(',') &&
|
|
92
|
+
wordText.replace(',', '').match(/^\d+$/)) {
|
|
93
|
+
text += `"${wordText}" `;
|
|
94
|
+
} else {
|
|
95
|
+
text += `${wordText} `;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
if (
|
|
99
|
+
word.BlockType === 'SELECTION_ELEMENT' &&
|
|
100
|
+
word.SelectionStatus === 'SELECTED'
|
|
101
|
+
) {
|
|
102
|
+
text += 'X ';
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return text.trim();
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
private isBlockInTable(block: Block, blocksMap: BlocksMap): boolean {
|
|
112
|
+
if (block.BlockType !== 'LINE') {
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
if (block.Relationships) {
|
|
116
|
+
for (const relationship of block.Relationships) {
|
|
117
|
+
if (relationship.Type === 'CHILD') {
|
|
118
|
+
for (const childId of relationship.Ids || []) {
|
|
119
|
+
const wordBlock = blocksMap[childId];
|
|
120
|
+
if (this.isWordInTableCell(wordBlock, blocksMap)) {
|
|
121
|
+
return true;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
private isWordInTableCell(wordBlock: Block, blocksMap: BlocksMap): boolean {
|
|
131
|
+
// Check if the wordBlock is a descendant of any TABLE->CELL block
|
|
132
|
+
for (const blockId in blocksMap) {
|
|
133
|
+
const potentialTable = blocksMap[blockId];
|
|
134
|
+
if (potentialTable.BlockType === 'TABLE' && potentialTable.Relationships) {
|
|
135
|
+
for (const relationship of potentialTable.Relationships) {
|
|
136
|
+
if (relationship.Type === 'CHILD') {
|
|
137
|
+
for (const cellId of relationship.Ids || []) {
|
|
138
|
+
const cell = blocksMap[cellId];
|
|
139
|
+
if (cell.BlockType === 'CELL' && cell.Relationships) {
|
|
140
|
+
for (const cellRel of cell.Relationships) {
|
|
141
|
+
if (
|
|
142
|
+
cellRel.Type === 'CHILD' &&
|
|
143
|
+
cellRel.Ids?.includes(wordBlock.Id!)
|
|
144
|
+
) {
|
|
145
|
+
return true;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return false;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* NEW: Helper type to store row and column text along with confidence.
|
|
159
|
+
*/
|
|
160
|
+
private getRowsColumnsMap(
|
|
161
|
+
tableResult: Block,
|
|
162
|
+
blocksMap: BlocksMap
|
|
163
|
+
): {
|
|
164
|
+
rows: Array<Array<{ text: string; confidence: number }>>;
|
|
165
|
+
} {
|
|
166
|
+
const rows: Array<Array<{ text: string; confidence: number }>> = [];
|
|
167
|
+
|
|
168
|
+
tableResult.Relationships?.forEach(relationship => {
|
|
169
|
+
if (relationship.Type === 'CHILD') {
|
|
170
|
+
relationship.Ids?.forEach(childId => {
|
|
171
|
+
const cell = blocksMap[childId];
|
|
172
|
+
if (cell.BlockType === 'CELL') {
|
|
173
|
+
const rowIndex = cell.RowIndex || 1;
|
|
174
|
+
const colIndex = cell.ColumnIndex || 1;
|
|
175
|
+
|
|
176
|
+
// Expand the array if needed
|
|
177
|
+
if (!rows[rowIndex - 1]) {
|
|
178
|
+
rows[rowIndex - 1] = [];
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Prepare cell text and confidence
|
|
182
|
+
const text = this.getText(cell, blocksMap);
|
|
183
|
+
const confidence = cell.Confidence || 0;
|
|
184
|
+
|
|
185
|
+
// If there's a gap, fill it with placeholders
|
|
186
|
+
// so that we can safely place text at colIndex - 1
|
|
187
|
+
for (let i = rows[rowIndex - 1].length; i < colIndex - 1; i++) {
|
|
188
|
+
rows[rowIndex - 1].push({ text: '', confidence: 0 });
|
|
189
|
+
}
|
|
190
|
+
rows[rowIndex - 1][colIndex - 1] = { text, confidence };
|
|
191
|
+
}
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
return { rows };
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
private generateTableCSV(
|
|
200
|
+
tableResult: Block,
|
|
201
|
+
blocksMap: BlocksMap,
|
|
202
|
+
_tableIndex: number,
|
|
203
|
+
_pageNumber: number
|
|
204
|
+
): { csv: string; tableConfidence: number } {
|
|
205
|
+
const { rows } = this.getRowsColumnsMap(tableResult, blocksMap);
|
|
206
|
+
|
|
207
|
+
let totalConfidence = 0;
|
|
208
|
+
let cellCount = 0;
|
|
209
|
+
|
|
210
|
+
// Prepare CSV data
|
|
211
|
+
const csvData: string[][] = [];
|
|
212
|
+
for (const row of rows) {
|
|
213
|
+
const rowData: string[] = [];
|
|
214
|
+
for (const cell of row) {
|
|
215
|
+
// Add to CSV
|
|
216
|
+
rowData.push(cell.text.trim());
|
|
217
|
+
// Accumulate confidence
|
|
218
|
+
totalConfidence += cell.confidence;
|
|
219
|
+
cellCount++;
|
|
220
|
+
}
|
|
221
|
+
csvData.push(rowData);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Compute average confidence (or any other method you prefer)
|
|
225
|
+
const tableConfidence = cellCount > 0 ? (totalConfidence / cellCount) : 0;
|
|
226
|
+
|
|
227
|
+
// Convert to CSV
|
|
228
|
+
const csv = Papa.unparse(csvData, {
|
|
229
|
+
delimiter: ',',
|
|
230
|
+
quotes: true,
|
|
231
|
+
quoteChar: '"',
|
|
232
|
+
escapeChar: '"',
|
|
233
|
+
header: false,
|
|
234
|
+
newline: '\n',
|
|
235
|
+
skipEmptyLines: false
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
return { csv, tableConfidence };
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
async upload(fileBuf: Buffer): Promise<void> {
|
|
242
|
+
this.log.info('Uploading file to S3', { fileKey: this.fileKey });
|
|
243
|
+
const command = new PutObjectCommand({
|
|
244
|
+
Bucket: this.bucket,
|
|
245
|
+
Key: this.fileKey,
|
|
246
|
+
Body: fileBuf,
|
|
247
|
+
});
|
|
248
|
+
await this.s3Client.send(command);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
async startAnalysis(s3Key: string): Promise<string> {
|
|
252
|
+
const command = new StartDocumentAnalysisCommand({
|
|
253
|
+
DocumentLocation: {
|
|
254
|
+
S3Object: {
|
|
255
|
+
Bucket: this.bucket,
|
|
256
|
+
Name: s3Key
|
|
257
|
+
}
|
|
258
|
+
},
|
|
259
|
+
FeatureTypes: ["TABLES"]
|
|
260
|
+
});
|
|
261
|
+
const response = await this.textractClient.send(command);
|
|
262
|
+
return response.JobId!;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
async checkJobStatus(jobId: string): Promise<string> {
|
|
266
|
+
const command = new GetDocumentAnalysisCommand({ JobId: jobId });
|
|
267
|
+
const response = await this.textractClient.send(command);
|
|
268
|
+
return response.JobStatus!;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
private getImagePlaceholder(block: Block): string {
|
|
272
|
+
const geometry = block.Geometry?.BoundingBox;
|
|
273
|
+
if (!geometry) return '';
|
|
274
|
+
const area = (geometry.Width || 0) * (geometry.Height || 0);
|
|
275
|
+
if (area < 0.05) return ''; // skip small images
|
|
276
|
+
|
|
277
|
+
const top = geometry.Top || 0;
|
|
278
|
+
const left = geometry.Left || 0;
|
|
279
|
+
|
|
280
|
+
let position = '';
|
|
281
|
+
if (top < 0.3) position += 'TOP_';
|
|
282
|
+
else if (top > 0.7) position += 'BOTTOM_';
|
|
283
|
+
|
|
284
|
+
if (left < 0.3) position += 'LEFT';
|
|
285
|
+
else if (left > 0.7) position += 'RIGHT';
|
|
286
|
+
else position += 'CENTER';
|
|
287
|
+
|
|
288
|
+
return `[IMAGE_${position}]\n`;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
private getIndentationLevel(block: Block): number {
|
|
292
|
+
const left = block.Geometry?.BoundingBox?.Left || 0;
|
|
293
|
+
if (left < 0.15) return 0;
|
|
294
|
+
if (left < 0.25) return 1;
|
|
295
|
+
return 2;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
private isLikelyHeader(block: Block, prevBlock: Block | null): boolean {
|
|
299
|
+
if (!prevBlock) return true;
|
|
300
|
+
const gap = (block.Geometry?.BoundingBox?.Top || 0) -
|
|
301
|
+
((prevBlock.Geometry?.BoundingBox?.Top || 0) +
|
|
302
|
+
(prevBlock.Geometry?.BoundingBox?.Height || 0));
|
|
303
|
+
return gap > 0.03;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
private formatTextBlock(block: Block, prevBlock: Block | null): string {
|
|
307
|
+
const text = block.Text || '';
|
|
308
|
+
const indentLevel = this.getIndentationLevel(block);
|
|
309
|
+
const indent = ' '.repeat(indentLevel);
|
|
310
|
+
|
|
311
|
+
if (this.isLikelyHeader(block, prevBlock)) {
|
|
312
|
+
return `\n${indent}${text}\n`;
|
|
313
|
+
}
|
|
314
|
+
return `${indent}${text}\n`;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
private shouldMergeLines(prev: Block, current: Block): boolean {
|
|
318
|
+
const prevBottom = (prev.Geometry?.BoundingBox?.Top || 0)
|
|
319
|
+
+ (prev.Geometry?.BoundingBox?.Height || 0);
|
|
320
|
+
const currentTop = current.Geometry?.BoundingBox?.Top || 0;
|
|
321
|
+
const gap = currentTop - prevBottom;
|
|
322
|
+
|
|
323
|
+
// For example, if gap < 0.02, treat them as contiguous
|
|
324
|
+
if (gap < 0.02) {
|
|
325
|
+
return true;
|
|
326
|
+
}
|
|
327
|
+
return false;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
async processResults(jobId: string): Promise<string> {
|
|
331
|
+
let nextToken: string | undefined;
|
|
332
|
+
let allBlocks: Block[] = [];
|
|
333
|
+
|
|
334
|
+
do {
|
|
335
|
+
const command = new GetDocumentAnalysisCommand({
|
|
336
|
+
JobId: jobId,
|
|
337
|
+
NextToken: nextToken
|
|
338
|
+
});
|
|
339
|
+
const response = await this.textractClient.send(command);
|
|
340
|
+
allBlocks = allBlocks.concat(response.Blocks || []);
|
|
341
|
+
nextToken = response.NextToken;
|
|
342
|
+
} while (nextToken);
|
|
343
|
+
|
|
344
|
+
// Create blocks map
|
|
345
|
+
const blocksMap: BlocksMap = {};
|
|
346
|
+
for (const block of allBlocks) {
|
|
347
|
+
blocksMap[block.Id!] = block;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// We'll store each page's content in sequence
|
|
351
|
+
const pageContents: PageContent[] = [];
|
|
352
|
+
let currentPage: PageContent | null = null;
|
|
353
|
+
|
|
354
|
+
// We'll keep track of a "current text block" that we're building
|
|
355
|
+
let currentTextContent = "";
|
|
356
|
+
let prevLineBlock: Block | null = null;
|
|
357
|
+
|
|
358
|
+
// Sort by page and vertical position
|
|
359
|
+
allBlocks.sort((a, b) => {
|
|
360
|
+
if (a.Page !== b.Page) return (a.Page || 0) - (b.Page || 0);
|
|
361
|
+
return (a.Geometry?.BoundingBox?.Top || 0) - (b.Geometry?.BoundingBox?.Top || 0);
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
for (const block of allBlocks) {
|
|
365
|
+
if (block.BlockType === 'PAGE') {
|
|
366
|
+
// If we were building a text block, push it before starting a new page
|
|
367
|
+
if (currentTextContent.trim().length > 0 && currentPage) {
|
|
368
|
+
currentPage.blocks.push({
|
|
369
|
+
type: 'text',
|
|
370
|
+
content: currentTextContent
|
|
371
|
+
});
|
|
372
|
+
}
|
|
373
|
+
if (currentPage) {
|
|
374
|
+
pageContents.push(currentPage);
|
|
375
|
+
}
|
|
376
|
+
currentPage = {
|
|
377
|
+
pageNumber: block.Page || 0,
|
|
378
|
+
blocks: []
|
|
379
|
+
};
|
|
380
|
+
currentTextContent = "";
|
|
381
|
+
prevLineBlock = null;
|
|
382
|
+
}
|
|
383
|
+
else if (currentPage && block.Page === currentPage.pageNumber) {
|
|
384
|
+
// TABLE handling
|
|
385
|
+
if (block.BlockType === 'TABLE') {
|
|
386
|
+
// If there's a pending text block, push it first
|
|
387
|
+
if (currentTextContent.trim().length > 0) {
|
|
388
|
+
currentPage.blocks.push({
|
|
389
|
+
type: 'text',
|
|
390
|
+
content: currentTextContent
|
|
391
|
+
});
|
|
392
|
+
currentTextContent = "";
|
|
393
|
+
}
|
|
394
|
+
const { csv, tableConfidence } = this.generateTableCSV(
|
|
395
|
+
block,
|
|
396
|
+
blocksMap,
|
|
397
|
+
currentPage.blocks.filter(b => b.type === 'table').length + 1,
|
|
398
|
+
currentPage.pageNumber
|
|
399
|
+
);
|
|
400
|
+
currentPage.blocks.push({
|
|
401
|
+
type: 'table',
|
|
402
|
+
content: csv,
|
|
403
|
+
confidence: tableConfidence
|
|
404
|
+
});
|
|
405
|
+
prevLineBlock = null;
|
|
406
|
+
}
|
|
407
|
+
// LINE handling (merge or start new)
|
|
408
|
+
else if (block.BlockType === 'LINE' && !this.isBlockInTable(block, blocksMap)) {
|
|
409
|
+
if (prevLineBlock && this.shouldMergeLines(prevLineBlock, block)) {
|
|
410
|
+
// If we consider this line to be part of the same paragraph,
|
|
411
|
+
// just append the text. We'll call formatTextBlock to get
|
|
412
|
+
// indentation/header logic, but we won't add a leading newline.
|
|
413
|
+
const formatted = this.formatTextBlock(block, prevLineBlock);
|
|
414
|
+
|
|
415
|
+
// formatTextBlock might include a leading newline if isLikelyHeader = true
|
|
416
|
+
// so you can strip it out if you want them truly "merged" into one paragraph:
|
|
417
|
+
const mergedText = formatted.replace(/^\s*\n/, " ");
|
|
418
|
+
|
|
419
|
+
currentTextContent += " " + mergedText.trim();
|
|
420
|
+
} else {
|
|
421
|
+
// If there's an existing text block, push it
|
|
422
|
+
if (currentTextContent.trim().length > 0) {
|
|
423
|
+
currentPage.blocks.push({
|
|
424
|
+
type: 'text',
|
|
425
|
+
content: currentTextContent
|
|
426
|
+
});
|
|
427
|
+
}
|
|
428
|
+
// Start a new text block
|
|
429
|
+
currentTextContent = this.formatTextBlock(block, prevLineBlock).trim();
|
|
430
|
+
}
|
|
431
|
+
prevLineBlock = block;
|
|
432
|
+
}
|
|
433
|
+
// IMAGES (if detectImages)
|
|
434
|
+
else if (this.detectImages) {
|
|
435
|
+
const geometry = block.Geometry?.BoundingBox;
|
|
436
|
+
if (geometry && geometry.Width && geometry.Height) {
|
|
437
|
+
const imagePlaceholder = this.getImagePlaceholder(block);
|
|
438
|
+
if (imagePlaceholder) {
|
|
439
|
+
// If there's a pending text block, push it first
|
|
440
|
+
if (currentTextContent.trim().length > 0) {
|
|
441
|
+
currentPage.blocks.push({
|
|
442
|
+
type: 'text',
|
|
443
|
+
content: currentTextContent
|
|
444
|
+
});
|
|
445
|
+
currentTextContent = "";
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
currentPage.blocks.push({
|
|
449
|
+
type: 'image',
|
|
450
|
+
content: imagePlaceholder,
|
|
451
|
+
left: geometry.Left,
|
|
452
|
+
top: geometry.Top,
|
|
453
|
+
width: geometry.Width,
|
|
454
|
+
height: geometry.Height
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
// No line update to prevLineBlock here
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// Handle last page
|
|
464
|
+
if (currentPage) {
|
|
465
|
+
if (currentTextContent.trim().length > 0) {
|
|
466
|
+
currentPage.blocks.push({
|
|
467
|
+
type: 'text',
|
|
468
|
+
content: currentTextContent
|
|
469
|
+
});
|
|
470
|
+
}
|
|
471
|
+
pageContents.push(currentPage);
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// Build final output
|
|
475
|
+
let fullText = '';
|
|
476
|
+
let imgNumber = 1;
|
|
477
|
+
let tableNumber = 1;
|
|
478
|
+
for (const page of pageContents) {
|
|
479
|
+
fullText += `<page number="${page.pageNumber}">\n`;
|
|
480
|
+
for (const block of page.blocks) {
|
|
481
|
+
if (block.type === 'text') {
|
|
482
|
+
fullText += `<text>\n${block.content}\n</text>\n\n`;
|
|
483
|
+
} else if (block.type === 'table') {
|
|
484
|
+
const confidenceAttr = block.confidence !== undefined && this.includeConfidenceInTables
|
|
485
|
+
? ` confidence="${block.confidence.toFixed(2)}"`
|
|
486
|
+
: '';
|
|
487
|
+
fullText += `<table number=${tableNumber++} type="csv" ${confidenceAttr}>\n`;
|
|
488
|
+
fullText += `${block.content}\n`;
|
|
489
|
+
fullText += `</table>\n\n`;
|
|
490
|
+
} else if (block.type === 'image') {
|
|
491
|
+
// Include geometry if you like
|
|
492
|
+
const leftAttr = block.left ? ` left="${block.left.toFixed(4)}"` : '';
|
|
493
|
+
const topAttr = block.top ? ` top="${block.top.toFixed(4)}"` : '';
|
|
494
|
+
const widthAttr = block.width ? ` width="${block.width.toFixed(4)}"` : '';
|
|
495
|
+
const heightAttr = block.height ? ` height="${block.height.toFixed(4)}"` : '';
|
|
496
|
+
|
|
497
|
+
fullText += `<image id="${imgNumber++}" ${leftAttr}${topAttr}${widthAttr}${heightAttr}>\n${block.content.trim()}\n</image>\n\n`;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
fullText += `</page>\n\n`;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
return fullText;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import { exec } from "child_process";
|
|
4
|
+
import { promisify } from "util";
|
|
5
|
+
import { expect, test, vi, describe } from "vitest";
|
|
6
|
+
|
|
7
|
+
// Mock Temporal activity context
|
|
8
|
+
vi.mock("@temporalio/activity", () => ({
|
|
9
|
+
log: {
|
|
10
|
+
info: vi.fn(),
|
|
11
|
+
warn: vi.fn(),
|
|
12
|
+
error: vi.fn(),
|
|
13
|
+
},
|
|
14
|
+
}));
|
|
15
|
+
|
|
16
|
+
// Import after mocking
|
|
17
|
+
import { imageResizer } from "../conversion/image";
|
|
18
|
+
|
|
19
|
+
const execAsync = promisify(exec);
|
|
20
|
+
|
|
21
|
+
describe("ImageMagick image resizing", () => {
|
|
22
|
+
test("should resize an image to a maximum height or width using ImageMagick", async () => {
|
|
23
|
+
const max_hw = 1596;
|
|
24
|
+
const format = "jpeg";
|
|
25
|
+
const inputImagePath = path.join(__dirname, "../../fixtures", "cat-picture.jpg");
|
|
26
|
+
|
|
27
|
+
// Make sure the input file exists
|
|
28
|
+
expect(fs.existsSync(inputImagePath)).toBe(true);
|
|
29
|
+
|
|
30
|
+
// Call the imageResizer function with a file path
|
|
31
|
+
const resizedImagePath = await imageResizer(inputImagePath, max_hw, format);
|
|
32
|
+
|
|
33
|
+
// Make sure the output file exists
|
|
34
|
+
expect(fs.existsSync(resizedImagePath)).toBe(true);
|
|
35
|
+
|
|
36
|
+
// Use ImageMagick identify to get metadata about the resized image
|
|
37
|
+
const { stdout } = await execAsync(`identify -format "%w %h %m" "${resizedImagePath}"`);
|
|
38
|
+
const [width, height, imageFormat] = stdout.trim().split(" ");
|
|
39
|
+
|
|
40
|
+
console.log({ width, height, imageFormat });
|
|
41
|
+
|
|
42
|
+
// Check dimensions
|
|
43
|
+
expect(parseInt(width)).to.be.lessThanOrEqual(max_hw);
|
|
44
|
+
expect(parseInt(height)).to.be.lessThanOrEqual(max_hw);
|
|
45
|
+
|
|
46
|
+
// Check format (JPEG)
|
|
47
|
+
expect(imageFormat.toLowerCase()).to.equal("jpeg");
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
test("should throw an error for non-existent input file", async () => {
|
|
51
|
+
const max_hw = 1596;
|
|
52
|
+
const format = "jpeg";
|
|
53
|
+
const nonExistentPath = path.join(__dirname, "non-existent-image.jpg");
|
|
54
|
+
|
|
55
|
+
// Verify file doesn't exist
|
|
56
|
+
expect(fs.existsSync(nonExistentPath)).toBe(false);
|
|
57
|
+
|
|
58
|
+
// Expect the function to throw an error
|
|
59
|
+
await expect(imageResizer(nonExistentPath, max_hw, format)).rejects.toThrow("Input file does not exist");
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
test("should throw error with empty format", async () => {
|
|
63
|
+
const max_hw = 1596;
|
|
64
|
+
const format = "";
|
|
65
|
+
const inputImagePath = path.join(__dirname, "../../fixtures", "cat-picture.jpg");
|
|
66
|
+
|
|
67
|
+
// Test for empty format validation
|
|
68
|
+
await expect(imageResizer(inputImagePath, max_hw, format)).rejects.toThrow("Invalid format");
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test("should create progressive/interlaced image when enabled", async () => {
|
|
72
|
+
const max_hw = 800;
|
|
73
|
+
const format = "jpeg";
|
|
74
|
+
const inputImagePath = path.join(__dirname, "../../fixtures", "cat-picture.jpg");
|
|
75
|
+
|
|
76
|
+
// Make sure the input file exists
|
|
77
|
+
expect(fs.existsSync(inputImagePath)).toBe(true);
|
|
78
|
+
|
|
79
|
+
// Call the imageResizer function with progressive=true
|
|
80
|
+
const resizedImagePath = await imageResizer(inputImagePath, max_hw, format, true);
|
|
81
|
+
|
|
82
|
+
// Make sure the output file exists
|
|
83
|
+
expect(fs.existsSync(resizedImagePath)).toBe(true);
|
|
84
|
+
|
|
85
|
+
// Use ImageMagick identify to check if the image is interlaced
|
|
86
|
+
const { stdout } = await execAsync(`identify -format "%[interlace]" "${resizedImagePath}"`);
|
|
87
|
+
const interlaceMode = stdout.trim();
|
|
88
|
+
|
|
89
|
+
console.log({ interlaceMode });
|
|
90
|
+
|
|
91
|
+
// Check that interlace is enabled (should be 'JPEG' or 'Line' for progressive JPEG)
|
|
92
|
+
expect(["JPEG", "Line", "Plane"]).to.include(interlaceMode);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
test("should create non-interlaced image when progressive is disabled", async () => {
|
|
96
|
+
const max_hw = 800;
|
|
97
|
+
const format = "jpeg";
|
|
98
|
+
const inputImagePath = path.join(__dirname, "../../fixtures", "cat-picture.jpg");
|
|
99
|
+
|
|
100
|
+
// Make sure the input file exists
|
|
101
|
+
expect(fs.existsSync(inputImagePath)).toBe(true);
|
|
102
|
+
|
|
103
|
+
// Call the imageResizer function with progressive=false
|
|
104
|
+
const resizedImagePath = await imageResizer(inputImagePath, max_hw, format, false);
|
|
105
|
+
|
|
106
|
+
// Make sure the output file exists
|
|
107
|
+
expect(fs.existsSync(resizedImagePath)).toBe(true);
|
|
108
|
+
|
|
109
|
+
// Use ImageMagick identify to check if the image is interlaced
|
|
110
|
+
const { stdout } = await execAsync(`identify -format "%[interlace]" "${resizedImagePath}"`);
|
|
111
|
+
const interlaceMode = stdout.trim().toLowerCase();
|
|
112
|
+
|
|
113
|
+
console.log({ interlaceMode });
|
|
114
|
+
|
|
115
|
+
// Check that interlace is disabled (should be 'none' or empty string)
|
|
116
|
+
expect(["none", ""]).to.include(interlaceMode);
|
|
117
|
+
});
|
|
118
|
+
});
|