@vertesia/workflow 0.42.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +13 -0
- package/README.md +24 -0
- package/bin/bundle-workflows.mjs +26 -0
- package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js +32 -0
- package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js.map +1 -0
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +66 -0
- package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -0
- package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js +18 -0
- package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js.map +1 -0
- package/lib/cjs/activities/chunkDocument.js +79 -0
- package/lib/cjs/activities/chunkDocument.js.map +1 -0
- package/lib/cjs/activities/createDocumentFromOther.js +64 -0
- package/lib/cjs/activities/createDocumentFromOther.js.map +1 -0
- package/lib/cjs/activities/executeInteraction.js +135 -0
- package/lib/cjs/activities/executeInteraction.js.map +1 -0
- package/lib/cjs/activities/extractDocumentText.js +140 -0
- package/lib/cjs/activities/extractDocumentText.js.map +1 -0
- package/lib/cjs/activities/generateDocumentProperties.js +59 -0
- package/lib/cjs/activities/generateDocumentProperties.js.map +1 -0
- package/lib/cjs/activities/generateEmbeddings.js +292 -0
- package/lib/cjs/activities/generateEmbeddings.js.map +1 -0
- package/lib/cjs/activities/generateImageRendition.js +104 -0
- package/lib/cjs/activities/generateImageRendition.js.map +1 -0
- package/lib/cjs/activities/generateOrAssignContentType.js +103 -0
- package/lib/cjs/activities/generateOrAssignContentType.js.map +1 -0
- package/lib/cjs/activities/getObjectFromStore.js +20 -0
- package/lib/cjs/activities/getObjectFromStore.js.map +1 -0
- package/lib/cjs/activities/index.js +54 -0
- package/lib/cjs/activities/index.js.map +1 -0
- package/lib/cjs/activities/media/processPdfWithTextract.js +102 -0
- package/lib/cjs/activities/media/processPdfWithTextract.js.map +1 -0
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js +51 -0
- package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +1 -0
- package/lib/cjs/activities/notifyWebhook.js +34 -0
- package/lib/cjs/activities/notifyWebhook.js.map +1 -0
- package/lib/cjs/activities/setDocumentStatus.js +15 -0
- package/lib/cjs/activities/setDocumentStatus.js.map +1 -0
- package/lib/cjs/conversion/TextractProcessor.js +416 -0
- package/lib/cjs/conversion/TextractProcessor.js.map +1 -0
- package/lib/cjs/conversion/image.js +22 -0
- package/lib/cjs/conversion/image.js.map +1 -0
- package/lib/cjs/conversion/mutool.js +147 -0
- package/lib/cjs/conversion/mutool.js.map +1 -0
- package/lib/cjs/conversion/pandoc.js +39 -0
- package/lib/cjs/conversion/pandoc.js.map +1 -0
- package/lib/cjs/conversion/pdf.js +13 -0
- package/lib/cjs/conversion/pdf.js.map +1 -0
- package/lib/cjs/dsl/conditions.js +81 -0
- package/lib/cjs/dsl/conditions.js.map +1 -0
- package/lib/cjs/dsl/dsl-workflow.js +223 -0
- package/lib/cjs/dsl/dsl-workflow.js.map +1 -0
- package/lib/cjs/dsl/projections.js +59 -0
- package/lib/cjs/dsl/projections.js.map +1 -0
- package/lib/cjs/dsl/setup/ActivityContext.js +96 -0
- package/lib/cjs/dsl/setup/ActivityContext.js.map +1 -0
- package/lib/cjs/dsl/setup/fetch/DataProvider.js +51 -0
- package/lib/cjs/dsl/setup/fetch/DataProvider.js.map +1 -0
- package/lib/cjs/dsl/setup/fetch/index.js +16 -0
- package/lib/cjs/dsl/setup/fetch/index.js.map +1 -0
- package/lib/cjs/dsl/setup/fetch/providers.js +67 -0
- package/lib/cjs/dsl/setup/fetch/providers.js.map +1 -0
- package/lib/cjs/dsl/test/test-child-workflow.js +10 -0
- package/lib/cjs/dsl/test/test-child-workflow.js.map +1 -0
- package/lib/cjs/dsl/validation.js +122 -0
- package/lib/cjs/dsl/validation.js.map +1 -0
- package/lib/cjs/dsl/vars.js +341 -0
- package/lib/cjs/dsl/vars.js.map +1 -0
- package/lib/cjs/dsl/walk.js +100 -0
- package/lib/cjs/dsl/walk.js.map +1 -0
- package/lib/cjs/errors.js +36 -0
- package/lib/cjs/errors.js.map +1 -0
- package/lib/cjs/index.js +43 -0
- package/lib/cjs/index.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/extractToc.js +47 -0
- package/lib/cjs/iterative-generation/activities/extractToc.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/finalizeOutput.js +69 -0
- package/lib/cjs/iterative-generation/activities/finalizeOutput.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/generatePart.js +73 -0
- package/lib/cjs/iterative-generation/activities/generatePart.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/generateToc.js +91 -0
- package/lib/cjs/iterative-generation/activities/generateToc.js.map +1 -0
- package/lib/cjs/iterative-generation/activities/index.js +12 -0
- package/lib/cjs/iterative-generation/activities/index.js.map +1 -0
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js +55 -0
- package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js.map +1 -0
- package/lib/cjs/iterative-generation/types.js +5 -0
- package/lib/cjs/iterative-generation/types.js.map +1 -0
- package/lib/cjs/iterative-generation/utils.js +118 -0
- package/lib/cjs/iterative-generation/utils.js.map +1 -0
- package/lib/cjs/package.json +3 -0
- package/lib/cjs/result-types.js +10 -0
- package/lib/cjs/result-types.js.map +1 -0
- package/lib/cjs/system/generateObjectText.js +89 -0
- package/lib/cjs/system/generateObjectText.js.map +1 -0
- package/lib/cjs/system/notifyWebhookWorkflow.js +52 -0
- package/lib/cjs/system/notifyWebhookWorkflow.js.map +1 -0
- package/lib/cjs/system/recalculateEmbeddingsWorkflow.js +37 -0
- package/lib/cjs/system/recalculateEmbeddingsWorkflow.js.map +1 -0
- package/lib/cjs/utils/auth.js +15 -0
- package/lib/cjs/utils/auth.js.map +1 -0
- package/lib/cjs/utils/blobs.js +63 -0
- package/lib/cjs/utils/blobs.js.map +1 -0
- package/lib/cjs/utils/client.js +25 -0
- package/lib/cjs/utils/client.js.map +1 -0
- package/lib/cjs/utils/expand-vars.js +33 -0
- package/lib/cjs/utils/expand-vars.js.map +1 -0
- package/lib/cjs/utils/memory.js +72 -0
- package/lib/cjs/utils/memory.js.map +1 -0
- package/lib/cjs/utils/tokens.js +38 -0
- package/lib/cjs/utils/tokens.js.map +1 -0
- package/lib/cjs/vars.js +20 -0
- package/lib/cjs/vars.js.map +1 -0
- package/lib/cjs/workflows.js +17 -0
- package/lib/cjs/workflows.js.map +1 -0
- package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js +29 -0
- package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js.map +1 -0
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +63 -0
- package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -0
- package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js +15 -0
- package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js.map +1 -0
- package/lib/esm/activities/chunkDocument.js +76 -0
- package/lib/esm/activities/chunkDocument.js.map +1 -0
- package/lib/esm/activities/createDocumentFromOther.js +58 -0
- package/lib/esm/activities/createDocumentFromOther.js.map +1 -0
- package/lib/esm/activities/executeInteraction.js +131 -0
- package/lib/esm/activities/executeInteraction.js.map +1 -0
- package/lib/esm/activities/extractDocumentText.js +137 -0
- package/lib/esm/activities/extractDocumentText.js.map +1 -0
- package/lib/esm/activities/generateDocumentProperties.js +56 -0
- package/lib/esm/activities/generateDocumentProperties.js.map +1 -0
- package/lib/esm/activities/generateEmbeddings.js +256 -0
- package/lib/esm/activities/generateEmbeddings.js.map +1 -0
- package/lib/esm/activities/generateImageRendition.js +98 -0
- package/lib/esm/activities/generateImageRendition.js.map +1 -0
- package/lib/esm/activities/generateOrAssignContentType.js +100 -0
- package/lib/esm/activities/generateOrAssignContentType.js.map +1 -0
- package/lib/esm/activities/getObjectFromStore.js +17 -0
- package/lib/esm/activities/getObjectFromStore.js.map +1 -0
- package/lib/esm/activities/index.js +21 -0
- package/lib/esm/activities/index.js.map +1 -0
- package/lib/esm/activities/media/processPdfWithTextract.js +98 -0
- package/lib/esm/activities/media/processPdfWithTextract.js.map +1 -0
- package/lib/esm/activities/media/transcribeMediaWithGladia.js +48 -0
- package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +1 -0
- package/lib/esm/activities/notifyWebhook.js +31 -0
- package/lib/esm/activities/notifyWebhook.js.map +1 -0
- package/lib/esm/activities/setDocumentStatus.js +12 -0
- package/lib/esm/activities/setDocumentStatus.js.map +1 -0
- package/lib/esm/conversion/TextractProcessor.js +409 -0
- package/lib/esm/conversion/TextractProcessor.js.map +1 -0
- package/lib/esm/conversion/image.js +16 -0
- package/lib/esm/conversion/image.js.map +1 -0
- package/lib/esm/conversion/mutool.js +139 -0
- package/lib/esm/conversion/mutool.js.map +1 -0
- package/lib/esm/conversion/pandoc.js +36 -0
- package/lib/esm/conversion/pandoc.js.map +1 -0
- package/lib/esm/conversion/pdf.js +7 -0
- package/lib/esm/conversion/pdf.js.map +1 -0
- package/lib/esm/dsl/conditions.js +75 -0
- package/lib/esm/dsl/conditions.js.map +1 -0
- package/lib/esm/dsl/dsl-workflow.js +216 -0
- package/lib/esm/dsl/dsl-workflow.js.map +1 -0
- package/lib/esm/dsl/projections.js +55 -0
- package/lib/esm/dsl/projections.js.map +1 -0
- package/lib/esm/dsl/setup/ActivityContext.js +91 -0
- package/lib/esm/dsl/setup/ActivityContext.js.map +1 -0
- package/lib/esm/dsl/setup/fetch/DataProvider.js +47 -0
- package/lib/esm/dsl/setup/fetch/DataProvider.js.map +1 -0
- package/lib/esm/dsl/setup/fetch/index.js +12 -0
- package/lib/esm/dsl/setup/fetch/index.js.map +1 -0
- package/lib/esm/dsl/setup/fetch/providers.js +61 -0
- package/lib/esm/dsl/setup/fetch/providers.js.map +1 -0
- package/lib/esm/dsl/test/test-child-workflow.js +5 -0
- package/lib/esm/dsl/test/test-child-workflow.js.map +1 -0
- package/lib/esm/dsl/validation.js +118 -0
- package/lib/esm/dsl/validation.js.map +1 -0
- package/lib/esm/dsl/vars.js +335 -0
- package/lib/esm/dsl/vars.js.map +1 -0
- package/lib/esm/dsl/walk.js +96 -0
- package/lib/esm/dsl/walk.js.map +1 -0
- package/lib/esm/errors.js +30 -0
- package/lib/esm/errors.js.map +1 -0
- package/lib/esm/index.js +25 -0
- package/lib/esm/index.js.map +1 -0
- package/lib/esm/iterative-generation/activities/extractToc.js +44 -0
- package/lib/esm/iterative-generation/activities/extractToc.js.map +1 -0
- package/lib/esm/iterative-generation/activities/finalizeOutput.js +66 -0
- package/lib/esm/iterative-generation/activities/finalizeOutput.js.map +1 -0
- package/lib/esm/iterative-generation/activities/generatePart.js +70 -0
- package/lib/esm/iterative-generation/activities/generatePart.js.map +1 -0
- package/lib/esm/iterative-generation/activities/generateToc.js +88 -0
- package/lib/esm/iterative-generation/activities/generateToc.js.map +1 -0
- package/lib/esm/iterative-generation/activities/index.js +5 -0
- package/lib/esm/iterative-generation/activities/index.js.map +1 -0
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js +52 -0
- package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js.map +1 -0
- package/lib/esm/iterative-generation/types.js +2 -0
- package/lib/esm/iterative-generation/types.js.map +1 -0
- package/lib/esm/iterative-generation/utils.js +109 -0
- package/lib/esm/iterative-generation/utils.js.map +1 -0
- package/lib/esm/result-types.js +7 -0
- package/lib/esm/result-types.js.map +1 -0
- package/lib/esm/system/generateObjectText.js +86 -0
- package/lib/esm/system/generateObjectText.js.map +1 -0
- package/lib/esm/system/notifyWebhookWorkflow.js +49 -0
- package/lib/esm/system/notifyWebhookWorkflow.js.map +1 -0
- package/lib/esm/system/recalculateEmbeddingsWorkflow.js +34 -0
- package/lib/esm/system/recalculateEmbeddingsWorkflow.js.map +1 -0
- package/lib/esm/utils/auth.js +8 -0
- package/lib/esm/utils/auth.js.map +1 -0
- package/lib/esm/utils/blobs.js +52 -0
- package/lib/esm/utils/blobs.js.map +1 -0
- package/lib/esm/utils/client.js +22 -0
- package/lib/esm/utils/client.js.map +1 -0
- package/lib/esm/utils/expand-vars.js +30 -0
- package/lib/esm/utils/expand-vars.js.map +1 -0
- package/lib/esm/utils/memory.js +60 -0
- package/lib/esm/utils/memory.js.map +1 -0
- package/lib/esm/utils/tokens.js +34 -0
- package/lib/esm/utils/tokens.js.map +1 -0
- package/lib/esm/vars.js +4 -0
- package/lib/esm/vars.js.map +1 -0
- package/lib/esm/workflows.js +9 -0
- package/lib/esm/workflows.js.map +1 -0
- package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts +17 -0
- package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts.map +1 -0
- package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts +29 -0
- package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts.map +1 -0
- package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts +19 -0
- package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts.map +1 -0
- package/lib/types/activities/chunkDocument.d.ts +18 -0
- package/lib/types/activities/chunkDocument.d.ts.map +1 -0
- package/lib/types/activities/createDocumentFromOther.d.ts +21 -0
- package/lib/types/activities/createDocumentFromOther.d.ts.map +1 -0
- package/lib/types/activities/executeInteraction.d.ts +40 -0
- package/lib/types/activities/executeInteraction.d.ts.map +1 -0
- package/lib/types/activities/extractDocumentText.d.ts +9 -0
- package/lib/types/activities/extractDocumentText.d.ts.map +1 -0
- package/lib/types/activities/generateDocumentProperties.d.ts +32 -0
- package/lib/types/activities/generateDocumentProperties.d.ts.map +1 -0
- package/lib/types/activities/generateEmbeddings.d.ts +49 -0
- package/lib/types/activities/generateEmbeddings.d.ts.map +1 -0
- package/lib/types/activities/generateImageRendition.d.ts +17 -0
- package/lib/types/activities/generateImageRendition.d.ts.map +1 -0
- package/lib/types/activities/generateOrAssignContentType.d.ts +44 -0
- package/lib/types/activities/generateOrAssignContentType.d.ts.map +1 -0
- package/lib/types/activities/getObjectFromStore.d.ts +14 -0
- package/lib/types/activities/getObjectFromStore.d.ts.map +1 -0
- package/lib/types/activities/index.d.ts +21 -0
- package/lib/types/activities/index.d.ts.map +1 -0
- package/lib/types/activities/media/processPdfWithTextract.d.ts +26 -0
- package/lib/types/activities/media/processPdfWithTextract.d.ts.map +1 -0
- package/lib/types/activities/media/transcribeMediaWithGladia.d.ts +14 -0
- package/lib/types/activities/media/transcribeMediaWithGladia.d.ts.map +1 -0
- package/lib/types/activities/notifyWebhook.d.ts +17 -0
- package/lib/types/activities/notifyWebhook.d.ts.map +1 -0
- package/lib/types/activities/setDocumentStatus.d.ts +15 -0
- package/lib/types/activities/setDocumentStatus.d.ts.map +1 -0
- package/lib/types/conversion/TextractProcessor.d.ts +45 -0
- package/lib/types/conversion/TextractProcessor.d.ts.map +1 -0
- package/lib/types/conversion/image.d.ts +9 -0
- package/lib/types/conversion/image.d.ts.map +1 -0
- package/lib/types/conversion/mutool.d.ts +19 -0
- package/lib/types/conversion/mutool.d.ts.map +1 -0
- package/lib/types/conversion/pandoc.d.ts +2 -0
- package/lib/types/conversion/pandoc.d.ts.map +1 -0
- package/lib/types/conversion/pdf.d.ts +2 -0
- package/lib/types/conversion/pdf.d.ts.map +1 -0
- package/lib/types/dsl/conditions.d.ts +2 -0
- package/lib/types/dsl/conditions.d.ts.map +1 -0
- package/lib/types/dsl/dsl-workflow.d.ts +5 -0
- package/lib/types/dsl/dsl-workflow.d.ts.map +1 -0
- package/lib/types/dsl/projections.d.ts +4 -0
- package/lib/types/dsl/projections.d.ts.map +1 -0
- package/lib/types/dsl/setup/ActivityContext.d.ts +14 -0
- package/lib/types/dsl/setup/ActivityContext.d.ts.map +1 -0
- package/lib/types/dsl/setup/fetch/DataProvider.d.ts +9 -0
- package/lib/types/dsl/setup/fetch/DataProvider.d.ts.map +1 -0
- package/lib/types/dsl/setup/fetch/index.d.ts +6 -0
- package/lib/types/dsl/setup/fetch/index.d.ts.map +1 -0
- package/lib/types/dsl/setup/fetch/providers.d.ts +25 -0
- package/lib/types/dsl/setup/fetch/providers.d.ts.map +1 -0
- package/lib/types/dsl/test/test-child-workflow.d.ts +4 -0
- package/lib/types/dsl/test/test-child-workflow.d.ts.map +1 -0
- package/lib/types/dsl/validation.d.ts +4 -0
- package/lib/types/dsl/validation.d.ts.map +1 -0
- package/lib/types/dsl/vars.d.ts +48 -0
- package/lib/types/dsl/vars.d.ts.map +1 -0
- package/lib/types/dsl/walk.d.ts +18 -0
- package/lib/types/dsl/walk.d.ts.map +1 -0
- package/lib/types/errors.d.ts +16 -0
- package/lib/types/errors.d.ts.map +1 -0
- package/lib/types/index.d.ts +24 -0
- package/lib/types/index.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/extractToc.d.ts +10 -0
- package/lib/types/iterative-generation/activities/extractToc.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/finalizeOutput.d.ts +3 -0
- package/lib/types/iterative-generation/activities/finalizeOutput.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/generatePart.d.ts +3 -0
- package/lib/types/iterative-generation/activities/generatePart.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/generateToc.d.ts +4 -0
- package/lib/types/iterative-generation/activities/generateToc.d.ts.map +1 -0
- package/lib/types/iterative-generation/activities/index.d.ts +5 -0
- package/lib/types/iterative-generation/activities/index.d.ts.map +1 -0
- package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts +3 -0
- package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts.map +1 -0
- package/lib/types/iterative-generation/types.d.ts +79 -0
- package/lib/types/iterative-generation/types.d.ts.map +1 -0
- package/lib/types/iterative-generation/utils.d.ts +27 -0
- package/lib/types/iterative-generation/utils.d.ts.map +1 -0
- package/lib/types/result-types.d.ts +22 -0
- package/lib/types/result-types.d.ts.map +1 -0
- package/lib/types/system/generateObjectText.d.ts +4 -0
- package/lib/types/system/generateObjectText.d.ts.map +1 -0
- package/lib/types/system/notifyWebhookWorkflow.d.ts +6 -0
- package/lib/types/system/notifyWebhookWorkflow.d.ts.map +1 -0
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts +40 -0
- package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts.map +1 -0
- package/lib/types/utils/auth.d.ts +4 -0
- package/lib/types/utils/auth.d.ts.map +1 -0
- package/lib/types/utils/blobs.d.ts +8 -0
- package/lib/types/utils/blobs.d.ts.map +1 -0
- package/lib/types/utils/client.d.ts +7 -0
- package/lib/types/utils/client.d.ts.map +1 -0
- package/lib/types/utils/expand-vars.d.ts +8 -0
- package/lib/types/utils/expand-vars.d.ts.map +1 -0
- package/lib/types/utils/memory.d.ts +12 -0
- package/lib/types/utils/memory.d.ts.map +1 -0
- package/lib/types/utils/tokens.d.ts +11 -0
- package/lib/types/utils/tokens.d.ts.map +1 -0
- package/lib/types/vars.d.ts +3 -0
- package/lib/types/vars.d.ts.map +1 -0
- package/lib/types/workflows.d.ts +9 -0
- package/lib/types/workflows.d.ts.map +1 -0
- package/lib/workflows-bundle.js +18394 -0
- package/package.json +109 -0
- package/src/activities/advanced/createDocumentTypeFromInteractionRun.ts +54 -0
- package/src/activities/advanced/createOrUpdateDocumentFromInteractionRun.ts +97 -0
- package/src/activities/advanced/updateDocumentFromInteractionRun.ts +34 -0
- package/src/activities/chunkDocument.ts +124 -0
- package/src/activities/createDocumentFromOther.ts +92 -0
- package/src/activities/executeInteraction.ts +191 -0
- package/src/activities/extractDocumentText.ts +174 -0
- package/src/activities/generateDocumentProperties.ts +93 -0
- package/src/activities/generateEmbeddings.ts +345 -0
- package/src/activities/generateImageRendition.ts +134 -0
- package/src/activities/generateOrAssignContentType.ts +152 -0
- package/src/activities/getObjectFromStore.ts +31 -0
- package/src/activities/index.ts +21 -0
- package/src/activities/media/processPdfWithTextract.ts +141 -0
- package/src/activities/media/transcribeMediaWithGladia.ts +83 -0
- package/src/activities/notifyWebhook.test.ts +32 -0
- package/src/activities/notifyWebhook.ts +51 -0
- package/src/activities/setDocumentStatus.ts +25 -0
- package/src/conversion/TextractProcessor.ts +505 -0
- package/src/conversion/image.test.ts +26 -0
- package/src/conversion/image.ts +22 -0
- package/src/conversion/mutool.test.ts +74 -0
- package/src/conversion/mutool.ts +180 -0
- package/src/conversion/pandoc.test.ts +22 -0
- package/src/conversion/pandoc.ts +44 -0
- package/src/conversion/pdf.test.ts +35 -0
- package/src/conversion/pdf.ts +8 -0
- package/src/dsl/conditions.ts +76 -0
- package/src/dsl/dsl-workflow.test.ts +58 -0
- package/src/dsl/dsl-workflow.ts +235 -0
- package/src/dsl/ms.d.ts +11 -0
- package/src/dsl/projections.test.ts +159 -0
- package/src/dsl/projections.ts +72 -0
- package/src/dsl/setup/ActivityContext.ts +106 -0
- package/src/dsl/setup/fetch/DataProvider.ts +45 -0
- package/src/dsl/setup/fetch/index.ts +19 -0
- package/src/dsl/setup/fetch/providers.ts +67 -0
- package/src/dsl/test/test-child-workflow.ts +6 -0
- package/src/dsl/validation.test.ts +257 -0
- package/src/dsl/validation.ts +125 -0
- package/src/dsl/vars.test.ts +245 -0
- package/src/dsl/vars.ts +340 -0
- package/src/dsl/walk.test.ts +81 -0
- package/src/dsl/walk.ts +103 -0
- package/src/dsl/workflow-exec-child.test.ts +182 -0
- package/src/dsl/workflow-fetch.test.ts +135 -0
- package/src/dsl/workflow-import.test.ts +89 -0
- package/src/dsl/workflow.test.ts +110 -0
- package/src/errors.ts +24 -0
- package/src/index.ts +27 -0
- package/src/iterative-generation/activities/extractToc.ts +49 -0
- package/src/iterative-generation/activities/finalizeOutput.ts +77 -0
- package/src/iterative-generation/activities/generatePart.ts +82 -0
- package/src/iterative-generation/activities/generateToc.ts +98 -0
- package/src/iterative-generation/activities/index.ts +4 -0
- package/src/iterative-generation/iterativeGenerationWorkflow.ts +67 -0
- package/src/iterative-generation/types.ts +99 -0
- package/src/iterative-generation/utils.ts +123 -0
- package/src/result-types.ts +25 -0
- package/src/system/generateObjectText.ts +109 -0
- package/src/system/notifyWebhookWorkflow.ts +64 -0
- package/src/system/recalculateEmbeddingsWorkflow.ts +46 -0
- package/src/utils/auth.ts +10 -0
- package/src/utils/blobs.ts +58 -0
- package/src/utils/client.ts +31 -0
- package/src/utils/expand-vars.ts +31 -0
- package/src/utils/memory.ts +66 -0
- package/src/utils/tokens.ts +44 -0
- package/src/vars.ts +3 -0
- package/src/workflows.ts +9 -0
@@ -0,0 +1,505 @@
|
|
1
|
+
import { PutObjectCommand, S3Client } from "@aws-sdk/client-s3";
|
2
|
+
import type { Block } from "@aws-sdk/client-textract";
|
3
|
+
import {
|
4
|
+
GetDocumentAnalysisCommand,
|
5
|
+
StartDocumentAnalysisCommand,
|
6
|
+
TextractClient
|
7
|
+
} from "@aws-sdk/client-textract";
|
8
|
+
import type { AwsCredentialIdentityProvider } from "@smithy/types";
|
9
|
+
import Papa from 'papaparse';
|
10
|
+
|
11
|
+
interface BlocksMap {
|
12
|
+
[key: string]: Block;
|
13
|
+
}
|
14
|
+
|
15
|
+
interface ContentBlock {
|
16
|
+
type: 'text' | 'table' | 'image';
|
17
|
+
content: string;
|
18
|
+
confidence?: number;
|
19
|
+
// Optional geometry if it's an image
|
20
|
+
left?: number;
|
21
|
+
top?: number;
|
22
|
+
width?: number;
|
23
|
+
height?: number;
|
24
|
+
}
|
25
|
+
|
26
|
+
interface PageContent {
|
27
|
+
pageNumber: number;
|
28
|
+
blocks: ContentBlock[];
|
29
|
+
}
|
30
|
+
|
31
|
+
interface TextractProcessorOptions {
|
32
|
+
fileKey: string;
|
33
|
+
region: string;
|
34
|
+
bucket: string;
|
35
|
+
credentials?: AwsCredentialIdentityProvider;
|
36
|
+
log?: any;
|
37
|
+
detectImages?: boolean;
|
38
|
+
/**
|
39
|
+
* NEW: If true, includes cell-confidence information in the table CSV
|
40
|
+
*/
|
41
|
+
includeConfidenceInTables?: boolean;
|
42
|
+
}
|
43
|
+
|
44
|
+
export class TextractProcessor {
|
45
|
+
private textractClient: TextractClient;
|
46
|
+
private s3Client: S3Client;
|
47
|
+
private fileKey: string;
|
48
|
+
private bucket: string;
|
49
|
+
private log: any;
|
50
|
+
private detectImages: boolean;
|
51
|
+
/**
|
52
|
+
* Whether or not to include confidence values in CSV output for tables.
|
53
|
+
*/
|
54
|
+
private includeConfidenceInTables: boolean;
|
55
|
+
|
56
|
+
constructor({
|
57
|
+
fileKey,
|
58
|
+
region,
|
59
|
+
bucket,
|
60
|
+
credentials,
|
61
|
+
log,
|
62
|
+
detectImages = false,
|
63
|
+
includeConfidenceInTables = false // NEW default = false
|
64
|
+
}: TextractProcessorOptions) {
|
65
|
+
this.fileKey = fileKey;
|
66
|
+
this.bucket = bucket;
|
67
|
+
this.log = log;
|
68
|
+
this.detectImages = detectImages;
|
69
|
+
this.includeConfidenceInTables = includeConfidenceInTables;
|
70
|
+
|
71
|
+
this.textractClient = new TextractClient({
|
72
|
+
region,
|
73
|
+
credentials
|
74
|
+
});
|
75
|
+
this.s3Client = new S3Client({
|
76
|
+
region,
|
77
|
+
credentials
|
78
|
+
});
|
79
|
+
}
|
80
|
+
|
81
|
+
private getText(result: Block, blocksMap: BlocksMap): string {
|
82
|
+
let text = '';
|
83
|
+
if (result.Relationships) {
|
84
|
+
for (const relationship of result.Relationships) {
|
85
|
+
if (relationship.Type === 'CHILD') {
|
86
|
+
for (const childId of relationship.Ids || []) {
|
87
|
+
const word = blocksMap[childId];
|
88
|
+
if (word.BlockType === 'WORD') {
|
89
|
+
const wordText = word.Text || '';
|
90
|
+
// Example logic to quote numeric text with commas
|
91
|
+
if (wordText.includes(',') &&
|
92
|
+
wordText.replace(',', '').match(/^\d+$/)) {
|
93
|
+
text += `"${wordText}" `;
|
94
|
+
} else {
|
95
|
+
text += `${wordText} `;
|
96
|
+
}
|
97
|
+
}
|
98
|
+
if (
|
99
|
+
word.BlockType === 'SELECTION_ELEMENT' &&
|
100
|
+
word.SelectionStatus === 'SELECTED'
|
101
|
+
) {
|
102
|
+
text += 'X ';
|
103
|
+
}
|
104
|
+
}
|
105
|
+
}
|
106
|
+
}
|
107
|
+
}
|
108
|
+
return text.trim();
|
109
|
+
}
|
110
|
+
|
111
|
+
private isBlockInTable(block: Block, blocksMap: BlocksMap): boolean {
|
112
|
+
if (block.BlockType !== 'LINE') {
|
113
|
+
return false;
|
114
|
+
}
|
115
|
+
if (block.Relationships) {
|
116
|
+
for (const relationship of block.Relationships) {
|
117
|
+
if (relationship.Type === 'CHILD') {
|
118
|
+
for (const childId of relationship.Ids || []) {
|
119
|
+
const wordBlock = blocksMap[childId];
|
120
|
+
if (this.isWordInTableCell(wordBlock, blocksMap)) {
|
121
|
+
return true;
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
}
|
126
|
+
}
|
127
|
+
return false;
|
128
|
+
}
|
129
|
+
|
130
|
+
private isWordInTableCell(wordBlock: Block, blocksMap: BlocksMap): boolean {
|
131
|
+
// Check if the wordBlock is a descendant of any TABLE->CELL block
|
132
|
+
for (const blockId in blocksMap) {
|
133
|
+
const potentialTable = blocksMap[blockId];
|
134
|
+
if (potentialTable.BlockType === 'TABLE' && potentialTable.Relationships) {
|
135
|
+
for (const relationship of potentialTable.Relationships) {
|
136
|
+
if (relationship.Type === 'CHILD') {
|
137
|
+
for (const cellId of relationship.Ids || []) {
|
138
|
+
const cell = blocksMap[cellId];
|
139
|
+
if (cell.BlockType === 'CELL' && cell.Relationships) {
|
140
|
+
for (const cellRel of cell.Relationships) {
|
141
|
+
if (
|
142
|
+
cellRel.Type === 'CHILD' &&
|
143
|
+
cellRel.Ids?.includes(wordBlock.Id!)
|
144
|
+
) {
|
145
|
+
return true;
|
146
|
+
}
|
147
|
+
}
|
148
|
+
}
|
149
|
+
}
|
150
|
+
}
|
151
|
+
}
|
152
|
+
}
|
153
|
+
}
|
154
|
+
return false;
|
155
|
+
}
|
156
|
+
|
157
|
+
/**
|
158
|
+
* NEW: Helper type to store row and column text along with confidence.
|
159
|
+
*/
|
160
|
+
private getRowsColumnsMap(
|
161
|
+
tableResult: Block,
|
162
|
+
blocksMap: BlocksMap
|
163
|
+
): {
|
164
|
+
rows: Array<Array<{ text: string; confidence: number }>>;
|
165
|
+
} {
|
166
|
+
const rows: Array<Array<{ text: string; confidence: number }>> = [];
|
167
|
+
|
168
|
+
tableResult.Relationships?.forEach(relationship => {
|
169
|
+
if (relationship.Type === 'CHILD') {
|
170
|
+
relationship.Ids?.forEach(childId => {
|
171
|
+
const cell = blocksMap[childId];
|
172
|
+
if (cell.BlockType === 'CELL') {
|
173
|
+
const rowIndex = cell.RowIndex || 1;
|
174
|
+
const colIndex = cell.ColumnIndex || 1;
|
175
|
+
|
176
|
+
// Expand the array if needed
|
177
|
+
if (!rows[rowIndex - 1]) {
|
178
|
+
rows[rowIndex - 1] = [];
|
179
|
+
}
|
180
|
+
|
181
|
+
// Prepare cell text and confidence
|
182
|
+
const text = this.getText(cell, blocksMap);
|
183
|
+
const confidence = cell.Confidence || 0;
|
184
|
+
|
185
|
+
// If there's a gap, fill it with placeholders
|
186
|
+
// so that we can safely place text at colIndex - 1
|
187
|
+
for (let i = rows[rowIndex - 1].length; i < colIndex - 1; i++) {
|
188
|
+
rows[rowIndex - 1].push({ text: '', confidence: 0 });
|
189
|
+
}
|
190
|
+
rows[rowIndex - 1][colIndex - 1] = { text, confidence };
|
191
|
+
}
|
192
|
+
});
|
193
|
+
}
|
194
|
+
});
|
195
|
+
|
196
|
+
return { rows };
|
197
|
+
}
|
198
|
+
|
199
|
+
private generateTableCSV(
|
200
|
+
tableResult: Block,
|
201
|
+
blocksMap: BlocksMap,
|
202
|
+
_tableIndex: number,
|
203
|
+
_pageNumber: number
|
204
|
+
): { csv: string; tableConfidence: number } {
|
205
|
+
const { rows } = this.getRowsColumnsMap(tableResult, blocksMap);
|
206
|
+
|
207
|
+
let totalConfidence = 0;
|
208
|
+
let cellCount = 0;
|
209
|
+
|
210
|
+
// Prepare CSV data
|
211
|
+
const csvData: string[][] = [];
|
212
|
+
for (const row of rows) {
|
213
|
+
const rowData: string[] = [];
|
214
|
+
for (const cell of row) {
|
215
|
+
// Add to CSV
|
216
|
+
rowData.push(cell.text.trim());
|
217
|
+
// Accumulate confidence
|
218
|
+
totalConfidence += cell.confidence;
|
219
|
+
cellCount++;
|
220
|
+
}
|
221
|
+
csvData.push(rowData);
|
222
|
+
}
|
223
|
+
|
224
|
+
// Compute average confidence (or any other method you prefer)
|
225
|
+
const tableConfidence = cellCount > 0 ? (totalConfidence / cellCount) : 0;
|
226
|
+
|
227
|
+
// Convert to CSV
|
228
|
+
const csv = Papa.unparse(csvData, {
|
229
|
+
delimiter: ',',
|
230
|
+
quotes: true,
|
231
|
+
quoteChar: '"',
|
232
|
+
escapeChar: '"',
|
233
|
+
header: false,
|
234
|
+
newline: '\n',
|
235
|
+
skipEmptyLines: false
|
236
|
+
});
|
237
|
+
|
238
|
+
return { csv, tableConfidence };
|
239
|
+
}
|
240
|
+
|
241
|
+
async upload(fileBuf: Buffer): Promise<void> {
|
242
|
+
this.log.info('Uploading file to S3', { fileKey: this.fileKey });
|
243
|
+
const command = new PutObjectCommand({
|
244
|
+
Bucket: this.bucket,
|
245
|
+
Key: this.fileKey,
|
246
|
+
Body: fileBuf,
|
247
|
+
});
|
248
|
+
await this.s3Client.send(command);
|
249
|
+
}
|
250
|
+
|
251
|
+
async startAnalysis(s3Key: string): Promise<string> {
|
252
|
+
const command = new StartDocumentAnalysisCommand({
|
253
|
+
DocumentLocation: {
|
254
|
+
S3Object: {
|
255
|
+
Bucket: this.bucket,
|
256
|
+
Name: s3Key
|
257
|
+
}
|
258
|
+
},
|
259
|
+
FeatureTypes: ["TABLES"]
|
260
|
+
});
|
261
|
+
const response = await this.textractClient.send(command);
|
262
|
+
return response.JobId!;
|
263
|
+
}
|
264
|
+
|
265
|
+
async checkJobStatus(jobId: string): Promise<string> {
|
266
|
+
const command = new GetDocumentAnalysisCommand({ JobId: jobId });
|
267
|
+
const response = await this.textractClient.send(command);
|
268
|
+
return response.JobStatus!;
|
269
|
+
}
|
270
|
+
|
271
|
+
private getImagePlaceholder(block: Block): string {
|
272
|
+
const geometry = block.Geometry?.BoundingBox;
|
273
|
+
if (!geometry) return '';
|
274
|
+
const area = (geometry.Width || 0) * (geometry.Height || 0);
|
275
|
+
if (area < 0.05) return ''; // skip small images
|
276
|
+
|
277
|
+
const top = geometry.Top || 0;
|
278
|
+
const left = geometry.Left || 0;
|
279
|
+
|
280
|
+
let position = '';
|
281
|
+
if (top < 0.3) position += 'TOP_';
|
282
|
+
else if (top > 0.7) position += 'BOTTOM_';
|
283
|
+
|
284
|
+
if (left < 0.3) position += 'LEFT';
|
285
|
+
else if (left > 0.7) position += 'RIGHT';
|
286
|
+
else position += 'CENTER';
|
287
|
+
|
288
|
+
return `[IMAGE_${position}]\n`;
|
289
|
+
}
|
290
|
+
|
291
|
+
private getIndentationLevel(block: Block): number {
|
292
|
+
const left = block.Geometry?.BoundingBox?.Left || 0;
|
293
|
+
if (left < 0.15) return 0;
|
294
|
+
if (left < 0.25) return 1;
|
295
|
+
return 2;
|
296
|
+
}
|
297
|
+
|
298
|
+
private isLikelyHeader(block: Block, prevBlock: Block | null): boolean {
|
299
|
+
if (!prevBlock) return true;
|
300
|
+
const gap = (block.Geometry?.BoundingBox?.Top || 0) -
|
301
|
+
((prevBlock.Geometry?.BoundingBox?.Top || 0) +
|
302
|
+
(prevBlock.Geometry?.BoundingBox?.Height || 0));
|
303
|
+
return gap > 0.03;
|
304
|
+
}
|
305
|
+
|
306
|
+
private formatTextBlock(block: Block, prevBlock: Block | null): string {
|
307
|
+
const text = block.Text || '';
|
308
|
+
const indentLevel = this.getIndentationLevel(block);
|
309
|
+
const indent = ' '.repeat(indentLevel);
|
310
|
+
|
311
|
+
if (this.isLikelyHeader(block, prevBlock)) {
|
312
|
+
return `\n${indent}${text}\n`;
|
313
|
+
}
|
314
|
+
return `${indent}${text}\n`;
|
315
|
+
}
|
316
|
+
|
317
|
+
private shouldMergeLines(prev: Block, current: Block): boolean {
|
318
|
+
const prevBottom = (prev.Geometry?.BoundingBox?.Top || 0)
|
319
|
+
+ (prev.Geometry?.BoundingBox?.Height || 0);
|
320
|
+
const currentTop = current.Geometry?.BoundingBox?.Top || 0;
|
321
|
+
const gap = currentTop - prevBottom;
|
322
|
+
|
323
|
+
// For example, if gap < 0.02, treat them as contiguous
|
324
|
+
if (gap < 0.02) {
|
325
|
+
return true;
|
326
|
+
}
|
327
|
+
return false;
|
328
|
+
}
|
329
|
+
|
330
|
+
async processResults(jobId: string): Promise<string> {
|
331
|
+
let nextToken: string | undefined;
|
332
|
+
let allBlocks: Block[] = [];
|
333
|
+
|
334
|
+
do {
|
335
|
+
const command = new GetDocumentAnalysisCommand({
|
336
|
+
JobId: jobId,
|
337
|
+
NextToken: nextToken
|
338
|
+
});
|
339
|
+
const response = await this.textractClient.send(command);
|
340
|
+
allBlocks = allBlocks.concat(response.Blocks || []);
|
341
|
+
nextToken = response.NextToken;
|
342
|
+
} while (nextToken);
|
343
|
+
|
344
|
+
// Create blocks map
|
345
|
+
const blocksMap: BlocksMap = {};
|
346
|
+
for (const block of allBlocks) {
|
347
|
+
blocksMap[block.Id!] = block;
|
348
|
+
}
|
349
|
+
|
350
|
+
// We'll store each page's content in sequence
|
351
|
+
const pageContents: PageContent[] = [];
|
352
|
+
let currentPage: PageContent | null = null;
|
353
|
+
|
354
|
+
// We'll keep track of a "current text block" that we're building
|
355
|
+
let currentTextContent = "";
|
356
|
+
let prevLineBlock: Block | null = null;
|
357
|
+
|
358
|
+
// Sort by page and vertical position
|
359
|
+
allBlocks.sort((a, b) => {
|
360
|
+
if (a.Page !== b.Page) return (a.Page || 0) - (b.Page || 0);
|
361
|
+
return (a.Geometry?.BoundingBox?.Top || 0) - (b.Geometry?.BoundingBox?.Top || 0);
|
362
|
+
});
|
363
|
+
|
364
|
+
for (const block of allBlocks) {
|
365
|
+
if (block.BlockType === 'PAGE') {
|
366
|
+
// If we were building a text block, push it before starting a new page
|
367
|
+
if (currentTextContent.trim().length > 0 && currentPage) {
|
368
|
+
currentPage.blocks.push({
|
369
|
+
type: 'text',
|
370
|
+
content: currentTextContent
|
371
|
+
});
|
372
|
+
}
|
373
|
+
if (currentPage) {
|
374
|
+
pageContents.push(currentPage);
|
375
|
+
}
|
376
|
+
currentPage = {
|
377
|
+
pageNumber: block.Page || 0,
|
378
|
+
blocks: []
|
379
|
+
};
|
380
|
+
currentTextContent = "";
|
381
|
+
prevLineBlock = null;
|
382
|
+
}
|
383
|
+
else if (currentPage && block.Page === currentPage.pageNumber) {
|
384
|
+
// TABLE handling
|
385
|
+
if (block.BlockType === 'TABLE') {
|
386
|
+
// If there's a pending text block, push it first
|
387
|
+
if (currentTextContent.trim().length > 0) {
|
388
|
+
currentPage.blocks.push({
|
389
|
+
type: 'text',
|
390
|
+
content: currentTextContent
|
391
|
+
});
|
392
|
+
currentTextContent = "";
|
393
|
+
}
|
394
|
+
const { csv, tableConfidence } = this.generateTableCSV(
|
395
|
+
block,
|
396
|
+
blocksMap,
|
397
|
+
currentPage.blocks.filter(b => b.type === 'table').length + 1,
|
398
|
+
currentPage.pageNumber
|
399
|
+
);
|
400
|
+
currentPage.blocks.push({
|
401
|
+
type: 'table',
|
402
|
+
content: csv,
|
403
|
+
confidence: tableConfidence
|
404
|
+
});
|
405
|
+
prevLineBlock = null;
|
406
|
+
}
|
407
|
+
// LINE handling (merge or start new)
|
408
|
+
else if (block.BlockType === 'LINE' && !this.isBlockInTable(block, blocksMap)) {
|
409
|
+
if (prevLineBlock && this.shouldMergeLines(prevLineBlock, block)) {
|
410
|
+
// If we consider this line to be part of the same paragraph,
|
411
|
+
// just append the text. We'll call formatTextBlock to get
|
412
|
+
// indentation/header logic, but we won't add a leading newline.
|
413
|
+
const formatted = this.formatTextBlock(block, prevLineBlock);
|
414
|
+
|
415
|
+
// formatTextBlock might include a leading newline if isLikelyHeader = true
|
416
|
+
// so you can strip it out if you want them truly "merged" into one paragraph:
|
417
|
+
const mergedText = formatted.replace(/^\s*\n/, " ");
|
418
|
+
|
419
|
+
currentTextContent += " " + mergedText.trim();
|
420
|
+
} else {
|
421
|
+
// If there's an existing text block, push it
|
422
|
+
if (currentTextContent.trim().length > 0) {
|
423
|
+
currentPage.blocks.push({
|
424
|
+
type: 'text',
|
425
|
+
content: currentTextContent
|
426
|
+
});
|
427
|
+
}
|
428
|
+
// Start a new text block
|
429
|
+
currentTextContent = this.formatTextBlock(block, prevLineBlock).trim();
|
430
|
+
}
|
431
|
+
prevLineBlock = block;
|
432
|
+
}
|
433
|
+
// IMAGES (if detectImages)
|
434
|
+
else if (this.detectImages) {
|
435
|
+
const geometry = block.Geometry?.BoundingBox;
|
436
|
+
if (geometry && geometry.Width && geometry.Height) {
|
437
|
+
const imagePlaceholder = this.getImagePlaceholder(block);
|
438
|
+
if (imagePlaceholder) {
|
439
|
+
// If there's a pending text block, push it first
|
440
|
+
if (currentTextContent.trim().length > 0) {
|
441
|
+
currentPage.blocks.push({
|
442
|
+
type: 'text',
|
443
|
+
content: currentTextContent
|
444
|
+
});
|
445
|
+
currentTextContent = "";
|
446
|
+
}
|
447
|
+
|
448
|
+
currentPage.blocks.push({
|
449
|
+
type: 'image',
|
450
|
+
content: imagePlaceholder,
|
451
|
+
left: geometry.Left,
|
452
|
+
top: geometry.Top,
|
453
|
+
width: geometry.Width,
|
454
|
+
height: geometry.Height
|
455
|
+
});
|
456
|
+
}
|
457
|
+
}
|
458
|
+
// No line update to prevLineBlock here
|
459
|
+
}
|
460
|
+
}
|
461
|
+
}
|
462
|
+
|
463
|
+
// Handle last page
|
464
|
+
if (currentPage) {
|
465
|
+
if (currentTextContent.trim().length > 0) {
|
466
|
+
currentPage.blocks.push({
|
467
|
+
type: 'text',
|
468
|
+
content: currentTextContent
|
469
|
+
});
|
470
|
+
}
|
471
|
+
pageContents.push(currentPage);
|
472
|
+
}
|
473
|
+
|
474
|
+
// Build final output
|
475
|
+
let fulltext = '';
|
476
|
+
let imgNumber = 1;
|
477
|
+
for (const page of pageContents) {
|
478
|
+
fulltext += `<page number="${page.pageNumber}">\n`;
|
479
|
+
for (const block of page.blocks) {
|
480
|
+
if (block.type === 'text') {
|
481
|
+
fulltext += `<text>\n${block.content}\n</text>\n\n`;
|
482
|
+
} else if (block.type === 'table') {
|
483
|
+
const confidenceAttr = block.confidence !== undefined && this.includeConfidenceInTables
|
484
|
+
? ` confidence="${block.confidence.toFixed(2)}"`
|
485
|
+
: '';
|
486
|
+
fulltext += `<table type="csv"${confidenceAttr}>\n`;
|
487
|
+
fulltext += `${block.content}\n`;
|
488
|
+
fulltext += `</table>\n\n`;
|
489
|
+
} else if (block.type === 'image') {
|
490
|
+
// Include geometry if you like
|
491
|
+
const leftAttr = block.left ? ` left="${block.left.toFixed(4)}"` : '';
|
492
|
+
const topAttr = block.top ? ` top="${block.top.toFixed(4)}"` : '';
|
493
|
+
const widthAttr = block.width ? ` width="${block.width.toFixed(4)}"` : '';
|
494
|
+
const heightAttr = block.height ? ` height="${block.height.toFixed(4)}"` : '';
|
495
|
+
|
496
|
+
fulltext += `<image id="${imgNumber++}" ${leftAttr}${topAttr}${widthAttr}${heightAttr}>\n${block.content.trim()}\n</image>\n\n`;
|
497
|
+
}
|
498
|
+
}
|
499
|
+
fulltext += `</page>\n\n`;
|
500
|
+
}
|
501
|
+
|
502
|
+
return fulltext;
|
503
|
+
}
|
504
|
+
|
505
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import fs from 'fs';
|
2
|
+
import path from 'path';
|
3
|
+
import sharp from 'sharp';
|
4
|
+
import { expect, test } from 'vitest';
|
5
|
+
import { imageResizer } from '../conversion/image';
|
6
|
+
|
7
|
+
|
8
|
+
test('should resize an image to a maximum height or width', async () => {
|
9
|
+
const max_hw = 1024;
|
10
|
+
const format: keyof sharp.FormatEnum = 'jpeg';
|
11
|
+
const imageFile = fs.readFileSync(path.join(__dirname, '../../fixtures', 'cat-picture.jpg'));
|
12
|
+
|
13
|
+
const resizer = imageResizer(max_hw, format);
|
14
|
+
|
15
|
+
const resized = sharp(imageFile).pipe(resizer);
|
16
|
+
const buffer = await resized.toBuffer();
|
17
|
+
const metadata = await sharp(buffer).metadata();
|
18
|
+
|
19
|
+
console.log(metadata);
|
20
|
+
resized.toFile('/tmp/cat-picture.jpg');
|
21
|
+
|
22
|
+
expect(metadata.width).to.be.lessThanOrEqual(max_hw);
|
23
|
+
expect(metadata.height).to.be.lessThanOrEqual(max_hw);
|
24
|
+
expect(metadata.format).to.equal(format);
|
25
|
+
|
26
|
+
});
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
import sharp from "sharp";
|
3
|
+
|
4
|
+
|
5
|
+
/**
|
6
|
+
* Resizes an image to a maximum height or width
|
7
|
+
* @param max_hw
|
8
|
+
* @param format
|
9
|
+
* @returns
|
10
|
+
*/
|
11
|
+
export function imageResizer(max_hw: number, format: keyof sharp.FormatEnum) {
|
12
|
+
|
13
|
+
return sharp().resize({
|
14
|
+
width: max_hw,
|
15
|
+
height: max_hw,
|
16
|
+
fit: sharp.fit.inside,
|
17
|
+
withoutEnlargement: true,
|
18
|
+
|
19
|
+
}).toFormat(format);
|
20
|
+
|
21
|
+
}
|
22
|
+
|
@@ -0,0 +1,74 @@
|
|
1
|
+
import { MockActivityEnvironment, TestWorkflowEnvironment } from '@temporalio/testing';
|
2
|
+
import fs from 'fs';
|
3
|
+
import path from 'path';
|
4
|
+
import { beforeAll, expect, test } from 'vitest';
|
5
|
+
import { mutoolPdfToText, pdfExtractPages, pdfToImages } from './mutool.js';
|
6
|
+
|
7
|
+
|
8
|
+
let testEnv: TestWorkflowEnvironment;
|
9
|
+
let activityContext: MockActivityEnvironment;
|
10
|
+
|
11
|
+
beforeAll(async () => {
|
12
|
+
testEnv = await TestWorkflowEnvironment.createLocal();
|
13
|
+
activityContext = new MockActivityEnvironment();
|
14
|
+
});
|
15
|
+
|
16
|
+
const TIMEOUT = 10000;
|
17
|
+
|
18
|
+
test('[mutool] should convert pdf to text', async () => {
|
19
|
+
const pdf = fs.readFileSync(path.join(__dirname, '../../fixtures', 'test-pdf1.pdf'));
|
20
|
+
const buf = Buffer.from(pdf);
|
21
|
+
console.log("Running mutoolPdfToText")
|
22
|
+
const result = await activityContext.run(mutoolPdfToText, buf);
|
23
|
+
expect(result).toContain('VF primarily uses foreign currency exchange');
|
24
|
+
|
25
|
+
}, TIMEOUT);
|
26
|
+
|
27
|
+
test('[mutool] should convert pdf to images', async () => {
|
28
|
+
const filename = path.join(__dirname, '../../fixtures', 'test-pdf1.pdf');
|
29
|
+
|
30
|
+
console.log("Running pdfToImages")
|
31
|
+
const result = await activityContext.run(pdfToImages, filename);
|
32
|
+
console.log(result);
|
33
|
+
|
34
|
+
expect(result).toBeInstanceOf(Array);
|
35
|
+
expect((result as string[]).length).toBe(119);
|
36
|
+
|
37
|
+
}, TIMEOUT);
|
38
|
+
|
39
|
+
test('[mutool] should convert pdf to images with pages', async () => {
|
40
|
+
const filename = path.join(__dirname, '../../fixtures', 'test-pdf1.pdf');
|
41
|
+
const pages = [7, 8, 9];
|
42
|
+
|
43
|
+
console.log("Running pdfToImages with pages")
|
44
|
+
const result = await activityContext.run(pdfToImages, filename, pages);
|
45
|
+
console.log(result);
|
46
|
+
|
47
|
+
expect(result).toBeInstanceOf(Array);
|
48
|
+
expect((result as string[]).length).toBe(3);
|
49
|
+
|
50
|
+
}, TIMEOUT);
|
51
|
+
|
52
|
+
test('[mutool] should extract 3 pages from PDF into new PDF', async () => {
|
53
|
+
const filename = path.join(__dirname, '../../fixtures', 'test-pdf1.pdf');
|
54
|
+
const pages = [7, 8, 9];
|
55
|
+
|
56
|
+
console.log("Running pdfGetPages")
|
57
|
+
const result = await activityContext.run(pdfExtractPages, filename, pages);
|
58
|
+
console.log(result);
|
59
|
+
|
60
|
+
expect(result).toContain(".pdf");
|
61
|
+
|
62
|
+
}, TIMEOUT);
|
63
|
+
|
64
|
+
test('[mutool] should extract 1 pages from PDF into new PDF', async () => {
|
65
|
+
const filename = path.join(__dirname, '../../fixtures', 'test-pdf1.pdf');
|
66
|
+
const pages = [12];
|
67
|
+
|
68
|
+
console.log("Running pdfGetPages")
|
69
|
+
const result = await activityContext.run(pdfExtractPages, filename, pages);
|
70
|
+
console.log(result);
|
71
|
+
|
72
|
+
expect(result).toContain(".pdf");
|
73
|
+
|
74
|
+
}, TIMEOUT);
|