@vertesia/workflow 0.24.0-dev.202601221707

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (490) hide show
  1. package/LICENSE +13 -0
  2. package/README.md +65 -0
  3. package/bin/bundle-workflows.mjs +39 -0
  4. package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js +33 -0
  5. package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js.map +1 -0
  6. package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +73 -0
  7. package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -0
  8. package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js +19 -0
  9. package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js.map +1 -0
  10. package/lib/cjs/activities/chunkDocument.js +85 -0
  11. package/lib/cjs/activities/chunkDocument.js.map +1 -0
  12. package/lib/cjs/activities/copyParentArtifacts.js +127 -0
  13. package/lib/cjs/activities/copyParentArtifacts.js.map +1 -0
  14. package/lib/cjs/activities/createDocumentFromOther.js +64 -0
  15. package/lib/cjs/activities/createDocumentFromOther.js.map +1 -0
  16. package/lib/cjs/activities/executeInteraction.js +194 -0
  17. package/lib/cjs/activities/executeInteraction.js.map +1 -0
  18. package/lib/cjs/activities/extractDocumentText.js +156 -0
  19. package/lib/cjs/activities/extractDocumentText.js.map +1 -0
  20. package/lib/cjs/activities/generateDocumentProperties.js +83 -0
  21. package/lib/cjs/activities/generateDocumentProperties.js.map +1 -0
  22. package/lib/cjs/activities/generateEmbeddings.js +257 -0
  23. package/lib/cjs/activities/generateEmbeddings.js.map +1 -0
  24. package/lib/cjs/activities/generateOrAssignContentType.js +125 -0
  25. package/lib/cjs/activities/generateOrAssignContentType.js.map +1 -0
  26. package/lib/cjs/activities/getObjectFromStore.js +20 -0
  27. package/lib/cjs/activities/getObjectFromStore.js.map +1 -0
  28. package/lib/cjs/activities/handleError.js +22 -0
  29. package/lib/cjs/activities/handleError.js.map +1 -0
  30. package/lib/cjs/activities/index-dsl.js +51 -0
  31. package/lib/cjs/activities/index-dsl.js.map +1 -0
  32. package/lib/cjs/activities/index.js +21 -0
  33. package/lib/cjs/activities/index.js.map +1 -0
  34. package/lib/cjs/activities/media/prepareAudio.js +239 -0
  35. package/lib/cjs/activities/media/prepareAudio.js.map +1 -0
  36. package/lib/cjs/activities/media/prepareVideo.js +429 -0
  37. package/lib/cjs/activities/media/prepareVideo.js.map +1 -0
  38. package/lib/cjs/activities/media/processPdfWithTextract.js +103 -0
  39. package/lib/cjs/activities/media/processPdfWithTextract.js.map +1 -0
  40. package/lib/cjs/activities/media/saveGladiaTranscription.js +81 -0
  41. package/lib/cjs/activities/media/saveGladiaTranscription.js.map +1 -0
  42. package/lib/cjs/activities/media/transcribeMediaWithGladia.js +82 -0
  43. package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +1 -0
  44. package/lib/cjs/activities/notifyWebhook.js +158 -0
  45. package/lib/cjs/activities/notifyWebhook.js.map +1 -0
  46. package/lib/cjs/activities/rateLimiter.js +30 -0
  47. package/lib/cjs/activities/rateLimiter.js.map +1 -0
  48. package/lib/cjs/activities/renditions/generateImageRendition.js +66 -0
  49. package/lib/cjs/activities/renditions/generateImageRendition.js.map +1 -0
  50. package/lib/cjs/activities/renditions/generateVideoRendition.js +200 -0
  51. package/lib/cjs/activities/renditions/generateVideoRendition.js.map +1 -0
  52. package/lib/cjs/activities/setDocumentStatus.js +15 -0
  53. package/lib/cjs/activities/setDocumentStatus.js.map +1 -0
  54. package/lib/cjs/conversion/TextractProcessor.js +417 -0
  55. package/lib/cjs/conversion/TextractProcessor.js.map +1 -0
  56. package/lib/cjs/conversion/image.js +149 -0
  57. package/lib/cjs/conversion/image.js.map +1 -0
  58. package/lib/cjs/conversion/markitdown.js +42 -0
  59. package/lib/cjs/conversion/markitdown.js.map +1 -0
  60. package/lib/cjs/conversion/mutool.js +147 -0
  61. package/lib/cjs/conversion/mutool.js.map +1 -0
  62. package/lib/cjs/conversion/pandoc.js +39 -0
  63. package/lib/cjs/conversion/pandoc.js.map +1 -0
  64. package/lib/cjs/dsl/conditions.js +81 -0
  65. package/lib/cjs/dsl/conditions.js.map +1 -0
  66. package/lib/cjs/dsl/dsl-workflow.js +343 -0
  67. package/lib/cjs/dsl/dsl-workflow.js.map +1 -0
  68. package/lib/cjs/dsl/dslProxyActivities.js +23 -0
  69. package/lib/cjs/dsl/dslProxyActivities.js.map +1 -0
  70. package/lib/cjs/dsl/projections.js +59 -0
  71. package/lib/cjs/dsl/projections.js.map +1 -0
  72. package/lib/cjs/dsl/setup/ActivityContext.js +122 -0
  73. package/lib/cjs/dsl/setup/ActivityContext.js.map +1 -0
  74. package/lib/cjs/dsl/setup/fetch/DataProvider.js +51 -0
  75. package/lib/cjs/dsl/setup/fetch/DataProvider.js.map +1 -0
  76. package/lib/cjs/dsl/setup/fetch/index.js +16 -0
  77. package/lib/cjs/dsl/setup/fetch/index.js.map +1 -0
  78. package/lib/cjs/dsl/setup/fetch/providers.js +67 -0
  79. package/lib/cjs/dsl/setup/fetch/providers.js.map +1 -0
  80. package/lib/cjs/dsl/test/test-child-workflow.js +10 -0
  81. package/lib/cjs/dsl/test/test-child-workflow.js.map +1 -0
  82. package/lib/cjs/dsl/validation.js +122 -0
  83. package/lib/cjs/dsl/validation.js.map +1 -0
  84. package/lib/cjs/dsl/vars.js +341 -0
  85. package/lib/cjs/dsl/vars.js.map +1 -0
  86. package/lib/cjs/dsl/walk.js +100 -0
  87. package/lib/cjs/dsl/walk.js.map +1 -0
  88. package/lib/cjs/dsl.js +20 -0
  89. package/lib/cjs/dsl.js.map +1 -0
  90. package/lib/cjs/errors.js +79 -0
  91. package/lib/cjs/errors.js.map +1 -0
  92. package/lib/cjs/index.js +56 -0
  93. package/lib/cjs/index.js.map +1 -0
  94. package/lib/cjs/iterative-generation/activities/extractToc.js +47 -0
  95. package/lib/cjs/iterative-generation/activities/extractToc.js.map +1 -0
  96. package/lib/cjs/iterative-generation/activities/finalizeOutput.js +72 -0
  97. package/lib/cjs/iterative-generation/activities/finalizeOutput.js.map +1 -0
  98. package/lib/cjs/iterative-generation/activities/generatePart.js +78 -0
  99. package/lib/cjs/iterative-generation/activities/generatePart.js.map +1 -0
  100. package/lib/cjs/iterative-generation/activities/generateToc.js +86 -0
  101. package/lib/cjs/iterative-generation/activities/generateToc.js.map +1 -0
  102. package/lib/cjs/iterative-generation/activities/index.js +12 -0
  103. package/lib/cjs/iterative-generation/activities/index.js.map +1 -0
  104. package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js +56 -0
  105. package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js.map +1 -0
  106. package/lib/cjs/iterative-generation/types.js +5 -0
  107. package/lib/cjs/iterative-generation/types.js.map +1 -0
  108. package/lib/cjs/iterative-generation/utils.js +121 -0
  109. package/lib/cjs/iterative-generation/utils.js.map +1 -0
  110. package/lib/cjs/package.json +3 -0
  111. package/lib/cjs/result-types.js +10 -0
  112. package/lib/cjs/result-types.js.map +1 -0
  113. package/lib/cjs/system/notifyWebhookWorkflow.js +53 -0
  114. package/lib/cjs/system/notifyWebhookWorkflow.js.map +1 -0
  115. package/lib/cjs/system/recalculateEmbeddingsWorkflow.js +33 -0
  116. package/lib/cjs/system/recalculateEmbeddingsWorkflow.js.map +1 -0
  117. package/lib/cjs/utils/auth.js +15 -0
  118. package/lib/cjs/utils/auth.js.map +1 -0
  119. package/lib/cjs/utils/blobs.js +64 -0
  120. package/lib/cjs/utils/blobs.js.map +1 -0
  121. package/lib/cjs/utils/chunks.js +14 -0
  122. package/lib/cjs/utils/chunks.js.map +1 -0
  123. package/lib/cjs/utils/client.js +31 -0
  124. package/lib/cjs/utils/client.js.map +1 -0
  125. package/lib/cjs/utils/expand-vars.js +33 -0
  126. package/lib/cjs/utils/expand-vars.js.map +1 -0
  127. package/lib/cjs/utils/memory.js +65 -0
  128. package/lib/cjs/utils/memory.js.map +1 -0
  129. package/lib/cjs/utils/renditions.js +88 -0
  130. package/lib/cjs/utils/renditions.js.map +1 -0
  131. package/lib/cjs/utils/storage.js +54 -0
  132. package/lib/cjs/utils/storage.js.map +1 -0
  133. package/lib/cjs/utils/tokens.js +38 -0
  134. package/lib/cjs/utils/tokens.js.map +1 -0
  135. package/lib/cjs/vars.js +20 -0
  136. package/lib/cjs/vars.js.map +1 -0
  137. package/lib/cjs/workflows.js +15 -0
  138. package/lib/cjs/workflows.js.map +1 -0
  139. package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js +30 -0
  140. package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js.map +1 -0
  141. package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +70 -0
  142. package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +1 -0
  143. package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js +16 -0
  144. package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js.map +1 -0
  145. package/lib/esm/activities/chunkDocument.js +82 -0
  146. package/lib/esm/activities/chunkDocument.js.map +1 -0
  147. package/lib/esm/activities/copyParentArtifacts.js +124 -0
  148. package/lib/esm/activities/copyParentArtifacts.js.map +1 -0
  149. package/lib/esm/activities/createDocumentFromOther.js +58 -0
  150. package/lib/esm/activities/createDocumentFromOther.js.map +1 -0
  151. package/lib/esm/activities/executeInteraction.js +190 -0
  152. package/lib/esm/activities/executeInteraction.js.map +1 -0
  153. package/lib/esm/activities/extractDocumentText.js +153 -0
  154. package/lib/esm/activities/extractDocumentText.js.map +1 -0
  155. package/lib/esm/activities/generateDocumentProperties.js +80 -0
  156. package/lib/esm/activities/generateDocumentProperties.js.map +1 -0
  157. package/lib/esm/activities/generateEmbeddings.js +254 -0
  158. package/lib/esm/activities/generateEmbeddings.js.map +1 -0
  159. package/lib/esm/activities/generateOrAssignContentType.js +122 -0
  160. package/lib/esm/activities/generateOrAssignContentType.js.map +1 -0
  161. package/lib/esm/activities/getObjectFromStore.js +17 -0
  162. package/lib/esm/activities/getObjectFromStore.js.map +1 -0
  163. package/lib/esm/activities/handleError.js +19 -0
  164. package/lib/esm/activities/handleError.js.map +1 -0
  165. package/lib/esm/activities/index-dsl.js +25 -0
  166. package/lib/esm/activities/index-dsl.js.map +1 -0
  167. package/lib/esm/activities/index.js +5 -0
  168. package/lib/esm/activities/index.js.map +1 -0
  169. package/lib/esm/activities/media/prepareAudio.js +200 -0
  170. package/lib/esm/activities/media/prepareAudio.js.map +1 -0
  171. package/lib/esm/activities/media/prepareVideo.js +390 -0
  172. package/lib/esm/activities/media/prepareVideo.js.map +1 -0
  173. package/lib/esm/activities/media/processPdfWithTextract.js +99 -0
  174. package/lib/esm/activities/media/processPdfWithTextract.js.map +1 -0
  175. package/lib/esm/activities/media/saveGladiaTranscription.js +78 -0
  176. package/lib/esm/activities/media/saveGladiaTranscription.js.map +1 -0
  177. package/lib/esm/activities/media/transcribeMediaWithGladia.js +79 -0
  178. package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +1 -0
  179. package/lib/esm/activities/notifyWebhook.js +155 -0
  180. package/lib/esm/activities/notifyWebhook.js.map +1 -0
  181. package/lib/esm/activities/rateLimiter.js +27 -0
  182. package/lib/esm/activities/rateLimiter.js.map +1 -0
  183. package/lib/esm/activities/renditions/generateImageRendition.js +63 -0
  184. package/lib/esm/activities/renditions/generateImageRendition.js.map +1 -0
  185. package/lib/esm/activities/renditions/generateVideoRendition.js +194 -0
  186. package/lib/esm/activities/renditions/generateVideoRendition.js.map +1 -0
  187. package/lib/esm/activities/setDocumentStatus.js +12 -0
  188. package/lib/esm/activities/setDocumentStatus.js.map +1 -0
  189. package/lib/esm/conversion/TextractProcessor.js +410 -0
  190. package/lib/esm/conversion/TextractProcessor.js.map +1 -0
  191. package/lib/esm/conversion/image.js +143 -0
  192. package/lib/esm/conversion/image.js.map +1 -0
  193. package/lib/esm/conversion/markitdown.js +36 -0
  194. package/lib/esm/conversion/markitdown.js.map +1 -0
  195. package/lib/esm/conversion/mutool.js +139 -0
  196. package/lib/esm/conversion/mutool.js.map +1 -0
  197. package/lib/esm/conversion/pandoc.js +36 -0
  198. package/lib/esm/conversion/pandoc.js.map +1 -0
  199. package/lib/esm/dsl/conditions.js +75 -0
  200. package/lib/esm/dsl/conditions.js.map +1 -0
  201. package/lib/esm/dsl/dsl-workflow.js +336 -0
  202. package/lib/esm/dsl/dsl-workflow.js.map +1 -0
  203. package/lib/esm/dsl/dslProxyActivities.js +20 -0
  204. package/lib/esm/dsl/dslProxyActivities.js.map +1 -0
  205. package/lib/esm/dsl/projections.js +55 -0
  206. package/lib/esm/dsl/projections.js.map +1 -0
  207. package/lib/esm/dsl/setup/ActivityContext.js +117 -0
  208. package/lib/esm/dsl/setup/ActivityContext.js.map +1 -0
  209. package/lib/esm/dsl/setup/fetch/DataProvider.js +47 -0
  210. package/lib/esm/dsl/setup/fetch/DataProvider.js.map +1 -0
  211. package/lib/esm/dsl/setup/fetch/index.js +12 -0
  212. package/lib/esm/dsl/setup/fetch/index.js.map +1 -0
  213. package/lib/esm/dsl/setup/fetch/providers.js +61 -0
  214. package/lib/esm/dsl/setup/fetch/providers.js.map +1 -0
  215. package/lib/esm/dsl/test/test-child-workflow.js +5 -0
  216. package/lib/esm/dsl/test/test-child-workflow.js.map +1 -0
  217. package/lib/esm/dsl/validation.js +118 -0
  218. package/lib/esm/dsl/validation.js.map +1 -0
  219. package/lib/esm/dsl/vars.js +335 -0
  220. package/lib/esm/dsl/vars.js.map +1 -0
  221. package/lib/esm/dsl/walk.js +96 -0
  222. package/lib/esm/dsl/walk.js.map +1 -0
  223. package/lib/esm/dsl.js +4 -0
  224. package/lib/esm/dsl.js.map +1 -0
  225. package/lib/esm/errors.js +69 -0
  226. package/lib/esm/errors.js.map +1 -0
  227. package/lib/esm/index.js +38 -0
  228. package/lib/esm/index.js.map +1 -0
  229. package/lib/esm/iterative-generation/activities/extractToc.js +44 -0
  230. package/lib/esm/iterative-generation/activities/extractToc.js.map +1 -0
  231. package/lib/esm/iterative-generation/activities/finalizeOutput.js +69 -0
  232. package/lib/esm/iterative-generation/activities/finalizeOutput.js.map +1 -0
  233. package/lib/esm/iterative-generation/activities/generatePart.js +75 -0
  234. package/lib/esm/iterative-generation/activities/generatePart.js.map +1 -0
  235. package/lib/esm/iterative-generation/activities/generateToc.js +83 -0
  236. package/lib/esm/iterative-generation/activities/generateToc.js.map +1 -0
  237. package/lib/esm/iterative-generation/activities/index.js +5 -0
  238. package/lib/esm/iterative-generation/activities/index.js.map +1 -0
  239. package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js +53 -0
  240. package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js.map +1 -0
  241. package/lib/esm/iterative-generation/types.js +2 -0
  242. package/lib/esm/iterative-generation/types.js.map +1 -0
  243. package/lib/esm/iterative-generation/utils.js +112 -0
  244. package/lib/esm/iterative-generation/utils.js.map +1 -0
  245. package/lib/esm/result-types.js +7 -0
  246. package/lib/esm/result-types.js.map +1 -0
  247. package/lib/esm/system/notifyWebhookWorkflow.js +50 -0
  248. package/lib/esm/system/notifyWebhookWorkflow.js.map +1 -0
  249. package/lib/esm/system/recalculateEmbeddingsWorkflow.js +30 -0
  250. package/lib/esm/system/recalculateEmbeddingsWorkflow.js.map +1 -0
  251. package/lib/esm/utils/auth.js +8 -0
  252. package/lib/esm/utils/auth.js.map +1 -0
  253. package/lib/esm/utils/blobs.js +54 -0
  254. package/lib/esm/utils/blobs.js.map +1 -0
  255. package/lib/esm/utils/chunks.js +9 -0
  256. package/lib/esm/utils/chunks.js.map +1 -0
  257. package/lib/esm/utils/client.js +27 -0
  258. package/lib/esm/utils/client.js.map +1 -0
  259. package/lib/esm/utils/expand-vars.js +30 -0
  260. package/lib/esm/utils/expand-vars.js.map +1 -0
  261. package/lib/esm/utils/memory.js +55 -0
  262. package/lib/esm/utils/memory.js.map +1 -0
  263. package/lib/esm/utils/renditions.js +80 -0
  264. package/lib/esm/utils/renditions.js.map +1 -0
  265. package/lib/esm/utils/storage.js +45 -0
  266. package/lib/esm/utils/storage.js.map +1 -0
  267. package/lib/esm/utils/tokens.js +34 -0
  268. package/lib/esm/utils/tokens.js.map +1 -0
  269. package/lib/esm/vars.js +4 -0
  270. package/lib/esm/vars.js.map +1 -0
  271. package/lib/esm/workflows.js +8 -0
  272. package/lib/esm/workflows.js.map +1 -0
  273. package/lib/tsconfig.tsbuildinfo +1 -0
  274. package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts +17 -0
  275. package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts.map +1 -0
  276. package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts +39 -0
  277. package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts.map +1 -0
  278. package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts +19 -0
  279. package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts.map +1 -0
  280. package/lib/types/activities/chunkDocument.d.ts +33 -0
  281. package/lib/types/activities/chunkDocument.d.ts.map +1 -0
  282. package/lib/types/activities/copyParentArtifacts.d.ts +19 -0
  283. package/lib/types/activities/copyParentArtifacts.d.ts.map +1 -0
  284. package/lib/types/activities/createDocumentFromOther.d.ts +21 -0
  285. package/lib/types/activities/createDocumentFromOther.d.ts.map +1 -0
  286. package/lib/types/activities/executeInteraction.d.ts +61 -0
  287. package/lib/types/activities/executeInteraction.d.ts.map +1 -0
  288. package/lib/types/activities/extractDocumentText.d.ts +10 -0
  289. package/lib/types/activities/extractDocumentText.d.ts.map +1 -0
  290. package/lib/types/activities/generateDocumentProperties.d.ts +32 -0
  291. package/lib/types/activities/generateDocumentProperties.d.ts.map +1 -0
  292. package/lib/types/activities/generateEmbeddings.d.ts +53 -0
  293. package/lib/types/activities/generateEmbeddings.d.ts.map +1 -0
  294. package/lib/types/activities/generateOrAssignContentType.d.ts +44 -0
  295. package/lib/types/activities/generateOrAssignContentType.d.ts.map +1 -0
  296. package/lib/types/activities/getObjectFromStore.d.ts +14 -0
  297. package/lib/types/activities/getObjectFromStore.d.ts.map +1 -0
  298. package/lib/types/activities/handleError.d.ts +6 -0
  299. package/lib/types/activities/handleError.d.ts.map +1 -0
  300. package/lib/types/activities/index-dsl.d.ts +25 -0
  301. package/lib/types/activities/index-dsl.d.ts.map +1 -0
  302. package/lib/types/activities/index.d.ts +5 -0
  303. package/lib/types/activities/index.d.ts.map +1 -0
  304. package/lib/types/activities/media/prepareAudio.d.ts +25 -0
  305. package/lib/types/activities/media/prepareAudio.d.ts.map +1 -0
  306. package/lib/types/activities/media/prepareVideo.d.ts +30 -0
  307. package/lib/types/activities/media/prepareVideo.d.ts.map +1 -0
  308. package/lib/types/activities/media/processPdfWithTextract.d.ts +26 -0
  309. package/lib/types/activities/media/processPdfWithTextract.d.ts.map +1 -0
  310. package/lib/types/activities/media/saveGladiaTranscription.d.ts +14 -0
  311. package/lib/types/activities/media/saveGladiaTranscription.d.ts.map +1 -0
  312. package/lib/types/activities/media/transcribeMediaWithGladia.d.ts +19 -0
  313. package/lib/types/activities/media/transcribeMediaWithGladia.d.ts.map +1 -0
  314. package/lib/types/activities/notifyWebhook.d.ts +27 -0
  315. package/lib/types/activities/notifyWebhook.d.ts.map +1 -0
  316. package/lib/types/activities/rateLimiter.d.ts +11 -0
  317. package/lib/types/activities/rateLimiter.d.ts.map +1 -0
  318. package/lib/types/activities/renditions/generateImageRendition.d.ts +14 -0
  319. package/lib/types/activities/renditions/generateImageRendition.d.ts.map +1 -0
  320. package/lib/types/activities/renditions/generateVideoRendition.d.ts +15 -0
  321. package/lib/types/activities/renditions/generateVideoRendition.d.ts.map +1 -0
  322. package/lib/types/activities/setDocumentStatus.d.ts +15 -0
  323. package/lib/types/activities/setDocumentStatus.d.ts.map +1 -0
  324. package/lib/types/conversion/TextractProcessor.d.ts +45 -0
  325. package/lib/types/conversion/TextractProcessor.d.ts.map +1 -0
  326. package/lib/types/conversion/image.d.ts +13 -0
  327. package/lib/types/conversion/image.d.ts.map +1 -0
  328. package/lib/types/conversion/markitdown.d.ts +2 -0
  329. package/lib/types/conversion/markitdown.d.ts.map +1 -0
  330. package/lib/types/conversion/mutool.d.ts +19 -0
  331. package/lib/types/conversion/mutool.d.ts.map +1 -0
  332. package/lib/types/conversion/pandoc.d.ts +2 -0
  333. package/lib/types/conversion/pandoc.d.ts.map +1 -0
  334. package/lib/types/dsl/conditions.d.ts +2 -0
  335. package/lib/types/dsl/conditions.d.ts.map +1 -0
  336. package/lib/types/dsl/dsl-workflow.d.ts +5 -0
  337. package/lib/types/dsl/dsl-workflow.d.ts.map +1 -0
  338. package/lib/types/dsl/dslProxyActivities.d.ts +10 -0
  339. package/lib/types/dsl/dslProxyActivities.d.ts.map +1 -0
  340. package/lib/types/dsl/projections.d.ts +4 -0
  341. package/lib/types/dsl/projections.d.ts.map +1 -0
  342. package/lib/types/dsl/setup/ActivityContext.d.ts +17 -0
  343. package/lib/types/dsl/setup/ActivityContext.d.ts.map +1 -0
  344. package/lib/types/dsl/setup/fetch/DataProvider.d.ts +9 -0
  345. package/lib/types/dsl/setup/fetch/DataProvider.d.ts.map +1 -0
  346. package/lib/types/dsl/setup/fetch/index.d.ts +6 -0
  347. package/lib/types/dsl/setup/fetch/index.d.ts.map +1 -0
  348. package/lib/types/dsl/setup/fetch/providers.d.ts +25 -0
  349. package/lib/types/dsl/setup/fetch/providers.d.ts.map +1 -0
  350. package/lib/types/dsl/test/test-child-workflow.d.ts +4 -0
  351. package/lib/types/dsl/test/test-child-workflow.d.ts.map +1 -0
  352. package/lib/types/dsl/validation.d.ts +4 -0
  353. package/lib/types/dsl/validation.d.ts.map +1 -0
  354. package/lib/types/dsl/vars.d.ts +48 -0
  355. package/lib/types/dsl/vars.d.ts.map +1 -0
  356. package/lib/types/dsl/walk.d.ts +18 -0
  357. package/lib/types/dsl/walk.d.ts.map +1 -0
  358. package/lib/types/dsl.d.ts +4 -0
  359. package/lib/types/dsl.d.ts.map +1 -0
  360. package/lib/types/errors.d.ts +37 -0
  361. package/lib/types/errors.d.ts.map +1 -0
  362. package/lib/types/index.d.ts +37 -0
  363. package/lib/types/index.d.ts.map +1 -0
  364. package/lib/types/iterative-generation/activities/extractToc.d.ts +10 -0
  365. package/lib/types/iterative-generation/activities/extractToc.d.ts.map +1 -0
  366. package/lib/types/iterative-generation/activities/finalizeOutput.d.ts +3 -0
  367. package/lib/types/iterative-generation/activities/finalizeOutput.d.ts.map +1 -0
  368. package/lib/types/iterative-generation/activities/generatePart.d.ts +3 -0
  369. package/lib/types/iterative-generation/activities/generatePart.d.ts.map +1 -0
  370. package/lib/types/iterative-generation/activities/generateToc.d.ts +4 -0
  371. package/lib/types/iterative-generation/activities/generateToc.d.ts.map +1 -0
  372. package/lib/types/iterative-generation/activities/index.d.ts +5 -0
  373. package/lib/types/iterative-generation/activities/index.d.ts.map +1 -0
  374. package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts +3 -0
  375. package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts.map +1 -0
  376. package/lib/types/iterative-generation/types.d.ts +79 -0
  377. package/lib/types/iterative-generation/types.d.ts.map +1 -0
  378. package/lib/types/iterative-generation/utils.d.ts +26 -0
  379. package/lib/types/iterative-generation/utils.d.ts.map +1 -0
  380. package/lib/types/result-types.d.ts +22 -0
  381. package/lib/types/result-types.d.ts.map +1 -0
  382. package/lib/types/system/notifyWebhookWorkflow.d.ts +8 -0
  383. package/lib/types/system/notifyWebhookWorkflow.d.ts.map +1 -0
  384. package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts +25 -0
  385. package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts.map +1 -0
  386. package/lib/types/utils/auth.d.ts +4 -0
  387. package/lib/types/utils/auth.d.ts.map +1 -0
  388. package/lib/types/utils/blobs.d.ts +7 -0
  389. package/lib/types/utils/blobs.d.ts.map +1 -0
  390. package/lib/types/utils/chunks.d.ts +9 -0
  391. package/lib/types/utils/chunks.d.ts.map +1 -0
  392. package/lib/types/utils/client.d.ts +8 -0
  393. package/lib/types/utils/client.d.ts.map +1 -0
  394. package/lib/types/utils/expand-vars.d.ts +8 -0
  395. package/lib/types/utils/expand-vars.d.ts.map +1 -0
  396. package/lib/types/utils/memory.d.ts +8 -0
  397. package/lib/types/utils/memory.d.ts.map +1 -0
  398. package/lib/types/utils/renditions.d.ts +23 -0
  399. package/lib/types/utils/renditions.d.ts.map +1 -0
  400. package/lib/types/utils/storage.d.ts +16 -0
  401. package/lib/types/utils/storage.d.ts.map +1 -0
  402. package/lib/types/utils/tokens.d.ts +11 -0
  403. package/lib/types/utils/tokens.d.ts.map +1 -0
  404. package/lib/types/vars.d.ts +3 -0
  405. package/lib/types/vars.d.ts.map +1 -0
  406. package/lib/types/workflows.d.ts +8 -0
  407. package/lib/types/workflows.d.ts.map +1 -0
  408. package/lib/workflows-bundle.js +17213 -0
  409. package/package.json +146 -0
  410. package/src/activities/advanced/createDocumentTypeFromInteractionRun.ts +55 -0
  411. package/src/activities/advanced/createOrUpdateDocumentFromInteractionRun.ts +119 -0
  412. package/src/activities/advanced/updateDocumentFromInteractionRun.ts +35 -0
  413. package/src/activities/chunkDocument.ts +146 -0
  414. package/src/activities/copyParentArtifacts.ts +162 -0
  415. package/src/activities/createDocumentFromOther.ts +92 -0
  416. package/src/activities/executeInteraction.ts +300 -0
  417. package/src/activities/extractDocumentText.ts +205 -0
  418. package/src/activities/generateDocumentProperties.ts +120 -0
  419. package/src/activities/generateEmbeddings.ts +387 -0
  420. package/src/activities/generateOrAssignContentType.ts +218 -0
  421. package/src/activities/getObjectFromStore.ts +31 -0
  422. package/src/activities/handleError.ts +25 -0
  423. package/src/activities/index-dsl.ts +25 -0
  424. package/src/activities/index.ts +4 -0
  425. package/src/activities/media/prepareAudio.ts +334 -0
  426. package/src/activities/media/prepareVideo.ts +622 -0
  427. package/src/activities/media/processPdfWithTextract.ts +141 -0
  428. package/src/activities/media/saveGladiaTranscription.ts +128 -0
  429. package/src/activities/media/transcribeMediaWithGladia.ts +117 -0
  430. package/src/activities/notifyWebhook.test.ts +134 -0
  431. package/src/activities/notifyWebhook.ts +199 -0
  432. package/src/activities/rateLimiter.ts +41 -0
  433. package/src/activities/renditions/generateImageRendition.ts +111 -0
  434. package/src/activities/renditions/generateVideoRendition.ts +293 -0
  435. package/src/activities/setDocumentStatus.ts +25 -0
  436. package/src/conversion/TextractProcessor.ts +506 -0
  437. package/src/conversion/image.test.ts +118 -0
  438. package/src/conversion/image.ts +168 -0
  439. package/src/conversion/markitdown.ts +41 -0
  440. package/src/conversion/mutool.test.ts +74 -0
  441. package/src/conversion/mutool.ts +180 -0
  442. package/src/conversion/pandoc.test.ts +24 -0
  443. package/src/conversion/pandoc.ts +40 -0
  444. package/src/dsl/conditions.ts +76 -0
  445. package/src/dsl/dsl-workflow.test.ts +58 -0
  446. package/src/dsl/dsl-workflow.ts +397 -0
  447. package/src/dsl/dslProxyActivities.ts +38 -0
  448. package/src/dsl/ms.d.ts +11 -0
  449. package/src/dsl/projections.test.ts +159 -0
  450. package/src/dsl/projections.ts +72 -0
  451. package/src/dsl/setup/ActivityContext.ts +178 -0
  452. package/src/dsl/setup/fetch/DataProvider.ts +45 -0
  453. package/src/dsl/setup/fetch/index.ts +19 -0
  454. package/src/dsl/setup/fetch/providers.ts +67 -0
  455. package/src/dsl/test/test-child-workflow.ts +6 -0
  456. package/src/dsl/validation.test.ts +257 -0
  457. package/src/dsl/validation.ts +125 -0
  458. package/src/dsl/vars.test.ts +245 -0
  459. package/src/dsl/vars.ts +340 -0
  460. package/src/dsl/walk.test.ts +81 -0
  461. package/src/dsl/walk.ts +103 -0
  462. package/src/dsl/workflow-exec-child.test.ts +273 -0
  463. package/src/dsl/workflow-fetch.test.ts +138 -0
  464. package/src/dsl/workflow-import.test.ts +89 -0
  465. package/src/dsl/workflow.test.ts +122 -0
  466. package/src/dsl.ts +3 -0
  467. package/src/errors.ts +101 -0
  468. package/src/index.ts +41 -0
  469. package/src/iterative-generation/activities/extractToc.ts +63 -0
  470. package/src/iterative-generation/activities/finalizeOutput.ts +100 -0
  471. package/src/iterative-generation/activities/generatePart.ts +123 -0
  472. package/src/iterative-generation/activities/generateToc.ts +116 -0
  473. package/src/iterative-generation/activities/index.ts +4 -0
  474. package/src/iterative-generation/iterativeGenerationWorkflow.ts +68 -0
  475. package/src/iterative-generation/types.ts +99 -0
  476. package/src/iterative-generation/utils.ts +126 -0
  477. package/src/result-types.ts +25 -0
  478. package/src/system/notifyWebhookWorkflow.ts +70 -0
  479. package/src/system/recalculateEmbeddingsWorkflow.ts +41 -0
  480. package/src/utils/auth.ts +10 -0
  481. package/src/utils/blobs.ts +59 -0
  482. package/src/utils/chunks.ts +17 -0
  483. package/src/utils/client.ts +46 -0
  484. package/src/utils/expand-vars.ts +31 -0
  485. package/src/utils/memory.ts +61 -0
  486. package/src/utils/renditions.ts +127 -0
  487. package/src/utils/storage.ts +60 -0
  488. package/src/utils/tokens.ts +44 -0
  489. package/src/vars.ts +3 -0
  490. package/src/workflows.ts +7 -0
@@ -0,0 +1,506 @@
1
+ import { PutObjectCommand, S3Client } from "@aws-sdk/client-s3";
2
+ import type { Block } from "@aws-sdk/client-textract";
3
+ import {
4
+ GetDocumentAnalysisCommand,
5
+ StartDocumentAnalysisCommand,
6
+ TextractClient
7
+ } from "@aws-sdk/client-textract";
8
+ import type { AwsCredentialIdentityProvider } from "@smithy/types";
9
+ import Papa from 'papaparse';
10
+
11
+ interface BlocksMap {
12
+ [key: string]: Block;
13
+ }
14
+
15
+ interface ContentBlock {
16
+ type: 'text' | 'table' | 'image';
17
+ content: string;
18
+ confidence?: number;
19
+ // Optional geometry if it's an image
20
+ left?: number;
21
+ top?: number;
22
+ width?: number;
23
+ height?: number;
24
+ }
25
+
26
+ interface PageContent {
27
+ pageNumber: number;
28
+ blocks: ContentBlock[];
29
+ }
30
+
31
+ interface TextractProcessorOptions {
32
+ fileKey: string;
33
+ region: string;
34
+ bucket: string;
35
+ credentials?: AwsCredentialIdentityProvider;
36
+ log?: any;
37
+ detectImages?: boolean;
38
+ /**
39
+ * NEW: If true, includes cell-confidence information in the table CSV
40
+ */
41
+ includeConfidenceInTables?: boolean;
42
+ }
43
+
44
+ export class TextractProcessor {
45
+ private textractClient: TextractClient;
46
+ private s3Client: S3Client;
47
+ private fileKey: string;
48
+ private bucket: string;
49
+ private log: any;
50
+ private detectImages: boolean;
51
+ /**
52
+ * Whether or not to include confidence values in CSV output for tables.
53
+ */
54
+ private includeConfidenceInTables: boolean;
55
+
56
+ constructor({
57
+ fileKey,
58
+ region,
59
+ bucket,
60
+ credentials,
61
+ log,
62
+ detectImages = false,
63
+ includeConfidenceInTables = false // NEW default = false
64
+ }: TextractProcessorOptions) {
65
+ this.fileKey = fileKey;
66
+ this.bucket = bucket;
67
+ this.log = log;
68
+ this.detectImages = detectImages;
69
+ this.includeConfidenceInTables = includeConfidenceInTables;
70
+
71
+ this.textractClient = new TextractClient({
72
+ region,
73
+ credentials
74
+ });
75
+ this.s3Client = new S3Client({
76
+ region,
77
+ credentials
78
+ });
79
+ }
80
+
81
+ private getText(result: Block, blocksMap: BlocksMap): string {
82
+ let text = '';
83
+ if (result.Relationships) {
84
+ for (const relationship of result.Relationships) {
85
+ if (relationship.Type === 'CHILD') {
86
+ for (const childId of relationship.Ids || []) {
87
+ const word = blocksMap[childId];
88
+ if (word.BlockType === 'WORD') {
89
+ const wordText = word.Text || '';
90
+ // Example logic to quote numeric text with commas
91
+ if (wordText.includes(',') &&
92
+ wordText.replace(',', '').match(/^\d+$/)) {
93
+ text += `"${wordText}" `;
94
+ } else {
95
+ text += `${wordText} `;
96
+ }
97
+ }
98
+ if (
99
+ word.BlockType === 'SELECTION_ELEMENT' &&
100
+ word.SelectionStatus === 'SELECTED'
101
+ ) {
102
+ text += 'X ';
103
+ }
104
+ }
105
+ }
106
+ }
107
+ }
108
+ return text.trim();
109
+ }
110
+
111
+ private isBlockInTable(block: Block, blocksMap: BlocksMap): boolean {
112
+ if (block.BlockType !== 'LINE') {
113
+ return false;
114
+ }
115
+ if (block.Relationships) {
116
+ for (const relationship of block.Relationships) {
117
+ if (relationship.Type === 'CHILD') {
118
+ for (const childId of relationship.Ids || []) {
119
+ const wordBlock = blocksMap[childId];
120
+ if (this.isWordInTableCell(wordBlock, blocksMap)) {
121
+ return true;
122
+ }
123
+ }
124
+ }
125
+ }
126
+ }
127
+ return false;
128
+ }
129
+
130
+ private isWordInTableCell(wordBlock: Block, blocksMap: BlocksMap): boolean {
131
+ // Check if the wordBlock is a descendant of any TABLE->CELL block
132
+ for (const blockId in blocksMap) {
133
+ const potentialTable = blocksMap[blockId];
134
+ if (potentialTable.BlockType === 'TABLE' && potentialTable.Relationships) {
135
+ for (const relationship of potentialTable.Relationships) {
136
+ if (relationship.Type === 'CHILD') {
137
+ for (const cellId of relationship.Ids || []) {
138
+ const cell = blocksMap[cellId];
139
+ if (cell.BlockType === 'CELL' && cell.Relationships) {
140
+ for (const cellRel of cell.Relationships) {
141
+ if (
142
+ cellRel.Type === 'CHILD' &&
143
+ cellRel.Ids?.includes(wordBlock.Id!)
144
+ ) {
145
+ return true;
146
+ }
147
+ }
148
+ }
149
+ }
150
+ }
151
+ }
152
+ }
153
+ }
154
+ return false;
155
+ }
156
+
157
+ /**
158
+ * NEW: Helper type to store row and column text along with confidence.
159
+ */
160
+ private getRowsColumnsMap(
161
+ tableResult: Block,
162
+ blocksMap: BlocksMap
163
+ ): {
164
+ rows: Array<Array<{ text: string; confidence: number }>>;
165
+ } {
166
+ const rows: Array<Array<{ text: string; confidence: number }>> = [];
167
+
168
+ tableResult.Relationships?.forEach(relationship => {
169
+ if (relationship.Type === 'CHILD') {
170
+ relationship.Ids?.forEach(childId => {
171
+ const cell = blocksMap[childId];
172
+ if (cell.BlockType === 'CELL') {
173
+ const rowIndex = cell.RowIndex || 1;
174
+ const colIndex = cell.ColumnIndex || 1;
175
+
176
+ // Expand the array if needed
177
+ if (!rows[rowIndex - 1]) {
178
+ rows[rowIndex - 1] = [];
179
+ }
180
+
181
+ // Prepare cell text and confidence
182
+ const text = this.getText(cell, blocksMap);
183
+ const confidence = cell.Confidence || 0;
184
+
185
+ // If there's a gap, fill it with placeholders
186
+ // so that we can safely place text at colIndex - 1
187
+ for (let i = rows[rowIndex - 1].length; i < colIndex - 1; i++) {
188
+ rows[rowIndex - 1].push({ text: '', confidence: 0 });
189
+ }
190
+ rows[rowIndex - 1][colIndex - 1] = { text, confidence };
191
+ }
192
+ });
193
+ }
194
+ });
195
+
196
+ return { rows };
197
+ }
198
+
199
+ private generateTableCSV(
200
+ tableResult: Block,
201
+ blocksMap: BlocksMap,
202
+ _tableIndex: number,
203
+ _pageNumber: number
204
+ ): { csv: string; tableConfidence: number } {
205
+ const { rows } = this.getRowsColumnsMap(tableResult, blocksMap);
206
+
207
+ let totalConfidence = 0;
208
+ let cellCount = 0;
209
+
210
+ // Prepare CSV data
211
+ const csvData: string[][] = [];
212
+ for (const row of rows) {
213
+ const rowData: string[] = [];
214
+ for (const cell of row) {
215
+ // Add to CSV
216
+ rowData.push(cell.text.trim());
217
+ // Accumulate confidence
218
+ totalConfidence += cell.confidence;
219
+ cellCount++;
220
+ }
221
+ csvData.push(rowData);
222
+ }
223
+
224
+ // Compute average confidence (or any other method you prefer)
225
+ const tableConfidence = cellCount > 0 ? (totalConfidence / cellCount) : 0;
226
+
227
+ // Convert to CSV
228
+ const csv = Papa.unparse(csvData, {
229
+ delimiter: ',',
230
+ quotes: true,
231
+ quoteChar: '"',
232
+ escapeChar: '"',
233
+ header: false,
234
+ newline: '\n',
235
+ skipEmptyLines: false
236
+ });
237
+
238
+ return { csv, tableConfidence };
239
+ }
240
+
241
+ async upload(fileBuf: Buffer): Promise<void> {
242
+ this.log.info('Uploading file to S3', { fileKey: this.fileKey });
243
+ const command = new PutObjectCommand({
244
+ Bucket: this.bucket,
245
+ Key: this.fileKey,
246
+ Body: fileBuf,
247
+ });
248
+ await this.s3Client.send(command);
249
+ }
250
+
251
+ async startAnalysis(s3Key: string): Promise<string> {
252
+ const command = new StartDocumentAnalysisCommand({
253
+ DocumentLocation: {
254
+ S3Object: {
255
+ Bucket: this.bucket,
256
+ Name: s3Key
257
+ }
258
+ },
259
+ FeatureTypes: ["TABLES"]
260
+ });
261
+ const response = await this.textractClient.send(command);
262
+ return response.JobId!;
263
+ }
264
+
265
+ async checkJobStatus(jobId: string): Promise<string> {
266
+ const command = new GetDocumentAnalysisCommand({ JobId: jobId });
267
+ const response = await this.textractClient.send(command);
268
+ return response.JobStatus!;
269
+ }
270
+
271
+ private getImagePlaceholder(block: Block): string {
272
+ const geometry = block.Geometry?.BoundingBox;
273
+ if (!geometry) return '';
274
+ const area = (geometry.Width || 0) * (geometry.Height || 0);
275
+ if (area < 0.05) return ''; // skip small images
276
+
277
+ const top = geometry.Top || 0;
278
+ const left = geometry.Left || 0;
279
+
280
+ let position = '';
281
+ if (top < 0.3) position += 'TOP_';
282
+ else if (top > 0.7) position += 'BOTTOM_';
283
+
284
+ if (left < 0.3) position += 'LEFT';
285
+ else if (left > 0.7) position += 'RIGHT';
286
+ else position += 'CENTER';
287
+
288
+ return `[IMAGE_${position}]\n`;
289
+ }
290
+
291
+ private getIndentationLevel(block: Block): number {
292
+ const left = block.Geometry?.BoundingBox?.Left || 0;
293
+ if (left < 0.15) return 0;
294
+ if (left < 0.25) return 1;
295
+ return 2;
296
+ }
297
+
298
+ private isLikelyHeader(block: Block, prevBlock: Block | null): boolean {
299
+ if (!prevBlock) return true;
300
+ const gap = (block.Geometry?.BoundingBox?.Top || 0) -
301
+ ((prevBlock.Geometry?.BoundingBox?.Top || 0) +
302
+ (prevBlock.Geometry?.BoundingBox?.Height || 0));
303
+ return gap > 0.03;
304
+ }
305
+
306
+ private formatTextBlock(block: Block, prevBlock: Block | null): string {
307
+ const text = block.Text || '';
308
+ const indentLevel = this.getIndentationLevel(block);
309
+ const indent = ' '.repeat(indentLevel);
310
+
311
+ if (this.isLikelyHeader(block, prevBlock)) {
312
+ return `\n${indent}${text}\n`;
313
+ }
314
+ return `${indent}${text}\n`;
315
+ }
316
+
317
+ private shouldMergeLines(prev: Block, current: Block): boolean {
318
+ const prevBottom = (prev.Geometry?.BoundingBox?.Top || 0)
319
+ + (prev.Geometry?.BoundingBox?.Height || 0);
320
+ const currentTop = current.Geometry?.BoundingBox?.Top || 0;
321
+ const gap = currentTop - prevBottom;
322
+
323
+ // For example, if gap < 0.02, treat them as contiguous
324
+ if (gap < 0.02) {
325
+ return true;
326
+ }
327
+ return false;
328
+ }
329
+
330
+ async processResults(jobId: string): Promise<string> {
331
+ let nextToken: string | undefined;
332
+ let allBlocks: Block[] = [];
333
+
334
+ do {
335
+ const command = new GetDocumentAnalysisCommand({
336
+ JobId: jobId,
337
+ NextToken: nextToken
338
+ });
339
+ const response = await this.textractClient.send(command);
340
+ allBlocks = allBlocks.concat(response.Blocks || []);
341
+ nextToken = response.NextToken;
342
+ } while (nextToken);
343
+
344
+ // Create blocks map
345
+ const blocksMap: BlocksMap = {};
346
+ for (const block of allBlocks) {
347
+ blocksMap[block.Id!] = block;
348
+ }
349
+
350
+ // We'll store each page's content in sequence
351
+ const pageContents: PageContent[] = [];
352
+ let currentPage: PageContent | null = null;
353
+
354
+ // We'll keep track of a "current text block" that we're building
355
+ let currentTextContent = "";
356
+ let prevLineBlock: Block | null = null;
357
+
358
+ // Sort by page and vertical position
359
+ allBlocks.sort((a, b) => {
360
+ if (a.Page !== b.Page) return (a.Page || 0) - (b.Page || 0);
361
+ return (a.Geometry?.BoundingBox?.Top || 0) - (b.Geometry?.BoundingBox?.Top || 0);
362
+ });
363
+
364
+ for (const block of allBlocks) {
365
+ if (block.BlockType === 'PAGE') {
366
+ // If we were building a text block, push it before starting a new page
367
+ if (currentTextContent.trim().length > 0 && currentPage) {
368
+ currentPage.blocks.push({
369
+ type: 'text',
370
+ content: currentTextContent
371
+ });
372
+ }
373
+ if (currentPage) {
374
+ pageContents.push(currentPage);
375
+ }
376
+ currentPage = {
377
+ pageNumber: block.Page || 0,
378
+ blocks: []
379
+ };
380
+ currentTextContent = "";
381
+ prevLineBlock = null;
382
+ }
383
+ else if (currentPage && block.Page === currentPage.pageNumber) {
384
+ // TABLE handling
385
+ if (block.BlockType === 'TABLE') {
386
+ // If there's a pending text block, push it first
387
+ if (currentTextContent.trim().length > 0) {
388
+ currentPage.blocks.push({
389
+ type: 'text',
390
+ content: currentTextContent
391
+ });
392
+ currentTextContent = "";
393
+ }
394
+ const { csv, tableConfidence } = this.generateTableCSV(
395
+ block,
396
+ blocksMap,
397
+ currentPage.blocks.filter(b => b.type === 'table').length + 1,
398
+ currentPage.pageNumber
399
+ );
400
+ currentPage.blocks.push({
401
+ type: 'table',
402
+ content: csv,
403
+ confidence: tableConfidence
404
+ });
405
+ prevLineBlock = null;
406
+ }
407
+ // LINE handling (merge or start new)
408
+ else if (block.BlockType === 'LINE' && !this.isBlockInTable(block, blocksMap)) {
409
+ if (prevLineBlock && this.shouldMergeLines(prevLineBlock, block)) {
410
+ // If we consider this line to be part of the same paragraph,
411
+ // just append the text. We'll call formatTextBlock to get
412
+ // indentation/header logic, but we won't add a leading newline.
413
+ const formatted = this.formatTextBlock(block, prevLineBlock);
414
+
415
+ // formatTextBlock might include a leading newline if isLikelyHeader = true
416
+ // so you can strip it out if you want them truly "merged" into one paragraph:
417
+ const mergedText = formatted.replace(/^\s*\n/, " ");
418
+
419
+ currentTextContent += " " + mergedText.trim();
420
+ } else {
421
+ // If there's an existing text block, push it
422
+ if (currentTextContent.trim().length > 0) {
423
+ currentPage.blocks.push({
424
+ type: 'text',
425
+ content: currentTextContent
426
+ });
427
+ }
428
+ // Start a new text block
429
+ currentTextContent = this.formatTextBlock(block, prevLineBlock).trim();
430
+ }
431
+ prevLineBlock = block;
432
+ }
433
+ // IMAGES (if detectImages)
434
+ else if (this.detectImages) {
435
+ const geometry = block.Geometry?.BoundingBox;
436
+ if (geometry && geometry.Width && geometry.Height) {
437
+ const imagePlaceholder = this.getImagePlaceholder(block);
438
+ if (imagePlaceholder) {
439
+ // If there's a pending text block, push it first
440
+ if (currentTextContent.trim().length > 0) {
441
+ currentPage.blocks.push({
442
+ type: 'text',
443
+ content: currentTextContent
444
+ });
445
+ currentTextContent = "";
446
+ }
447
+
448
+ currentPage.blocks.push({
449
+ type: 'image',
450
+ content: imagePlaceholder,
451
+ left: geometry.Left,
452
+ top: geometry.Top,
453
+ width: geometry.Width,
454
+ height: geometry.Height
455
+ });
456
+ }
457
+ }
458
+ // No line update to prevLineBlock here
459
+ }
460
+ }
461
+ }
462
+
463
+ // Handle last page
464
+ if (currentPage) {
465
+ if (currentTextContent.trim().length > 0) {
466
+ currentPage.blocks.push({
467
+ type: 'text',
468
+ content: currentTextContent
469
+ });
470
+ }
471
+ pageContents.push(currentPage);
472
+ }
473
+
474
+ // Build final output
475
+ let fullText = '';
476
+ let imgNumber = 1;
477
+ let tableNumber = 1;
478
+ for (const page of pageContents) {
479
+ fullText += `<page number="${page.pageNumber}">\n`;
480
+ for (const block of page.blocks) {
481
+ if (block.type === 'text') {
482
+ fullText += `<text>\n${block.content}\n</text>\n\n`;
483
+ } else if (block.type === 'table') {
484
+ const confidenceAttr = block.confidence !== undefined && this.includeConfidenceInTables
485
+ ? ` confidence="${block.confidence.toFixed(2)}"`
486
+ : '';
487
+ fullText += `<table number=${tableNumber++} type="csv" ${confidenceAttr}>\n`;
488
+ fullText += `${block.content}\n`;
489
+ fullText += `</table>\n\n`;
490
+ } else if (block.type === 'image') {
491
+ // Include geometry if you like
492
+ const leftAttr = block.left ? ` left="${block.left.toFixed(4)}"` : '';
493
+ const topAttr = block.top ? ` top="${block.top.toFixed(4)}"` : '';
494
+ const widthAttr = block.width ? ` width="${block.width.toFixed(4)}"` : '';
495
+ const heightAttr = block.height ? ` height="${block.height.toFixed(4)}"` : '';
496
+
497
+ fullText += `<image id="${imgNumber++}" ${leftAttr}${topAttr}${widthAttr}${heightAttr}>\n${block.content.trim()}\n</image>\n\n`;
498
+ }
499
+ }
500
+ fullText += `</page>\n\n`;
501
+ }
502
+
503
+ return fullText;
504
+ }
505
+
506
+ }
@@ -0,0 +1,118 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ import { exec } from "child_process";
4
+ import { promisify } from "util";
5
+ import { expect, test, vi, describe } from "vitest";
6
+
7
+ // Mock Temporal activity context
8
+ vi.mock("@temporalio/activity", () => ({
9
+ log: {
10
+ info: vi.fn(),
11
+ warn: vi.fn(),
12
+ error: vi.fn(),
13
+ },
14
+ }));
15
+
16
+ // Import after mocking
17
+ import { imageResizer } from "../conversion/image";
18
+
19
+ const execAsync = promisify(exec);
20
+
21
+ describe("ImageMagick image resizing", () => {
22
+ test("should resize an image to a maximum height or width using ImageMagick", async () => {
23
+ const max_hw = 1596;
24
+ const format = "jpeg";
25
+ const inputImagePath = path.join(__dirname, "../../fixtures", "cat-picture.jpg");
26
+
27
+ // Make sure the input file exists
28
+ expect(fs.existsSync(inputImagePath)).toBe(true);
29
+
30
+ // Call the imageResizer function with a file path
31
+ const resizedImagePath = await imageResizer(inputImagePath, max_hw, format);
32
+
33
+ // Make sure the output file exists
34
+ expect(fs.existsSync(resizedImagePath)).toBe(true);
35
+
36
+ // Use ImageMagick identify to get metadata about the resized image
37
+ const { stdout } = await execAsync(`identify -format "%w %h %m" "${resizedImagePath}"`);
38
+ const [width, height, imageFormat] = stdout.trim().split(" ");
39
+
40
+ console.log({ width, height, imageFormat });
41
+
42
+ // Check dimensions
43
+ expect(parseInt(width)).to.be.lessThanOrEqual(max_hw);
44
+ expect(parseInt(height)).to.be.lessThanOrEqual(max_hw);
45
+
46
+ // Check format (JPEG)
47
+ expect(imageFormat.toLowerCase()).to.equal("jpeg");
48
+ });
49
+
50
+ test("should throw an error for non-existent input file", async () => {
51
+ const max_hw = 1596;
52
+ const format = "jpeg";
53
+ const nonExistentPath = path.join(__dirname, "non-existent-image.jpg");
54
+
55
+ // Verify file doesn't exist
56
+ expect(fs.existsSync(nonExistentPath)).toBe(false);
57
+
58
+ // Expect the function to throw an error
59
+ await expect(imageResizer(nonExistentPath, max_hw, format)).rejects.toThrow("Input file does not exist");
60
+ });
61
+
62
+ test("should throw error with empty format", async () => {
63
+ const max_hw = 1596;
64
+ const format = "";
65
+ const inputImagePath = path.join(__dirname, "../../fixtures", "cat-picture.jpg");
66
+
67
+ // Test for empty format validation
68
+ await expect(imageResizer(inputImagePath, max_hw, format)).rejects.toThrow("Invalid format");
69
+ });
70
+
71
+ test("should create progressive/interlaced image when enabled", async () => {
72
+ const max_hw = 800;
73
+ const format = "jpeg";
74
+ const inputImagePath = path.join(__dirname, "../../fixtures", "cat-picture.jpg");
75
+
76
+ // Make sure the input file exists
77
+ expect(fs.existsSync(inputImagePath)).toBe(true);
78
+
79
+ // Call the imageResizer function with progressive=true
80
+ const resizedImagePath = await imageResizer(inputImagePath, max_hw, format, true);
81
+
82
+ // Make sure the output file exists
83
+ expect(fs.existsSync(resizedImagePath)).toBe(true);
84
+
85
+ // Use ImageMagick identify to check if the image is interlaced
86
+ const { stdout } = await execAsync(`identify -format "%[interlace]" "${resizedImagePath}"`);
87
+ const interlaceMode = stdout.trim();
88
+
89
+ console.log({ interlaceMode });
90
+
91
+ // Check that interlace is enabled (should be 'JPEG' or 'Line' for progressive JPEG)
92
+ expect(["JPEG", "Line", "Plane"]).to.include(interlaceMode);
93
+ });
94
+
95
+ test("should create non-interlaced image when progressive is disabled", async () => {
96
+ const max_hw = 800;
97
+ const format = "jpeg";
98
+ const inputImagePath = path.join(__dirname, "../../fixtures", "cat-picture.jpg");
99
+
100
+ // Make sure the input file exists
101
+ expect(fs.existsSync(inputImagePath)).toBe(true);
102
+
103
+ // Call the imageResizer function with progressive=false
104
+ const resizedImagePath = await imageResizer(inputImagePath, max_hw, format, false);
105
+
106
+ // Make sure the output file exists
107
+ expect(fs.existsSync(resizedImagePath)).toBe(true);
108
+
109
+ // Use ImageMagick identify to check if the image is interlaced
110
+ const { stdout } = await execAsync(`identify -format "%[interlace]" "${resizedImagePath}"`);
111
+ const interlaceMode = stdout.trim().toLowerCase();
112
+
113
+ console.log({ interlaceMode });
114
+
115
+ // Check that interlace is disabled (should be 'none' or empty string)
116
+ expect(["none", ""]).to.include(interlaceMode);
117
+ });
118
+ });