@vertesia/workflow 0.51.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (387) hide show
  1. package/package.json +9 -6
  2. package/src/activities/advanced/createOrUpdateDocumentFromInteractionRun.ts +20 -1
  3. package/src/activities/chunkDocument.ts +62 -42
  4. package/src/activities/createDocumentFromOther.ts +2 -2
  5. package/src/activities/executeInteraction.ts +92 -47
  6. package/src/activities/extractDocumentText.ts +91 -54
  7. package/src/activities/generateDocumentProperties.ts +37 -16
  8. package/src/activities/generateEmbeddings.ts +91 -79
  9. package/src/activities/generateImageRendition.ts +127 -59
  10. package/src/activities/generateOrAssignContentType.ts +52 -32
  11. package/src/activities/getObjectFromStore.ts +1 -1
  12. package/src/activities/handleError.ts +25 -0
  13. package/src/activities/index-dsl.ts +1 -0
  14. package/src/activities/index.ts +0 -1
  15. package/src/activities/media/processPdfWithTextract.ts +4 -4
  16. package/src/activities/media/transcribeMediaWithGladia.ts +1 -1
  17. package/src/activities/notifyWebhook.ts +2 -2
  18. package/src/activities/setDocumentStatus.ts +1 -1
  19. package/src/conversion/TextractProcessor.ts +9 -9
  20. package/src/conversion/image.test.ts +110 -18
  21. package/src/conversion/image.ts +96 -15
  22. package/src/conversion/markitdown.ts +41 -0
  23. package/src/conversion/mutool.ts +1 -1
  24. package/src/conversion/pandoc.test.ts +8 -6
  25. package/src/conversion/pandoc.ts +38 -42
  26. package/src/dsl/dsl-workflow.ts +80 -12
  27. package/src/dsl/setup/ActivityContext.ts +57 -16
  28. package/src/dsl/validation.test.ts +2 -2
  29. package/src/dsl/vars.test.ts +1 -1
  30. package/src/dsl/vars.ts +6 -6
  31. package/src/dsl/workflow-exec-child.test.ts +14 -4
  32. package/src/dsl/workflow-fetch.test.ts +1 -1
  33. package/src/dsl/workflow-import.test.ts +1 -1
  34. package/src/dsl/workflow.test.ts +12 -2
  35. package/src/dsl.ts +1 -1
  36. package/src/errors.ts +27 -6
  37. package/src/index.ts +1 -1
  38. package/src/iterative-generation/activities/extractToc.ts +1 -1
  39. package/src/iterative-generation/activities/generatePart.ts +2 -2
  40. package/src/iterative-generation/activities/generateToc.ts +1 -1
  41. package/src/iterative-generation/iterativeGenerationWorkflow.ts +3 -2
  42. package/src/iterative-generation/types.ts +4 -4
  43. package/src/iterative-generation/utils.ts +4 -4
  44. package/src/system/notifyWebhookWorkflow.ts +2 -1
  45. package/src/system/recalculateEmbeddingsWorkflow.ts +2 -2
  46. package/src/utils/blobs.ts +11 -6
  47. package/src/utils/chunks.ts +17 -0
  48. package/src/utils/client.ts +4 -3
  49. package/src/utils/memory.ts +3 -8
  50. package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js +0 -32
  51. package/lib/cjs/activities/advanced/createDocumentTypeFromInteractionRun.js.map +0 -1
  52. package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +0 -66
  53. package/lib/cjs/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +0 -1
  54. package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js +0 -18
  55. package/lib/cjs/activities/advanced/updateDocumentFromInteractionRun.js.map +0 -1
  56. package/lib/cjs/activities/chunkDocument.js +0 -79
  57. package/lib/cjs/activities/chunkDocument.js.map +0 -1
  58. package/lib/cjs/activities/createDocumentFromOther.js +0 -64
  59. package/lib/cjs/activities/createDocumentFromOther.js.map +0 -1
  60. package/lib/cjs/activities/executeInteraction.js +0 -134
  61. package/lib/cjs/activities/executeInteraction.js.map +0 -1
  62. package/lib/cjs/activities/extractDocumentText.js +0 -135
  63. package/lib/cjs/activities/extractDocumentText.js.map +0 -1
  64. package/lib/cjs/activities/generateDocumentProperties.js +0 -59
  65. package/lib/cjs/activities/generateDocumentProperties.js.map +0 -1
  66. package/lib/cjs/activities/generateEmbeddings.js +0 -292
  67. package/lib/cjs/activities/generateEmbeddings.js.map +0 -1
  68. package/lib/cjs/activities/generateImageRendition.js +0 -104
  69. package/lib/cjs/activities/generateImageRendition.js.map +0 -1
  70. package/lib/cjs/activities/generateOrAssignContentType.js +0 -103
  71. package/lib/cjs/activities/generateOrAssignContentType.js.map +0 -1
  72. package/lib/cjs/activities/getObjectFromStore.js +0 -20
  73. package/lib/cjs/activities/getObjectFromStore.js.map +0 -1
  74. package/lib/cjs/activities/index-dsl.js +0 -37
  75. package/lib/cjs/activities/index-dsl.js.map +0 -1
  76. package/lib/cjs/activities/index.js +0 -22
  77. package/lib/cjs/activities/index.js.map +0 -1
  78. package/lib/cjs/activities/media/processPdfWithTextract.js +0 -102
  79. package/lib/cjs/activities/media/processPdfWithTextract.js.map +0 -1
  80. package/lib/cjs/activities/media/transcribeMediaWithGladia.js +0 -51
  81. package/lib/cjs/activities/media/transcribeMediaWithGladia.js.map +0 -1
  82. package/lib/cjs/activities/notifyWebhook.js +0 -34
  83. package/lib/cjs/activities/notifyWebhook.js.map +0 -1
  84. package/lib/cjs/activities/setDocumentStatus.js +0 -15
  85. package/lib/cjs/activities/setDocumentStatus.js.map +0 -1
  86. package/lib/cjs/conversion/TextractProcessor.js +0 -417
  87. package/lib/cjs/conversion/TextractProcessor.js.map +0 -1
  88. package/lib/cjs/conversion/image.js +0 -22
  89. package/lib/cjs/conversion/image.js.map +0 -1
  90. package/lib/cjs/conversion/mutool.js +0 -147
  91. package/lib/cjs/conversion/mutool.js.map +0 -1
  92. package/lib/cjs/conversion/pandoc.js +0 -39
  93. package/lib/cjs/conversion/pandoc.js.map +0 -1
  94. package/lib/cjs/dsl/conditions.js +0 -81
  95. package/lib/cjs/dsl/conditions.js.map +0 -1
  96. package/lib/cjs/dsl/dsl-workflow.js +0 -223
  97. package/lib/cjs/dsl/dsl-workflow.js.map +0 -1
  98. package/lib/cjs/dsl/dslProxyActivities.js +0 -23
  99. package/lib/cjs/dsl/dslProxyActivities.js.map +0 -1
  100. package/lib/cjs/dsl/projections.js +0 -59
  101. package/lib/cjs/dsl/projections.js.map +0 -1
  102. package/lib/cjs/dsl/setup/ActivityContext.js +0 -96
  103. package/lib/cjs/dsl/setup/ActivityContext.js.map +0 -1
  104. package/lib/cjs/dsl/setup/fetch/DataProvider.js +0 -51
  105. package/lib/cjs/dsl/setup/fetch/DataProvider.js.map +0 -1
  106. package/lib/cjs/dsl/setup/fetch/index.js +0 -16
  107. package/lib/cjs/dsl/setup/fetch/index.js.map +0 -1
  108. package/lib/cjs/dsl/setup/fetch/providers.js +0 -67
  109. package/lib/cjs/dsl/setup/fetch/providers.js.map +0 -1
  110. package/lib/cjs/dsl/test/test-child-workflow.js +0 -10
  111. package/lib/cjs/dsl/test/test-child-workflow.js.map +0 -1
  112. package/lib/cjs/dsl/validation.js +0 -122
  113. package/lib/cjs/dsl/validation.js.map +0 -1
  114. package/lib/cjs/dsl/vars.js +0 -341
  115. package/lib/cjs/dsl/vars.js.map +0 -1
  116. package/lib/cjs/dsl/walk.js +0 -100
  117. package/lib/cjs/dsl/walk.js.map +0 -1
  118. package/lib/cjs/dsl.js +0 -20
  119. package/lib/cjs/dsl.js.map +0 -1
  120. package/lib/cjs/errors.js +0 -36
  121. package/lib/cjs/errors.js.map +0 -1
  122. package/lib/cjs/index.js +0 -50
  123. package/lib/cjs/index.js.map +0 -1
  124. package/lib/cjs/iterative-generation/activities/extractToc.js +0 -47
  125. package/lib/cjs/iterative-generation/activities/extractToc.js.map +0 -1
  126. package/lib/cjs/iterative-generation/activities/finalizeOutput.js +0 -69
  127. package/lib/cjs/iterative-generation/activities/finalizeOutput.js.map +0 -1
  128. package/lib/cjs/iterative-generation/activities/generatePart.js +0 -73
  129. package/lib/cjs/iterative-generation/activities/generatePart.js.map +0 -1
  130. package/lib/cjs/iterative-generation/activities/generateToc.js +0 -91
  131. package/lib/cjs/iterative-generation/activities/generateToc.js.map +0 -1
  132. package/lib/cjs/iterative-generation/activities/index.js +0 -12
  133. package/lib/cjs/iterative-generation/activities/index.js.map +0 -1
  134. package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js +0 -55
  135. package/lib/cjs/iterative-generation/iterativeGenerationWorkflow.js.map +0 -1
  136. package/lib/cjs/iterative-generation/types.js +0 -5
  137. package/lib/cjs/iterative-generation/types.js.map +0 -1
  138. package/lib/cjs/iterative-generation/utils.js +0 -121
  139. package/lib/cjs/iterative-generation/utils.js.map +0 -1
  140. package/lib/cjs/package.json +0 -3
  141. package/lib/cjs/result-types.js +0 -10
  142. package/lib/cjs/result-types.js.map +0 -1
  143. package/lib/cjs/system/notifyWebhookWorkflow.js +0 -46
  144. package/lib/cjs/system/notifyWebhookWorkflow.js.map +0 -1
  145. package/lib/cjs/system/recalculateEmbeddingsWorkflow.js +0 -28
  146. package/lib/cjs/system/recalculateEmbeddingsWorkflow.js.map +0 -1
  147. package/lib/cjs/utils/auth.js +0 -15
  148. package/lib/cjs/utils/auth.js.map +0 -1
  149. package/lib/cjs/utils/blobs.js +0 -63
  150. package/lib/cjs/utils/blobs.js.map +0 -1
  151. package/lib/cjs/utils/client.js +0 -25
  152. package/lib/cjs/utils/client.js.map +0 -1
  153. package/lib/cjs/utils/expand-vars.js +0 -33
  154. package/lib/cjs/utils/expand-vars.js.map +0 -1
  155. package/lib/cjs/utils/memory.js +0 -72
  156. package/lib/cjs/utils/memory.js.map +0 -1
  157. package/lib/cjs/utils/tokens.js +0 -38
  158. package/lib/cjs/utils/tokens.js.map +0 -1
  159. package/lib/cjs/vars.js +0 -20
  160. package/lib/cjs/vars.js.map +0 -1
  161. package/lib/cjs/workflows.js +0 -15
  162. package/lib/cjs/workflows.js.map +0 -1
  163. package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js +0 -29
  164. package/lib/esm/activities/advanced/createDocumentTypeFromInteractionRun.js.map +0 -1
  165. package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js +0 -63
  166. package/lib/esm/activities/advanced/createOrUpdateDocumentFromInteractionRun.js.map +0 -1
  167. package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js +0 -15
  168. package/lib/esm/activities/advanced/updateDocumentFromInteractionRun.js.map +0 -1
  169. package/lib/esm/activities/chunkDocument.js +0 -76
  170. package/lib/esm/activities/chunkDocument.js.map +0 -1
  171. package/lib/esm/activities/createDocumentFromOther.js +0 -58
  172. package/lib/esm/activities/createDocumentFromOther.js.map +0 -1
  173. package/lib/esm/activities/executeInteraction.js +0 -130
  174. package/lib/esm/activities/executeInteraction.js.map +0 -1
  175. package/lib/esm/activities/extractDocumentText.js +0 -132
  176. package/lib/esm/activities/extractDocumentText.js.map +0 -1
  177. package/lib/esm/activities/generateDocumentProperties.js +0 -56
  178. package/lib/esm/activities/generateDocumentProperties.js.map +0 -1
  179. package/lib/esm/activities/generateEmbeddings.js +0 -256
  180. package/lib/esm/activities/generateEmbeddings.js.map +0 -1
  181. package/lib/esm/activities/generateImageRendition.js +0 -98
  182. package/lib/esm/activities/generateImageRendition.js.map +0 -1
  183. package/lib/esm/activities/generateOrAssignContentType.js +0 -100
  184. package/lib/esm/activities/generateOrAssignContentType.js.map +0 -1
  185. package/lib/esm/activities/getObjectFromStore.js +0 -17
  186. package/lib/esm/activities/getObjectFromStore.js.map +0 -1
  187. package/lib/esm/activities/index-dsl.js +0 -18
  188. package/lib/esm/activities/index-dsl.js.map +0 -1
  189. package/lib/esm/activities/index.js +0 -6
  190. package/lib/esm/activities/index.js.map +0 -1
  191. package/lib/esm/activities/media/processPdfWithTextract.js +0 -98
  192. package/lib/esm/activities/media/processPdfWithTextract.js.map +0 -1
  193. package/lib/esm/activities/media/transcribeMediaWithGladia.js +0 -48
  194. package/lib/esm/activities/media/transcribeMediaWithGladia.js.map +0 -1
  195. package/lib/esm/activities/notifyWebhook.js +0 -31
  196. package/lib/esm/activities/notifyWebhook.js.map +0 -1
  197. package/lib/esm/activities/setDocumentStatus.js +0 -12
  198. package/lib/esm/activities/setDocumentStatus.js.map +0 -1
  199. package/lib/esm/conversion/TextractProcessor.js +0 -410
  200. package/lib/esm/conversion/TextractProcessor.js.map +0 -1
  201. package/lib/esm/conversion/image.js +0 -16
  202. package/lib/esm/conversion/image.js.map +0 -1
  203. package/lib/esm/conversion/mutool.js +0 -139
  204. package/lib/esm/conversion/mutool.js.map +0 -1
  205. package/lib/esm/conversion/pandoc.js +0 -36
  206. package/lib/esm/conversion/pandoc.js.map +0 -1
  207. package/lib/esm/dsl/conditions.js +0 -75
  208. package/lib/esm/dsl/conditions.js.map +0 -1
  209. package/lib/esm/dsl/dsl-workflow.js +0 -216
  210. package/lib/esm/dsl/dsl-workflow.js.map +0 -1
  211. package/lib/esm/dsl/dslProxyActivities.js +0 -20
  212. package/lib/esm/dsl/dslProxyActivities.js.map +0 -1
  213. package/lib/esm/dsl/projections.js +0 -55
  214. package/lib/esm/dsl/projections.js.map +0 -1
  215. package/lib/esm/dsl/setup/ActivityContext.js +0 -91
  216. package/lib/esm/dsl/setup/ActivityContext.js.map +0 -1
  217. package/lib/esm/dsl/setup/fetch/DataProvider.js +0 -47
  218. package/lib/esm/dsl/setup/fetch/DataProvider.js.map +0 -1
  219. package/lib/esm/dsl/setup/fetch/index.js +0 -12
  220. package/lib/esm/dsl/setup/fetch/index.js.map +0 -1
  221. package/lib/esm/dsl/setup/fetch/providers.js +0 -61
  222. package/lib/esm/dsl/setup/fetch/providers.js.map +0 -1
  223. package/lib/esm/dsl/test/test-child-workflow.js +0 -5
  224. package/lib/esm/dsl/test/test-child-workflow.js.map +0 -1
  225. package/lib/esm/dsl/validation.js +0 -118
  226. package/lib/esm/dsl/validation.js.map +0 -1
  227. package/lib/esm/dsl/vars.js +0 -335
  228. package/lib/esm/dsl/vars.js.map +0 -1
  229. package/lib/esm/dsl/walk.js +0 -96
  230. package/lib/esm/dsl/walk.js.map +0 -1
  231. package/lib/esm/dsl.js +0 -4
  232. package/lib/esm/dsl.js.map +0 -1
  233. package/lib/esm/errors.js +0 -30
  234. package/lib/esm/errors.js.map +0 -1
  235. package/lib/esm/index.js +0 -32
  236. package/lib/esm/index.js.map +0 -1
  237. package/lib/esm/iterative-generation/activities/extractToc.js +0 -44
  238. package/lib/esm/iterative-generation/activities/extractToc.js.map +0 -1
  239. package/lib/esm/iterative-generation/activities/finalizeOutput.js +0 -66
  240. package/lib/esm/iterative-generation/activities/finalizeOutput.js.map +0 -1
  241. package/lib/esm/iterative-generation/activities/generatePart.js +0 -70
  242. package/lib/esm/iterative-generation/activities/generatePart.js.map +0 -1
  243. package/lib/esm/iterative-generation/activities/generateToc.js +0 -88
  244. package/lib/esm/iterative-generation/activities/generateToc.js.map +0 -1
  245. package/lib/esm/iterative-generation/activities/index.js +0 -5
  246. package/lib/esm/iterative-generation/activities/index.js.map +0 -1
  247. package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js +0 -52
  248. package/lib/esm/iterative-generation/iterativeGenerationWorkflow.js.map +0 -1
  249. package/lib/esm/iterative-generation/types.js +0 -2
  250. package/lib/esm/iterative-generation/types.js.map +0 -1
  251. package/lib/esm/iterative-generation/utils.js +0 -112
  252. package/lib/esm/iterative-generation/utils.js.map +0 -1
  253. package/lib/esm/result-types.js +0 -7
  254. package/lib/esm/result-types.js.map +0 -1
  255. package/lib/esm/system/notifyWebhookWorkflow.js +0 -43
  256. package/lib/esm/system/notifyWebhookWorkflow.js.map +0 -1
  257. package/lib/esm/system/recalculateEmbeddingsWorkflow.js +0 -25
  258. package/lib/esm/system/recalculateEmbeddingsWorkflow.js.map +0 -1
  259. package/lib/esm/utils/auth.js +0 -8
  260. package/lib/esm/utils/auth.js.map +0 -1
  261. package/lib/esm/utils/blobs.js +0 -52
  262. package/lib/esm/utils/blobs.js.map +0 -1
  263. package/lib/esm/utils/client.js +0 -22
  264. package/lib/esm/utils/client.js.map +0 -1
  265. package/lib/esm/utils/expand-vars.js +0 -30
  266. package/lib/esm/utils/expand-vars.js.map +0 -1
  267. package/lib/esm/utils/memory.js +0 -60
  268. package/lib/esm/utils/memory.js.map +0 -1
  269. package/lib/esm/utils/tokens.js +0 -34
  270. package/lib/esm/utils/tokens.js.map +0 -1
  271. package/lib/esm/vars.js +0 -4
  272. package/lib/esm/vars.js.map +0 -1
  273. package/lib/esm/workflows.js +0 -8
  274. package/lib/esm/workflows.js.map +0 -1
  275. package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts +0 -17
  276. package/lib/types/activities/advanced/createDocumentTypeFromInteractionRun.d.ts.map +0 -1
  277. package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts +0 -29
  278. package/lib/types/activities/advanced/createOrUpdateDocumentFromInteractionRun.d.ts.map +0 -1
  279. package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts +0 -19
  280. package/lib/types/activities/advanced/updateDocumentFromInteractionRun.d.ts.map +0 -1
  281. package/lib/types/activities/chunkDocument.d.ts +0 -18
  282. package/lib/types/activities/chunkDocument.d.ts.map +0 -1
  283. package/lib/types/activities/createDocumentFromOther.d.ts +0 -21
  284. package/lib/types/activities/createDocumentFromOther.d.ts.map +0 -1
  285. package/lib/types/activities/executeInteraction.d.ts +0 -44
  286. package/lib/types/activities/executeInteraction.d.ts.map +0 -1
  287. package/lib/types/activities/extractDocumentText.d.ts +0 -10
  288. package/lib/types/activities/extractDocumentText.d.ts.map +0 -1
  289. package/lib/types/activities/generateDocumentProperties.d.ts +0 -32
  290. package/lib/types/activities/generateDocumentProperties.d.ts.map +0 -1
  291. package/lib/types/activities/generateEmbeddings.d.ts +0 -49
  292. package/lib/types/activities/generateEmbeddings.d.ts.map +0 -1
  293. package/lib/types/activities/generateImageRendition.d.ts +0 -17
  294. package/lib/types/activities/generateImageRendition.d.ts.map +0 -1
  295. package/lib/types/activities/generateOrAssignContentType.d.ts +0 -44
  296. package/lib/types/activities/generateOrAssignContentType.d.ts.map +0 -1
  297. package/lib/types/activities/getObjectFromStore.d.ts +0 -14
  298. package/lib/types/activities/getObjectFromStore.d.ts.map +0 -1
  299. package/lib/types/activities/index-dsl.d.ts +0 -17
  300. package/lib/types/activities/index-dsl.d.ts.map +0 -1
  301. package/lib/types/activities/index.d.ts +0 -6
  302. package/lib/types/activities/index.d.ts.map +0 -1
  303. package/lib/types/activities/media/processPdfWithTextract.d.ts +0 -26
  304. package/lib/types/activities/media/processPdfWithTextract.d.ts.map +0 -1
  305. package/lib/types/activities/media/transcribeMediaWithGladia.d.ts +0 -14
  306. package/lib/types/activities/media/transcribeMediaWithGladia.d.ts.map +0 -1
  307. package/lib/types/activities/notifyWebhook.d.ts +0 -17
  308. package/lib/types/activities/notifyWebhook.d.ts.map +0 -1
  309. package/lib/types/activities/setDocumentStatus.d.ts +0 -15
  310. package/lib/types/activities/setDocumentStatus.d.ts.map +0 -1
  311. package/lib/types/conversion/TextractProcessor.d.ts +0 -45
  312. package/lib/types/conversion/TextractProcessor.d.ts.map +0 -1
  313. package/lib/types/conversion/image.d.ts +0 -9
  314. package/lib/types/conversion/image.d.ts.map +0 -1
  315. package/lib/types/conversion/mutool.d.ts +0 -19
  316. package/lib/types/conversion/mutool.d.ts.map +0 -1
  317. package/lib/types/conversion/pandoc.d.ts +0 -2
  318. package/lib/types/conversion/pandoc.d.ts.map +0 -1
  319. package/lib/types/dsl/conditions.d.ts +0 -2
  320. package/lib/types/dsl/conditions.d.ts.map +0 -1
  321. package/lib/types/dsl/dsl-workflow.d.ts +0 -5
  322. package/lib/types/dsl/dsl-workflow.d.ts.map +0 -1
  323. package/lib/types/dsl/dslProxyActivities.d.ts +0 -10
  324. package/lib/types/dsl/dslProxyActivities.d.ts.map +0 -1
  325. package/lib/types/dsl/projections.d.ts +0 -4
  326. package/lib/types/dsl/projections.d.ts.map +0 -1
  327. package/lib/types/dsl/setup/ActivityContext.d.ts +0 -14
  328. package/lib/types/dsl/setup/ActivityContext.d.ts.map +0 -1
  329. package/lib/types/dsl/setup/fetch/DataProvider.d.ts +0 -9
  330. package/lib/types/dsl/setup/fetch/DataProvider.d.ts.map +0 -1
  331. package/lib/types/dsl/setup/fetch/index.d.ts +0 -6
  332. package/lib/types/dsl/setup/fetch/index.d.ts.map +0 -1
  333. package/lib/types/dsl/setup/fetch/providers.d.ts +0 -25
  334. package/lib/types/dsl/setup/fetch/providers.d.ts.map +0 -1
  335. package/lib/types/dsl/test/test-child-workflow.d.ts +0 -4
  336. package/lib/types/dsl/test/test-child-workflow.d.ts.map +0 -1
  337. package/lib/types/dsl/validation.d.ts +0 -4
  338. package/lib/types/dsl/validation.d.ts.map +0 -1
  339. package/lib/types/dsl/vars.d.ts +0 -48
  340. package/lib/types/dsl/vars.d.ts.map +0 -1
  341. package/lib/types/dsl/walk.d.ts +0 -18
  342. package/lib/types/dsl/walk.d.ts.map +0 -1
  343. package/lib/types/dsl.d.ts +0 -4
  344. package/lib/types/dsl.d.ts.map +0 -1
  345. package/lib/types/errors.d.ts +0 -16
  346. package/lib/types/errors.d.ts.map +0 -1
  347. package/lib/types/index.d.ts +0 -31
  348. package/lib/types/index.d.ts.map +0 -1
  349. package/lib/types/iterative-generation/activities/extractToc.d.ts +0 -10
  350. package/lib/types/iterative-generation/activities/extractToc.d.ts.map +0 -1
  351. package/lib/types/iterative-generation/activities/finalizeOutput.d.ts +0 -3
  352. package/lib/types/iterative-generation/activities/finalizeOutput.d.ts.map +0 -1
  353. package/lib/types/iterative-generation/activities/generatePart.d.ts +0 -3
  354. package/lib/types/iterative-generation/activities/generatePart.d.ts.map +0 -1
  355. package/lib/types/iterative-generation/activities/generateToc.d.ts +0 -4
  356. package/lib/types/iterative-generation/activities/generateToc.d.ts.map +0 -1
  357. package/lib/types/iterative-generation/activities/index.d.ts +0 -5
  358. package/lib/types/iterative-generation/activities/index.d.ts.map +0 -1
  359. package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts +0 -3
  360. package/lib/types/iterative-generation/iterativeGenerationWorkflow.d.ts.map +0 -1
  361. package/lib/types/iterative-generation/types.d.ts +0 -79
  362. package/lib/types/iterative-generation/types.d.ts.map +0 -1
  363. package/lib/types/iterative-generation/utils.d.ts +0 -27
  364. package/lib/types/iterative-generation/utils.d.ts.map +0 -1
  365. package/lib/types/result-types.d.ts +0 -22
  366. package/lib/types/result-types.d.ts.map +0 -1
  367. package/lib/types/system/notifyWebhookWorkflow.d.ts +0 -3
  368. package/lib/types/system/notifyWebhookWorkflow.d.ts.map +0 -1
  369. package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts +0 -40
  370. package/lib/types/system/recalculateEmbeddingsWorkflow.d.ts.map +0 -1
  371. package/lib/types/utils/auth.d.ts +0 -4
  372. package/lib/types/utils/auth.d.ts.map +0 -1
  373. package/lib/types/utils/blobs.d.ts +0 -8
  374. package/lib/types/utils/blobs.d.ts.map +0 -1
  375. package/lib/types/utils/client.d.ts +0 -7
  376. package/lib/types/utils/client.d.ts.map +0 -1
  377. package/lib/types/utils/expand-vars.d.ts +0 -8
  378. package/lib/types/utils/expand-vars.d.ts.map +0 -1
  379. package/lib/types/utils/memory.d.ts +0 -12
  380. package/lib/types/utils/memory.d.ts.map +0 -1
  381. package/lib/types/utils/tokens.d.ts +0 -11
  382. package/lib/types/utils/tokens.d.ts.map +0 -1
  383. package/lib/types/vars.d.ts +0 -3
  384. package/lib/types/vars.d.ts.map +0 -1
  385. package/lib/types/workflows.d.ts +0 -8
  386. package/lib/types/workflows.d.ts.map +0 -1
  387. package/lib/workflows-bundle.js +0 -19897
@@ -1,33 +1,41 @@
1
1
  import { log } from "@temporalio/activity";
2
- import { ContentObject, CreateContentObjectPayload, DSLActivityExecutionPayload, DSLActivitySpec } from '@vertesia/common';
3
- import { mutoolPdfToText } from '../conversion/mutool.js';
4
- import { manyToMarkdown } from '../conversion/pandoc.js';
2
+ import {
3
+ ContentObject,
4
+ CreateContentObjectPayload,
5
+ DSLActivityExecutionPayload,
6
+ DSLActivitySpec,
7
+ } from "@vertesia/common";
8
+ import { mutoolPdfToText } from "../conversion/mutool.js";
9
+ import { markdownWithPandoc } from "../conversion/pandoc.js";
5
10
  import { setupActivity } from "../dsl/setup/ActivityContext.js";
6
- import { NoDocumentFound } from '../errors.js';
7
- import { TextExtractionResult, TextExtractionStatus } from '../result-types.js';
8
- import { fetchBlobAsBuffer, md5 } from '../utils/blobs.js';
9
- import { countTokens } from '../utils/tokens.js';
11
+ import { NoDocumentFound } from "../errors.js";
12
+ import { TextExtractionResult, TextExtractionStatus } from "../result-types.js";
13
+ import { fetchBlobAsBuffer, md5 } from "../utils/blobs.js";
14
+ import { countTokens } from "../utils/tokens.js";
15
+ import { markdownWithMarkitdown } from "../conversion/markitdown.js";
10
16
 
11
17
  //@ts-ignore
12
18
  const JSON: DSLActivitySpec = {
13
- name: 'extractDocumentText',
14
- }
19
+ name: "extractDocumentText",
20
+ };
15
21
 
16
22
  // doesn't have any own param
17
- export interface ExtractDocumentTextParams { };
23
+ export interface ExtractDocumentTextParams {}
18
24
  export interface ExtractDocumentText extends DSLActivitySpec<ExtractDocumentTextParams> {
19
- name: 'extractDocumentText';
25
+ name: "extractDocumentText";
20
26
  projection?: never;
21
27
  }
22
28
 
23
- export async function extractDocumentText(payload: DSLActivityExecutionPayload<ExtractDocumentTextParams>): Promise<TextExtractionResult> {
29
+ export async function extractDocumentText(
30
+ payload: DSLActivityExecutionPayload<ExtractDocumentTextParams>,
31
+ ): Promise<TextExtractionResult> {
24
32
  const { client, objectId } = await setupActivity(payload);
25
33
 
26
34
  const r = await client.objects.find({
27
35
  query: { _id: objectId },
28
36
  limit: 1,
29
- select: "+text"
30
- })
37
+ select: "+text",
38
+ });
31
39
  const doc = r[0] as ContentObject;
32
40
  if (!doc) {
33
41
  log.error(`Document ${objectId} not found`);
@@ -36,7 +44,6 @@ export async function extractDocumentText(payload: DSLActivityExecutionPayload<E
36
44
 
37
45
  log.info(`Extracting text for object ${doc.id}`);
38
46
 
39
-
40
47
  if (!doc.content?.type || !doc.content?.source) {
41
48
  if (doc.text) {
42
49
  return createResponse(doc, doc.text, TextExtractionStatus.skipped, "Text present and no source or type");
@@ -58,74 +65,80 @@ export async function extractDocumentText(payload: DSLActivityExecutionPayload<E
58
65
  return createResponse(doc, "", TextExtractionStatus.error, e.message);
59
66
  }
60
67
 
61
-
62
68
  let txt: string;
63
69
 
64
70
  switch (doc.content.type) {
65
-
66
- case 'application/pdf':
67
- //if pdf is more than 2MB, use mutool
71
+ case "application/pdf":
68
72
  txt = await mutoolPdfToText(fileBuffer);
69
73
  break;
70
74
 
71
- case 'text/plain':
72
- txt = fileBuffer.toString('utf8')
75
+ case "text/plain":
76
+ txt = fileBuffer.toString("utf8");
73
77
  break;
74
78
 
75
79
  //docx
76
- case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
77
- txt = await manyToMarkdown(fileBuffer, 'docx');
80
+ case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
81
+ txt = await markdownWithMarkitdown(fileBuffer, "docx");
82
+ break;
83
+
84
+ //pptx
85
+ case "application/vnd.openxmlformats-officedocument.presentationml.presentation":
86
+ txt = await markdownWithMarkitdown(fileBuffer, "pptx");
78
87
  break;
79
88
 
80
89
  //html
81
- case 'text/html':
82
- txt = await manyToMarkdown(fileBuffer, 'html');
90
+ case "text/html":
91
+ txt = await markdownWithPandoc(fileBuffer, "html");
83
92
  break;
84
93
 
85
94
  //opendocument
86
- case 'application/vnd.oasis.opendocument.text':
87
- txt = await manyToMarkdown(fileBuffer, 'odt');
95
+ case "application/vnd.oasis.opendocument.text":
96
+ txt = await markdownWithPandoc(fileBuffer, "odt");
88
97
  break;
89
98
 
90
99
  //rtf
91
- case 'application/rtf':
92
- txt = await manyToMarkdown(fileBuffer, 'rtf');
100
+ case "application/rtf":
101
+ txt = await markdownWithPandoc(fileBuffer, "rtf");
93
102
  break;
94
103
 
95
104
  //markdown
96
- case 'text/markdown':
97
- txt = fileBuffer.toString('utf8');
105
+ case "text/markdown":
106
+ txt = fileBuffer.toString("utf8");
98
107
  break;
99
108
 
100
109
  //csv
101
- case 'text/csv':
102
- txt = fileBuffer.toString('utf8');
110
+ case "text/csv":
111
+ txt = fileBuffer.toString("utf8");
103
112
  break;
104
113
 
105
114
  //typescript
106
- case 'application/typescript':
107
- txt = fileBuffer.toString('utf8');
115
+ case "application/typescript":
116
+ txt = fileBuffer.toString("utf8");
108
117
  break;
109
118
 
110
119
  //javascript
111
- case 'application/javascript':
112
- txt = fileBuffer.toString('utf8');
120
+ case "application/javascript":
121
+ txt = fileBuffer.toString("utf8");
113
122
  break;
114
123
 
115
124
  //json
116
- case 'application/json':
117
- txt = fileBuffer.toString('utf8');
125
+ case "application/json":
126
+ txt = fileBuffer.toString("utf8");
118
127
  break;
119
128
 
120
129
  default:
121
130
  if (sniffIfText(fileBuffer)) {
122
- txt = fileBuffer.toString('utf8'); //TODO: add charset detection
131
+ txt = fileBuffer.toString("utf8"); //TODO: add charset detection
123
132
  break;
124
133
  }
125
- return createResponse(doc, doc.text ?? '', TextExtractionStatus.skipped, `Unsupported mime type: ${doc.content.type}`);
134
+ return createResponse(
135
+ doc,
136
+ doc.text ?? "",
137
+ TextExtractionStatus.skipped,
138
+ `Unsupported mime type: ${doc.content.type}`,
139
+ );
126
140
  }
127
141
 
128
-
129
142
  const tokensData = countTokens(txt);
130
143
  const etag = doc.content.etag ?? md5(txt);
131
144
 
@@ -135,15 +148,20 @@ export async function extractDocumentText(payload: DSLActivityExecutionPayload<E
135
148
  tokens: {
136
149
  ...tokensData,
137
150
  etag: etag,
138
- }
139
- }
151
+ },
152
+ };
140
153
 
141
154
  await client.objects.update(doc.id, updateData);
142
155
 
143
156
  return createResponse(doc, txt, TextExtractionStatus.success);
144
157
  }
145
158
 
146
- function createResponse(doc: ContentObject, text: string, status: TextExtractionStatus, message?: string): TextExtractionResult {
159
+ function createResponse(
160
+ doc: ContentObject,
161
+ text: string,
162
+ status: TextExtractionStatus,
163
+ message?: string,
164
+ ): TextExtractionResult {
147
165
  return {
148
166
  status,
149
167
  message,
@@ -151,18 +169,37 @@ function createResponse(doc: ContentObject, text: string, status: TextExtraction
151
169
  len: text.length,
152
170
  objectId: doc.id,
153
171
  hasText: !!text,
154
- }
155
-
172
+ };
156
173
  }
157
174
 
158
-
159
- //if file is less than 100KB, check if it looks like text
160
175
  function sniffIfText(buf: Buffer) {
161
- if (buf.length < 100 * 1024) {
162
- const s = buf.toString('utf8');
163
- if (s.length > 0) {
164
- return true;
176
+ // If file is too large, don't even try
177
+ if (buf.length > 500 * 1024) {
178
+ return false;
179
+ }
180
+
181
+ // Count binary/control characters
182
+ let binaryCount = 0;
183
+ const sampleSize = Math.min(buf.length, 1000); // Check first 1000 bytes
184
+
185
+ for (let i = 0; i < sampleSize; i++) {
186
+ // Count control characters (except common whitespace)
187
+ const byte = buf[i];
188
+ if ((byte < 32 && ![9, 10, 13].includes(byte)) || byte === 0) {
189
+ binaryCount++;
165
190
  }
166
191
  }
167
- return false;
192
+
193
+ // If more than 10% binary/control chars, probably not text
194
+ if (binaryCount / sampleSize > 0.1) {
195
+ return false;
196
+ }
197
+
198
+ // Additional check for valid UTF-8 encoding
199
+ try {
200
+ const s = buf.toString("utf8");
201
+ return s.length > 0 && !s.includes("\uFFFD"); // Replacement character
202
+ } catch (e) {
203
+ return false;
204
+ }
168
205
  }
@@ -1,10 +1,10 @@
1
- import { DSLActivityExecutionPayload, DSLActivitySpec } from "@vertesia/common";
2
1
  import { log } from "@temporalio/activity";
2
+ import { DSLActivityExecutionPayload, DSLActivitySpec } from "@vertesia/common";
3
3
  import { setupActivity } from "../dsl/setup/ActivityContext.js";
4
4
  import { TruncateSpec } from "../utils/tokens.js";
5
5
  import { InteractionExecutionParams, executeInteractionFromActivity } from "./executeInteraction.js";
6
6
 
7
- const INT_EXTRACT_INFORMATION = "sys:ExtractInformation"
7
+ const INT_EXTRACT_INFORMATION = "sys:ExtractInformation";
8
8
  export interface GenerateDocumentPropertiesParams extends InteractionExecutionParams {
9
9
  typesHint?: string[];
10
10
  /**
@@ -17,10 +17,12 @@ export interface GenerateDocumentPropertiesParams extends InteractionExecutionPa
17
17
  use_vision?: boolean;
18
18
  }
19
19
  export interface GenerateDocumentProperties extends DSLActivitySpec<GenerateDocumentPropertiesParams> {
20
- name: 'generateDocumentProperties';
20
+ name: "generateDocumentProperties";
21
21
  }
22
22
 
23
- export async function generateDocumentProperties(payload: DSLActivityExecutionPayload<GenerateDocumentPropertiesParams>) {
23
+ export async function generateDocumentProperties(
24
+ payload: DSLActivityExecutionPayload<GenerateDocumentPropertiesParams>,
25
+ ) {
24
26
  const context = await setupActivity<GenerateDocumentPropertiesParams>(payload);
25
27
  const { params, client, objectId } = context;
26
28
  const interactionName = params.interactionName ?? INT_EXTRACT_INFORMATION;
@@ -32,7 +34,7 @@ export async function generateDocumentProperties(payload: DSLActivityExecutionPa
32
34
 
33
35
  if (!doc?.text && !params.use_vision && !doc?.content?.type?.startsWith("image/")) {
34
36
  log.warn(`Object ${objectId} not found or text is empty`);
35
- return { status: "failed", error: "no-text" }
37
+ return { status: "failed", error: "no-text" };
36
38
  }
37
39
 
38
40
  if (!type || !type.object_schema) {
@@ -50,16 +52,19 @@ export async function generateDocumentProperties(payload: DSLActivityExecutionPa
50
52
  }
51
53
 
52
54
  log.info(`Object ${objectId} is not an image or pdf`);
53
- return undefined
54
- }
55
+ return undefined;
56
+ };
55
57
 
56
58
  const promptData = {
57
59
  content: doc.text ?? undefined,
58
60
  image: getImageRef() ?? undefined,
59
61
  human_context: project?.configuration?.human_context ?? undefined,
60
- }
62
+ };
61
63
 
62
- log.info(` Extracting information from object ${objectId} with type ${type.name}`, payload.debug_mode ? { params, } : undefined);
64
+ log.info(
65
+ ` Extracting information from object ${objectId} with type ${type.name}`,
66
+ payload.debug_mode ? { params } : undefined,
67
+ );
63
68
 
64
69
  const infoRes = await executeInteractionFromActivity(
65
70
  client,
@@ -70,24 +75,40 @@ export async function generateDocumentProperties(payload: DSLActivityExecutionPa
70
75
  result_schema: type.object_schema,
71
76
  },
72
77
  promptData,
73
- payload.debug_mode ?? false
78
+ payload.debug_mode ?? false,
74
79
  );
75
80
 
81
+ const getText = () => {
82
+ if (doc.text) {
83
+ return undefined;
84
+ }
85
+ let text = "";
86
+ if (infoRes.result.title) {
87
+ text += infoRes.result.title + "\n";
88
+ }
89
+ if (infoRes.result.description) {
90
+ text += infoRes.result.description;
91
+ }
92
+ if (text) {
93
+ return text;
94
+ } else {
95
+ return undefined;
96
+ }
97
+ };
98
+
76
99
  log.info(`Extracted information from object ${objectId} with type ${type.name}`, { runId: infoRes.id });
77
100
  await client.objects.update(doc.id, {
78
101
  properties: {
79
102
  ...infoRes.result,
80
- etag: doc.text_etag
103
+ etag: doc.text_etag,
81
104
  },
82
- text: infoRes.result.description ?? undefined,
105
+ text: getText(),
83
106
  generation_run_info: {
84
107
  id: infoRes.id,
85
108
  date: new Date().toISOString(),
86
109
  model: infoRes.modelId,
87
- }
110
+ },
88
111
  });
89
112
 
90
-
91
113
  return { status: "completed" };
92
-
93
- }
114
+ }
@@ -1,19 +1,42 @@
1
- import { VertesiaClient } from "@vertesia/client";
2
- import { ContentObject, DSLActivityExecutionPayload, DSLActivitySpec, ProjectConfigurationEmbeddings, SupportedEmbeddingTypes } from "@vertesia/common";
3
1
  import { EmbeddingsResult } from "@llumiverse/core";
4
2
  import { log } from "@temporalio/activity";
5
- import * as tf from '@tensorflow/tfjs-node';
3
+ import { VertesiaClient } from "@vertesia/client";
4
+ import { ContentObject, DSLActivityExecutionPayload, DSLActivitySpec, ProjectConfigurationEmbeddings, SupportedEmbeddingTypes } from "@vertesia/common";
6
5
  import { setupActivity } from "../dsl/setup/ActivityContext.js";
7
6
  import { NoDocumentFound } from '../errors.js';
8
7
  import { fetchBlobAsBase64, md5 } from "../utils/blobs.js";
8
+ import { DocPart, getContentParts } from "../utils/chunks.js";
9
9
  import { countTokens } from "../utils/tokens.js";
10
10
 
11
11
 
12
12
  export interface GenerateEmbeddingsParams {
13
+
14
+ /**
15
+ * The model to use for embedding generation
16
+ * If not set, the default model for the project will be used
17
+ */
13
18
  model?: string;
19
+
20
+ /**
21
+ * The environment to use for embedding generation
22
+ * If not set, the default environment for the project will be used
23
+ */
14
24
  environment?: string;
25
+
26
+ /**
27
+ * If true, force embedding generation even if the document already has embeddings
28
+ */
15
29
  force?: boolean;
30
+
31
+ /**
32
+ * The embedding type to generate
33
+ */
16
34
  type: SupportedEmbeddingTypes;
35
+
36
+ /**
37
+ * The DocParts to use for long documents
38
+ */
39
+ parts?: DocPart[];
17
40
  }
18
41
 
19
42
  export interface GenerateEmbeddings extends DSLActivitySpec<GenerateEmbeddingsParams> {
@@ -103,7 +126,7 @@ interface ExecuteGenerateEmbeddingsParams {
103
126
  force?: boolean;
104
127
  }
105
128
 
106
- async function generateTextEmbeddings({ document, client, type, config }: ExecuteGenerateEmbeddingsParams) {
129
+ async function generateTextEmbeddings({ document, client, type, config }: ExecuteGenerateEmbeddingsParams, parts?: DocPart[],) {
107
130
  // if (!force && document.embeddings[type]?.etag === (document.text_etag ?? md5(document.text))) {
108
131
  // return { id: objectId, status: "skipped", message: "embeddings already generated" }
109
132
  // }
@@ -125,6 +148,8 @@ async function generateTextEmbeddings({ document, client, type, config }: Execut
125
148
 
126
149
  const { environment, model } = config;
127
150
 
151
+ const partDefinitions = parts ?? [];
152
+
128
153
  // Count tokens if not already done
129
154
  if (!document.tokens?.count && type === SupportedEmbeddingTypes.text) {
130
155
  log.debug('Updating token count for document: ' + document.id);
@@ -150,79 +175,64 @@ async function generateTextEmbeddings({ document, client, type, config }: Execut
150
175
  if (type === SupportedEmbeddingTypes.text && document.tokens?.count && document.tokens?.count > maxTokens) {
151
176
  log.info('Document too large, generating embeddings for parts');
152
177
 
153
- if (!document.parts || document.parts.length === 0) {
154
- return { id: document.id, status: "skipped", message: "no parts found" }
178
+
179
+ if (!document.text) {
180
+ return { id: document.id, status: "failed", message: "no text found" }
155
181
  }
156
182
 
157
- const docParts = await Promise.all(document.parts?.map(async (partId) => client.objects.retrieve(partId, "+text +embeddings +properties +tokens")));
158
- log.info(`Retrieved ${docParts.length} parts`)
183
+ if (!partDefinitions || partDefinitions.length === 0) {
184
+ log.info('No parts found for document, skipping embeddings generation');
185
+ return { id: document.id, status: "failed", message: "no parts found" }
186
+ }
159
187
 
160
- const generatePartEmbeddings = async (part: ContentObject<any>, i: number) => {
161
- try {
162
- log.info(`Generating embeddings for part ${part.id}`, { text_len: part.text?.length })
163
- if (!part.text) {
164
- return { id: part.id, number: i, result: null, status: "skipped", message: "no text found" }
165
- }
166
188
 
167
- if (part.tokens?.count && part.tokens.count > maxTokens) {
168
- log.info('Part too large, skipping embeddings generation for part', { part: part.id, tokens: part.tokens.count });
169
- return { id: part.id, number: i, result: null, message: "part too large" }
189
+ log.info('Generating embeddings for parts', { parts: partDefinitions, max_tokens: maxTokens });
190
+ const docParts = getContentParts(document.text, partDefinitions);
191
+
192
+
193
+ log.info(`Retrieved ${docParts.length} parts`)
194
+ const start = new Date().getTime();
195
+ const generatePartEmbeddings = async (partContent: string, i: number) => {
196
+ const localStart = new Date().getTime();
197
+ try {
198
+ log.info(`Generating embeddings for part ${i}`, { text_len: partContent.length })
199
+ if (!partContent) {
200
+ return { id: i, number: i, result: null, status: "skipped", message: "no text found" }
170
201
  }
171
202
 
172
- const e = await generateEmbeddingsFromStudio(part.text, environment, client, model).catch(e => {
173
- log.error('Error generating embeddings for part', { part: part.id, tokens: part.tokens, text_length: part.text?.length, error: e });
203
+ const e = await generateEmbeddingsFromStudio(partContent, environment, client, model).catch(e => {
204
+ log.error('Error generating embeddings for part ' + i, { text_length: partContent.length, error: e });
174
205
  return null;
175
206
  });
176
207
 
177
208
  if (!e || !e.values) {
178
- return { id: part.id, number: i, result: null, message: "no embeddings generated" }
209
+ return { id: i, number: i, result: null, message: "no embeddings generated" }
179
210
  }
180
211
 
181
- log.info(`Embeddings generated for part ${part.id}, updating object in the store.`)
182
- await client.objects.setEmbedding(part.id, SupportedEmbeddingTypes.text,
183
- {
184
- values: e.values,
185
- model: e.model,
186
- etag: part.text_etag
187
- }).catch(err => {
188
- log.info(`Error updating embeddings on part ${part.id}`);
189
- return { id: part.id, number: i, result: null, message: "error setting embeddings on part", error: err.message }
190
- })
191
-
192
- log.info('Generated embeddings for part: ' + part.id);
193
- return { id: part.id, number: i, result: e }
212
+ if (e.values.length === 0) {
213
+ return { id: i, number: i, result: null, message: "no embeddings generated" }
214
+ }
215
+ log.info(`Generated embeddings for part ${i}`, { len: e.values.length, duration: new Date().getTime() - localStart });
216
+
217
+ return { number: i, result: e }
194
218
  } catch (err: any) {
195
- log.info(`Error generating ${type} embeddings for part ${part.id} of ${document.id}`, { error: err });
196
- return { id: part.id, number: i, result: null, message: "error generating embeddings", error: err.message }
219
+ log.info(`Error generating ${type} embeddings for part ${i} of ${document.id}`, { error: err });
220
+ return { number: i, result: null, message: "error generating embeddings", error: err.message }
197
221
  }
198
222
  }
199
223
 
200
- const promises = docParts.map((p, i) => generatePartEmbeddings(p, i))
201
- const res = await Promise.all(promises);
202
- // let i = 0;
203
- // for (const p of docParts) {
204
- // log.info(`Processing part ${p.id}`)
205
- // const r = await generatePartEmbeddings(p, i++);
206
- // res.push(r)
207
- // }
208
-
209
-
210
- // Filter out parts without embeddings
211
- const validEmbeddings = res.filter(item => item.result !== null) as { id: string, number: number, result: EmbeddingsResult }[];
212
-
213
- // Compute the document-level embedding using TensorFlow for attention mechanism
214
- log.info('Computing document-level embedding using TF');
215
- const documentEmbedding = computeAttentionEmbedding(validEmbeddings.map(item => item.result.values));
216
-
217
- // Save the document-level embedding
224
+ const partEmbeddings = await Promise.all(docParts.map((part, i) => generatePartEmbeddings(part, i)));
225
+ const validPartEmbeddings = partEmbeddings.filter(e => e.result !== null).map(e => e.result);
226
+ const averagedEmbedding = computeAttentionEmbedding(validPartEmbeddings.map(e => e.values));
227
+ log.info(`Averaged embeddings for document ${document.id} in ${(new Date().getTime() - start) / 1000} seconds`, { len: averagedEmbedding.length, count: validPartEmbeddings.length, max_tokens: maxTokens });
218
228
  await client.objects.setEmbedding(document.id, type,
219
229
  {
220
- values: documentEmbedding,
221
- model: "attention",
230
+ values: averagedEmbedding,
231
+ model: validPartEmbeddings[0].model,
222
232
  etag: document.text_etag
223
233
  }
224
234
  );
225
- return { id: document.id, status: "completed", parts: docParts.map(i => i.id), len: documentEmbedding.length, part_embeddings: res.map(r => { return { id: r.id, status: r.status, error: r.error, message: r.message } }) }
235
+ log.info(`Object ${document.id} embedding set`, { type, len: averagedEmbedding.length });
226
236
 
227
237
  } else {
228
238
  log.info(`Generating ${type} embeddings for document`);
@@ -311,35 +321,37 @@ async function generateEmbeddingsFromStudio(text: string, env: string, client: V
311
321
 
312
322
  }
313
323
 
314
- function computeAttentionEmbedding(embeddingsArray: number[][], axis: number = 0) {
315
- if (embeddingsArray.length === 0) return [];
316
- log.info('Computing attention embedding for', { embeddingsArrays: embeddingsArray.map(a => a.length) });
324
+ //Simplified attention mechanism
325
+ // This is a naive implementation and should be replaced with a more sophisticated
326
+ // using tensorflow in a specific package
327
+ function computeAttentionEmbedding(chunkEmbeddings: number[][]): number[] {
328
+ if (chunkEmbeddings.length === 0) return [];
329
+
317
330
  const start = new Date().getTime();
318
331
 
319
- // Convert embeddings array to TensorFlow tensor
320
- const embeddingsTensor = tf.tensor(embeddingsArray);
332
+ // Generate random attention weights
333
+ const attentionWeights = chunkEmbeddings.map(() => Math.random());
321
334
 
322
- // Initialize trainable attention weights
323
- const attentionWeights = tf.variable(tf.randomNormal([embeddingsArray.length]), true);
335
+ // Apply softmax to get attention scores
336
+ const expWeights = attentionWeights.map(w => Math.exp(w));
337
+ const sumExpWeights = expWeights.reduce((sum, val) => sum + val, 0);
338
+ const attentionScores = expWeights.map(w => w / sumExpWeights);
324
339
 
325
- // Compute attention scoresje sui
326
- const attentionScores = tf.softmax(attentionWeights);
340
+ // Get embedding dimension
341
+ const embeddingDim = chunkEmbeddings[0].length;
327
342
 
328
- // Compute weighted sum of embeddings
329
- const weightedEmbeddings = tf.mul(embeddingsTensor.transpose(), attentionScores).transpose();
330
- const documentEmbeddingTensor = tf.sum(weightedEmbeddings, axis);
343
+ // Initialize document embedding
344
+ const documentEmbedding = new Array(embeddingDim).fill(0);
331
345
 
332
- // Convert the result back to a JavaScript array
333
- const documentEmbedding = documentEmbeddingTensor.arraySync() as number[];
334
- const duration = (new Date().getTime() - start);
335
- log.info(`Computed attention embeddings in ${duration}ms - array size: ${documentEmbedding.length}`, { length: documentEmbedding.length });
346
+ // Weighted sum of embeddings
347
+ for (let i = 0; i < chunkEmbeddings.length; i++) {
348
+ for (let j = 0; j < embeddingDim; j++) {
349
+ documentEmbedding[j] += chunkEmbeddings[i][j] * attentionScores[i];
350
+ }
351
+ }
336
352
 
337
- // Clean up tensors
338
- embeddingsTensor.dispose();
339
- attentionWeights.dispose();
340
- attentionScores.dispose();
341
- weightedEmbeddings.dispose();
342
- documentEmbeddingTensor.dispose();
353
+ const duration = new Date().getTime() - start;
354
+ console.log(`Computed document embedding in ${duration}ms for ${chunkEmbeddings.length} chunks`);
343
355
 
344
356
  return documentEmbedding;
345
- }
357
+ }