Flowfile 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (346) hide show
  1. build_backends/main.py +25 -22
  2. build_backends/main_prd.py +10 -19
  3. flowfile/__init__.py +194 -74
  4. flowfile/__main__.py +10 -7
  5. flowfile/api.py +51 -57
  6. flowfile/web/__init__.py +14 -9
  7. flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
  8. flowfile/web/static/assets/AdminView-f9847d67.js +713 -0
  9. flowfile/web/static/assets/CloudConnectionView-cf85f943.css +72 -0
  10. flowfile/web/static/assets/{CloudConnectionManager-0dfba9f2.js → CloudConnectionView-faace55b.js} +11 -11
  11. flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
  12. flowfile/web/static/assets/{CloudStorageReader-d5b1b6c9.js → CloudStorageReader-d86ecaa7.js} +10 -8
  13. flowfile/web/static/assets/{CloudStorageWriter-00d87aad.js → CloudStorageWriter-0f4d9a44.js} +10 -8
  14. flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
  15. flowfile/web/static/assets/ColumnActionInput-c44b7aee.css +159 -0
  16. flowfile/web/static/assets/ColumnActionInput-f4189ae0.js +330 -0
  17. flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
  18. flowfile/web/static/assets/{ColumnSelector-4685e75d.js → ColumnSelector-e66b33da.js} +3 -5
  19. flowfile/web/static/assets/ContextMenu-49463352.js +9 -0
  20. flowfile/web/static/assets/ContextMenu-dd5f3f25.js +9 -0
  21. flowfile/web/static/assets/ContextMenu-f709b884.js +9 -0
  22. flowfile/web/static/assets/ContextMenu.vue_vue_type_script_setup_true_lang-a1bd6314.js +59 -0
  23. flowfile/web/static/assets/{CrossJoin-702a3edd.js → CrossJoin-24694b8f.js} +12 -10
  24. flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
  25. flowfile/web/static/assets/{CustomNode-b1519993.js → CustomNode-569d45ff.js} +43 -24
  26. flowfile/web/static/assets/CustomNode-edb9b939.css +42 -0
  27. flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-c20a1e16.css} +23 -21
  28. flowfile/web/static/assets/{DatabaseConnectionSettings-6f3e4ea5.js → DatabaseConnectionSettings-cfc08938.js} +5 -4
  29. flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-5bf8c75b.css} +41 -46
  30. flowfile/web/static/assets/{DatabaseReader-d38c7295.js → DatabaseReader-701feabb.js} +25 -15
  31. flowfile/web/static/assets/{DatabaseManager-cf5ef661.js → DatabaseView-0482e5b5.js} +11 -11
  32. flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
  33. flowfile/web/static/assets/{DatabaseWriter-b04ef46a.js → DatabaseWriter-16721989.js} +17 -10
  34. flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-bdcf2c8b.css} +29 -27
  35. flowfile/web/static/assets/{designer-8da3ba3a.css → DesignerView-49abb835.css} +783 -663
  36. flowfile/web/static/assets/{designer-9633482a.js → DesignerView-f64749fb.js} +1292 -3253
  37. flowfile/web/static/assets/{documentation-ca400224.js → DocumentationView-61bd2990.js} +5 -5
  38. flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-9ea6e871.css} +9 -9
  39. flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
  40. flowfile/web/static/assets/{ExploreData-5fa10ed8.js → ExploreData-e2735b13.js} +18 -9
  41. flowfile/web/static/assets/{ExternalSource-d39af878.js → ExternalSource-2535c3b2.js} +9 -7
  42. flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-7ac7373f.css} +20 -20
  43. flowfile/web/static/assets/Filter-2cdbc93c.js +287 -0
  44. flowfile/web/static/assets/Filter-7494ea97.css +48 -0
  45. flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
  46. flowfile/web/static/assets/{Formula-6b04fb1d.js → Formula-fcda3c2c.js} +13 -11
  47. flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
  48. flowfile/web/static/assets/{FuzzyMatch-999521f4.js → FuzzyMatch-f8d3b7d3.js} +12 -10
  49. flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-4b4d7db9.css} +5 -5
  50. flowfile/web/static/assets/{GraphSolver-17dd2198.js → GraphSolver-72eaa695.js} +14 -12
  51. flowfile/web/static/assets/GroupBy-5792782d.css +9 -0
  52. flowfile/web/static/assets/{GroupBy-6b039e18.js → GroupBy-8aa0598b.js} +9 -7
  53. flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
  54. flowfile/web/static/assets/{Join-24d0f113.js → Join-e40f0ffa.js} +13 -11
  55. flowfile/web/static/assets/LoginView-5111c9ae.js +134 -0
  56. flowfile/web/static/assets/LoginView-d325d632.css +172 -0
  57. flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
  58. flowfile/web/static/assets/{ManualInput-34639209.js → ManualInput-9b6f3224.js} +170 -116
  59. flowfile/web/static/assets/{MultiSelect-0e8724a3.js → MultiSelect-ef28e19e.js} +2 -2
  60. flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js → MultiSelect.vue_vue_type_script_setup_true_lang-83b3bbfd.js} +1 -1
  61. flowfile/web/static/assets/NodeDesigner-94cd4dd3.css +1429 -0
  62. flowfile/web/static/assets/NodeDesigner-d2b7ee2b.js +2712 -0
  63. flowfile/web/static/assets/{NumericInput-3d63a470.js → NumericInput-1d789794.js} +2 -2
  64. flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js → NumericInput.vue_vue_type_script_setup_true_lang-7775f83e.js} +5 -2
  65. flowfile/web/static/assets/Output-692dd25d.css +37 -0
  66. flowfile/web/static/assets/{Output-edea9802.js → Output-cefef801.js} +14 -10
  67. flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
  68. flowfile/web/static/assets/{Pivot-61d19301.js → Pivot-bab1b75b.js} +12 -10
  69. flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
  70. flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
  71. flowfile/web/static/assets/{PivotValidation-f97fec5b.js → PivotValidation-e7941f91.js} +3 -3
  72. flowfile/web/static/assets/{PivotValidation-de9f43fe.js → PivotValidation-fba09336.js} +3 -3
  73. flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
  74. flowfile/web/static/assets/{PolarsCode-bc3c9984.js → PolarsCode-740e40fa.js} +18 -9
  75. flowfile/web/static/assets/PopOver-862d7e28.js +939 -0
  76. flowfile/web/static/assets/PopOver-d96599db.css +33 -0
  77. flowfile/web/static/assets/{Read-64a3f259.js → Read-225cc63f.js} +16 -12
  78. flowfile/web/static/assets/{Read-e808b239.css → Read-90f366bc.css} +15 -15
  79. flowfile/web/static/assets/{RecordCount-3d5039be.js → RecordCount-ffc71eca.js} +6 -4
  80. flowfile/web/static/assets/{RecordId-597510e0.js → RecordId-a70bb8df.js} +9 -7
  81. flowfile/web/static/assets/{SQLQueryComponent-df51adbe.js → SQLQueryComponent-15a421f5.js} +3 -3
  82. flowfile/web/static/assets/SQLQueryComponent-edb90b98.css +29 -0
  83. flowfile/web/static/assets/{Sample-4be0a507.js → Sample-6c26afc7.js} +6 -4
  84. flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
  85. flowfile/web/static/assets/SecretSelector-ceed9496.js +113 -0
  86. flowfile/web/static/assets/{SecretManager-4839be57.js → SecretsView-214d255a.js} +35 -36
  87. flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
  88. flowfile/web/static/assets/{Select-9b72f201.js → Select-8fc29999.js} +9 -7
  89. flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
  90. flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
  91. flowfile/web/static/assets/{SettingsSection-7ded385d.js → SettingsSection-3f70e4c3.js} +3 -3
  92. flowfile/web/static/assets/{SettingsSection-f0f75a42.js → SettingsSection-83090218.js} +3 -3
  93. flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
  94. flowfile/web/static/assets/{SettingsSection-e1e9c953.js → SettingsSection-9f0d1725.js} +3 -3
  95. flowfile/web/static/assets/SetupView-3fa0aa03.js +160 -0
  96. flowfile/web/static/assets/SetupView-e2da3442.css +230 -0
  97. flowfile/web/static/assets/{SingleSelect-6c777aac.js → SingleSelect-a4a568cb.js} +2 -2
  98. flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js → SingleSelect.vue_vue_type_script_setup_true_lang-c8ebdd33.js} +1 -1
  99. flowfile/web/static/assets/{SliderInput-7cb93e62.js → SliderInput-be533e71.js} +7 -4
  100. flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
  101. flowfile/web/static/assets/{Sort-6cbde21a.js → Sort-154dad81.js} +9 -7
  102. flowfile/web/static/assets/Sort-4abb7fae.css +9 -0
  103. flowfile/web/static/assets/{TextInput-d9a40c11.js → TextInput-454e2bda.js} +2 -2
  104. flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-5896c375.js → TextInput.vue_vue_type_script_setup_true_lang-e86510d0.js} +5 -2
  105. flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
  106. flowfile/web/static/assets/{TextToRows-c4fcbf4d.js → TextToRows-ea73433d.js} +11 -10
  107. flowfile/web/static/assets/{ToggleSwitch-4ef91d19.js → ToggleSwitch-9d7b30f1.js} +2 -2
  108. flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-00f2580e.js} +1 -1
  109. flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-394a1f78.css} +14 -14
  110. flowfile/web/static/assets/{UnavailableFields-a03f512c.js → UnavailableFields-b72a2c72.js} +4 -4
  111. flowfile/web/static/assets/{Union-bfe9b996.js → Union-1e44f263.js} +8 -6
  112. flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
  113. flowfile/web/static/assets/Unique-2b705521.css +3 -0
  114. flowfile/web/static/assets/{Unique-5d023a27.js → Unique-a3bc6d0a.js} +13 -10
  115. flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-b6ad6427.css} +7 -7
  116. flowfile/web/static/assets/{Unpivot-91cc5354.js → Unpivot-e27935fc.js} +11 -9
  117. flowfile/web/static/assets/{UnpivotValidation-7ee2de44.js → UnpivotValidation-72497680.js} +3 -3
  118. flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
  119. flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
  120. flowfile/web/static/assets/{VueGraphicWalker-e51b9924.js → VueGraphicWalker-d9ab70a3.js} +4 -4
  121. flowfile/web/static/assets/{api-cf1221f0.js → api-a2102880.js} +1 -1
  122. flowfile/web/static/assets/{api-c1bad5ca.js → api-f75042b0.js} +1 -1
  123. flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-1d6acbd9.css} +41 -41
  124. flowfile/web/static/assets/{dropDown-614b998d.js → dropDown-2798a109.js} +3 -3
  125. flowfile/web/static/assets/{fullEditor-f7971590.js → fullEditor-cf7d7d93.js} +11 -10
  126. flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-fe9f7e18.css} +77 -65
  127. flowfile/web/static/assets/{genericNodeSettings-4fe5f36b.js → genericNodeSettings-14eac1c3.js} +5 -5
  128. flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
  129. flowfile/web/static/assets/{index-5429bbf8.js → index-387a6f18.js} +41806 -40958
  130. flowfile/web/static/assets/index-6b367bb5.js +38 -0
  131. flowfile/web/static/assets/{index-50508d4d.css → index-e96ab018.css} +2184 -569
  132. flowfile/web/static/assets/index-f0a6e5a5.js +2696 -0
  133. flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
  134. flowfile/web/static/assets/nodeInput-ed2ae8d7.js +2 -0
  135. flowfile/web/static/assets/{outputCsv-076b85ab.js → outputCsv-3c1757e8.js} +3 -3
  136. flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
  137. flowfile/web/static/assets/{outputExcel-0fd17dbe.js → outputExcel-686e1f48.js} +3 -3
  138. flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
  139. flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
  140. flowfile/web/static/assets/{outputParquet-b61e0847.js → outputParquet-df28faa7.js} +4 -4
  141. flowfile/web/static/assets/{readCsv-c767cb37.css → readCsv-3bfac4c3.css} +15 -15
  142. flowfile/web/static/assets/{readCsv-a8bb8b61.js → readCsv-e37eee21.js} +3 -3
  143. flowfile/web/static/assets/{readExcel-806d2826.css → readExcel-3db6b763.css} +13 -13
  144. flowfile/web/static/assets/{readExcel-67b4aee0.js → readExcel-a13f14bb.js} +5 -5
  145. flowfile/web/static/assets/{readParquet-92ce1dbc.js → readParquet-344cf746.js} +3 -3
  146. flowfile/web/static/assets/{readParquet-48c81530.css → readParquet-c5244ad5.css} +4 -4
  147. flowfile/web/static/assets/secrets.api-ae198c5c.js +65 -0
  148. flowfile/web/static/assets/{selectDynamic-92e25ee3.js → selectDynamic-6b4b0767.js} +5 -5
  149. flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
  150. flowfile/web/static/assets/{vue-codemirror.esm-41b0e0d7.js → vue-codemirror.esm-31ba0e0b.js} +31 -640
  151. flowfile/web/static/assets/{vue-content-loader.es-2c8e608f.js → vue-content-loader.es-4469c8ff.js} +1 -1
  152. flowfile/web/static/index.html +2 -2
  153. {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/METADATA +3 -4
  154. flowfile-0.5.4.dist-info/RECORD +407 -0
  155. flowfile_core/__init__.py +13 -6
  156. flowfile_core/auth/jwt.py +51 -16
  157. flowfile_core/auth/models.py +32 -7
  158. flowfile_core/auth/password.py +89 -0
  159. flowfile_core/auth/secrets.py +64 -19
  160. flowfile_core/configs/__init__.py +9 -7
  161. flowfile_core/configs/flow_logger.py +15 -14
  162. flowfile_core/configs/node_store/__init__.py +72 -4
  163. flowfile_core/configs/node_store/nodes.py +155 -172
  164. flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
  165. flowfile_core/configs/settings.py +28 -15
  166. flowfile_core/database/connection.py +7 -6
  167. flowfile_core/database/init_db.py +96 -2
  168. flowfile_core/database/models.py +3 -1
  169. flowfile_core/fileExplorer/__init__.py +17 -0
  170. flowfile_core/fileExplorer/funcs.py +145 -57
  171. flowfile_core/fileExplorer/utils.py +10 -11
  172. flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
  173. flowfile_core/flowfile/analytics/analytics_processor.py +26 -24
  174. flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
  175. flowfile_core/flowfile/analytics/utils.py +1 -1
  176. flowfile_core/flowfile/code_generator/__init__.py +11 -0
  177. flowfile_core/flowfile/code_generator/code_generator.py +706 -247
  178. flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
  179. flowfile_core/flowfile/connection_manager/models.py +1 -1
  180. flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
  181. flowfile_core/flowfile/database_connection_manager/models.py +1 -1
  182. flowfile_core/flowfile/extensions.py +17 -12
  183. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
  184. flowfile_core/flowfile/flow_data_engine/create/funcs.py +115 -83
  185. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +493 -423
  186. flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
  187. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
  188. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
  189. flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
  190. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
  191. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +31 -20
  192. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
  193. flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
  194. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +14 -15
  195. flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
  196. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
  197. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
  198. flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
  199. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
  200. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
  201. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +190 -127
  202. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
  203. flowfile_core/flowfile/flow_data_engine/utils.py +99 -67
  204. flowfile_core/flowfile/flow_graph.py +920 -571
  205. flowfile_core/flowfile/flow_graph_utils.py +31 -49
  206. flowfile_core/flowfile/flow_node/flow_node.py +379 -258
  207. flowfile_core/flowfile/flow_node/models.py +53 -41
  208. flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
  209. flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
  210. flowfile_core/flowfile/handler.py +80 -30
  211. flowfile_core/flowfile/manage/compatibility_enhancements.py +209 -126
  212. flowfile_core/flowfile/manage/io_flowfile.py +54 -57
  213. flowfile_core/flowfile/node_designer/__init__.py +19 -13
  214. flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
  215. flowfile_core/flowfile/node_designer/custom_node.py +162 -36
  216. flowfile_core/flowfile/node_designer/ui_components.py +278 -34
  217. flowfile_core/flowfile/schema_callbacks.py +71 -51
  218. flowfile_core/flowfile/setting_generator/__init__.py +0 -1
  219. flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
  220. flowfile_core/flowfile/setting_generator/settings.py +64 -53
  221. flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
  222. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
  223. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
  224. flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
  225. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
  226. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
  227. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
  228. flowfile_core/flowfile/util/calculate_layout.py +9 -13
  229. flowfile_core/flowfile/util/execution_orderer.py +25 -17
  230. flowfile_core/flowfile/util/node_skipper.py +4 -4
  231. flowfile_core/flowfile/utils.py +19 -21
  232. flowfile_core/main.py +26 -19
  233. flowfile_core/routes/auth.py +284 -11
  234. flowfile_core/routes/cloud_connections.py +25 -25
  235. flowfile_core/routes/logs.py +21 -29
  236. flowfile_core/routes/public.py +46 -4
  237. flowfile_core/routes/routes.py +70 -34
  238. flowfile_core/routes/secrets.py +25 -27
  239. flowfile_core/routes/user_defined_components.py +483 -4
  240. flowfile_core/run_lock.py +0 -1
  241. flowfile_core/schemas/__init__.py +4 -6
  242. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
  243. flowfile_core/schemas/cloud_storage_schemas.py +96 -66
  244. flowfile_core/schemas/input_schema.py +231 -144
  245. flowfile_core/schemas/output_model.py +49 -34
  246. flowfile_core/schemas/schemas.py +116 -89
  247. flowfile_core/schemas/transform_schema.py +518 -263
  248. flowfile_core/schemas/yaml_types.py +21 -7
  249. flowfile_core/secret_manager/secret_manager.py +123 -18
  250. flowfile_core/types.py +29 -9
  251. flowfile_core/utils/arrow_reader.py +7 -6
  252. flowfile_core/utils/excel_file_manager.py +3 -3
  253. flowfile_core/utils/fileManager.py +7 -7
  254. flowfile_core/utils/fl_executor.py +8 -10
  255. flowfile_core/utils/utils.py +4 -4
  256. flowfile_core/utils/validate_setup.py +5 -4
  257. flowfile_frame/__init__.py +117 -51
  258. flowfile_frame/adapters.py +2 -9
  259. flowfile_frame/adding_expr.py +73 -32
  260. flowfile_frame/cloud_storage/frame_helpers.py +27 -23
  261. flowfile_frame/cloud_storage/secret_manager.py +12 -26
  262. flowfile_frame/config.py +2 -5
  263. flowfile_frame/database/__init__.py +36 -0
  264. flowfile_frame/database/connection_manager.py +205 -0
  265. flowfile_frame/database/frame_helpers.py +249 -0
  266. flowfile_frame/expr.py +311 -218
  267. flowfile_frame/expr.pyi +160 -159
  268. flowfile_frame/expr_name.py +23 -23
  269. flowfile_frame/flow_frame.py +571 -476
  270. flowfile_frame/flow_frame.pyi +123 -104
  271. flowfile_frame/flow_frame_methods.py +227 -246
  272. flowfile_frame/group_frame.py +50 -20
  273. flowfile_frame/join.py +2 -2
  274. flowfile_frame/lazy.py +129 -87
  275. flowfile_frame/lazy_methods.py +83 -30
  276. flowfile_frame/list_name_space.py +55 -50
  277. flowfile_frame/selectors.py +148 -68
  278. flowfile_frame/series.py +9 -7
  279. flowfile_frame/utils.py +19 -21
  280. flowfile_worker/__init__.py +12 -7
  281. flowfile_worker/configs.py +41 -33
  282. flowfile_worker/create/__init__.py +14 -9
  283. flowfile_worker/create/funcs.py +114 -77
  284. flowfile_worker/create/models.py +46 -43
  285. flowfile_worker/create/pl_types.py +14 -15
  286. flowfile_worker/create/read_excel_tables.py +34 -41
  287. flowfile_worker/create/utils.py +22 -19
  288. flowfile_worker/external_sources/s3_source/main.py +18 -51
  289. flowfile_worker/external_sources/s3_source/models.py +34 -27
  290. flowfile_worker/external_sources/sql_source/main.py +8 -5
  291. flowfile_worker/external_sources/sql_source/models.py +13 -9
  292. flowfile_worker/flow_logger.py +10 -8
  293. flowfile_worker/funcs.py +214 -155
  294. flowfile_worker/main.py +11 -17
  295. flowfile_worker/models.py +35 -28
  296. flowfile_worker/process_manager.py +2 -3
  297. flowfile_worker/routes.py +121 -90
  298. flowfile_worker/secrets.py +114 -21
  299. flowfile_worker/spawner.py +89 -54
  300. flowfile_worker/utils.py +3 -2
  301. shared/__init__.py +2 -7
  302. shared/storage_config.py +25 -13
  303. test_utils/postgres/commands.py +3 -2
  304. test_utils/postgres/fixtures.py +9 -9
  305. test_utils/s3/commands.py +1 -1
  306. test_utils/s3/data_generator.py +3 -4
  307. test_utils/s3/demo_data_generator.py +4 -7
  308. test_utils/s3/fixtures.py +7 -5
  309. tools/migrate/__init__.py +1 -1
  310. tools/migrate/__main__.py +16 -29
  311. tools/migrate/legacy_schemas.py +251 -190
  312. tools/migrate/migrate.py +193 -181
  313. tools/migrate/tests/conftest.py +1 -3
  314. tools/migrate/tests/test_migrate.py +36 -41
  315. tools/migrate/tests/test_migration_e2e.py +28 -29
  316. tools/migrate/tests/test_node_migrations.py +50 -20
  317. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
  318. flowfile/web/static/assets/ContextMenu-23e909da.js +0 -41
  319. flowfile/web/static/assets/ContextMenu-4c74eef1.css +0 -26
  320. flowfile/web/static/assets/ContextMenu-63cfa99b.css +0 -26
  321. flowfile/web/static/assets/ContextMenu-70ae0c79.js +0 -41
  322. flowfile/web/static/assets/ContextMenu-c13f91d0.css +0 -26
  323. flowfile/web/static/assets/ContextMenu-f149cf7c.js +0 -41
  324. flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
  325. flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
  326. flowfile/web/static/assets/Filter-9b6d08db.js +0 -164
  327. flowfile/web/static/assets/Filter-f62091b3.css +0 -20
  328. flowfile/web/static/assets/GroupBy-b9505323.css +0 -51
  329. flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
  330. flowfile/web/static/assets/Output-283fe388.css +0 -37
  331. flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
  332. flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
  333. flowfile/web/static/assets/SQLQueryComponent-36cef432.css +0 -27
  334. flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
  335. flowfile/web/static/assets/Sort-3643d625.css +0 -51
  336. flowfile/web/static/assets/Unique-f9fb0809.css +0 -51
  337. flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
  338. flowfile/web/static/assets/nodeInput-5d0d6b79.js +0 -41
  339. flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
  340. flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
  341. flowfile/web/static/assets/secretApi-68435402.js +0 -46
  342. flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
  343. flowfile-0.5.1.dist-info/RECORD +0 -388
  344. {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/WHEEL +0 -0
  345. {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/entry_points.txt +0 -0
  346. {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,52 +1,50 @@
1
1
  # Standard library imports
2
+ from __future__ import annotations
3
+
2
4
  import logging
3
5
  import os
6
+ from collections.abc import Callable, Generator, Iterable
4
7
  from copy import deepcopy
5
8
  from dataclasses import dataclass
6
9
  from math import ceil
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
10
+ from typing import Any, Literal, TypeVar, Union
8
11
 
9
- from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
12
+ import polars as pl
10
13
 
11
14
  # Third-party imports
12
15
  from loky import Future
13
- import polars as pl
16
+ from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
14
17
  from polars.exceptions import PanicException
15
- from polars_grouper import graph_solver
16
18
  from polars_expr_transformer import simple_function_to_expr as to_expr
19
+ from polars_grouper import graph_solver
17
20
  from pyarrow import Table as PaTable
18
21
  from pyarrow.parquet import ParquetFile
19
22
 
20
23
  # Local imports - Core
21
24
  from flowfile_core.configs import logger
22
- from flowfile_core.utils.utils import ensure_similarity_dicts
23
25
  from flowfile_core.configs.flow_logger import NodeLogger
24
- from flowfile_core.schemas import (
25
- cloud_storage_schemas,
26
- input_schema,
27
- transform_schema as transform_schemas
28
- )
29
- from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
30
26
 
31
27
  # Local imports - Flow File Components
32
28
  from flowfile_core.flowfile.flow_data_engine import utils
33
- from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (CloudStorageReader,
34
- ensure_path_has_wildcard_pattern,
35
- get_first_file_from_s3_dir)
29
+ from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
30
+ CloudStorageReader,
31
+ ensure_path_has_wildcard_pattern,
32
+ get_first_file_from_s3_dir,
33
+ )
36
34
  from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
37
35
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
38
36
  FlowfileColumn,
39
37
  assert_if_flowfile_schema,
40
- convert_stats_to_column_info
38
+ convert_stats_to_column_info,
41
39
  )
42
40
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
43
41
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
44
42
  from flowfile_core.flowfile.flow_data_engine.join import (
45
- verify_join_select_integrity,
46
- verify_join_map_integrity,
47
- rename_df_table_for_join,
43
+ get_col_name_to_delete,
48
44
  get_undo_rename_mapping_join,
49
- get_col_name_to_delete
45
+ rename_df_table_for_join,
46
+ verify_join_map_integrity,
47
+ verify_join_select_integrity,
50
48
  )
51
49
  from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
52
50
  from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
@@ -55,19 +53,21 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
55
53
  ExternalDfFetcher,
56
54
  ExternalExecutorTracker,
57
55
  ExternalFuzzyMatchFetcher,
58
- fetch_unique_values
59
- )
60
- from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
61
- get_join_count,
62
- write_threaded
56
+ fetch_unique_values,
63
57
  )
64
-
58
+ from flowfile_core.flowfile.flow_data_engine.threaded_processes import write_threaded
65
59
  from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
60
+ from flowfile_core.schemas import cloud_storage_schemas, input_schema
61
+ from flowfile_core.schemas import transform_schema as transform_schemas
62
+ from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
63
+ from flowfile_core.utils.utils import ensure_similarity_dicts
66
64
 
67
- T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
65
+ T = TypeVar("T", pl.DataFrame, pl.LazyFrame)
68
66
 
69
67
 
70
- def _handle_duplication_join_keys(left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager) -> Tuple[T, T, Dict[str, str]]:
68
+ def _handle_duplication_join_keys(
69
+ left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager
70
+ ) -> tuple[T, T, dict[str, str]]:
71
71
  """Temporarily renames join keys to avoid conflicts during a join.
72
72
 
73
73
  This helper function checks the join type and renames the join key columns
@@ -88,20 +88,26 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_manager: transfo
88
88
  """
89
89
 
90
90
  def _construct_temp_name(column_name: str) -> str:
91
- return "__FL_TEMP__"+column_name
91
+ return "__FL_TEMP__" + column_name
92
92
 
93
- if join_manager.how == 'right':
94
- left_df = left_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
95
- for jk in join_manager.left_manager.get_join_key_selects())
93
+ if join_manager.how == "right":
94
+ left_df = left_df.with_columns(
95
+ pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
96
+ for jk in join_manager.left_manager.get_join_key_selects()
97
+ )
96
98
  reverse_actions = {
97
99
  _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
98
- for jk in join_manager.left_manager.get_join_key_selects()}
99
- elif join_manager.how in ('left', 'inner'):
100
- right_df = right_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
101
- for jk in join_manager.right_manager.get_join_key_selects())
100
+ for jk in join_manager.left_manager.get_join_key_selects()
101
+ }
102
+ elif join_manager.how in ("left", "inner"):
103
+ right_df = right_df.with_columns(
104
+ pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
105
+ for jk in join_manager.right_manager.get_join_key_selects()
106
+ )
102
107
  reverse_actions = {
103
108
  _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
104
- for jk in join_manager.right_manager.get_join_key_selects()}
109
+ for jk in join_manager.right_manager.get_join_key_selects()
110
+ }
105
111
  else:
106
112
  reverse_actions = {}
107
113
  return left_df, right_df, reverse_actions
@@ -118,12 +124,12 @@ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.
118
124
  Args:
119
125
  join_input: The JoinInput settings object to modify.
120
126
  """
121
- if join_input.how in ('semi', 'anti'):
127
+ if join_input.how in ("semi", "anti"):
122
128
  for jk in join_input.right_select.renames:
123
129
  jk.keep = False
124
130
 
125
131
 
126
- def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
132
+ def get_select_columns(full_select_input: list[transform_schemas.SelectInput]) -> list[str]:
127
133
  """Extracts a list of column names to be selected from a SelectInput list.
128
134
 
129
135
  This function filters a list of `SelectInput` objects to return the names
@@ -156,15 +162,16 @@ class FlowDataEngine:
156
162
  errors: A list of errors encountered during operations.
157
163
  _schema: A cached list of `FlowfileColumn` objects representing the schema.
158
164
  """
165
+
159
166
  # Core attributes
160
- _data_frame: Union[pl.DataFrame, pl.LazyFrame]
161
- columns: List[Any]
167
+ _data_frame: pl.DataFrame | pl.LazyFrame
168
+ columns: list[Any]
162
169
 
163
170
  # Metadata attributes
164
171
  name: str = None
165
172
  number_of_records: int = None
166
- errors: List = None
167
- _schema: Optional[List['FlowfileColumn']] = None
173
+ errors: list = None
174
+ _schema: list["FlowfileColumn"] | None = None
168
175
 
169
176
  # Configuration attributes
170
177
  _optimize_memory: bool = False
@@ -173,16 +180,16 @@ class FlowDataEngine:
173
180
  _calculate_schema_stats: bool = False
174
181
 
175
182
  # Cache and optimization attributes
176
- __col_name_idx_map: Dict = None
177
- __data_map: Dict = None
178
- __optimized_columns: List = None
183
+ __col_name_idx_map: dict = None
184
+ __data_map: dict = None
185
+ __optimized_columns: list = None
179
186
  __sample__: str = None
180
187
  __number_of_fields: int = None
181
- _col_idx: Dict[str, int] = None
188
+ _col_idx: dict[str, int] = None
182
189
 
183
190
  # Source tracking
184
- _org_path: Optional[str] = None
185
- _external_source: Optional[ExternalDataSource] = None
191
+ _org_path: str | None = None
192
+ _external_source: ExternalDataSource | None = None
186
193
 
187
194
  # State tracking
188
195
  sorted_by: int = None
@@ -195,17 +202,21 @@ class FlowDataEngine:
195
202
  _number_of_records_callback: Callable = None
196
203
  _data_callback: Callable = None
197
204
 
198
- def __init__(self,
199
- raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
200
- path_ref: str = None,
201
- name: str = None,
202
- optimize_memory: bool = True,
203
- schema: List['FlowfileColumn'] | List[str] | pl.Schema = None,
204
- number_of_records: int = None,
205
- calculate_schema_stats: bool = False,
206
- streamable: bool = True,
207
- number_of_records_callback: Callable = None,
208
- data_callback: Callable = None):
205
+ def __init__(
206
+ self,
207
+ raw_data: Union[
208
+ list[dict], list[Any], dict[str, Any], "ParquetFile", pl.DataFrame, pl.LazyFrame, input_schema.RawData
209
+ ] = None,
210
+ path_ref: str = None,
211
+ name: str = None,
212
+ optimize_memory: bool = True,
213
+ schema: list["FlowfileColumn"] | list[str] | pl.Schema = None,
214
+ number_of_records: int = None,
215
+ calculate_schema_stats: bool = False,
216
+ streamable: bool = True,
217
+ number_of_records_callback: Callable = None,
218
+ data_callback: Callable = None,
219
+ ):
209
220
  """Initializes the FlowDataEngine from various data sources.
210
221
 
211
222
  Args:
@@ -265,12 +276,12 @@ class FlowDataEngine:
265
276
  elif isinstance(raw_data, (list, dict)):
266
277
  self._handle_python_data(raw_data)
267
278
 
268
- def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
279
+ def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: int | None):
269
280
  """(Internal) Initializes the engine from an eager Polars DataFrame."""
270
281
  self.data_frame = df
271
282
  self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
272
283
 
273
- def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
284
+ def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: int | None, optimize_memory: bool):
274
285
  """(Internal) Initializes the engine from a Polars LazyFrame."""
275
286
  self.data_frame = lf
276
287
  self._lazy = True
@@ -281,14 +292,14 @@ class FlowDataEngine:
281
292
  else:
282
293
  self.number_of_records = lf.select(pl.len()).collect()[0, 0]
283
294
 
284
- def _handle_python_data(self, data: Union[List, Dict]):
295
+ def _handle_python_data(self, data: list | dict):
285
296
  """(Internal) Dispatches Python collections to the correct handler."""
286
297
  if isinstance(data, dict):
287
298
  self._handle_dict_input(data)
288
299
  else:
289
300
  self._handle_list_input(data)
290
301
 
291
- def _handle_dict_input(self, data: Dict):
302
+ def _handle_dict_input(self, data: dict):
292
303
  """(Internal) Initializes the engine from a Python dictionary."""
293
304
  if len(data) == 0:
294
305
  self.initialize_empty_fl()
@@ -312,8 +323,12 @@ class FlowDataEngine:
312
323
  raw_data: An instance of `RawData` containing the data and schema.
313
324
  """
314
325
  flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
315
- polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
316
- for flowfile_column in flowfile_schema])
326
+ polars_schema = pl.Schema(
327
+ [
328
+ (flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
329
+ for flowfile_column in flowfile_schema
330
+ ]
331
+ )
317
332
  try:
318
333
  df = pl.DataFrame(raw_data.data, polars_schema, strict=False)
319
334
  except TypeError as e:
@@ -323,7 +338,7 @@ class FlowDataEngine:
323
338
  self.data_frame = df.lazy()
324
339
  self.lazy = True
325
340
 
326
- def _handle_list_input(self, data: List):
341
+ def _handle_list_input(self, data: list):
327
342
  """(Internal) Initializes the engine from a list of records."""
328
343
  number_of_records = len(data)
329
344
  if number_of_records > 0:
@@ -336,19 +351,19 @@ class FlowDataEngine:
336
351
  self.number_of_records = 0
337
352
 
338
353
  @staticmethod
339
- def _process_list_data(data: List) -> List[Dict]:
354
+ def _process_list_data(data: list) -> list[dict]:
340
355
  """(Internal) Normalizes list data into a list of dictionaries.
341
356
 
342
357
  Ensures that a list of objects or non-dict items is converted into a
343
358
  uniform list of dictionaries suitable for Polars DataFrame creation.
344
359
  """
345
- if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
360
+ if not (isinstance(data[0], dict) or hasattr(data[0], "__dict__")):
346
361
  try:
347
362
  return pl.DataFrame(data).to_dicts()
348
363
  except TypeError:
349
- raise Exception('Value must be able to be converted to dictionary')
364
+ raise Exception("Value must be able to be converted to dictionary")
350
365
  except Exception as e:
351
- raise Exception(f'Value must be able to be converted to dictionary: {e}')
366
+ raise Exception(f"Value must be able to be converted to dictionary: {e}")
352
367
 
353
368
  if not isinstance(data[0], dict):
354
369
  data = [row.__dict__ for row in data]
@@ -375,49 +390,37 @@ class FlowDataEngine:
375
390
 
376
391
  logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
377
392
 
378
- if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
393
+ if write_settings.write_mode == "append" and write_settings.file_format != "delta":
379
394
  raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
380
395
  storage_options = CloudStorageReader.get_storage_options(connection)
381
396
  credential_provider = CloudStorageReader.get_credential_provider(connection)
382
397
  # Dispatch to the correct writer based on file format
383
398
  if write_settings.file_format == "parquet":
384
399
  self._write_parquet_to_cloud(
385
- write_settings.resource_path,
386
- storage_options,
387
- credential_provider,
388
- write_settings
400
+ write_settings.resource_path, storage_options, credential_provider, write_settings
389
401
  )
390
402
  elif write_settings.file_format == "delta":
391
403
  self._write_delta_to_cloud(
392
- write_settings.resource_path,
393
- storage_options,
394
- credential_provider,
395
- write_settings
404
+ write_settings.resource_path, storage_options, credential_provider, write_settings
396
405
  )
397
406
  elif write_settings.file_format == "csv":
398
- self._write_csv_to_cloud(
399
- write_settings.resource_path,
400
- storage_options,
401
- credential_provider,
402
- write_settings
403
- )
407
+ self._write_csv_to_cloud(write_settings.resource_path, storage_options, credential_provider, write_settings)
404
408
  elif write_settings.file_format == "json":
405
409
  self._write_json_to_cloud(
406
- write_settings.resource_path,
407
- storage_options,
408
- credential_provider,
409
- write_settings
410
+ write_settings.resource_path, storage_options, credential_provider, write_settings
410
411
  )
411
412
  else:
412
413
  raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
413
414
 
414
415
  logger.info(f"Successfully wrote data to {write_settings.resource_path}")
415
416
 
416
- def _write_parquet_to_cloud(self,
417
- resource_path: str,
418
- storage_options: Dict[str, Any],
419
- credential_provider: Optional[Callable],
420
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
417
+ def _write_parquet_to_cloud(
418
+ self,
419
+ resource_path: str,
420
+ storage_options: dict[str, Any],
421
+ credential_provider: Callable | None,
422
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
423
+ ):
421
424
  """(Internal) Writes the DataFrame to a Parquet file in cloud storage.
422
425
 
423
426
  Uses `sink_parquet` for efficient streaming writes. Falls back to a
@@ -437,18 +440,20 @@ class FlowDataEngine:
437
440
  except Exception as e:
438
441
  logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
439
442
  pl_df = self.collect()
440
- sink_kwargs['file'] = sink_kwargs.pop("path")
443
+ sink_kwargs["file"] = sink_kwargs.pop("path")
441
444
  pl_df.write_parquet(**sink_kwargs)
442
445
 
443
446
  except Exception as e:
444
447
  logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
445
448
  raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
446
449
 
447
- def _write_delta_to_cloud(self,
448
- resource_path: str,
449
- storage_options: Dict[str, Any],
450
- credential_provider: Optional[Callable],
451
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
450
+ def _write_delta_to_cloud(
451
+ self,
452
+ resource_path: str,
453
+ storage_options: dict[str, Any],
454
+ credential_provider: Callable | None,
455
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
456
+ ):
452
457
  """(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
453
458
 
454
459
  This operation requires collecting the data first, as `write_delta` operates
@@ -464,11 +469,13 @@ class FlowDataEngine:
464
469
  sink_kwargs["credential_provider"] = credential_provider
465
470
  self.collect().write_delta(**sink_kwargs)
466
471
 
467
- def _write_csv_to_cloud(self,
468
- resource_path: str,
469
- storage_options: Dict[str, Any],
470
- credential_provider: Optional[Callable],
471
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
472
+ def _write_csv_to_cloud(
473
+ self,
474
+ resource_path: str,
475
+ storage_options: dict[str, Any],
476
+ credential_provider: Callable | None,
477
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
478
+ ):
472
479
  """(Internal) Writes the DataFrame to a CSV file in cloud storage.
473
480
 
474
481
  Uses `sink_csv` for efficient, streaming writes of the data.
@@ -490,11 +497,13 @@ class FlowDataEngine:
490
497
  logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
491
498
  raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
492
499
 
493
- def _write_json_to_cloud(self,
494
- resource_path: str,
495
- storage_options: Dict[str, Any],
496
- credential_provider: Optional[Callable],
497
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
500
+ def _write_json_to_cloud(
501
+ self,
502
+ resource_path: str,
503
+ storage_options: dict[str, Any],
504
+ credential_provider: Callable | None,
505
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
506
+ ):
498
507
  """(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
499
508
 
500
509
  Uses `sink_ndjson` for efficient, streaming writes.
@@ -512,7 +521,9 @@ class FlowDataEngine:
512
521
  raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
513
522
 
514
523
  @classmethod
515
- def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal) -> "FlowDataEngine":
524
+ def from_cloud_storage_obj(
525
+ cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal
526
+ ) -> "FlowDataEngine":
516
527
  """Creates a FlowDataEngine from an object in cloud storage.
517
528
 
518
529
  This method supports reading from various cloud storage providers like AWS S3,
@@ -549,31 +560,22 @@ class FlowDataEngine:
549
560
  )
550
561
  elif read_settings.file_format == "delta":
551
562
  return cls._read_delta_from_cloud(
552
- read_settings.resource_path,
553
- storage_options,
554
- credential_provider,
555
- read_settings
563
+ read_settings.resource_path, storage_options, credential_provider, read_settings
556
564
  )
557
565
  elif read_settings.file_format == "csv":
558
566
  return cls._read_csv_from_cloud(
559
- read_settings.resource_path,
560
- storage_options,
561
- credential_provider,
562
- read_settings
567
+ read_settings.resource_path, storage_options, credential_provider, read_settings
563
568
  )
564
569
  elif read_settings.file_format == "json":
565
570
  return cls._read_json_from_cloud(
566
571
  read_settings.resource_path,
567
572
  storage_options,
568
573
  credential_provider,
569
- read_settings.scan_mode == "directory"
574
+ read_settings.scan_mode == "directory",
570
575
  )
571
576
  elif read_settings.file_format == "iceberg":
572
577
  return cls._read_iceberg_from_cloud(
573
- read_settings.resource_path,
574
- storage_options,
575
- credential_provider,
576
- read_settings
578
+ read_settings.resource_path, storage_options, credential_provider, read_settings
577
579
  )
578
580
 
579
581
  elif read_settings.file_format in ["delta", "iceberg"]:
@@ -583,33 +585,40 @@ class FlowDataEngine:
583
585
  raise ValueError(f"Unsupported file format: {read_settings.file_format}")
584
586
 
585
587
  @staticmethod
586
- def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any],
587
- file_format: Literal["csv", "parquet", "json", "delta"]) -> List[FlowfileColumn] | None:
588
+ def _get_schema_from_first_file_in_dir(
589
+ source: str, storage_options: dict[str, Any], file_format: Literal["csv", "parquet", "json", "delta"]
590
+ ) -> list[FlowfileColumn] | None:
588
591
  """Infers the schema by scanning the first file in a cloud directory."""
589
592
  try:
590
593
  scan_func = getattr(pl, "scan_" + file_format)
591
594
  first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
592
- return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
593
- scan_func(first_file_ref, storage_options=storage_options).collect_schema()))
595
+ return convert_stats_to_column_info(
596
+ FlowDataEngine._create_schema_stats_from_pl_schema(
597
+ scan_func(first_file_ref, storage_options=storage_options).collect_schema()
598
+ )
599
+ )
594
600
  except Exception as e:
595
601
  logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
596
602
 
597
-
598
603
  @classmethod
599
- def _read_iceberg_from_cloud(cls,
600
- resource_path: str,
601
- storage_options: Dict[str, Any],
602
- credential_provider: Optional[Callable],
603
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
604
+ def _read_iceberg_from_cloud(
605
+ cls,
606
+ resource_path: str,
607
+ storage_options: dict[str, Any],
608
+ credential_provider: Callable | None,
609
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
610
+ ) -> "FlowDataEngine":
604
611
  """Reads Iceberg table(s) from cloud storage."""
605
- raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
612
+ raise NotImplementedError("Failed to read Iceberg table from cloud storage: Not yet implemented")
606
613
 
607
614
  @classmethod
608
- def _read_parquet_from_cloud(cls,
609
- resource_path: str,
610
- storage_options: Dict[str, Any],
611
- credential_provider: Optional[Callable],
612
- is_directory: bool) -> "FlowDataEngine":
615
+ def _read_parquet_from_cloud(
616
+ cls,
617
+ resource_path: str,
618
+ storage_options: dict[str, Any],
619
+ credential_provider: Callable | None,
620
+ is_directory: bool,
621
+ ) -> "FlowDataEngine":
613
622
  """Reads Parquet file(s) from cloud storage."""
614
623
  try:
615
624
  # Use scan_parquet for lazy evaluation
@@ -633,7 +642,7 @@ class FlowDataEngine:
633
642
  number_of_records=6_666_666, # Set so the provider is not accessed for this stat
634
643
  optimize_memory=True,
635
644
  streamable=True,
636
- schema=schema
645
+ schema=schema,
637
646
  )
638
647
 
639
648
  except Exception as e:
@@ -641,18 +650,20 @@ class FlowDataEngine:
641
650
  raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
642
651
 
643
652
  @classmethod
644
- def _read_delta_from_cloud(cls,
645
- resource_path: str,
646
- storage_options: Dict[str, Any],
647
- credential_provider: Optional[Callable],
648
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
653
+ def _read_delta_from_cloud(
654
+ cls,
655
+ resource_path: str,
656
+ storage_options: dict[str, Any],
657
+ credential_provider: Callable | None,
658
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
659
+ ) -> "FlowDataEngine":
649
660
  """Reads a Delta Lake table from cloud storage."""
650
661
  try:
651
662
  logger.info("Reading Delta file from cloud storage...")
652
663
  logger.info(f"read_settings: {read_settings}")
653
664
  scan_kwargs = {"source": resource_path}
654
665
  if read_settings.delta_version:
655
- scan_kwargs['version'] = read_settings.delta_version
666
+ scan_kwargs["version"] = read_settings.delta_version
656
667
  if storage_options:
657
668
  scan_kwargs["storage_options"] = storage_options
658
669
  if credential_provider:
@@ -663,18 +674,20 @@ class FlowDataEngine:
663
674
  lf,
664
675
  number_of_records=6_666_666, # Set so the provider is not accessed for this stat
665
676
  optimize_memory=True,
666
- streamable=True
677
+ streamable=True,
667
678
  )
668
679
  except Exception as e:
669
680
  logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
670
681
  raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
671
682
 
672
683
  @classmethod
673
- def _read_csv_from_cloud(cls,
674
- resource_path: str,
675
- storage_options: Dict[str, Any],
676
- credential_provider: Optional[Callable],
677
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
684
+ def _read_csv_from_cloud(
685
+ cls,
686
+ resource_path: str,
687
+ storage_options: dict[str, Any],
688
+ credential_provider: Callable | None,
689
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
690
+ ) -> "FlowDataEngine":
678
691
  """Reads CSV file(s) from cloud storage."""
679
692
  try:
680
693
  scan_kwargs = {
@@ -703,7 +716,7 @@ class FlowDataEngine:
703
716
  number_of_records=6_666_666, # Will be calculated lazily
704
717
  optimize_memory=True,
705
718
  streamable=True,
706
- schema=schema
719
+ schema=schema,
707
720
  )
708
721
 
709
722
  except Exception as e:
@@ -711,11 +724,13 @@ class FlowDataEngine:
711
724
  raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
712
725
 
713
726
  @classmethod
714
- def _read_json_from_cloud(cls,
715
- resource_path: str,
716
- storage_options: Dict[str, Any],
717
- credential_provider: Optional[Callable],
718
- is_directory: bool) -> "FlowDataEngine":
727
+ def _read_json_from_cloud(
728
+ cls,
729
+ resource_path: str,
730
+ storage_options: dict[str, Any],
731
+ credential_provider: Callable | None,
732
+ is_directory: bool,
733
+ ) -> "FlowDataEngine":
719
734
  """Reads JSON file(s) from cloud storage."""
720
735
  try:
721
736
  if is_directory:
@@ -755,8 +770,9 @@ class FlowDataEngine:
755
770
  else:
756
771
  self.data_frame = pl.read_parquet(path_ref)
757
772
 
758
- def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
759
- calculate_schema_stats: bool):
773
+ def _finalize_initialization(
774
+ self, name: str, optimize_memory: bool, schema: Any | None, calculate_schema_stats: bool
775
+ ):
760
776
  """Finalizes initialization by setting remaining attributes."""
761
777
  _ = calculate_schema_stats
762
778
  self.name = name
@@ -803,23 +819,20 @@ class FlowDataEngine:
803
819
  def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
804
820
  """Sets the underlying Polars DataFrame or LazyFrame."""
805
821
  if self.lazy and isinstance(df, pl.DataFrame):
806
- raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
822
+ raise Exception("Cannot set a non-lazy dataframe to a lazy flowfile")
807
823
  self._data_frame = df
808
824
 
809
825
  @staticmethod
810
- def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
826
+ def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> list[dict]:
811
827
  """Converts a Polars Schema into a list of schema statistics dictionaries."""
812
- return [
813
- dict(column_name=k, pl_datatype=v, col_index=i)
814
- for i, (k, v) in enumerate(pl_schema.items())
815
- ]
828
+ return [dict(column_name=k, pl_datatype=v, col_index=i) for i, (k, v) in enumerate(pl_schema.items())]
816
829
 
817
- def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
830
+ def _add_schema_from_schema_stats(self, schema_stats: list[dict]):
818
831
  """Populates the schema from a list of schema statistics dictionaries."""
819
832
  self._schema = convert_stats_to_column_info(schema_stats)
820
833
 
821
834
  @property
822
- def schema(self) -> List[FlowfileColumn]:
835
+ def schema(self) -> list[FlowfileColumn]:
823
836
  """The schema of the DataFrame as a list of `FlowfileColumn` objects.
824
837
 
825
838
  This property lazily calculates the schema if it hasn't been determined yet.
@@ -866,8 +879,10 @@ class FlowDataEngine:
866
879
  if n_records is None:
867
880
  logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
868
881
  else:
869
- logger.info(f'Fetching {n_records} record(s) for Table object "{id(self)}". '
870
- f'Settings: streaming={self._streamable}')
882
+ logger.info(
883
+ f'Fetching {n_records} record(s) for Table object "{id(self)}". '
884
+ f"Settings: streaming={self._streamable}"
885
+ )
871
886
 
872
887
  if not self.lazy:
873
888
  return self.data_frame
@@ -881,16 +896,15 @@ class FlowDataEngine:
881
896
  def _collect_data(self, n_records: int = None) -> pl.DataFrame:
882
897
  """Internal method to handle data collection logic."""
883
898
  if n_records is None:
884
-
885
899
  self.collect_external()
886
900
  if self._streamable:
887
901
  try:
888
- logger.info('Collecting data in streaming mode')
902
+ logger.info("Collecting data in streaming mode")
889
903
  return self.data_frame.collect(engine="streaming")
890
904
  except PanicException:
891
905
  self._streamable = False
892
906
 
893
- logger.info('Collecting data in non-streaming mode')
907
+ logger.info("Collecting data in non-streaming mode")
894
908
  return self.data_frame.collect()
895
909
 
896
910
  if self.external_source is not None:
@@ -919,7 +933,7 @@ class FlowDataEngine:
919
933
  return self._create_partial_dataframe(ok_cols, error_cols, n_records)
920
934
  return self._create_empty_dataframe(n_records)
921
935
 
922
- def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
936
+ def _identify_valid_columns(self, n_records: int) -> tuple[list[str], list[tuple[str, Any]]]:
923
937
  """Identifies which columns can be collected successfully."""
924
938
  ok_cols = []
925
939
  error_cols = []
@@ -931,30 +945,30 @@ class FlowDataEngine:
931
945
  error_cols.append((c, self.data_frame.schema[c]))
932
946
  return ok_cols, error_cols
933
947
 
934
- def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
935
- n_records: int) -> pl.DataFrame:
948
+ def _create_partial_dataframe(
949
+ self, ok_cols: list[str], error_cols: list[tuple[str, Any]], n_records: int
950
+ ) -> pl.DataFrame:
936
951
  """Creates a DataFrame with partial data for columns that could be collected."""
937
952
  df = self.data_frame.select(ok_cols)
938
- df = df.with_columns([
939
- pl.lit(None).alias(column_name).cast(data_type)
940
- for column_name, data_type in error_cols
941
- ])
953
+ df = df.with_columns([pl.lit(None).alias(column_name).cast(data_type) for column_name, data_type in error_cols])
942
954
  return df.select(self.columns).head(n_records).collect()
943
955
 
944
956
  def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
945
957
  """Creates an empty DataFrame with the correct schema."""
946
958
  if self.number_of_records > 0:
947
- return pl.DataFrame({
948
- column_name: pl.Series(
949
- name=column_name,
950
- values=[None] * min(self.number_of_records, n_records)
951
- ).cast(data_type)
952
- for column_name, data_type in self.data_frame.schema.items()
953
- })
959
+ return pl.DataFrame(
960
+ {
961
+ column_name: pl.Series(
962
+ name=column_name, values=[None] * min(self.number_of_records, n_records)
963
+ ).cast(data_type)
964
+ for column_name, data_type in self.data_frame.schema.items()
965
+ }
966
+ )
954
967
  return pl.DataFrame(schema=self.data_frame.schema)
955
968
 
956
- def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
957
- calculate_schema_stats: bool = True) -> "FlowDataEngine":
969
+ def do_group_by(
970
+ self, group_by_input: transform_schemas.GroupByInput, calculate_schema_stats: bool = True
971
+ ) -> "FlowDataEngine":
958
972
  """Performs a group-by operation on the DataFrame.
959
973
 
960
974
  Args:
@@ -966,27 +980,35 @@ class FlowDataEngine:
966
980
  Returns:
967
981
  A new `FlowDataEngine` instance with the grouped and aggregated data.
968
982
  """
969
- aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
970
- group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
983
+ aggregations = [c for c in group_by_input.agg_cols if c.agg != "groupby"]
984
+ group_columns = [c for c in group_by_input.agg_cols if c.agg == "groupby"]
971
985
 
972
986
  if len(group_columns) == 0:
973
987
  return FlowDataEngine(
974
- self.data_frame.select(
975
- ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
976
- ),
977
- calculate_schema_stats=calculate_schema_stats
988
+ self.data_frame.select(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
989
+ calculate_schema_stats=calculate_schema_stats,
978
990
  )
979
991
 
980
992
  df = self.data_frame.rename({c.old_name: c.new_name for c in group_columns})
981
993
  group_by_columns = [n_c.new_name for n_c in group_columns]
994
+
995
+ # Handle case where there are no aggregations - just get unique combinations of group columns
996
+ if len(aggregations) == 0:
997
+ return FlowDataEngine(
998
+ df.select(group_by_columns).unique(),
999
+ calculate_schema_stats=calculate_schema_stats,
1000
+ )
1001
+
1002
+ grouped_df = df.group_by(*group_by_columns)
1003
+ agg_exprs = [ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations]
1004
+ result_df = grouped_df.agg(agg_exprs)
1005
+
982
1006
  return FlowDataEngine(
983
- df.group_by(*group_by_columns).agg(
984
- ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
985
- ),
986
- calculate_schema_stats=calculate_schema_stats
1007
+ result_df,
1008
+ calculate_schema_stats=calculate_schema_stats,
987
1009
  )
988
1010
 
989
- def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
1011
+ def do_sort(self, sorts: list[transform_schemas.SortByInput]) -> "FlowDataEngine":
990
1012
  """Sorts the DataFrame by one or more columns.
991
1013
 
992
1014
  Args:
@@ -999,12 +1021,13 @@ class FlowDataEngine:
999
1021
  if not sorts:
1000
1022
  return self
1001
1023
 
1002
- descending = [s.how == 'desc' or s.how.lower() == 'descending' for s in sorts]
1024
+ descending = [s.how == "desc" or s.how.lower() == "descending" for s in sorts]
1003
1025
  df = self.data_frame.sort([sort_by.column for sort_by in sorts], descending=descending)
1004
1026
  return FlowDataEngine(df, number_of_records=self.number_of_records, schema=self.schema)
1005
1027
 
1006
- def change_column_types(self, transforms: List[transform_schemas.SelectInput],
1007
- calculate_schema: bool = False) -> "FlowDataEngine":
1028
+ def change_column_types(
1029
+ self, transforms: list[transform_schemas.SelectInput], calculate_schema: bool = False
1030
+ ) -> "FlowDataEngine":
1008
1031
  """Changes the data type of one or more columns.
1009
1032
 
1010
1033
  Args:
@@ -1018,7 +1041,8 @@ class FlowDataEngine:
1018
1041
  dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
1019
1042
  idx_mapping = list(
1020
1043
  (transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
1021
- for transform in transforms if transform.data_type is not None
1044
+ for transform in transforms
1045
+ if transform.data_type is not None
1022
1046
  )
1023
1047
 
1024
1048
  actual_transforms = [c for c in idx_mapping if c[2] != dtypes[c[1]]]
@@ -1032,10 +1056,10 @@ class FlowDataEngine:
1032
1056
  df,
1033
1057
  number_of_records=self.number_of_records,
1034
1058
  calculate_schema_stats=calculate_schema,
1035
- streamable=self._streamable
1059
+ streamable=self._streamable,
1036
1060
  )
1037
1061
 
1038
- def save(self, path: str, data_type: str = 'parquet') -> Future:
1062
+ def save(self, path: str, data_type: str = "parquet") -> Future:
1039
1063
  """Saves the DataFrame to a file in a separate thread.
1040
1064
 
1041
1065
  Args:
@@ -1049,7 +1073,7 @@ class FlowDataEngine:
1049
1073
  df = deepcopy(self.data_frame)
1050
1074
  return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
1051
1075
 
1052
- def to_pylist(self) -> List[Dict]:
1076
+ def to_pylist(self) -> list[dict]:
1053
1077
  """Converts the DataFrame to a list of Python dictionaries.
1054
1078
 
1055
1079
  Returns:
@@ -1083,15 +1107,15 @@ class FlowDataEngine:
1083
1107
  data = list(self.to_dict().values())
1084
1108
  return input_schema.RawData(columns=columns, data=data)
1085
1109
 
1086
- def to_dict(self) -> Dict[str, List]:
1110
+ def to_dict(self) -> dict[str, list]:
1087
1111
  """Converts the DataFrame to a Python dictionary of columns.
1088
1112
 
1089
- Each key in the dictionary is a column name, and the corresponding value
1090
- is a list of the data in that column.
1113
+ Each key in the dictionary is a column name, and the corresponding value
1114
+ is a list of the data in that column.
1091
1115
 
1092
- Returns:
1093
- A dictionary mapping column names to lists of their values.
1094
- """
1116
+ Returns:
1117
+ A dictionary mapping column names to lists of their values.
1118
+ """
1095
1119
  if self.lazy:
1096
1120
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
1097
1121
  else:
@@ -1131,7 +1155,7 @@ class FlowDataEngine:
1131
1155
  return cls(pl.read_sql(sql, conn))
1132
1156
 
1133
1157
  @classmethod
1134
- def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
1158
+ def create_from_schema(cls, schema: list[FlowfileColumn]) -> "FlowDataEngine":
1135
1159
  """Creates an empty FlowDataEngine from a schema definition.
1136
1160
 
1137
1161
  Args:
@@ -1162,14 +1186,14 @@ class FlowDataEngine:
1162
1186
  """
1163
1187
  received_table.set_absolute_filepath()
1164
1188
  file_type_handlers = {
1165
- 'csv': create_funcs.create_from_path_csv,
1166
- 'parquet': create_funcs.create_from_path_parquet,
1167
- 'excel': create_funcs.create_from_path_excel
1189
+ "csv": create_funcs.create_from_path_csv,
1190
+ "parquet": create_funcs.create_from_path_parquet,
1191
+ "excel": create_funcs.create_from_path_excel,
1168
1192
  }
1169
1193
 
1170
1194
  handler = file_type_handlers.get(received_table.file_type)
1171
1195
  if not handler:
1172
- raise Exception(f'Cannot create from {received_table.file_type}')
1196
+ raise Exception(f"Cannot create from {received_table.file_type}")
1173
1197
 
1174
1198
  flow_file = cls(handler(received_table))
1175
1199
  flow_file._org_path = received_table.abs_file_path
@@ -1190,7 +1214,7 @@ class FlowDataEngine:
1190
1214
  return cls(create_fake_data(number_of_records))
1191
1215
 
1192
1216
  @classmethod
1193
- def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
1217
+ def generate_enumerator(cls, length: int = 1000, output_name: str = "output_column") -> "FlowDataEngine":
1194
1218
  """Generates a FlowDataEngine with a single column containing a sequence of integers.
1195
1219
 
1196
1220
  Args:
@@ -1204,8 +1228,9 @@ class FlowDataEngine:
1204
1228
  length = 10_000_000
1205
1229
  return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
1206
1230
 
1207
- def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
1208
- pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
1231
+ def _handle_schema(
1232
+ self, schema: list[FlowfileColumn] | list[str] | pl.Schema | None, pl_schema: pl.Schema
1233
+ ) -> list[FlowfileColumn] | None:
1209
1234
  """Handles schema processing and validation during initialization."""
1210
1235
  if schema is None and pl_schema is not None:
1211
1236
  return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
@@ -1216,7 +1241,8 @@ class FlowDataEngine:
1216
1241
  elif pl_schema is not None and schema is not None:
1217
1242
  if schema.__len__() != pl_schema.__len__():
1218
1243
  raise Exception(
1219
- f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
1244
+ f"Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}"
1245
+ )
1220
1246
  if isinstance(schema, pl.Schema):
1221
1247
  return self._handle_polars_schema(schema, pl_schema)
1222
1248
  elif isinstance(schema, list) and len(schema) == 0:
@@ -1225,31 +1251,29 @@ class FlowDataEngine:
1225
1251
  return self._handle_string_schema(schema, pl_schema)
1226
1252
  return schema
1227
1253
 
1228
- def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
1254
+ def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> list[FlowfileColumn]:
1229
1255
  """Handles Polars schema conversion."""
1230
1256
  flow_file_columns = [
1231
1257
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
1232
- for col_name, dtype in zip(schema.names(), schema.dtypes())
1258
+ for col_name, dtype in zip(schema.names(), schema.dtypes(), strict=False)
1233
1259
  ]
1234
1260
 
1235
1261
  select_arg = [
1236
1262
  pl.col(o).alias(n).cast(schema_dtype)
1237
- for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes())
1263
+ for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes(), strict=False)
1238
1264
  ]
1239
1265
 
1240
1266
  self.data_frame = self.data_frame.select(select_arg)
1241
1267
  return flow_file_columns
1242
1268
 
1243
- def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
1269
+ def _handle_string_schema(self, schema: list[str], pl_schema: pl.Schema) -> list[FlowfileColumn]:
1244
1270
  """Handles string-based schema conversion."""
1245
1271
  flow_file_columns = [
1246
1272
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
1247
- for col_name, dtype in zip(schema, pl_schema.dtypes())
1273
+ for col_name, dtype in zip(schema, pl_schema.dtypes(), strict=False)
1248
1274
  ]
1249
1275
 
1250
- self.data_frame = self.data_frame.rename({
1251
- o: n for o, n in zip(pl_schema.names(), schema)
1252
- })
1276
+ self.data_frame = self.data_frame.rename({o: n for o, n in zip(pl_schema.names(), schema, strict=False)})
1253
1277
 
1254
1278
  return flow_file_columns
1255
1279
 
@@ -1267,25 +1291,16 @@ class FlowDataEngine:
1267
1291
  A new `FlowDataEngine` instance with the exploded rows.
1268
1292
  """
1269
1293
  output_column_name = (
1270
- split_input.output_column_name
1271
- if split_input.output_column_name
1272
- else split_input.column_to_split
1294
+ split_input.output_column_name if split_input.output_column_name else split_input.column_to_split
1273
1295
  )
1274
1296
 
1275
1297
  split_value = (
1276
- split_input.split_fixed_value
1277
- if split_input.split_by_fixed_value
1278
- else pl.col(split_input.split_by_column)
1298
+ split_input.split_fixed_value if split_input.split_by_fixed_value else pl.col(split_input.split_by_column)
1279
1299
  )
1280
1300
 
1281
- df = (
1282
- self.data_frame.with_columns(
1283
- pl.col(split_input.column_to_split)
1284
- .str.split(by=split_value)
1285
- .alias(output_column_name)
1286
- )
1287
- .explode(output_column_name)
1288
- )
1301
+ df = self.data_frame.with_columns(
1302
+ pl.col(split_input.column_to_split).str.split(by=split_value).alias(output_column_name)
1303
+ ).explode(output_column_name)
1289
1304
 
1290
1305
  return FlowDataEngine(df)
1291
1306
 
@@ -1305,15 +1320,9 @@ class FlowDataEngine:
1305
1320
  lf = self.data_frame
1306
1321
 
1307
1322
  if unpivot_input.data_type_selector_expr is not None:
1308
- result = lf.unpivot(
1309
- on=unpivot_input.data_type_selector_expr(),
1310
- index=unpivot_input.index_columns
1311
- )
1323
+ result = lf.unpivot(on=unpivot_input.data_type_selector_expr(), index=unpivot_input.index_columns)
1312
1324
  elif unpivot_input.value_columns is not None:
1313
- result = lf.unpivot(
1314
- on=unpivot_input.value_columns,
1315
- index=unpivot_input.index_columns
1316
- )
1325
+ result = lf.unpivot(on=unpivot_input.value_columns, index=unpivot_input.index_columns)
1317
1326
  else:
1318
1327
  result = lf.unpivot()
1319
1328
 
@@ -1333,19 +1342,24 @@ class FlowDataEngine:
1333
1342
  """
1334
1343
  # Get unique values for pivot columns
1335
1344
  max_unique_vals = 200
1336
- new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
1337
- .unique()
1338
- .sort(pivot_input.pivot_column)
1339
- .limit(max_unique_vals).cast(pl.String))
1345
+ new_cols_unique = fetch_unique_values(
1346
+ self.data_frame.select(pivot_input.pivot_column)
1347
+ .unique()
1348
+ .sort(pivot_input.pivot_column)
1349
+ .limit(max_unique_vals)
1350
+ .cast(pl.String)
1351
+ )
1340
1352
  if len(new_cols_unique) >= max_unique_vals:
1341
1353
  if node_logger:
1342
- node_logger.warning('Pivot column has too many unique values. Please consider using a different column.'
1343
- f' Max unique values: {max_unique_vals}')
1354
+ node_logger.warning(
1355
+ "Pivot column has too many unique values. Please consider using a different column."
1356
+ f" Max unique values: {max_unique_vals}"
1357
+ )
1344
1358
 
1345
1359
  if len(pivot_input.index_columns) == 0:
1346
1360
  no_index_cols = True
1347
- pivot_input.index_columns = ['__temp__']
1348
- ff = self.apply_flowfile_formula('1', col_name='__temp__')
1361
+ pivot_input.index_columns = ["__temp__"]
1362
+ ff = self.apply_flowfile_formula("1", col_name="__temp__")
1349
1363
  else:
1350
1364
  no_index_cols = False
1351
1365
  ff = self
@@ -1355,36 +1369,32 @@ class FlowDataEngine:
1355
1369
  grouped_ff = ff.do_group_by(pivot_input.get_group_by_input(), False)
1356
1370
  pivot_column = pivot_input.get_pivot_column()
1357
1371
 
1358
- input_df = grouped_ff.data_frame.with_columns(
1359
- pivot_column.cast(pl.String).alias(pivot_input.pivot_column)
1360
- )
1372
+ input_df = grouped_ff.data_frame.with_columns(pivot_column.cast(pl.String).alias(pivot_input.pivot_column))
1361
1373
  number_of_aggregations = len(pivot_input.aggregations)
1362
1374
  df = (
1363
- input_df.select(
1364
- *index_columns,
1365
- pivot_column,
1366
- pivot_input.get_values_expr()
1367
- )
1375
+ input_df.select(*index_columns, pivot_column, pivot_input.get_values_expr())
1368
1376
  .group_by(*index_columns)
1369
- .agg([
1370
- (pl.col('vals').filter(pivot_column == new_col_value))
1371
- .first()
1372
- .alias(new_col_value)
1373
- for new_col_value in new_cols_unique
1374
- ])
1377
+ .agg(
1378
+ [
1379
+ (pl.col("vals").filter(pivot_column == new_col_value)).first().alias(new_col_value)
1380
+ for new_col_value in new_cols_unique
1381
+ ]
1382
+ )
1375
1383
  .select(
1376
1384
  *index_columns,
1377
1385
  *[
1378
- pl.col(new_col).struct.field(agg).alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
1386
+ pl.col(new_col)
1387
+ .struct.field(agg)
1388
+ .alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
1379
1389
  for new_col in new_cols_unique
1380
1390
  for agg in pivot_input.aggregations
1381
- ]
1391
+ ],
1382
1392
  )
1383
1393
  )
1384
1394
 
1385
1395
  # Clean up temporary columns if needed
1386
1396
  if no_index_cols:
1387
- df = df.drop('__temp__')
1397
+ df = df.drop("__temp__")
1388
1398
  pivot_input.index_columns = []
1389
1399
 
1390
1400
  return FlowDataEngine(df, calculate_schema_stats=False)
@@ -1403,7 +1413,7 @@ class FlowDataEngine:
1403
1413
  try:
1404
1414
  f = to_expr(predicate)
1405
1415
  except Exception as e:
1406
- logger.warning(f'Error in filter expression: {e}')
1416
+ logger.warning(f"Error in filter expression: {e}")
1407
1417
  f = to_expr("False")
1408
1418
  df = self.data_frame.filter(f)
1409
1419
  return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
@@ -1430,29 +1440,27 @@ class FlowDataEngine:
1430
1440
  select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
1431
1441
 
1432
1442
  df = (
1433
- self.data_frame
1434
- .with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
1443
+ self.data_frame.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
1435
1444
  .with_columns(
1436
- (pl.cum_count(record_id_settings.output_column_name)
1437
- .over(record_id_settings.group_by_columns) + record_id_settings.offset - 1)
1438
- .alias(record_id_settings.output_column_name)
1445
+ (
1446
+ pl.cum_count(record_id_settings.output_column_name).over(record_id_settings.group_by_columns)
1447
+ + record_id_settings.offset
1448
+ - 1
1449
+ ).alias(record_id_settings.output_column_name)
1439
1450
  )
1440
1451
  .select(select_cols)
1441
1452
  )
1442
1453
 
1443
- output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
1454
+ output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
1444
1455
  output_schema.extend(self.schema)
1445
1456
 
1446
1457
  return FlowDataEngine(df, schema=output_schema)
1447
1458
 
1448
1459
  def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
1449
1460
  """Adds a simple sequential record ID column."""
1450
- df = self.data_frame.with_row_index(
1451
- record_id_settings.output_column_name,
1452
- record_id_settings.offset
1453
- )
1461
+ df = self.data_frame.with_row_index(record_id_settings.output_column_name, record_id_settings.offset)
1454
1462
 
1455
- output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
1463
+ output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
1456
1464
  output_schema.extend(self.schema)
1457
1465
 
1458
1466
  return FlowDataEngine(df, schema=output_schema)
@@ -1484,7 +1492,7 @@ class FlowDataEngine:
1484
1492
 
1485
1493
  def __repr__(self) -> str:
1486
1494
  """Returns a string representation of the FlowDataEngine."""
1487
- return f'flow data engine\n{self.data_frame.__repr__()}'
1495
+ return f"flow data engine\n{self.data_frame.__repr__()}"
1488
1496
 
1489
1497
  def __call__(self) -> "FlowDataEngine":
1490
1498
  """Makes the class instance callable, returning itself."""
@@ -1504,16 +1512,16 @@ class FlowDataEngine:
1504
1512
  Returns:
1505
1513
  The same `FlowDataEngine` instance, now backed by the cached data.
1506
1514
  """
1507
- edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
1508
- flow_id=-1,
1509
- node_id=-1)
1510
- logger.info('Caching data in background')
1515
+ edf = ExternalDfFetcher(
1516
+ lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False, flow_id=-1, node_id=-1
1517
+ )
1518
+ logger.info("Caching data in background")
1511
1519
  result = edf.get_result()
1512
1520
  if isinstance(result, pl.LazyFrame):
1513
- logger.info('Data cached')
1521
+ logger.info("Data cached")
1514
1522
  del self._data_frame
1515
1523
  self.data_frame = result
1516
- logger.info('Data loaded from cache')
1524
+ logger.info("Data loaded from cache")
1517
1525
  return self
1518
1526
 
1519
1527
  def collect_external(self):
@@ -1525,14 +1533,14 @@ class FlowDataEngine:
1525
1533
  re-evaluated.
1526
1534
  """
1527
1535
  if self._external_source is not None:
1528
- logger.info('Collecting external source')
1536
+ logger.info("Collecting external source")
1529
1537
  if self.external_source.get_pl_df() is not None:
1530
1538
  self.data_frame = self.external_source.get_pl_df().lazy()
1531
1539
  else:
1532
1540
  self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
1533
1541
  self._schema = None # enforce reset schema
1534
1542
 
1535
- def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
1543
+ def get_output_sample(self, n_rows: int = 10) -> list[dict]:
1536
1544
  """Gets a sample of the data as a list of dictionaries.
1537
1545
 
1538
1546
  This is typically used to display a preview of the data in a UI.
@@ -1560,14 +1568,20 @@ class FlowDataEngine:
1560
1568
  try:
1561
1569
  df = df.head(n_rows).collect()
1562
1570
  except Exception as e:
1563
- logger.warning(f'Error in getting sample: {e}')
1571
+ logger.warning(f"Error in getting sample: {e}")
1564
1572
  df = df.head(n_rows).collect(engine="auto")
1565
1573
  else:
1566
1574
  df = self.collect()
1567
1575
  return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
1568
1576
 
1569
- def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
1570
- seed: int = None, execution_location: Optional[ExecutionLocationsLiteral] = None) -> "FlowDataEngine":
1577
+ def get_sample(
1578
+ self,
1579
+ n_rows: int = 100,
1580
+ random: bool = False,
1581
+ shuffle: bool = False,
1582
+ seed: int = None,
1583
+ execution_location: ExecutionLocationsLiteral | None = None,
1584
+ ) -> "FlowDataEngine":
1571
1585
  """Gets a sample of rows from the DataFrame.
1572
1586
 
1573
1587
  Args:
@@ -1579,22 +1593,23 @@ class FlowDataEngine:
1579
1593
  Returns:
1580
1594
  A new `FlowDataEngine` instance containing the sampled data.
1581
1595
  """
1582
- logging.info(f'Getting sample of {n_rows} rows')
1596
+ logging.info(f"Getting sample of {n_rows} rows")
1583
1597
  if random:
1584
1598
  if self.lazy and self.external_source is not None:
1585
1599
  self.collect_external()
1586
1600
 
1587
1601
  if self.lazy and shuffle:
1588
- sample_df = (self.data_frame.collect(engine="streaming" if self._streamable else "auto")
1589
- .sample(n_rows, seed=seed, shuffle=shuffle))
1602
+ sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(
1603
+ n_rows, seed=seed, shuffle=shuffle
1604
+ )
1590
1605
  elif shuffle:
1591
1606
  sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
1592
1607
  else:
1593
1608
  if execution_location is None:
1594
1609
  execution_location = get_global_execution_location()
1595
- n_rows = min(n_rows, self.get_number_of_records(
1596
- calculate_in_worker_process=execution_location == "remote")
1597
- )
1610
+ n_rows = min(
1611
+ n_rows, self.get_number_of_records(calculate_in_worker_process=execution_location == "remote")
1612
+ )
1598
1613
 
1599
1614
  every_n_records = ceil(self.number_of_records / n_rows)
1600
1615
  sample_df = self.data_frame.gather_every(every_n_records)
@@ -1619,8 +1634,9 @@ class FlowDataEngine:
1619
1634
  else:
1620
1635
  return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
1621
1636
 
1622
- def iter_batches(self, batch_size: int = 1000,
1623
- columns: Union[List, Tuple, str] = None) -> Generator["FlowDataEngine", None, None]:
1637
+ def iter_batches(
1638
+ self, batch_size: int = 1000, columns: list | tuple | str = None
1639
+ ) -> Generator["FlowDataEngine", None, None]:
1624
1640
  """Iterates over the DataFrame in batches.
1625
1641
 
1626
1642
  Args:
@@ -1638,9 +1654,14 @@ class FlowDataEngine:
1638
1654
  for batch in batches:
1639
1655
  yield FlowDataEngine(batch)
1640
1656
 
1641
- def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1642
- other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
1643
- node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
1657
+ def start_fuzzy_join(
1658
+ self,
1659
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1660
+ other: "FlowDataEngine",
1661
+ file_ref: str,
1662
+ flow_id: int = -1,
1663
+ node_id: int | str = -1,
1664
+ ) -> ExternalFuzzyMatchFetcher:
1644
1665
  """Starts a fuzzy join operation in a background process.
1645
1666
 
1646
1667
  This method prepares the data and initiates the fuzzy matching in a
@@ -1658,51 +1679,70 @@ class FlowDataEngine:
1658
1679
  progress and retrieve the result of the fuzzy join.
1659
1680
  """
1660
1681
  fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1661
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1662
- fuzzy_match_input_manager=fuzzy_match_input_manager)
1663
-
1664
- return ExternalFuzzyMatchFetcher(left_df, right_df,
1665
- fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1666
- file_ref=file_ref + '_fm',
1667
- wait_on_completion=False,
1668
- flow_id=flow_id,
1669
- node_id=node_id)
1670
-
1671
- def fuzzy_join_external(self,
1672
- fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1673
- other: "FlowDataEngine",
1674
- file_ref: str = None,
1675
- flow_id: int = -1,
1676
- node_id: int = -1
1677
- ):
1682
+ left_df, right_df = prepare_for_fuzzy_match(
1683
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1684
+ )
1685
+
1686
+ return ExternalFuzzyMatchFetcher(
1687
+ left_df,
1688
+ right_df,
1689
+ fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1690
+ file_ref=file_ref + "_fm",
1691
+ wait_on_completion=False,
1692
+ flow_id=flow_id,
1693
+ node_id=node_id,
1694
+ )
1695
+
1696
+ def fuzzy_join_external(
1697
+ self,
1698
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1699
+ other: "FlowDataEngine",
1700
+ file_ref: str = None,
1701
+ flow_id: int = -1,
1702
+ node_id: int = -1,
1703
+ ):
1678
1704
  if file_ref is None:
1679
- file_ref = str(id(self)) + '_' + str(id(other))
1705
+ file_ref = str(id(self)) + "_" + str(id(other))
1680
1706
  fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1681
1707
 
1682
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1683
- fuzzy_match_input_manager=fuzzy_match_input_manager)
1684
- external_tracker = ExternalFuzzyMatchFetcher(left_df, right_df,
1685
- fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1686
- file_ref=file_ref + '_fm',
1687
- wait_on_completion=False,
1688
- flow_id=flow_id,
1689
- node_id=node_id)
1708
+ left_df, right_df = prepare_for_fuzzy_match(
1709
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1710
+ )
1711
+ external_tracker = ExternalFuzzyMatchFetcher(
1712
+ left_df,
1713
+ right_df,
1714
+ fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1715
+ file_ref=file_ref + "_fm",
1716
+ wait_on_completion=False,
1717
+ flow_id=flow_id,
1718
+ node_id=node_id,
1719
+ )
1690
1720
  return FlowDataEngine(external_tracker.get_result())
1691
1721
 
1692
- def fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1693
- other: "FlowDataEngine",
1694
- node_logger: NodeLogger = None) -> "FlowDataEngine":
1722
+ def fuzzy_join(
1723
+ self,
1724
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1725
+ other: "FlowDataEngine",
1726
+ node_logger: NodeLogger = None,
1727
+ ) -> "FlowDataEngine":
1695
1728
  fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1696
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1697
- fuzzy_match_input_manager=fuzzy_match_input_manager)
1729
+ left_df, right_df = prepare_for_fuzzy_match(
1730
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1731
+ )
1698
1732
  fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input_manager.fuzzy_maps]
1699
- return FlowDataEngine(fuzzy_match_dfs(left_df, right_df, fuzzy_maps=fuzzy_mappings,
1700
- logger=node_logger.logger if node_logger else logger)
1701
- .lazy())
1733
+ return FlowDataEngine(
1734
+ fuzzy_match_dfs(
1735
+ left_df, right_df, fuzzy_maps=fuzzy_mappings, logger=node_logger.logger if node_logger else logger
1736
+ ).lazy()
1737
+ )
1702
1738
 
1703
- def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
1704
- auto_generate_selection: bool, verify_integrity: bool,
1705
- other: "FlowDataEngine") -> "FlowDataEngine":
1739
+ def do_cross_join(
1740
+ self,
1741
+ cross_join_input: transform_schemas.CrossJoinInput,
1742
+ auto_generate_selection: bool,
1743
+ verify_integrity: bool,
1744
+ other: "FlowDataEngine",
1745
+ ) -> "FlowDataEngine":
1706
1746
  """Performs a cross join with another DataFrame.
1707
1747
 
1708
1748
  A cross join produces the Cartesian product of the two DataFrames.
@@ -1723,26 +1763,41 @@ class FlowDataEngine:
1723
1763
  self.lazy = True
1724
1764
  other.lazy = True
1725
1765
  cross_join_input_manager = transform_schemas.CrossJoinInputManager(cross_join_input)
1726
- verify_join_select_integrity(cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns)
1727
- right_select = [v.old_name for v in cross_join_input_manager.right_select.renames
1728
- if (v.keep or v.join_key) and v.is_available]
1729
- left_select = [v.old_name for v in cross_join_input_manager.left_select.renames
1730
- if (v.keep or v.join_key) and v.is_available]
1766
+ verify_join_select_integrity(
1767
+ cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns
1768
+ )
1769
+ right_select = [
1770
+ v.old_name
1771
+ for v in cross_join_input_manager.right_select.renames
1772
+ if (v.keep or v.join_key) and v.is_available
1773
+ ]
1774
+ left_select = [
1775
+ v.old_name
1776
+ for v in cross_join_input_manager.left_select.renames
1777
+ if (v.keep or v.join_key) and v.is_available
1778
+ ]
1731
1779
  cross_join_input_manager.auto_rename(rename_mode="suffix")
1732
1780
  left = self.data_frame.select(left_select).rename(cross_join_input_manager.left_select.rename_table)
1733
1781
  right = other.data_frame.select(right_select).rename(cross_join_input_manager.right_select.rename_table)
1734
1782
 
1735
- joined_df = left.join(right, how='cross')
1783
+ joined_df = left.join(right, how="cross")
1736
1784
 
1737
- cols_to_delete_after = [col.new_name for col in
1738
- cross_join_input_manager.left_select.renames + cross_join_input_manager.left_select.renames
1739
- if col.join_key and not col.keep and col.is_available]
1785
+ cols_to_delete_after = [
1786
+ col.new_name
1787
+ for col in cross_join_input_manager.left_select.renames + cross_join_input_manager.right_select.renames
1788
+ if col.join_key and not col.keep and col.is_available
1789
+ ]
1740
1790
 
1741
1791
  fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
1742
1792
  return fl
1743
1793
 
1744
- def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
1745
- verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
1794
+ def join(
1795
+ self,
1796
+ join_input: transform_schemas.JoinInput,
1797
+ auto_generate_selection: bool,
1798
+ verify_integrity: bool,
1799
+ other: "FlowDataEngine",
1800
+ ) -> "FlowDataEngine":
1746
1801
  """Performs a standard SQL-style join with another DataFrame."""
1747
1802
  # Create manager from input
1748
1803
  join_manager = transform_schemas.JoinInputManager(join_input)
@@ -1754,40 +1809,52 @@ class FlowDataEngine:
1754
1809
  join_manager.right_select.append(transform_schemas.SelectInput(jk.right_col, keep=False))
1755
1810
  verify_join_select_integrity(join_manager.input, left_columns=self.columns, right_columns=other.columns)
1756
1811
  if not verify_join_map_integrity(join_manager.input, left_columns=self.schema, right_columns=other.schema):
1757
- raise Exception('Join is not valid by the data fields')
1812
+ raise Exception("Join is not valid by the data fields")
1758
1813
 
1759
1814
  if auto_generate_selection:
1760
1815
  join_manager.auto_rename()
1761
1816
 
1762
1817
  # Use manager properties throughout
1763
- left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(join_manager.left_manager.get_rename_table())
1764
- right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(join_manager.right_manager.get_rename_table())
1818
+ left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(
1819
+ join_manager.left_manager.get_rename_table()
1820
+ )
1821
+ right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(
1822
+ join_manager.right_manager.get_rename_table()
1823
+ )
1765
1824
 
1766
1825
  left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_manager)
1767
1826
  left, right = rename_df_table_for_join(left, right, join_manager.get_join_key_renames())
1768
- if join_manager.how == 'right':
1827
+ if join_manager.how == "right":
1769
1828
  joined_df = right.join(
1770
1829
  other=left,
1771
1830
  left_on=join_manager.right_join_keys,
1772
1831
  right_on=join_manager.left_join_keys,
1773
1832
  how="left",
1774
- suffix="").rename(reverse_join_key_mapping)
1833
+ suffix="",
1834
+ ).rename(reverse_join_key_mapping)
1775
1835
  else:
1776
1836
  joined_df = left.join(
1777
1837
  other=right,
1778
1838
  left_on=join_manager.left_join_keys,
1779
1839
  right_on=join_manager.right_join_keys,
1780
1840
  how=join_manager.how,
1781
- suffix="").rename(reverse_join_key_mapping)
1841
+ suffix="",
1842
+ ).rename(reverse_join_key_mapping)
1782
1843
 
1783
- left_cols_to_delete_after = [get_col_name_to_delete(col, 'left')
1784
- for col in join_manager.input.left_select.renames
1785
- if not col.keep and col.is_available and col.join_key]
1844
+ left_cols_to_delete_after = [
1845
+ get_col_name_to_delete(col, "left")
1846
+ for col in join_manager.input.left_select.renames
1847
+ if not col.keep and col.is_available and col.join_key
1848
+ ]
1786
1849
 
1787
- right_cols_to_delete_after = [get_col_name_to_delete(col, 'right')
1788
- for col in join_manager.input.right_select.renames
1789
- if not col.keep and col.is_available and col.join_key
1790
- and join_manager.how in ("left", "right", "inner", "cross", "outer")]
1850
+ right_cols_to_delete_after = [
1851
+ get_col_name_to_delete(col, "right")
1852
+ for col in join_manager.input.right_select.renames
1853
+ if not col.keep
1854
+ and col.is_available
1855
+ and col.join_key
1856
+ and join_manager.how in ("left", "right", "inner", "cross", "outer")
1857
+ ]
1791
1858
 
1792
1859
  if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
1793
1860
  joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
@@ -1795,8 +1862,7 @@ class FlowDataEngine:
1795
1862
  undo_join_key_remapping = get_undo_rename_mapping_join(join_manager)
1796
1863
  joined_df = joined_df.rename(undo_join_key_remapping)
1797
1864
 
1798
- return FlowDataEngine(joined_df, calculate_schema_stats=False,
1799
- number_of_records=0, streamable=False)
1865
+ return FlowDataEngine(joined_df, calculate_schema_stats=False, number_of_records=0, streamable=False)
1800
1866
 
1801
1867
  def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
1802
1868
  """Solves a graph problem represented by 'from' and 'to' columns.
@@ -1811,8 +1877,9 @@ class FlowDataEngine:
1811
1877
  A new `FlowDataEngine` instance with the solved graph data.
1812
1878
  """
1813
1879
  lf = self.data_frame.with_columns(
1814
- graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
1815
- .alias(graph_solver_input.output_column_name)
1880
+ graph_solver(graph_solver_input.col_from, graph_solver_input.col_to).alias(
1881
+ graph_solver_input.output_column_name
1882
+ )
1816
1883
  )
1817
1884
  return FlowDataEngine(lf)
1818
1885
 
@@ -1827,7 +1894,7 @@ class FlowDataEngine:
1827
1894
  A new `FlowDataEngine` instance with the added column.
1828
1895
  """
1829
1896
  if col_name is None:
1830
- col_name = 'new_values'
1897
+ col_name = "new_values"
1831
1898
  return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
1832
1899
 
1833
1900
  def get_record_count(self) -> "FlowDataEngine":
@@ -1837,7 +1904,7 @@ class FlowDataEngine:
1837
1904
  Returns:
1838
1905
  A new `FlowDataEngine` instance.
1839
1906
  """
1840
- return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
1907
+ return FlowDataEngine(self.data_frame.select(pl.len().alias("number_of_records")))
1841
1908
 
1842
1909
  def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
1843
1910
  """Asserts that this DataFrame is equal to another.
@@ -1860,13 +1927,13 @@ class FlowDataEngine:
1860
1927
  other = other.select_columns(self.columns)
1861
1928
 
1862
1929
  if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
1863
- raise Exception('Number of records is not equal')
1930
+ raise Exception("Number of records is not equal")
1864
1931
 
1865
1932
  if self.columns != other.columns:
1866
- raise Exception('Schema is not equal')
1933
+ raise Exception("Schema is not equal")
1867
1934
 
1868
1935
  if strict_schema:
1869
- assert self.data_frame.schema == other.data_frame.schema, 'Data types do not match'
1936
+ assert self.data_frame.schema == other.data_frame.schema, "Data types do not match"
1870
1937
 
1871
1938
  if ordered:
1872
1939
  self_lf = self.data_frame.sort(by=self.columns)
@@ -1876,7 +1943,7 @@ class FlowDataEngine:
1876
1943
  other_lf = other.data_frame
1877
1944
 
1878
1945
  self.lazy, other.lazy = org_laziness
1879
- assert self_lf.equals(other_lf), 'Data is not equal'
1946
+ assert self_lf.equals(other_lf), "Data is not equal"
1880
1947
 
1881
1948
  def initialize_empty_fl(self):
1882
1949
  """Initializes an empty LazyFrame."""
@@ -1891,7 +1958,7 @@ class FlowDataEngine:
1891
1958
  operation_type="calculate_number_of_records",
1892
1959
  flow_id=-1,
1893
1960
  node_id=-1,
1894
- wait_on_completion=True
1961
+ wait_on_completion=True,
1895
1962
  ).result
1896
1963
  return number_of_records
1897
1964
 
@@ -1907,8 +1974,9 @@ class FlowDataEngine:
1907
1974
  """
1908
1975
  return self.get_number_of_records(force_calculate=force_calculate)
1909
1976
 
1910
- def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
1911
- calculate_in_worker_process: bool = False) -> int:
1977
+ def get_number_of_records(
1978
+ self, warn: bool = False, force_calculate: bool = False, calculate_in_worker_process: bool = False
1979
+ ) -> int:
1912
1980
  """Gets the total number of records in the DataFrame.
1913
1981
 
1914
1982
  For lazy frames, this may trigger a full data scan, which can be expensive.
@@ -1938,12 +2006,13 @@ class FlowDataEngine:
1938
2006
  except Exception as e:
1939
2007
  logger.error(f"Error: {e}")
1940
2008
  if warn:
1941
- logger.warning('Calculating the number of records this can be expensive on a lazy frame')
2009
+ logger.warning("Calculating the number of records this can be expensive on a lazy frame")
1942
2010
  try:
1943
2011
  self.number_of_records = self.data_frame.select(pl.len()).collect(
1944
- engine="streaming" if self._streamable else "auto")[0, 0]
2012
+ engine="streaming" if self._streamable else "auto"
2013
+ )[0, 0]
1945
2014
  except Exception:
1946
- raise ValueError('Could not get number of records')
2015
+ raise ValueError("Could not get number of records")
1947
2016
  else:
1948
2017
  self.number_of_records = self.data_frame.__len__()
1949
2018
  return self.number_of_records
@@ -1984,7 +2053,7 @@ class FlowDataEngine:
1984
2053
  return self._external_source
1985
2054
 
1986
2055
  @property
1987
- def cols_idx(self) -> Dict[str, int]:
2056
+ def cols_idx(self) -> dict[str, int]:
1988
2057
  """A dictionary mapping column names to their integer index."""
1989
2058
  if self._col_idx is None:
1990
2059
  self._col_idx = {c: i for i, c in enumerate(self.columns)}
@@ -2006,7 +2075,7 @@ class FlowDataEngine:
2006
2075
  [transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
2007
2076
  )
2008
2077
 
2009
- def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
2078
+ def select_columns(self, list_select: list[str] | tuple[str] | str) -> "FlowDataEngine":
2010
2079
  """Selects a subset of columns from the DataFrame.
2011
2080
 
2012
2081
  Args:
@@ -2019,17 +2088,17 @@ class FlowDataEngine:
2019
2088
  list_select = [list_select]
2020
2089
 
2021
2090
  idx_to_keep = [self.cols_idx.get(c) for c in list_select]
2022
- selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep) if id_to_keep is not None]
2091
+ selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep, strict=False) if id_to_keep is not None]
2023
2092
  new_schema = [self.schema[i] for i in idx_to_keep if i is not None]
2024
2093
 
2025
2094
  return FlowDataEngine(
2026
2095
  self.data_frame.select(selects),
2027
2096
  number_of_records=self.number_of_records,
2028
2097
  schema=new_schema,
2029
- streamable=self._streamable
2098
+ streamable=self._streamable,
2030
2099
  )
2031
2100
 
2032
- def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
2101
+ def drop_columns(self, columns: list[str]) -> "FlowDataEngine":
2033
2102
  """Drops specified columns from the DataFrame.
2034
2103
 
2035
2104
  Args:
@@ -2043,12 +2112,10 @@ class FlowDataEngine:
2043
2112
  new_schema = [self.schema[i] for i in idx_to_keep]
2044
2113
 
2045
2114
  return FlowDataEngine(
2046
- self.data_frame.select(cols_for_select),
2047
- number_of_records=self.number_of_records,
2048
- schema=new_schema
2115
+ self.data_frame.select(cols_for_select), number_of_records=self.number_of_records, schema=new_schema
2049
2116
  )
2050
2117
 
2051
- def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
2118
+ def reorganize_order(self, column_order: list[str]) -> "FlowDataEngine":
2052
2119
  """Reorganizes columns into a specified order.
2053
2120
 
2054
2121
  Args:
@@ -2061,8 +2128,9 @@ class FlowDataEngine:
2061
2128
  schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
2062
2129
  return FlowDataEngine(df, schema=schema, number_of_records=self.number_of_records)
2063
2130
 
2064
- def apply_flowfile_formula(self, func: str, col_name: str,
2065
- output_data_type: pl.DataType = None) -> "FlowDataEngine":
2131
+ def apply_flowfile_formula(
2132
+ self, func: str, col_name: str, output_data_type: pl.DataType = None
2133
+ ) -> "FlowDataEngine":
2066
2134
  """Applies a formula to create a new column or transform an existing one.
2067
2135
 
2068
2136
  Args:
@@ -2081,8 +2149,7 @@ class FlowDataEngine:
2081
2149
 
2082
2150
  return FlowDataEngine(df2, number_of_records=self.number_of_records)
2083
2151
 
2084
- def apply_sql_formula(self, func: str, col_name: str,
2085
- output_data_type: pl.DataType = None) -> "FlowDataEngine":
2152
+ def apply_sql_formula(self, func: str, col_name: str, output_data_type: pl.DataType = None) -> "FlowDataEngine":
2086
2153
  """Applies an SQL-style formula using `pl.sql_expr`.
2087
2154
 
2088
2155
  Args:
@@ -2101,8 +2168,9 @@ class FlowDataEngine:
2101
2168
 
2102
2169
  return FlowDataEngine(df, number_of_records=self.number_of_records)
2103
2170
 
2104
- def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
2105
- execute_remote: bool = True) -> "FlowDataEngine":
2171
+ def output(
2172
+ self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str, execute_remote: bool = True
2173
+ ) -> "FlowDataEngine":
2106
2174
  """Writes the DataFrame to an output file.
2107
2175
 
2108
2176
  Can execute the write operation locally or in a remote worker process.
@@ -2116,7 +2184,7 @@ class FlowDataEngine:
2116
2184
  Returns:
2117
2185
  The same `FlowDataEngine` instance for chaining.
2118
2186
  """
2119
- logger.info('Starting to write output')
2187
+ logger.info("Starting to write output")
2120
2188
  if execute_remote:
2121
2189
  status = utils.write_output(
2122
2190
  self.data_frame,
@@ -2126,11 +2194,11 @@ class FlowDataEngine:
2126
2194
  sheet_name=output_fs.sheet_name,
2127
2195
  delimiter=output_fs.delimiter,
2128
2196
  flow_id=flow_id,
2129
- node_id=node_id
2197
+ node_id=node_id,
2130
2198
  )
2131
2199
  tracker = ExternalExecutorTracker(status)
2132
2200
  tracker.get_result()
2133
- logger.info('Finished writing output')
2201
+ logger.info("Finished writing output")
2134
2202
  else:
2135
2203
  logger.info("Starting to write results locally")
2136
2204
  utils.local_write_output(
@@ -2172,11 +2240,10 @@ class FlowDataEngine:
2172
2240
  if isinstance(other, FlowDataEngine):
2173
2241
  other = [other]
2174
2242
 
2175
- dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
2176
- return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
2243
+ dfs: list[pl.LazyFrame] | list[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
2244
+ return FlowDataEngine(pl.concat(dfs, how="diagonal_relaxed"))
2177
2245
 
2178
- def do_select(self, select_inputs: transform_schemas.SelectInputs,
2179
- keep_missing: bool = True) -> "FlowDataEngine":
2246
+ def do_select(self, select_inputs: transform_schemas.SelectInputs, keep_missing: bool = True) -> "FlowDataEngine":
2180
2247
  """Performs a complex column selection, renaming, and reordering operation.
2181
2248
 
2182
2249
  Args:
@@ -2192,7 +2259,8 @@ class FlowDataEngine:
2192
2259
 
2193
2260
  if not keep_missing:
2194
2261
  drop_cols = set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames).union(
2195
- set(r.old_name for r in renames if not r.keep))
2262
+ set(r.old_name for r in renames if not r.keep)
2263
+ )
2196
2264
  keep_cols = []
2197
2265
  else:
2198
2266
  keep_cols = list(set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames))
@@ -2212,12 +2280,14 @@ class FlowDataEngine:
2212
2280
 
2213
2281
  rename_dict = {r.old_name: r.new_name for r in available_renames}
2214
2282
  fl = self.select_columns(
2215
- list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols)
2283
+ list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
2284
+ )
2216
2285
  fl = fl.change_column_types(transforms=[r for r in renames if r.keep])
2217
2286
  ndf = fl.data_frame.rename(rename_dict)
2218
2287
  renames.sort(key=lambda r: 0 if r.position is None else r.position)
2219
- sorted_cols = utils.match_order(ndf.collect_schema().names(),
2220
- [r.new_name for r in renames] + self.data_frame.collect_schema().names())
2288
+ sorted_cols = utils.match_order(
2289
+ ndf.collect_schema().names(), [r.new_name for r in renames] + self.data_frame.collect_schema().names()
2290
+ )
2221
2291
  output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
2222
2292
  return output_file.reorganize_order(sorted_cols)
2223
2293
 
@@ -2225,10 +2295,9 @@ class FlowDataEngine:
2225
2295
  """Sets whether DataFrame operations should be streamable."""
2226
2296
  self._streamable = streamable
2227
2297
 
2228
- def _calculate_schema(self) -> List[Dict]:
2298
+ def _calculate_schema(self) -> list[dict]:
2229
2299
  """Calculates schema statistics."""
2230
2300
  if self.external_source is not None:
2231
-
2232
2301
  self.collect_external()
2233
2302
  v = utils.calculate_schema(self.data_frame)
2234
2303
  return v
@@ -2247,8 +2316,9 @@ class FlowDataEngine:
2247
2316
  """Creates a FlowDataEngine from a path in a worker process."""
2248
2317
  received_table.set_absolute_filepath()
2249
2318
 
2250
- external_fetcher = ExternalCreateFetcher(received_table=received_table,
2251
- file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
2319
+ external_fetcher = ExternalCreateFetcher(
2320
+ received_table=received_table, file_type=received_table.file_type, flow_id=flow_id, node_id=node_id
2321
+ )
2252
2322
  return cls(external_fetcher.get_result())
2253
2323
 
2254
2324
 
@@ -2271,10 +2341,10 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
2271
2341
  if len(flowfile_tables) == 0:
2272
2342
  kwargs = {}
2273
2343
  elif len(flowfile_tables) == 1:
2274
- kwargs = {'input_df': flowfile_tables[0].data_frame}
2344
+ kwargs = {"input_df": flowfile_tables[0].data_frame}
2275
2345
  else:
2276
- kwargs = {f'input_df_{i+1}': flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
2346
+ kwargs = {f"input_df_{i+1}": flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
2277
2347
  df = polars_executable(**kwargs)
2278
2348
  if isinstance(df, pl.DataFrame):
2279
2349
  logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
2280
- return FlowDataEngine(df)
2350
+ return FlowDataEngine(df)