Flowfile 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. build_backends/main.py +25 -22
  2. build_backends/main_prd.py +10 -19
  3. flowfile/__init__.py +178 -74
  4. flowfile/__main__.py +10 -7
  5. flowfile/api.py +51 -57
  6. flowfile/web/__init__.py +14 -9
  7. flowfile/web/static/assets/AdminView-49392a9a.js +713 -0
  8. flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
  9. flowfile/web/static/assets/CloudConnectionView-36bcd6df.css +72 -0
  10. flowfile/web/static/assets/{CloudConnectionManager-0dfba9f2.js → CloudConnectionView-f13f202b.js} +11 -11
  11. flowfile/web/static/assets/{CloudStorageReader-d5b1b6c9.js → CloudStorageReader-0023d4a5.js} +10 -8
  12. flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
  13. flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
  14. flowfile/web/static/assets/{CloudStorageWriter-00d87aad.js → CloudStorageWriter-8e781e11.js} +10 -8
  15. flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
  16. flowfile/web/static/assets/{ColumnSelector-4685e75d.js → ColumnSelector-8ad68ea9.js} +3 -5
  17. flowfile/web/static/assets/{ContextMenu-c13f91d0.css → ContextMenu-26d4dd27.css} +6 -6
  18. flowfile/web/static/assets/{ContextMenu-23e909da.js → ContextMenu-31ee57f0.js} +3 -3
  19. flowfile/web/static/assets/{ContextMenu-70ae0c79.js → ContextMenu-69a74055.js} +3 -3
  20. flowfile/web/static/assets/{ContextMenu-f149cf7c.js → ContextMenu-8e2051c6.js} +3 -3
  21. flowfile/web/static/assets/{ContextMenu-4c74eef1.css → ContextMenu-8ec1729e.css} +6 -6
  22. flowfile/web/static/assets/{ContextMenu-63cfa99b.css → ContextMenu-9b310c60.css} +6 -6
  23. flowfile/web/static/assets/{CrossJoin-702a3edd.js → CrossJoin-03df6938.js} +12 -10
  24. flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
  25. flowfile/web/static/assets/CustomNode-59e99a86.css +32 -0
  26. flowfile/web/static/assets/{CustomNode-b1519993.js → CustomNode-8479239b.js} +36 -24
  27. flowfile/web/static/assets/{DatabaseConnectionSettings-6f3e4ea5.js → DatabaseConnectionSettings-869e3efd.js} +5 -4
  28. flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-e91df89a.css} +13 -13
  29. flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-36898a00.css} +24 -24
  30. flowfile/web/static/assets/{DatabaseReader-d38c7295.js → DatabaseReader-c58b9552.js} +25 -15
  31. flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
  32. flowfile/web/static/assets/{DatabaseManager-cf5ef661.js → DatabaseView-d26a9140.js} +11 -11
  33. flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-217a99f1.css} +19 -19
  34. flowfile/web/static/assets/{DatabaseWriter-b04ef46a.js → DatabaseWriter-4d05ddc7.js} +17 -10
  35. flowfile/web/static/assets/{designer-8da3ba3a.css → DesignerView-a6d0ee84.css} +614 -546
  36. flowfile/web/static/assets/{designer-9633482a.js → DesignerView-e6f5c0e8.js} +1107 -3170
  37. flowfile/web/static/assets/{documentation-ca400224.js → DocumentationView-2e78ef1b.js} +5 -5
  38. flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-fd46c656.css} +7 -7
  39. flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
  40. flowfile/web/static/assets/{ExploreData-5fa10ed8.js → ExploreData-7b54caca.js} +18 -9
  41. flowfile/web/static/assets/{ExternalSource-d39af878.js → ExternalSource-3fa399b2.js} +9 -7
  42. flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-47ab05a3.css} +17 -17
  43. flowfile/web/static/assets/Filter-7494ea97.css +48 -0
  44. flowfile/web/static/assets/Filter-8cbbdbf3.js +287 -0
  45. flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
  46. flowfile/web/static/assets/{Formula-6b04fb1d.js → Formula-aac42b1e.js} +13 -11
  47. flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
  48. flowfile/web/static/assets/{FuzzyMatch-999521f4.js → FuzzyMatch-cd9bbfca.js} +12 -10
  49. flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-c24dec17.css} +5 -5
  50. flowfile/web/static/assets/{GraphSolver-17dd2198.js → GraphSolver-c7e6780e.js} +13 -11
  51. flowfile/web/static/assets/{GroupBy-6b039e18.js → GroupBy-93c5d22b.js} +9 -7
  52. flowfile/web/static/assets/{GroupBy-b9505323.css → GroupBy-be7ac0bf.css} +10 -10
  53. flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
  54. flowfile/web/static/assets/{Join-24d0f113.js → Join-a19b2de2.js} +13 -11
  55. flowfile/web/static/assets/LoginView-0df4ed0a.js +134 -0
  56. flowfile/web/static/assets/LoginView-d325d632.css +172 -0
  57. flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
  58. flowfile/web/static/assets/{ManualInput-34639209.js → ManualInput-8d3374b2.js} +170 -116
  59. flowfile/web/static/assets/{MultiSelect-0e8724a3.js → MultiSelect-ad1b6243.js} +2 -2
  60. flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js → MultiSelect.vue_vue_type_script_setup_true_lang-e278950d.js} +1 -1
  61. flowfile/web/static/assets/NodeDesigner-40b647c9.js +2610 -0
  62. flowfile/web/static/assets/NodeDesigner-5f53be3f.css +1429 -0
  63. flowfile/web/static/assets/{NumericInput-3d63a470.js → NumericInput-7100234c.js} +2 -2
  64. flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js → NumericInput.vue_vue_type_script_setup_true_lang-5130219f.js} +5 -2
  65. flowfile/web/static/assets/{Output-283fe388.css → Output-35e97000.css} +6 -6
  66. flowfile/web/static/assets/{Output-edea9802.js → Output-f5efd2aa.js} +12 -9
  67. flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
  68. flowfile/web/static/assets/{Pivot-61d19301.js → Pivot-d981d23c.js} +11 -9
  69. flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
  70. flowfile/web/static/assets/{PivotValidation-f97fec5b.js → PivotValidation-39386e95.js} +3 -3
  71. flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
  72. flowfile/web/static/assets/{PivotValidation-de9f43fe.js → PivotValidation-63de1f73.js} +3 -3
  73. flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
  74. flowfile/web/static/assets/{PolarsCode-bc3c9984.js → PolarsCode-f9d69217.js} +18 -9
  75. flowfile/web/static/assets/PopOver-b22f049e.js +939 -0
  76. flowfile/web/static/assets/PopOver-d96599db.css +33 -0
  77. flowfile/web/static/assets/{Read-e808b239.css → Read-36e7bd51.css} +12 -12
  78. flowfile/web/static/assets/{Read-64a3f259.js → Read-aec2e377.js} +14 -11
  79. flowfile/web/static/assets/{RecordCount-3d5039be.js → RecordCount-78ed6845.js} +6 -4
  80. flowfile/web/static/assets/{RecordId-597510e0.js → RecordId-2156e890.js} +8 -6
  81. flowfile/web/static/assets/{SQLQueryComponent-36cef432.css → SQLQueryComponent-1c2f26b4.css} +5 -5
  82. flowfile/web/static/assets/{SQLQueryComponent-df51adbe.js → SQLQueryComponent-48c72f5b.js} +3 -3
  83. flowfile/web/static/assets/{Sample-4be0a507.js → Sample-1352ca74.js} +6 -4
  84. flowfile/web/static/assets/SecretSelector-22b5ff89.js +113 -0
  85. flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
  86. flowfile/web/static/assets/{SecretManager-4839be57.js → SecretsView-17df66ee.js} +35 -36
  87. flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
  88. flowfile/web/static/assets/{Select-9b72f201.js → Select-0aee4c54.js} +9 -7
  89. flowfile/web/static/assets/{SettingsSection-f0f75a42.js → SettingsSection-0784e157.js} +3 -3
  90. flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
  91. flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
  92. flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
  93. flowfile/web/static/assets/{SettingsSection-e1e9c953.js → SettingsSection-cd341bb6.js} +3 -3
  94. flowfile/web/static/assets/{SettingsSection-7ded385d.js → SettingsSection-f2002a6d.js} +3 -3
  95. flowfile/web/static/assets/{SingleSelect-6c777aac.js → SingleSelect-460cc0ea.js} +2 -2
  96. flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js → SingleSelect.vue_vue_type_script_setup_true_lang-30741bb2.js} +1 -1
  97. flowfile/web/static/assets/{SliderInput-7cb93e62.js → SliderInput-5d926864.js} +7 -4
  98. flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
  99. flowfile/web/static/assets/{Sort-6cbde21a.js → Sort-3cdc971b.js} +9 -7
  100. flowfile/web/static/assets/{Unique-f9fb0809.css → Sort-8a871341.css} +10 -10
  101. flowfile/web/static/assets/{TextInput-d9a40c11.js → TextInput-a2d0bfbd.js} +2 -2
  102. flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-5896c375.js → TextInput.vue_vue_type_script_setup_true_lang-abad1ca2.js} +5 -2
  103. flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
  104. flowfile/web/static/assets/{TextToRows-c4fcbf4d.js → TextToRows-918945f7.js} +11 -10
  105. flowfile/web/static/assets/{ToggleSwitch-4ef91d19.js → ToggleSwitch-f0ef5196.js} +2 -2
  106. flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-5605c793.js} +1 -1
  107. flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-54d2f518.css} +6 -6
  108. flowfile/web/static/assets/{UnavailableFields-a03f512c.js → UnavailableFields-bdad6144.js} +4 -4
  109. flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
  110. flowfile/web/static/assets/{Union-bfe9b996.js → Union-e8ab8c86.js} +8 -6
  111. flowfile/web/static/assets/{Unique-5d023a27.js → Unique-8cd4f976.js} +13 -10
  112. flowfile/web/static/assets/{Sort-3643d625.css → Unique-9fb2f567.css} +10 -10
  113. flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-710a2948.css} +7 -7
  114. flowfile/web/static/assets/{Unpivot-91cc5354.js → Unpivot-8da14095.js} +10 -8
  115. flowfile/web/static/assets/{UnpivotValidation-7ee2de44.js → UnpivotValidation-6f7d89ff.js} +3 -3
  116. flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
  117. flowfile/web/static/assets/{VueGraphicWalker-e51b9924.js → VueGraphicWalker-3fb312e1.js} +4 -4
  118. flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
  119. flowfile/web/static/assets/{api-cf1221f0.js → api-24483f0d.js} +1 -1
  120. flowfile/web/static/assets/{api-c1bad5ca.js → api-8b81fa73.js} +1 -1
  121. flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-3d8dc5fa.css} +40 -40
  122. flowfile/web/static/assets/{dropDown-614b998d.js → dropDown-ac0fda9d.js} +3 -3
  123. flowfile/web/static/assets/{fullEditor-f7971590.js → fullEditor-5497a84a.js} +11 -10
  124. flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-a0be62b3.css} +74 -62
  125. flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
  126. flowfile/web/static/assets/{genericNodeSettings-4fe5f36b.js → genericNodeSettings-99014e1d.js} +5 -5
  127. flowfile/web/static/assets/index-07dda503.js +38 -0
  128. flowfile/web/static/assets/index-3ba44389.js +2696 -0
  129. flowfile/web/static/assets/{index-50508d4d.css → index-e6289dd0.css} +1945 -569
  130. flowfile/web/static/assets/{index-5429bbf8.js → index-fb6493ae.js} +41626 -40867
  131. flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
  132. flowfile/web/static/assets/nodeInput-0eb13f1a.js +2 -0
  133. flowfile/web/static/assets/{outputCsv-076b85ab.js → outputCsv-8f8ba42d.js} +3 -3
  134. flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
  135. flowfile/web/static/assets/{outputExcel-0fd17dbe.js → outputExcel-393f4fef.js} +3 -3
  136. flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
  137. flowfile/web/static/assets/{outputParquet-b61e0847.js → outputParquet-07c81f65.js} +4 -4
  138. flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
  139. flowfile/web/static/assets/{readCsv-a8bb8b61.js → readCsv-07f6d9ad.js} +3 -3
  140. flowfile/web/static/assets/{readCsv-c767cb37.css → readCsv-3bfac4c3.css} +15 -15
  141. flowfile/web/static/assets/{readExcel-806d2826.css → readExcel-3db6b763.css} +13 -13
  142. flowfile/web/static/assets/{readExcel-67b4aee0.js → readExcel-ed69bc8f.js} +5 -5
  143. flowfile/web/static/assets/{readParquet-48c81530.css → readParquet-c5244ad5.css} +4 -4
  144. flowfile/web/static/assets/{readParquet-92ce1dbc.js → readParquet-e3ed4528.js} +3 -3
  145. flowfile/web/static/assets/secrets.api-002e7d7e.js +65 -0
  146. flowfile/web/static/assets/{selectDynamic-92e25ee3.js → selectDynamic-80b92899.js} +5 -5
  147. flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
  148. flowfile/web/static/assets/{vue-codemirror.esm-41b0e0d7.js → vue-codemirror.esm-0965f39f.js} +31 -640
  149. flowfile/web/static/assets/{vue-content-loader.es-2c8e608f.js → vue-content-loader.es-c506ad97.js} +1 -1
  150. flowfile/web/static/index.html +2 -2
  151. {flowfile-0.5.1.dist-info → flowfile-0.5.3.dist-info}/METADATA +2 -3
  152. flowfile-0.5.3.dist-info/RECORD +402 -0
  153. flowfile_core/__init__.py +13 -6
  154. flowfile_core/auth/jwt.py +51 -16
  155. flowfile_core/auth/models.py +32 -7
  156. flowfile_core/auth/password.py +89 -0
  157. flowfile_core/auth/secrets.py +8 -6
  158. flowfile_core/configs/__init__.py +9 -7
  159. flowfile_core/configs/flow_logger.py +15 -14
  160. flowfile_core/configs/node_store/__init__.py +72 -4
  161. flowfile_core/configs/node_store/nodes.py +155 -172
  162. flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
  163. flowfile_core/configs/settings.py +28 -15
  164. flowfile_core/database/connection.py +7 -6
  165. flowfile_core/database/init_db.py +96 -2
  166. flowfile_core/database/models.py +3 -1
  167. flowfile_core/fileExplorer/__init__.py +17 -0
  168. flowfile_core/fileExplorer/funcs.py +123 -57
  169. flowfile_core/fileExplorer/utils.py +10 -11
  170. flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
  171. flowfile_core/flowfile/analytics/analytics_processor.py +26 -24
  172. flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
  173. flowfile_core/flowfile/analytics/utils.py +1 -1
  174. flowfile_core/flowfile/code_generator/code_generator.py +358 -244
  175. flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
  176. flowfile_core/flowfile/connection_manager/models.py +1 -1
  177. flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
  178. flowfile_core/flowfile/database_connection_manager/models.py +1 -1
  179. flowfile_core/flowfile/extensions.py +17 -12
  180. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
  181. flowfile_core/flowfile/flow_data_engine/create/funcs.py +115 -83
  182. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +481 -423
  183. flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
  184. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
  185. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
  186. flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
  187. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
  188. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +31 -20
  189. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
  190. flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
  191. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +14 -15
  192. flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
  193. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
  194. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
  195. flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
  196. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
  197. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
  198. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +190 -127
  199. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
  200. flowfile_core/flowfile/flow_data_engine/utils.py +99 -67
  201. flowfile_core/flowfile/flow_graph.py +918 -571
  202. flowfile_core/flowfile/flow_graph_utils.py +31 -49
  203. flowfile_core/flowfile/flow_node/flow_node.py +330 -233
  204. flowfile_core/flowfile/flow_node/models.py +53 -41
  205. flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
  206. flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
  207. flowfile_core/flowfile/handler.py +80 -30
  208. flowfile_core/flowfile/manage/compatibility_enhancements.py +209 -126
  209. flowfile_core/flowfile/manage/io_flowfile.py +54 -57
  210. flowfile_core/flowfile/node_designer/__init__.py +15 -13
  211. flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
  212. flowfile_core/flowfile/node_designer/custom_node.py +162 -36
  213. flowfile_core/flowfile/node_designer/ui_components.py +135 -34
  214. flowfile_core/flowfile/schema_callbacks.py +71 -51
  215. flowfile_core/flowfile/setting_generator/__init__.py +0 -1
  216. flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
  217. flowfile_core/flowfile/setting_generator/settings.py +64 -53
  218. flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
  219. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
  220. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
  221. flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
  222. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
  223. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
  224. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
  225. flowfile_core/flowfile/util/calculate_layout.py +9 -13
  226. flowfile_core/flowfile/util/execution_orderer.py +25 -17
  227. flowfile_core/flowfile/util/node_skipper.py +4 -4
  228. flowfile_core/flowfile/utils.py +19 -21
  229. flowfile_core/main.py +26 -19
  230. flowfile_core/routes/auth.py +284 -11
  231. flowfile_core/routes/cloud_connections.py +25 -25
  232. flowfile_core/routes/logs.py +21 -29
  233. flowfile_core/routes/public.py +3 -3
  234. flowfile_core/routes/routes.py +70 -34
  235. flowfile_core/routes/secrets.py +25 -27
  236. flowfile_core/routes/user_defined_components.py +483 -4
  237. flowfile_core/run_lock.py +0 -1
  238. flowfile_core/schemas/__init__.py +4 -6
  239. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
  240. flowfile_core/schemas/cloud_storage_schemas.py +59 -53
  241. flowfile_core/schemas/input_schema.py +231 -144
  242. flowfile_core/schemas/output_model.py +49 -34
  243. flowfile_core/schemas/schemas.py +116 -89
  244. flowfile_core/schemas/transform_schema.py +518 -263
  245. flowfile_core/schemas/yaml_types.py +21 -7
  246. flowfile_core/secret_manager/secret_manager.py +17 -13
  247. flowfile_core/types.py +29 -9
  248. flowfile_core/utils/arrow_reader.py +7 -6
  249. flowfile_core/utils/excel_file_manager.py +3 -3
  250. flowfile_core/utils/fileManager.py +7 -7
  251. flowfile_core/utils/fl_executor.py +8 -10
  252. flowfile_core/utils/utils.py +4 -4
  253. flowfile_core/utils/validate_setup.py +5 -4
  254. flowfile_frame/__init__.py +106 -51
  255. flowfile_frame/adapters.py +2 -9
  256. flowfile_frame/adding_expr.py +73 -32
  257. flowfile_frame/cloud_storage/frame_helpers.py +27 -23
  258. flowfile_frame/cloud_storage/secret_manager.py +12 -26
  259. flowfile_frame/config.py +2 -5
  260. flowfile_frame/expr.py +311 -218
  261. flowfile_frame/expr.pyi +160 -159
  262. flowfile_frame/expr_name.py +23 -23
  263. flowfile_frame/flow_frame.py +571 -476
  264. flowfile_frame/flow_frame.pyi +123 -104
  265. flowfile_frame/flow_frame_methods.py +227 -246
  266. flowfile_frame/group_frame.py +50 -20
  267. flowfile_frame/join.py +2 -2
  268. flowfile_frame/lazy.py +129 -87
  269. flowfile_frame/lazy_methods.py +83 -30
  270. flowfile_frame/list_name_space.py +55 -50
  271. flowfile_frame/selectors.py +148 -68
  272. flowfile_frame/series.py +9 -7
  273. flowfile_frame/utils.py +19 -21
  274. flowfile_worker/__init__.py +12 -7
  275. flowfile_worker/configs.py +11 -19
  276. flowfile_worker/create/__init__.py +14 -9
  277. flowfile_worker/create/funcs.py +114 -77
  278. flowfile_worker/create/models.py +46 -43
  279. flowfile_worker/create/pl_types.py +14 -15
  280. flowfile_worker/create/read_excel_tables.py +34 -41
  281. flowfile_worker/create/utils.py +22 -19
  282. flowfile_worker/external_sources/s3_source/main.py +18 -51
  283. flowfile_worker/external_sources/s3_source/models.py +34 -27
  284. flowfile_worker/external_sources/sql_source/main.py +8 -5
  285. flowfile_worker/external_sources/sql_source/models.py +13 -9
  286. flowfile_worker/flow_logger.py +10 -8
  287. flowfile_worker/funcs.py +214 -155
  288. flowfile_worker/main.py +11 -17
  289. flowfile_worker/models.py +35 -28
  290. flowfile_worker/process_manager.py +2 -3
  291. flowfile_worker/routes.py +121 -90
  292. flowfile_worker/secrets.py +9 -6
  293. flowfile_worker/spawner.py +80 -49
  294. flowfile_worker/utils.py +3 -2
  295. shared/__init__.py +2 -7
  296. shared/storage_config.py +25 -13
  297. test_utils/postgres/commands.py +3 -2
  298. test_utils/postgres/fixtures.py +9 -9
  299. test_utils/s3/commands.py +1 -1
  300. test_utils/s3/data_generator.py +3 -4
  301. test_utils/s3/demo_data_generator.py +4 -7
  302. test_utils/s3/fixtures.py +7 -5
  303. tools/migrate/__init__.py +1 -1
  304. tools/migrate/__main__.py +16 -29
  305. tools/migrate/legacy_schemas.py +251 -190
  306. tools/migrate/migrate.py +193 -181
  307. tools/migrate/tests/conftest.py +1 -3
  308. tools/migrate/tests/test_migrate.py +36 -41
  309. tools/migrate/tests/test_migration_e2e.py +28 -29
  310. tools/migrate/tests/test_node_migrations.py +50 -20
  311. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
  312. flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
  313. flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
  314. flowfile/web/static/assets/Filter-9b6d08db.js +0 -164
  315. flowfile/web/static/assets/Filter-f62091b3.css +0 -20
  316. flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
  317. flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
  318. flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
  319. flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
  320. flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
  321. flowfile/web/static/assets/nodeInput-5d0d6b79.js +0 -41
  322. flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
  323. flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
  324. flowfile/web/static/assets/secretApi-68435402.js +0 -46
  325. flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
  326. flowfile-0.5.1.dist-info/RECORD +0 -388
  327. {flowfile-0.5.1.dist-info → flowfile-0.5.3.dist-info}/WHEEL +0 -0
  328. {flowfile-0.5.1.dist-info → flowfile-0.5.3.dist-info}/entry_points.txt +0 -0
  329. {flowfile-0.5.1.dist-info → flowfile-0.5.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,52 +1,50 @@
1
1
  # Standard library imports
2
+ from __future__ import annotations
3
+
2
4
  import logging
3
5
  import os
6
+ from collections.abc import Callable, Generator, Iterable
4
7
  from copy import deepcopy
5
8
  from dataclasses import dataclass
6
9
  from math import ceil
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
10
+ from typing import Any, Literal, TypeVar, Union
8
11
 
9
- from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
12
+ import polars as pl
10
13
 
11
14
  # Third-party imports
12
15
  from loky import Future
13
- import polars as pl
16
+ from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
14
17
  from polars.exceptions import PanicException
15
- from polars_grouper import graph_solver
16
18
  from polars_expr_transformer import simple_function_to_expr as to_expr
19
+ from polars_grouper import graph_solver
17
20
  from pyarrow import Table as PaTable
18
21
  from pyarrow.parquet import ParquetFile
19
22
 
20
23
  # Local imports - Core
21
24
  from flowfile_core.configs import logger
22
- from flowfile_core.utils.utils import ensure_similarity_dicts
23
25
  from flowfile_core.configs.flow_logger import NodeLogger
24
- from flowfile_core.schemas import (
25
- cloud_storage_schemas,
26
- input_schema,
27
- transform_schema as transform_schemas
28
- )
29
- from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
30
26
 
31
27
  # Local imports - Flow File Components
32
28
  from flowfile_core.flowfile.flow_data_engine import utils
33
- from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (CloudStorageReader,
34
- ensure_path_has_wildcard_pattern,
35
- get_first_file_from_s3_dir)
29
+ from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
30
+ CloudStorageReader,
31
+ ensure_path_has_wildcard_pattern,
32
+ get_first_file_from_s3_dir,
33
+ )
36
34
  from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
37
35
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
38
36
  FlowfileColumn,
39
37
  assert_if_flowfile_schema,
40
- convert_stats_to_column_info
38
+ convert_stats_to_column_info,
41
39
  )
42
40
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
43
41
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
44
42
  from flowfile_core.flowfile.flow_data_engine.join import (
45
- verify_join_select_integrity,
46
- verify_join_map_integrity,
47
- rename_df_table_for_join,
43
+ get_col_name_to_delete,
48
44
  get_undo_rename_mapping_join,
49
- get_col_name_to_delete
45
+ rename_df_table_for_join,
46
+ verify_join_map_integrity,
47
+ verify_join_select_integrity,
50
48
  )
51
49
  from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
52
50
  from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
@@ -55,19 +53,21 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
55
53
  ExternalDfFetcher,
56
54
  ExternalExecutorTracker,
57
55
  ExternalFuzzyMatchFetcher,
58
- fetch_unique_values
59
- )
60
- from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
61
- get_join_count,
62
- write_threaded
56
+ fetch_unique_values,
63
57
  )
64
-
58
+ from flowfile_core.flowfile.flow_data_engine.threaded_processes import write_threaded
65
59
  from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
60
+ from flowfile_core.schemas import cloud_storage_schemas, input_schema
61
+ from flowfile_core.schemas import transform_schema as transform_schemas
62
+ from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
63
+ from flowfile_core.utils.utils import ensure_similarity_dicts
66
64
 
67
- T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
65
+ T = TypeVar("T", pl.DataFrame, pl.LazyFrame)
68
66
 
69
67
 
70
- def _handle_duplication_join_keys(left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager) -> Tuple[T, T, Dict[str, str]]:
68
+ def _handle_duplication_join_keys(
69
+ left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager
70
+ ) -> tuple[T, T, dict[str, str]]:
71
71
  """Temporarily renames join keys to avoid conflicts during a join.
72
72
 
73
73
  This helper function checks the join type and renames the join key columns
@@ -88,20 +88,26 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_manager: transfo
88
88
  """
89
89
 
90
90
  def _construct_temp_name(column_name: str) -> str:
91
- return "__FL_TEMP__"+column_name
91
+ return "__FL_TEMP__" + column_name
92
92
 
93
- if join_manager.how == 'right':
94
- left_df = left_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
95
- for jk in join_manager.left_manager.get_join_key_selects())
93
+ if join_manager.how == "right":
94
+ left_df = left_df.with_columns(
95
+ pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
96
+ for jk in join_manager.left_manager.get_join_key_selects()
97
+ )
96
98
  reverse_actions = {
97
99
  _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
98
- for jk in join_manager.left_manager.get_join_key_selects()}
99
- elif join_manager.how in ('left', 'inner'):
100
- right_df = right_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
101
- for jk in join_manager.right_manager.get_join_key_selects())
100
+ for jk in join_manager.left_manager.get_join_key_selects()
101
+ }
102
+ elif join_manager.how in ("left", "inner"):
103
+ right_df = right_df.with_columns(
104
+ pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
105
+ for jk in join_manager.right_manager.get_join_key_selects()
106
+ )
102
107
  reverse_actions = {
103
108
  _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
104
- for jk in join_manager.right_manager.get_join_key_selects()}
109
+ for jk in join_manager.right_manager.get_join_key_selects()
110
+ }
105
111
  else:
106
112
  reverse_actions = {}
107
113
  return left_df, right_df, reverse_actions
@@ -118,12 +124,12 @@ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.
118
124
  Args:
119
125
  join_input: The JoinInput settings object to modify.
120
126
  """
121
- if join_input.how in ('semi', 'anti'):
127
+ if join_input.how in ("semi", "anti"):
122
128
  for jk in join_input.right_select.renames:
123
129
  jk.keep = False
124
130
 
125
131
 
126
- def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
132
+ def get_select_columns(full_select_input: list[transform_schemas.SelectInput]) -> list[str]:
127
133
  """Extracts a list of column names to be selected from a SelectInput list.
128
134
 
129
135
  This function filters a list of `SelectInput` objects to return the names
@@ -156,15 +162,16 @@ class FlowDataEngine:
156
162
  errors: A list of errors encountered during operations.
157
163
  _schema: A cached list of `FlowfileColumn` objects representing the schema.
158
164
  """
165
+
159
166
  # Core attributes
160
- _data_frame: Union[pl.DataFrame, pl.LazyFrame]
161
- columns: List[Any]
167
+ _data_frame: pl.DataFrame | pl.LazyFrame
168
+ columns: list[Any]
162
169
 
163
170
  # Metadata attributes
164
171
  name: str = None
165
172
  number_of_records: int = None
166
- errors: List = None
167
- _schema: Optional[List['FlowfileColumn']] = None
173
+ errors: list = None
174
+ _schema: list["FlowfileColumn"] | None = None
168
175
 
169
176
  # Configuration attributes
170
177
  _optimize_memory: bool = False
@@ -173,16 +180,16 @@ class FlowDataEngine:
173
180
  _calculate_schema_stats: bool = False
174
181
 
175
182
  # Cache and optimization attributes
176
- __col_name_idx_map: Dict = None
177
- __data_map: Dict = None
178
- __optimized_columns: List = None
183
+ __col_name_idx_map: dict = None
184
+ __data_map: dict = None
185
+ __optimized_columns: list = None
179
186
  __sample__: str = None
180
187
  __number_of_fields: int = None
181
- _col_idx: Dict[str, int] = None
188
+ _col_idx: dict[str, int] = None
182
189
 
183
190
  # Source tracking
184
- _org_path: Optional[str] = None
185
- _external_source: Optional[ExternalDataSource] = None
191
+ _org_path: str | None = None
192
+ _external_source: ExternalDataSource | None = None
186
193
 
187
194
  # State tracking
188
195
  sorted_by: int = None
@@ -195,17 +202,21 @@ class FlowDataEngine:
195
202
  _number_of_records_callback: Callable = None
196
203
  _data_callback: Callable = None
197
204
 
198
- def __init__(self,
199
- raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
200
- path_ref: str = None,
201
- name: str = None,
202
- optimize_memory: bool = True,
203
- schema: List['FlowfileColumn'] | List[str] | pl.Schema = None,
204
- number_of_records: int = None,
205
- calculate_schema_stats: bool = False,
206
- streamable: bool = True,
207
- number_of_records_callback: Callable = None,
208
- data_callback: Callable = None):
205
+ def __init__(
206
+ self,
207
+ raw_data: Union[
208
+ list[dict], list[Any], dict[str, Any], "ParquetFile", pl.DataFrame, pl.LazyFrame, input_schema.RawData
209
+ ] = None,
210
+ path_ref: str = None,
211
+ name: str = None,
212
+ optimize_memory: bool = True,
213
+ schema: list["FlowfileColumn"] | list[str] | pl.Schema = None,
214
+ number_of_records: int = None,
215
+ calculate_schema_stats: bool = False,
216
+ streamable: bool = True,
217
+ number_of_records_callback: Callable = None,
218
+ data_callback: Callable = None,
219
+ ):
209
220
  """Initializes the FlowDataEngine from various data sources.
210
221
 
211
222
  Args:
@@ -265,12 +276,12 @@ class FlowDataEngine:
265
276
  elif isinstance(raw_data, (list, dict)):
266
277
  self._handle_python_data(raw_data)
267
278
 
268
- def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
279
+ def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: int | None):
269
280
  """(Internal) Initializes the engine from an eager Polars DataFrame."""
270
281
  self.data_frame = df
271
282
  self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
272
283
 
273
- def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
284
+ def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: int | None, optimize_memory: bool):
274
285
  """(Internal) Initializes the engine from a Polars LazyFrame."""
275
286
  self.data_frame = lf
276
287
  self._lazy = True
@@ -281,14 +292,14 @@ class FlowDataEngine:
281
292
  else:
282
293
  self.number_of_records = lf.select(pl.len()).collect()[0, 0]
283
294
 
284
- def _handle_python_data(self, data: Union[List, Dict]):
295
+ def _handle_python_data(self, data: list | dict):
285
296
  """(Internal) Dispatches Python collections to the correct handler."""
286
297
  if isinstance(data, dict):
287
298
  self._handle_dict_input(data)
288
299
  else:
289
300
  self._handle_list_input(data)
290
301
 
291
- def _handle_dict_input(self, data: Dict):
302
+ def _handle_dict_input(self, data: dict):
292
303
  """(Internal) Initializes the engine from a Python dictionary."""
293
304
  if len(data) == 0:
294
305
  self.initialize_empty_fl()
@@ -312,8 +323,12 @@ class FlowDataEngine:
312
323
  raw_data: An instance of `RawData` containing the data and schema.
313
324
  """
314
325
  flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
315
- polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
316
- for flowfile_column in flowfile_schema])
326
+ polars_schema = pl.Schema(
327
+ [
328
+ (flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
329
+ for flowfile_column in flowfile_schema
330
+ ]
331
+ )
317
332
  try:
318
333
  df = pl.DataFrame(raw_data.data, polars_schema, strict=False)
319
334
  except TypeError as e:
@@ -323,7 +338,7 @@ class FlowDataEngine:
323
338
  self.data_frame = df.lazy()
324
339
  self.lazy = True
325
340
 
326
- def _handle_list_input(self, data: List):
341
+ def _handle_list_input(self, data: list):
327
342
  """(Internal) Initializes the engine from a list of records."""
328
343
  number_of_records = len(data)
329
344
  if number_of_records > 0:
@@ -336,19 +351,19 @@ class FlowDataEngine:
336
351
  self.number_of_records = 0
337
352
 
338
353
  @staticmethod
339
- def _process_list_data(data: List) -> List[Dict]:
354
+ def _process_list_data(data: list) -> list[dict]:
340
355
  """(Internal) Normalizes list data into a list of dictionaries.
341
356
 
342
357
  Ensures that a list of objects or non-dict items is converted into a
343
358
  uniform list of dictionaries suitable for Polars DataFrame creation.
344
359
  """
345
- if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
360
+ if not (isinstance(data[0], dict) or hasattr(data[0], "__dict__")):
346
361
  try:
347
362
  return pl.DataFrame(data).to_dicts()
348
363
  except TypeError:
349
- raise Exception('Value must be able to be converted to dictionary')
364
+ raise Exception("Value must be able to be converted to dictionary")
350
365
  except Exception as e:
351
- raise Exception(f'Value must be able to be converted to dictionary: {e}')
366
+ raise Exception(f"Value must be able to be converted to dictionary: {e}")
352
367
 
353
368
  if not isinstance(data[0], dict):
354
369
  data = [row.__dict__ for row in data]
@@ -375,49 +390,37 @@ class FlowDataEngine:
375
390
 
376
391
  logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
377
392
 
378
- if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
393
+ if write_settings.write_mode == "append" and write_settings.file_format != "delta":
379
394
  raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
380
395
  storage_options = CloudStorageReader.get_storage_options(connection)
381
396
  credential_provider = CloudStorageReader.get_credential_provider(connection)
382
397
  # Dispatch to the correct writer based on file format
383
398
  if write_settings.file_format == "parquet":
384
399
  self._write_parquet_to_cloud(
385
- write_settings.resource_path,
386
- storage_options,
387
- credential_provider,
388
- write_settings
400
+ write_settings.resource_path, storage_options, credential_provider, write_settings
389
401
  )
390
402
  elif write_settings.file_format == "delta":
391
403
  self._write_delta_to_cloud(
392
- write_settings.resource_path,
393
- storage_options,
394
- credential_provider,
395
- write_settings
404
+ write_settings.resource_path, storage_options, credential_provider, write_settings
396
405
  )
397
406
  elif write_settings.file_format == "csv":
398
- self._write_csv_to_cloud(
399
- write_settings.resource_path,
400
- storage_options,
401
- credential_provider,
402
- write_settings
403
- )
407
+ self._write_csv_to_cloud(write_settings.resource_path, storage_options, credential_provider, write_settings)
404
408
  elif write_settings.file_format == "json":
405
409
  self._write_json_to_cloud(
406
- write_settings.resource_path,
407
- storage_options,
408
- credential_provider,
409
- write_settings
410
+ write_settings.resource_path, storage_options, credential_provider, write_settings
410
411
  )
411
412
  else:
412
413
  raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
413
414
 
414
415
  logger.info(f"Successfully wrote data to {write_settings.resource_path}")
415
416
 
416
- def _write_parquet_to_cloud(self,
417
- resource_path: str,
418
- storage_options: Dict[str, Any],
419
- credential_provider: Optional[Callable],
420
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
417
+ def _write_parquet_to_cloud(
418
+ self,
419
+ resource_path: str,
420
+ storage_options: dict[str, Any],
421
+ credential_provider: Callable | None,
422
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
423
+ ):
421
424
  """(Internal) Writes the DataFrame to a Parquet file in cloud storage.
422
425
 
423
426
  Uses `sink_parquet` for efficient streaming writes. Falls back to a
@@ -437,18 +440,20 @@ class FlowDataEngine:
437
440
  except Exception as e:
438
441
  logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
439
442
  pl_df = self.collect()
440
- sink_kwargs['file'] = sink_kwargs.pop("path")
443
+ sink_kwargs["file"] = sink_kwargs.pop("path")
441
444
  pl_df.write_parquet(**sink_kwargs)
442
445
 
443
446
  except Exception as e:
444
447
  logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
445
448
  raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
446
449
 
447
- def _write_delta_to_cloud(self,
448
- resource_path: str,
449
- storage_options: Dict[str, Any],
450
- credential_provider: Optional[Callable],
451
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
450
+ def _write_delta_to_cloud(
451
+ self,
452
+ resource_path: str,
453
+ storage_options: dict[str, Any],
454
+ credential_provider: Callable | None,
455
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
456
+ ):
452
457
  """(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
453
458
 
454
459
  This operation requires collecting the data first, as `write_delta` operates
@@ -464,11 +469,13 @@ class FlowDataEngine:
464
469
  sink_kwargs["credential_provider"] = credential_provider
465
470
  self.collect().write_delta(**sink_kwargs)
466
471
 
467
- def _write_csv_to_cloud(self,
468
- resource_path: str,
469
- storage_options: Dict[str, Any],
470
- credential_provider: Optional[Callable],
471
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
472
+ def _write_csv_to_cloud(
473
+ self,
474
+ resource_path: str,
475
+ storage_options: dict[str, Any],
476
+ credential_provider: Callable | None,
477
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
478
+ ):
472
479
  """(Internal) Writes the DataFrame to a CSV file in cloud storage.
473
480
 
474
481
  Uses `sink_csv` for efficient, streaming writes of the data.
@@ -490,11 +497,13 @@ class FlowDataEngine:
490
497
  logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
491
498
  raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
492
499
 
493
- def _write_json_to_cloud(self,
494
- resource_path: str,
495
- storage_options: Dict[str, Any],
496
- credential_provider: Optional[Callable],
497
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
500
+ def _write_json_to_cloud(
501
+ self,
502
+ resource_path: str,
503
+ storage_options: dict[str, Any],
504
+ credential_provider: Callable | None,
505
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
506
+ ):
498
507
  """(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
499
508
 
500
509
  Uses `sink_ndjson` for efficient, streaming writes.
@@ -512,7 +521,9 @@ class FlowDataEngine:
512
521
  raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
513
522
 
514
523
  @classmethod
515
- def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal) -> "FlowDataEngine":
524
+ def from_cloud_storage_obj(
525
+ cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal
526
+ ) -> "FlowDataEngine":
516
527
  """Creates a FlowDataEngine from an object in cloud storage.
517
528
 
518
529
  This method supports reading from various cloud storage providers like AWS S3,
@@ -549,31 +560,22 @@ class FlowDataEngine:
549
560
  )
550
561
  elif read_settings.file_format == "delta":
551
562
  return cls._read_delta_from_cloud(
552
- read_settings.resource_path,
553
- storage_options,
554
- credential_provider,
555
- read_settings
563
+ read_settings.resource_path, storage_options, credential_provider, read_settings
556
564
  )
557
565
  elif read_settings.file_format == "csv":
558
566
  return cls._read_csv_from_cloud(
559
- read_settings.resource_path,
560
- storage_options,
561
- credential_provider,
562
- read_settings
567
+ read_settings.resource_path, storage_options, credential_provider, read_settings
563
568
  )
564
569
  elif read_settings.file_format == "json":
565
570
  return cls._read_json_from_cloud(
566
571
  read_settings.resource_path,
567
572
  storage_options,
568
573
  credential_provider,
569
- read_settings.scan_mode == "directory"
574
+ read_settings.scan_mode == "directory",
570
575
  )
571
576
  elif read_settings.file_format == "iceberg":
572
577
  return cls._read_iceberg_from_cloud(
573
- read_settings.resource_path,
574
- storage_options,
575
- credential_provider,
576
- read_settings
578
+ read_settings.resource_path, storage_options, credential_provider, read_settings
577
579
  )
578
580
 
579
581
  elif read_settings.file_format in ["delta", "iceberg"]:
@@ -583,33 +585,40 @@ class FlowDataEngine:
583
585
  raise ValueError(f"Unsupported file format: {read_settings.file_format}")
584
586
 
585
587
  @staticmethod
586
- def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any],
587
- file_format: Literal["csv", "parquet", "json", "delta"]) -> List[FlowfileColumn] | None:
588
+ def _get_schema_from_first_file_in_dir(
589
+ source: str, storage_options: dict[str, Any], file_format: Literal["csv", "parquet", "json", "delta"]
590
+ ) -> list[FlowfileColumn] | None:
588
591
  """Infers the schema by scanning the first file in a cloud directory."""
589
592
  try:
590
593
  scan_func = getattr(pl, "scan_" + file_format)
591
594
  first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
592
- return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
593
- scan_func(first_file_ref, storage_options=storage_options).collect_schema()))
595
+ return convert_stats_to_column_info(
596
+ FlowDataEngine._create_schema_stats_from_pl_schema(
597
+ scan_func(first_file_ref, storage_options=storage_options).collect_schema()
598
+ )
599
+ )
594
600
  except Exception as e:
595
601
  logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
596
602
 
597
-
598
603
  @classmethod
599
- def _read_iceberg_from_cloud(cls,
600
- resource_path: str,
601
- storage_options: Dict[str, Any],
602
- credential_provider: Optional[Callable],
603
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
604
+ def _read_iceberg_from_cloud(
605
+ cls,
606
+ resource_path: str,
607
+ storage_options: dict[str, Any],
608
+ credential_provider: Callable | None,
609
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
610
+ ) -> "FlowDataEngine":
604
611
  """Reads Iceberg table(s) from cloud storage."""
605
- raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
612
+ raise NotImplementedError("Failed to read Iceberg table from cloud storage: Not yet implemented")
606
613
 
607
614
  @classmethod
608
- def _read_parquet_from_cloud(cls,
609
- resource_path: str,
610
- storage_options: Dict[str, Any],
611
- credential_provider: Optional[Callable],
612
- is_directory: bool) -> "FlowDataEngine":
615
+ def _read_parquet_from_cloud(
616
+ cls,
617
+ resource_path: str,
618
+ storage_options: dict[str, Any],
619
+ credential_provider: Callable | None,
620
+ is_directory: bool,
621
+ ) -> "FlowDataEngine":
613
622
  """Reads Parquet file(s) from cloud storage."""
614
623
  try:
615
624
  # Use scan_parquet for lazy evaluation
@@ -633,7 +642,7 @@ class FlowDataEngine:
633
642
  number_of_records=6_666_666, # Set so the provider is not accessed for this stat
634
643
  optimize_memory=True,
635
644
  streamable=True,
636
- schema=schema
645
+ schema=schema,
637
646
  )
638
647
 
639
648
  except Exception as e:
@@ -641,18 +650,20 @@ class FlowDataEngine:
641
650
  raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
642
651
 
643
652
  @classmethod
644
- def _read_delta_from_cloud(cls,
645
- resource_path: str,
646
- storage_options: Dict[str, Any],
647
- credential_provider: Optional[Callable],
648
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
653
+ def _read_delta_from_cloud(
654
+ cls,
655
+ resource_path: str,
656
+ storage_options: dict[str, Any],
657
+ credential_provider: Callable | None,
658
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
659
+ ) -> "FlowDataEngine":
649
660
  """Reads a Delta Lake table from cloud storage."""
650
661
  try:
651
662
  logger.info("Reading Delta file from cloud storage...")
652
663
  logger.info(f"read_settings: {read_settings}")
653
664
  scan_kwargs = {"source": resource_path}
654
665
  if read_settings.delta_version:
655
- scan_kwargs['version'] = read_settings.delta_version
666
+ scan_kwargs["version"] = read_settings.delta_version
656
667
  if storage_options:
657
668
  scan_kwargs["storage_options"] = storage_options
658
669
  if credential_provider:
@@ -663,18 +674,20 @@ class FlowDataEngine:
663
674
  lf,
664
675
  number_of_records=6_666_666, # Set so the provider is not accessed for this stat
665
676
  optimize_memory=True,
666
- streamable=True
677
+ streamable=True,
667
678
  )
668
679
  except Exception as e:
669
680
  logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
670
681
  raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
671
682
 
672
683
  @classmethod
673
- def _read_csv_from_cloud(cls,
674
- resource_path: str,
675
- storage_options: Dict[str, Any],
676
- credential_provider: Optional[Callable],
677
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
684
+ def _read_csv_from_cloud(
685
+ cls,
686
+ resource_path: str,
687
+ storage_options: dict[str, Any],
688
+ credential_provider: Callable | None,
689
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
690
+ ) -> "FlowDataEngine":
678
691
  """Reads CSV file(s) from cloud storage."""
679
692
  try:
680
693
  scan_kwargs = {
@@ -703,7 +716,7 @@ class FlowDataEngine:
703
716
  number_of_records=6_666_666, # Will be calculated lazily
704
717
  optimize_memory=True,
705
718
  streamable=True,
706
- schema=schema
719
+ schema=schema,
707
720
  )
708
721
 
709
722
  except Exception as e:
@@ -711,11 +724,13 @@ class FlowDataEngine:
711
724
  raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
712
725
 
713
726
  @classmethod
714
- def _read_json_from_cloud(cls,
715
- resource_path: str,
716
- storage_options: Dict[str, Any],
717
- credential_provider: Optional[Callable],
718
- is_directory: bool) -> "FlowDataEngine":
727
+ def _read_json_from_cloud(
728
+ cls,
729
+ resource_path: str,
730
+ storage_options: dict[str, Any],
731
+ credential_provider: Callable | None,
732
+ is_directory: bool,
733
+ ) -> "FlowDataEngine":
719
734
  """Reads JSON file(s) from cloud storage."""
720
735
  try:
721
736
  if is_directory:
@@ -755,8 +770,9 @@ class FlowDataEngine:
755
770
  else:
756
771
  self.data_frame = pl.read_parquet(path_ref)
757
772
 
758
- def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
759
- calculate_schema_stats: bool):
773
+ def _finalize_initialization(
774
+ self, name: str, optimize_memory: bool, schema: Any | None, calculate_schema_stats: bool
775
+ ):
760
776
  """Finalizes initialization by setting remaining attributes."""
761
777
  _ = calculate_schema_stats
762
778
  self.name = name
@@ -803,23 +819,20 @@ class FlowDataEngine:
803
819
  def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
804
820
  """Sets the underlying Polars DataFrame or LazyFrame."""
805
821
  if self.lazy and isinstance(df, pl.DataFrame):
806
- raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
822
+ raise Exception("Cannot set a non-lazy dataframe to a lazy flowfile")
807
823
  self._data_frame = df
808
824
 
809
825
  @staticmethod
810
- def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
826
+ def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> list[dict]:
811
827
  """Converts a Polars Schema into a list of schema statistics dictionaries."""
812
- return [
813
- dict(column_name=k, pl_datatype=v, col_index=i)
814
- for i, (k, v) in enumerate(pl_schema.items())
815
- ]
828
+ return [dict(column_name=k, pl_datatype=v, col_index=i) for i, (k, v) in enumerate(pl_schema.items())]
816
829
 
817
- def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
830
+ def _add_schema_from_schema_stats(self, schema_stats: list[dict]):
818
831
  """Populates the schema from a list of schema statistics dictionaries."""
819
832
  self._schema = convert_stats_to_column_info(schema_stats)
820
833
 
821
834
  @property
822
- def schema(self) -> List[FlowfileColumn]:
835
+ def schema(self) -> list[FlowfileColumn]:
823
836
  """The schema of the DataFrame as a list of `FlowfileColumn` objects.
824
837
 
825
838
  This property lazily calculates the schema if it hasn't been determined yet.
@@ -866,8 +879,10 @@ class FlowDataEngine:
866
879
  if n_records is None:
867
880
  logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
868
881
  else:
869
- logger.info(f'Fetching {n_records} record(s) for Table object "{id(self)}". '
870
- f'Settings: streaming={self._streamable}')
882
+ logger.info(
883
+ f'Fetching {n_records} record(s) for Table object "{id(self)}". '
884
+ f"Settings: streaming={self._streamable}"
885
+ )
871
886
 
872
887
  if not self.lazy:
873
888
  return self.data_frame
@@ -881,16 +896,15 @@ class FlowDataEngine:
881
896
  def _collect_data(self, n_records: int = None) -> pl.DataFrame:
882
897
  """Internal method to handle data collection logic."""
883
898
  if n_records is None:
884
-
885
899
  self.collect_external()
886
900
  if self._streamable:
887
901
  try:
888
- logger.info('Collecting data in streaming mode')
902
+ logger.info("Collecting data in streaming mode")
889
903
  return self.data_frame.collect(engine="streaming")
890
904
  except PanicException:
891
905
  self._streamable = False
892
906
 
893
- logger.info('Collecting data in non-streaming mode')
907
+ logger.info("Collecting data in non-streaming mode")
894
908
  return self.data_frame.collect()
895
909
 
896
910
  if self.external_source is not None:
@@ -919,7 +933,7 @@ class FlowDataEngine:
919
933
  return self._create_partial_dataframe(ok_cols, error_cols, n_records)
920
934
  return self._create_empty_dataframe(n_records)
921
935
 
922
- def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
936
+ def _identify_valid_columns(self, n_records: int) -> tuple[list[str], list[tuple[str, Any]]]:
923
937
  """Identifies which columns can be collected successfully."""
924
938
  ok_cols = []
925
939
  error_cols = []
@@ -931,30 +945,30 @@ class FlowDataEngine:
931
945
  error_cols.append((c, self.data_frame.schema[c]))
932
946
  return ok_cols, error_cols
933
947
 
934
- def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
935
- n_records: int) -> pl.DataFrame:
948
+ def _create_partial_dataframe(
949
+ self, ok_cols: list[str], error_cols: list[tuple[str, Any]], n_records: int
950
+ ) -> pl.DataFrame:
936
951
  """Creates a DataFrame with partial data for columns that could be collected."""
937
952
  df = self.data_frame.select(ok_cols)
938
- df = df.with_columns([
939
- pl.lit(None).alias(column_name).cast(data_type)
940
- for column_name, data_type in error_cols
941
- ])
953
+ df = df.with_columns([pl.lit(None).alias(column_name).cast(data_type) for column_name, data_type in error_cols])
942
954
  return df.select(self.columns).head(n_records).collect()
943
955
 
944
956
  def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
945
957
  """Creates an empty DataFrame with the correct schema."""
946
958
  if self.number_of_records > 0:
947
- return pl.DataFrame({
948
- column_name: pl.Series(
949
- name=column_name,
950
- values=[None] * min(self.number_of_records, n_records)
951
- ).cast(data_type)
952
- for column_name, data_type in self.data_frame.schema.items()
953
- })
959
+ return pl.DataFrame(
960
+ {
961
+ column_name: pl.Series(
962
+ name=column_name, values=[None] * min(self.number_of_records, n_records)
963
+ ).cast(data_type)
964
+ for column_name, data_type in self.data_frame.schema.items()
965
+ }
966
+ )
954
967
  return pl.DataFrame(schema=self.data_frame.schema)
955
968
 
956
- def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
957
- calculate_schema_stats: bool = True) -> "FlowDataEngine":
969
+ def do_group_by(
970
+ self, group_by_input: transform_schemas.GroupByInput, calculate_schema_stats: bool = True
971
+ ) -> "FlowDataEngine":
958
972
  """Performs a group-by operation on the DataFrame.
959
973
 
960
974
  Args:
@@ -966,27 +980,23 @@ class FlowDataEngine:
966
980
  Returns:
967
981
  A new `FlowDataEngine` instance with the grouped and aggregated data.
968
982
  """
969
- aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
970
- group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
983
+ aggregations = [c for c in group_by_input.agg_cols if c.agg != "groupby"]
984
+ group_columns = [c for c in group_by_input.agg_cols if c.agg == "groupby"]
971
985
 
972
986
  if len(group_columns) == 0:
973
987
  return FlowDataEngine(
974
- self.data_frame.select(
975
- ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
976
- ),
977
- calculate_schema_stats=calculate_schema_stats
988
+ self.data_frame.select(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
989
+ calculate_schema_stats=calculate_schema_stats,
978
990
  )
979
991
 
980
992
  df = self.data_frame.rename({c.old_name: c.new_name for c in group_columns})
981
993
  group_by_columns = [n_c.new_name for n_c in group_columns]
982
994
  return FlowDataEngine(
983
- df.group_by(*group_by_columns).agg(
984
- ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
985
- ),
986
- calculate_schema_stats=calculate_schema_stats
995
+ df.group_by(*group_by_columns).agg(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
996
+ calculate_schema_stats=calculate_schema_stats,
987
997
  )
988
998
 
989
- def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
999
+ def do_sort(self, sorts: list[transform_schemas.SortByInput]) -> "FlowDataEngine":
990
1000
  """Sorts the DataFrame by one or more columns.
991
1001
 
992
1002
  Args:
@@ -999,12 +1009,13 @@ class FlowDataEngine:
999
1009
  if not sorts:
1000
1010
  return self
1001
1011
 
1002
- descending = [s.how == 'desc' or s.how.lower() == 'descending' for s in sorts]
1012
+ descending = [s.how == "desc" or s.how.lower() == "descending" for s in sorts]
1003
1013
  df = self.data_frame.sort([sort_by.column for sort_by in sorts], descending=descending)
1004
1014
  return FlowDataEngine(df, number_of_records=self.number_of_records, schema=self.schema)
1005
1015
 
1006
- def change_column_types(self, transforms: List[transform_schemas.SelectInput],
1007
- calculate_schema: bool = False) -> "FlowDataEngine":
1016
+ def change_column_types(
1017
+ self, transforms: list[transform_schemas.SelectInput], calculate_schema: bool = False
1018
+ ) -> "FlowDataEngine":
1008
1019
  """Changes the data type of one or more columns.
1009
1020
 
1010
1021
  Args:
@@ -1018,7 +1029,8 @@ class FlowDataEngine:
1018
1029
  dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
1019
1030
  idx_mapping = list(
1020
1031
  (transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
1021
- for transform in transforms if transform.data_type is not None
1032
+ for transform in transforms
1033
+ if transform.data_type is not None
1022
1034
  )
1023
1035
 
1024
1036
  actual_transforms = [c for c in idx_mapping if c[2] != dtypes[c[1]]]
@@ -1032,10 +1044,10 @@ class FlowDataEngine:
1032
1044
  df,
1033
1045
  number_of_records=self.number_of_records,
1034
1046
  calculate_schema_stats=calculate_schema,
1035
- streamable=self._streamable
1047
+ streamable=self._streamable,
1036
1048
  )
1037
1049
 
1038
- def save(self, path: str, data_type: str = 'parquet') -> Future:
1050
+ def save(self, path: str, data_type: str = "parquet") -> Future:
1039
1051
  """Saves the DataFrame to a file in a separate thread.
1040
1052
 
1041
1053
  Args:
@@ -1049,7 +1061,7 @@ class FlowDataEngine:
1049
1061
  df = deepcopy(self.data_frame)
1050
1062
  return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
1051
1063
 
1052
- def to_pylist(self) -> List[Dict]:
1064
+ def to_pylist(self) -> list[dict]:
1053
1065
  """Converts the DataFrame to a list of Python dictionaries.
1054
1066
 
1055
1067
  Returns:
@@ -1083,15 +1095,15 @@ class FlowDataEngine:
1083
1095
  data = list(self.to_dict().values())
1084
1096
  return input_schema.RawData(columns=columns, data=data)
1085
1097
 
1086
- def to_dict(self) -> Dict[str, List]:
1098
+ def to_dict(self) -> dict[str, list]:
1087
1099
  """Converts the DataFrame to a Python dictionary of columns.
1088
1100
 
1089
- Each key in the dictionary is a column name, and the corresponding value
1090
- is a list of the data in that column.
1101
+ Each key in the dictionary is a column name, and the corresponding value
1102
+ is a list of the data in that column.
1091
1103
 
1092
- Returns:
1093
- A dictionary mapping column names to lists of their values.
1094
- """
1104
+ Returns:
1105
+ A dictionary mapping column names to lists of their values.
1106
+ """
1095
1107
  if self.lazy:
1096
1108
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
1097
1109
  else:
@@ -1131,7 +1143,7 @@ class FlowDataEngine:
1131
1143
  return cls(pl.read_sql(sql, conn))
1132
1144
 
1133
1145
  @classmethod
1134
- def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
1146
+ def create_from_schema(cls, schema: list[FlowfileColumn]) -> "FlowDataEngine":
1135
1147
  """Creates an empty FlowDataEngine from a schema definition.
1136
1148
 
1137
1149
  Args:
@@ -1162,14 +1174,14 @@ class FlowDataEngine:
1162
1174
  """
1163
1175
  received_table.set_absolute_filepath()
1164
1176
  file_type_handlers = {
1165
- 'csv': create_funcs.create_from_path_csv,
1166
- 'parquet': create_funcs.create_from_path_parquet,
1167
- 'excel': create_funcs.create_from_path_excel
1177
+ "csv": create_funcs.create_from_path_csv,
1178
+ "parquet": create_funcs.create_from_path_parquet,
1179
+ "excel": create_funcs.create_from_path_excel,
1168
1180
  }
1169
1181
 
1170
1182
  handler = file_type_handlers.get(received_table.file_type)
1171
1183
  if not handler:
1172
- raise Exception(f'Cannot create from {received_table.file_type}')
1184
+ raise Exception(f"Cannot create from {received_table.file_type}")
1173
1185
 
1174
1186
  flow_file = cls(handler(received_table))
1175
1187
  flow_file._org_path = received_table.abs_file_path
@@ -1190,7 +1202,7 @@ class FlowDataEngine:
1190
1202
  return cls(create_fake_data(number_of_records))
1191
1203
 
1192
1204
  @classmethod
1193
- def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
1205
+ def generate_enumerator(cls, length: int = 1000, output_name: str = "output_column") -> "FlowDataEngine":
1194
1206
  """Generates a FlowDataEngine with a single column containing a sequence of integers.
1195
1207
 
1196
1208
  Args:
@@ -1204,8 +1216,9 @@ class FlowDataEngine:
1204
1216
  length = 10_000_000
1205
1217
  return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
1206
1218
 
1207
- def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
1208
- pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
1219
+ def _handle_schema(
1220
+ self, schema: list[FlowfileColumn] | list[str] | pl.Schema | None, pl_schema: pl.Schema
1221
+ ) -> list[FlowfileColumn] | None:
1209
1222
  """Handles schema processing and validation during initialization."""
1210
1223
  if schema is None and pl_schema is not None:
1211
1224
  return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
@@ -1216,7 +1229,8 @@ class FlowDataEngine:
1216
1229
  elif pl_schema is not None and schema is not None:
1217
1230
  if schema.__len__() != pl_schema.__len__():
1218
1231
  raise Exception(
1219
- f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
1232
+ f"Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}"
1233
+ )
1220
1234
  if isinstance(schema, pl.Schema):
1221
1235
  return self._handle_polars_schema(schema, pl_schema)
1222
1236
  elif isinstance(schema, list) and len(schema) == 0:
@@ -1225,31 +1239,29 @@ class FlowDataEngine:
1225
1239
  return self._handle_string_schema(schema, pl_schema)
1226
1240
  return schema
1227
1241
 
1228
- def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
1242
+ def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> list[FlowfileColumn]:
1229
1243
  """Handles Polars schema conversion."""
1230
1244
  flow_file_columns = [
1231
1245
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
1232
- for col_name, dtype in zip(schema.names(), schema.dtypes())
1246
+ for col_name, dtype in zip(schema.names(), schema.dtypes(), strict=False)
1233
1247
  ]
1234
1248
 
1235
1249
  select_arg = [
1236
1250
  pl.col(o).alias(n).cast(schema_dtype)
1237
- for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes())
1251
+ for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes(), strict=False)
1238
1252
  ]
1239
1253
 
1240
1254
  self.data_frame = self.data_frame.select(select_arg)
1241
1255
  return flow_file_columns
1242
1256
 
1243
- def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
1257
+ def _handle_string_schema(self, schema: list[str], pl_schema: pl.Schema) -> list[FlowfileColumn]:
1244
1258
  """Handles string-based schema conversion."""
1245
1259
  flow_file_columns = [
1246
1260
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
1247
- for col_name, dtype in zip(schema, pl_schema.dtypes())
1261
+ for col_name, dtype in zip(schema, pl_schema.dtypes(), strict=False)
1248
1262
  ]
1249
1263
 
1250
- self.data_frame = self.data_frame.rename({
1251
- o: n for o, n in zip(pl_schema.names(), schema)
1252
- })
1264
+ self.data_frame = self.data_frame.rename({o: n for o, n in zip(pl_schema.names(), schema, strict=False)})
1253
1265
 
1254
1266
  return flow_file_columns
1255
1267
 
@@ -1267,25 +1279,16 @@ class FlowDataEngine:
1267
1279
  A new `FlowDataEngine` instance with the exploded rows.
1268
1280
  """
1269
1281
  output_column_name = (
1270
- split_input.output_column_name
1271
- if split_input.output_column_name
1272
- else split_input.column_to_split
1282
+ split_input.output_column_name if split_input.output_column_name else split_input.column_to_split
1273
1283
  )
1274
1284
 
1275
1285
  split_value = (
1276
- split_input.split_fixed_value
1277
- if split_input.split_by_fixed_value
1278
- else pl.col(split_input.split_by_column)
1286
+ split_input.split_fixed_value if split_input.split_by_fixed_value else pl.col(split_input.split_by_column)
1279
1287
  )
1280
1288
 
1281
- df = (
1282
- self.data_frame.with_columns(
1283
- pl.col(split_input.column_to_split)
1284
- .str.split(by=split_value)
1285
- .alias(output_column_name)
1286
- )
1287
- .explode(output_column_name)
1288
- )
1289
+ df = self.data_frame.with_columns(
1290
+ pl.col(split_input.column_to_split).str.split(by=split_value).alias(output_column_name)
1291
+ ).explode(output_column_name)
1289
1292
 
1290
1293
  return FlowDataEngine(df)
1291
1294
 
@@ -1305,15 +1308,9 @@ class FlowDataEngine:
1305
1308
  lf = self.data_frame
1306
1309
 
1307
1310
  if unpivot_input.data_type_selector_expr is not None:
1308
- result = lf.unpivot(
1309
- on=unpivot_input.data_type_selector_expr(),
1310
- index=unpivot_input.index_columns
1311
- )
1311
+ result = lf.unpivot(on=unpivot_input.data_type_selector_expr(), index=unpivot_input.index_columns)
1312
1312
  elif unpivot_input.value_columns is not None:
1313
- result = lf.unpivot(
1314
- on=unpivot_input.value_columns,
1315
- index=unpivot_input.index_columns
1316
- )
1313
+ result = lf.unpivot(on=unpivot_input.value_columns, index=unpivot_input.index_columns)
1317
1314
  else:
1318
1315
  result = lf.unpivot()
1319
1316
 
@@ -1333,19 +1330,24 @@ class FlowDataEngine:
1333
1330
  """
1334
1331
  # Get unique values for pivot columns
1335
1332
  max_unique_vals = 200
1336
- new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
1337
- .unique()
1338
- .sort(pivot_input.pivot_column)
1339
- .limit(max_unique_vals).cast(pl.String))
1333
+ new_cols_unique = fetch_unique_values(
1334
+ self.data_frame.select(pivot_input.pivot_column)
1335
+ .unique()
1336
+ .sort(pivot_input.pivot_column)
1337
+ .limit(max_unique_vals)
1338
+ .cast(pl.String)
1339
+ )
1340
1340
  if len(new_cols_unique) >= max_unique_vals:
1341
1341
  if node_logger:
1342
- node_logger.warning('Pivot column has too many unique values. Please consider using a different column.'
1343
- f' Max unique values: {max_unique_vals}')
1342
+ node_logger.warning(
1343
+ "Pivot column has too many unique values. Please consider using a different column."
1344
+ f" Max unique values: {max_unique_vals}"
1345
+ )
1344
1346
 
1345
1347
  if len(pivot_input.index_columns) == 0:
1346
1348
  no_index_cols = True
1347
- pivot_input.index_columns = ['__temp__']
1348
- ff = self.apply_flowfile_formula('1', col_name='__temp__')
1349
+ pivot_input.index_columns = ["__temp__"]
1350
+ ff = self.apply_flowfile_formula("1", col_name="__temp__")
1349
1351
  else:
1350
1352
  no_index_cols = False
1351
1353
  ff = self
@@ -1355,36 +1357,32 @@ class FlowDataEngine:
1355
1357
  grouped_ff = ff.do_group_by(pivot_input.get_group_by_input(), False)
1356
1358
  pivot_column = pivot_input.get_pivot_column()
1357
1359
 
1358
- input_df = grouped_ff.data_frame.with_columns(
1359
- pivot_column.cast(pl.String).alias(pivot_input.pivot_column)
1360
- )
1360
+ input_df = grouped_ff.data_frame.with_columns(pivot_column.cast(pl.String).alias(pivot_input.pivot_column))
1361
1361
  number_of_aggregations = len(pivot_input.aggregations)
1362
1362
  df = (
1363
- input_df.select(
1364
- *index_columns,
1365
- pivot_column,
1366
- pivot_input.get_values_expr()
1367
- )
1363
+ input_df.select(*index_columns, pivot_column, pivot_input.get_values_expr())
1368
1364
  .group_by(*index_columns)
1369
- .agg([
1370
- (pl.col('vals').filter(pivot_column == new_col_value))
1371
- .first()
1372
- .alias(new_col_value)
1373
- for new_col_value in new_cols_unique
1374
- ])
1365
+ .agg(
1366
+ [
1367
+ (pl.col("vals").filter(pivot_column == new_col_value)).first().alias(new_col_value)
1368
+ for new_col_value in new_cols_unique
1369
+ ]
1370
+ )
1375
1371
  .select(
1376
1372
  *index_columns,
1377
1373
  *[
1378
- pl.col(new_col).struct.field(agg).alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
1374
+ pl.col(new_col)
1375
+ .struct.field(agg)
1376
+ .alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
1379
1377
  for new_col in new_cols_unique
1380
1378
  for agg in pivot_input.aggregations
1381
- ]
1379
+ ],
1382
1380
  )
1383
1381
  )
1384
1382
 
1385
1383
  # Clean up temporary columns if needed
1386
1384
  if no_index_cols:
1387
- df = df.drop('__temp__')
1385
+ df = df.drop("__temp__")
1388
1386
  pivot_input.index_columns = []
1389
1387
 
1390
1388
  return FlowDataEngine(df, calculate_schema_stats=False)
@@ -1403,7 +1401,7 @@ class FlowDataEngine:
1403
1401
  try:
1404
1402
  f = to_expr(predicate)
1405
1403
  except Exception as e:
1406
- logger.warning(f'Error in filter expression: {e}')
1404
+ logger.warning(f"Error in filter expression: {e}")
1407
1405
  f = to_expr("False")
1408
1406
  df = self.data_frame.filter(f)
1409
1407
  return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
@@ -1430,29 +1428,27 @@ class FlowDataEngine:
1430
1428
  select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
1431
1429
 
1432
1430
  df = (
1433
- self.data_frame
1434
- .with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
1431
+ self.data_frame.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
1435
1432
  .with_columns(
1436
- (pl.cum_count(record_id_settings.output_column_name)
1437
- .over(record_id_settings.group_by_columns) + record_id_settings.offset - 1)
1438
- .alias(record_id_settings.output_column_name)
1433
+ (
1434
+ pl.cum_count(record_id_settings.output_column_name).over(record_id_settings.group_by_columns)
1435
+ + record_id_settings.offset
1436
+ - 1
1437
+ ).alias(record_id_settings.output_column_name)
1439
1438
  )
1440
1439
  .select(select_cols)
1441
1440
  )
1442
1441
 
1443
- output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
1442
+ output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
1444
1443
  output_schema.extend(self.schema)
1445
1444
 
1446
1445
  return FlowDataEngine(df, schema=output_schema)
1447
1446
 
1448
1447
  def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
1449
1448
  """Adds a simple sequential record ID column."""
1450
- df = self.data_frame.with_row_index(
1451
- record_id_settings.output_column_name,
1452
- record_id_settings.offset
1453
- )
1449
+ df = self.data_frame.with_row_index(record_id_settings.output_column_name, record_id_settings.offset)
1454
1450
 
1455
- output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
1451
+ output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
1456
1452
  output_schema.extend(self.schema)
1457
1453
 
1458
1454
  return FlowDataEngine(df, schema=output_schema)
@@ -1484,7 +1480,7 @@ class FlowDataEngine:
1484
1480
 
1485
1481
  def __repr__(self) -> str:
1486
1482
  """Returns a string representation of the FlowDataEngine."""
1487
- return f'flow data engine\n{self.data_frame.__repr__()}'
1483
+ return f"flow data engine\n{self.data_frame.__repr__()}"
1488
1484
 
1489
1485
  def __call__(self) -> "FlowDataEngine":
1490
1486
  """Makes the class instance callable, returning itself."""
@@ -1504,16 +1500,16 @@ class FlowDataEngine:
1504
1500
  Returns:
1505
1501
  The same `FlowDataEngine` instance, now backed by the cached data.
1506
1502
  """
1507
- edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
1508
- flow_id=-1,
1509
- node_id=-1)
1510
- logger.info('Caching data in background')
1503
+ edf = ExternalDfFetcher(
1504
+ lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False, flow_id=-1, node_id=-1
1505
+ )
1506
+ logger.info("Caching data in background")
1511
1507
  result = edf.get_result()
1512
1508
  if isinstance(result, pl.LazyFrame):
1513
- logger.info('Data cached')
1509
+ logger.info("Data cached")
1514
1510
  del self._data_frame
1515
1511
  self.data_frame = result
1516
- logger.info('Data loaded from cache')
1512
+ logger.info("Data loaded from cache")
1517
1513
  return self
1518
1514
 
1519
1515
  def collect_external(self):
@@ -1525,14 +1521,14 @@ class FlowDataEngine:
1525
1521
  re-evaluated.
1526
1522
  """
1527
1523
  if self._external_source is not None:
1528
- logger.info('Collecting external source')
1524
+ logger.info("Collecting external source")
1529
1525
  if self.external_source.get_pl_df() is not None:
1530
1526
  self.data_frame = self.external_source.get_pl_df().lazy()
1531
1527
  else:
1532
1528
  self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
1533
1529
  self._schema = None # enforce reset schema
1534
1530
 
1535
- def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
1531
+ def get_output_sample(self, n_rows: int = 10) -> list[dict]:
1536
1532
  """Gets a sample of the data as a list of dictionaries.
1537
1533
 
1538
1534
  This is typically used to display a preview of the data in a UI.
@@ -1560,14 +1556,20 @@ class FlowDataEngine:
1560
1556
  try:
1561
1557
  df = df.head(n_rows).collect()
1562
1558
  except Exception as e:
1563
- logger.warning(f'Error in getting sample: {e}')
1559
+ logger.warning(f"Error in getting sample: {e}")
1564
1560
  df = df.head(n_rows).collect(engine="auto")
1565
1561
  else:
1566
1562
  df = self.collect()
1567
1563
  return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
1568
1564
 
1569
- def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
1570
- seed: int = None, execution_location: Optional[ExecutionLocationsLiteral] = None) -> "FlowDataEngine":
1565
+ def get_sample(
1566
+ self,
1567
+ n_rows: int = 100,
1568
+ random: bool = False,
1569
+ shuffle: bool = False,
1570
+ seed: int = None,
1571
+ execution_location: ExecutionLocationsLiteral | None = None,
1572
+ ) -> "FlowDataEngine":
1571
1573
  """Gets a sample of rows from the DataFrame.
1572
1574
 
1573
1575
  Args:
@@ -1579,22 +1581,23 @@ class FlowDataEngine:
1579
1581
  Returns:
1580
1582
  A new `FlowDataEngine` instance containing the sampled data.
1581
1583
  """
1582
- logging.info(f'Getting sample of {n_rows} rows')
1584
+ logging.info(f"Getting sample of {n_rows} rows")
1583
1585
  if random:
1584
1586
  if self.lazy and self.external_source is not None:
1585
1587
  self.collect_external()
1586
1588
 
1587
1589
  if self.lazy and shuffle:
1588
- sample_df = (self.data_frame.collect(engine="streaming" if self._streamable else "auto")
1589
- .sample(n_rows, seed=seed, shuffle=shuffle))
1590
+ sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(
1591
+ n_rows, seed=seed, shuffle=shuffle
1592
+ )
1590
1593
  elif shuffle:
1591
1594
  sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
1592
1595
  else:
1593
1596
  if execution_location is None:
1594
1597
  execution_location = get_global_execution_location()
1595
- n_rows = min(n_rows, self.get_number_of_records(
1596
- calculate_in_worker_process=execution_location == "remote")
1597
- )
1598
+ n_rows = min(
1599
+ n_rows, self.get_number_of_records(calculate_in_worker_process=execution_location == "remote")
1600
+ )
1598
1601
 
1599
1602
  every_n_records = ceil(self.number_of_records / n_rows)
1600
1603
  sample_df = self.data_frame.gather_every(every_n_records)
@@ -1619,8 +1622,9 @@ class FlowDataEngine:
1619
1622
  else:
1620
1623
  return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
1621
1624
 
1622
- def iter_batches(self, batch_size: int = 1000,
1623
- columns: Union[List, Tuple, str] = None) -> Generator["FlowDataEngine", None, None]:
1625
+ def iter_batches(
1626
+ self, batch_size: int = 1000, columns: list | tuple | str = None
1627
+ ) -> Generator["FlowDataEngine", None, None]:
1624
1628
  """Iterates over the DataFrame in batches.
1625
1629
 
1626
1630
  Args:
@@ -1638,9 +1642,14 @@ class FlowDataEngine:
1638
1642
  for batch in batches:
1639
1643
  yield FlowDataEngine(batch)
1640
1644
 
1641
- def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1642
- other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
1643
- node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
1645
+ def start_fuzzy_join(
1646
+ self,
1647
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1648
+ other: "FlowDataEngine",
1649
+ file_ref: str,
1650
+ flow_id: int = -1,
1651
+ node_id: int | str = -1,
1652
+ ) -> ExternalFuzzyMatchFetcher:
1644
1653
  """Starts a fuzzy join operation in a background process.
1645
1654
 
1646
1655
  This method prepares the data and initiates the fuzzy matching in a
@@ -1658,51 +1667,70 @@ class FlowDataEngine:
1658
1667
  progress and retrieve the result of the fuzzy join.
1659
1668
  """
1660
1669
  fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1661
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1662
- fuzzy_match_input_manager=fuzzy_match_input_manager)
1663
-
1664
- return ExternalFuzzyMatchFetcher(left_df, right_df,
1665
- fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1666
- file_ref=file_ref + '_fm',
1667
- wait_on_completion=False,
1668
- flow_id=flow_id,
1669
- node_id=node_id)
1670
-
1671
- def fuzzy_join_external(self,
1672
- fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1673
- other: "FlowDataEngine",
1674
- file_ref: str = None,
1675
- flow_id: int = -1,
1676
- node_id: int = -1
1677
- ):
1670
+ left_df, right_df = prepare_for_fuzzy_match(
1671
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1672
+ )
1673
+
1674
+ return ExternalFuzzyMatchFetcher(
1675
+ left_df,
1676
+ right_df,
1677
+ fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1678
+ file_ref=file_ref + "_fm",
1679
+ wait_on_completion=False,
1680
+ flow_id=flow_id,
1681
+ node_id=node_id,
1682
+ )
1683
+
1684
+ def fuzzy_join_external(
1685
+ self,
1686
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1687
+ other: "FlowDataEngine",
1688
+ file_ref: str = None,
1689
+ flow_id: int = -1,
1690
+ node_id: int = -1,
1691
+ ):
1678
1692
  if file_ref is None:
1679
- file_ref = str(id(self)) + '_' + str(id(other))
1693
+ file_ref = str(id(self)) + "_" + str(id(other))
1680
1694
  fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1681
1695
 
1682
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1683
- fuzzy_match_input_manager=fuzzy_match_input_manager)
1684
- external_tracker = ExternalFuzzyMatchFetcher(left_df, right_df,
1685
- fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1686
- file_ref=file_ref + '_fm',
1687
- wait_on_completion=False,
1688
- flow_id=flow_id,
1689
- node_id=node_id)
1696
+ left_df, right_df = prepare_for_fuzzy_match(
1697
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1698
+ )
1699
+ external_tracker = ExternalFuzzyMatchFetcher(
1700
+ left_df,
1701
+ right_df,
1702
+ fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1703
+ file_ref=file_ref + "_fm",
1704
+ wait_on_completion=False,
1705
+ flow_id=flow_id,
1706
+ node_id=node_id,
1707
+ )
1690
1708
  return FlowDataEngine(external_tracker.get_result())
1691
1709
 
1692
- def fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1693
- other: "FlowDataEngine",
1694
- node_logger: NodeLogger = None) -> "FlowDataEngine":
1710
+ def fuzzy_join(
1711
+ self,
1712
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1713
+ other: "FlowDataEngine",
1714
+ node_logger: NodeLogger = None,
1715
+ ) -> "FlowDataEngine":
1695
1716
  fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1696
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1697
- fuzzy_match_input_manager=fuzzy_match_input_manager)
1717
+ left_df, right_df = prepare_for_fuzzy_match(
1718
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1719
+ )
1698
1720
  fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input_manager.fuzzy_maps]
1699
- return FlowDataEngine(fuzzy_match_dfs(left_df, right_df, fuzzy_maps=fuzzy_mappings,
1700
- logger=node_logger.logger if node_logger else logger)
1701
- .lazy())
1721
+ return FlowDataEngine(
1722
+ fuzzy_match_dfs(
1723
+ left_df, right_df, fuzzy_maps=fuzzy_mappings, logger=node_logger.logger if node_logger else logger
1724
+ ).lazy()
1725
+ )
1702
1726
 
1703
- def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
1704
- auto_generate_selection: bool, verify_integrity: bool,
1705
- other: "FlowDataEngine") -> "FlowDataEngine":
1727
+ def do_cross_join(
1728
+ self,
1729
+ cross_join_input: transform_schemas.CrossJoinInput,
1730
+ auto_generate_selection: bool,
1731
+ verify_integrity: bool,
1732
+ other: "FlowDataEngine",
1733
+ ) -> "FlowDataEngine":
1706
1734
  """Performs a cross join with another DataFrame.
1707
1735
 
1708
1736
  A cross join produces the Cartesian product of the two DataFrames.
@@ -1723,26 +1751,41 @@ class FlowDataEngine:
1723
1751
  self.lazy = True
1724
1752
  other.lazy = True
1725
1753
  cross_join_input_manager = transform_schemas.CrossJoinInputManager(cross_join_input)
1726
- verify_join_select_integrity(cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns)
1727
- right_select = [v.old_name for v in cross_join_input_manager.right_select.renames
1728
- if (v.keep or v.join_key) and v.is_available]
1729
- left_select = [v.old_name for v in cross_join_input_manager.left_select.renames
1730
- if (v.keep or v.join_key) and v.is_available]
1754
+ verify_join_select_integrity(
1755
+ cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns
1756
+ )
1757
+ right_select = [
1758
+ v.old_name
1759
+ for v in cross_join_input_manager.right_select.renames
1760
+ if (v.keep or v.join_key) and v.is_available
1761
+ ]
1762
+ left_select = [
1763
+ v.old_name
1764
+ for v in cross_join_input_manager.left_select.renames
1765
+ if (v.keep or v.join_key) and v.is_available
1766
+ ]
1731
1767
  cross_join_input_manager.auto_rename(rename_mode="suffix")
1732
1768
  left = self.data_frame.select(left_select).rename(cross_join_input_manager.left_select.rename_table)
1733
1769
  right = other.data_frame.select(right_select).rename(cross_join_input_manager.right_select.rename_table)
1734
1770
 
1735
- joined_df = left.join(right, how='cross')
1771
+ joined_df = left.join(right, how="cross")
1736
1772
 
1737
- cols_to_delete_after = [col.new_name for col in
1738
- cross_join_input_manager.left_select.renames + cross_join_input_manager.left_select.renames
1739
- if col.join_key and not col.keep and col.is_available]
1773
+ cols_to_delete_after = [
1774
+ col.new_name
1775
+ for col in cross_join_input_manager.left_select.renames + cross_join_input_manager.right_select.renames
1776
+ if col.join_key and not col.keep and col.is_available
1777
+ ]
1740
1778
 
1741
1779
  fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
1742
1780
  return fl
1743
1781
 
1744
- def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
1745
- verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
1782
+ def join(
1783
+ self,
1784
+ join_input: transform_schemas.JoinInput,
1785
+ auto_generate_selection: bool,
1786
+ verify_integrity: bool,
1787
+ other: "FlowDataEngine",
1788
+ ) -> "FlowDataEngine":
1746
1789
  """Performs a standard SQL-style join with another DataFrame."""
1747
1790
  # Create manager from input
1748
1791
  join_manager = transform_schemas.JoinInputManager(join_input)
@@ -1754,40 +1797,52 @@ class FlowDataEngine:
1754
1797
  join_manager.right_select.append(transform_schemas.SelectInput(jk.right_col, keep=False))
1755
1798
  verify_join_select_integrity(join_manager.input, left_columns=self.columns, right_columns=other.columns)
1756
1799
  if not verify_join_map_integrity(join_manager.input, left_columns=self.schema, right_columns=other.schema):
1757
- raise Exception('Join is not valid by the data fields')
1800
+ raise Exception("Join is not valid by the data fields")
1758
1801
 
1759
1802
  if auto_generate_selection:
1760
1803
  join_manager.auto_rename()
1761
1804
 
1762
1805
  # Use manager properties throughout
1763
- left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(join_manager.left_manager.get_rename_table())
1764
- right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(join_manager.right_manager.get_rename_table())
1806
+ left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(
1807
+ join_manager.left_manager.get_rename_table()
1808
+ )
1809
+ right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(
1810
+ join_manager.right_manager.get_rename_table()
1811
+ )
1765
1812
 
1766
1813
  left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_manager)
1767
1814
  left, right = rename_df_table_for_join(left, right, join_manager.get_join_key_renames())
1768
- if join_manager.how == 'right':
1815
+ if join_manager.how == "right":
1769
1816
  joined_df = right.join(
1770
1817
  other=left,
1771
1818
  left_on=join_manager.right_join_keys,
1772
1819
  right_on=join_manager.left_join_keys,
1773
1820
  how="left",
1774
- suffix="").rename(reverse_join_key_mapping)
1821
+ suffix="",
1822
+ ).rename(reverse_join_key_mapping)
1775
1823
  else:
1776
1824
  joined_df = left.join(
1777
1825
  other=right,
1778
1826
  left_on=join_manager.left_join_keys,
1779
1827
  right_on=join_manager.right_join_keys,
1780
1828
  how=join_manager.how,
1781
- suffix="").rename(reverse_join_key_mapping)
1829
+ suffix="",
1830
+ ).rename(reverse_join_key_mapping)
1782
1831
 
1783
- left_cols_to_delete_after = [get_col_name_to_delete(col, 'left')
1784
- for col in join_manager.input.left_select.renames
1785
- if not col.keep and col.is_available and col.join_key]
1832
+ left_cols_to_delete_after = [
1833
+ get_col_name_to_delete(col, "left")
1834
+ for col in join_manager.input.left_select.renames
1835
+ if not col.keep and col.is_available and col.join_key
1836
+ ]
1786
1837
 
1787
- right_cols_to_delete_after = [get_col_name_to_delete(col, 'right')
1788
- for col in join_manager.input.right_select.renames
1789
- if not col.keep and col.is_available and col.join_key
1790
- and join_manager.how in ("left", "right", "inner", "cross", "outer")]
1838
+ right_cols_to_delete_after = [
1839
+ get_col_name_to_delete(col, "right")
1840
+ for col in join_manager.input.right_select.renames
1841
+ if not col.keep
1842
+ and col.is_available
1843
+ and col.join_key
1844
+ and join_manager.how in ("left", "right", "inner", "cross", "outer")
1845
+ ]
1791
1846
 
1792
1847
  if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
1793
1848
  joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
@@ -1795,8 +1850,7 @@ class FlowDataEngine:
1795
1850
  undo_join_key_remapping = get_undo_rename_mapping_join(join_manager)
1796
1851
  joined_df = joined_df.rename(undo_join_key_remapping)
1797
1852
 
1798
- return FlowDataEngine(joined_df, calculate_schema_stats=False,
1799
- number_of_records=0, streamable=False)
1853
+ return FlowDataEngine(joined_df, calculate_schema_stats=False, number_of_records=0, streamable=False)
1800
1854
 
1801
1855
  def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
1802
1856
  """Solves a graph problem represented by 'from' and 'to' columns.
@@ -1811,8 +1865,9 @@ class FlowDataEngine:
1811
1865
  A new `FlowDataEngine` instance with the solved graph data.
1812
1866
  """
1813
1867
  lf = self.data_frame.with_columns(
1814
- graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
1815
- .alias(graph_solver_input.output_column_name)
1868
+ graph_solver(graph_solver_input.col_from, graph_solver_input.col_to).alias(
1869
+ graph_solver_input.output_column_name
1870
+ )
1816
1871
  )
1817
1872
  return FlowDataEngine(lf)
1818
1873
 
@@ -1827,7 +1882,7 @@ class FlowDataEngine:
1827
1882
  A new `FlowDataEngine` instance with the added column.
1828
1883
  """
1829
1884
  if col_name is None:
1830
- col_name = 'new_values'
1885
+ col_name = "new_values"
1831
1886
  return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
1832
1887
 
1833
1888
  def get_record_count(self) -> "FlowDataEngine":
@@ -1837,7 +1892,7 @@ class FlowDataEngine:
1837
1892
  Returns:
1838
1893
  A new `FlowDataEngine` instance.
1839
1894
  """
1840
- return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
1895
+ return FlowDataEngine(self.data_frame.select(pl.len().alias("number_of_records")))
1841
1896
 
1842
1897
  def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
1843
1898
  """Asserts that this DataFrame is equal to another.
@@ -1860,13 +1915,13 @@ class FlowDataEngine:
1860
1915
  other = other.select_columns(self.columns)
1861
1916
 
1862
1917
  if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
1863
- raise Exception('Number of records is not equal')
1918
+ raise Exception("Number of records is not equal")
1864
1919
 
1865
1920
  if self.columns != other.columns:
1866
- raise Exception('Schema is not equal')
1921
+ raise Exception("Schema is not equal")
1867
1922
 
1868
1923
  if strict_schema:
1869
- assert self.data_frame.schema == other.data_frame.schema, 'Data types do not match'
1924
+ assert self.data_frame.schema == other.data_frame.schema, "Data types do not match"
1870
1925
 
1871
1926
  if ordered:
1872
1927
  self_lf = self.data_frame.sort(by=self.columns)
@@ -1876,7 +1931,7 @@ class FlowDataEngine:
1876
1931
  other_lf = other.data_frame
1877
1932
 
1878
1933
  self.lazy, other.lazy = org_laziness
1879
- assert self_lf.equals(other_lf), 'Data is not equal'
1934
+ assert self_lf.equals(other_lf), "Data is not equal"
1880
1935
 
1881
1936
  def initialize_empty_fl(self):
1882
1937
  """Initializes an empty LazyFrame."""
@@ -1891,7 +1946,7 @@ class FlowDataEngine:
1891
1946
  operation_type="calculate_number_of_records",
1892
1947
  flow_id=-1,
1893
1948
  node_id=-1,
1894
- wait_on_completion=True
1949
+ wait_on_completion=True,
1895
1950
  ).result
1896
1951
  return number_of_records
1897
1952
 
@@ -1907,8 +1962,9 @@ class FlowDataEngine:
1907
1962
  """
1908
1963
  return self.get_number_of_records(force_calculate=force_calculate)
1909
1964
 
1910
- def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
1911
- calculate_in_worker_process: bool = False) -> int:
1965
+ def get_number_of_records(
1966
+ self, warn: bool = False, force_calculate: bool = False, calculate_in_worker_process: bool = False
1967
+ ) -> int:
1912
1968
  """Gets the total number of records in the DataFrame.
1913
1969
 
1914
1970
  For lazy frames, this may trigger a full data scan, which can be expensive.
@@ -1938,12 +1994,13 @@ class FlowDataEngine:
1938
1994
  except Exception as e:
1939
1995
  logger.error(f"Error: {e}")
1940
1996
  if warn:
1941
- logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1997
+ logger.warning("Calculating the number of records this can be expensive on a lazy frame")
1942
1998
  try:
1943
1999
  self.number_of_records = self.data_frame.select(pl.len()).collect(
1944
- engine="streaming" if self._streamable else "auto")[0, 0]
2000
+ engine="streaming" if self._streamable else "auto"
2001
+ )[0, 0]
1945
2002
  except Exception:
1946
- raise ValueError('Could not get number of records')
2003
+ raise ValueError("Could not get number of records")
1947
2004
  else:
1948
2005
  self.number_of_records = self.data_frame.__len__()
1949
2006
  return self.number_of_records
@@ -1984,7 +2041,7 @@ class FlowDataEngine:
1984
2041
  return self._external_source
1985
2042
 
1986
2043
  @property
1987
- def cols_idx(self) -> Dict[str, int]:
2044
+ def cols_idx(self) -> dict[str, int]:
1988
2045
  """A dictionary mapping column names to their integer index."""
1989
2046
  if self._col_idx is None:
1990
2047
  self._col_idx = {c: i for i, c in enumerate(self.columns)}
@@ -2006,7 +2063,7 @@ class FlowDataEngine:
2006
2063
  [transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
2007
2064
  )
2008
2065
 
2009
- def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
2066
+ def select_columns(self, list_select: list[str] | tuple[str] | str) -> "FlowDataEngine":
2010
2067
  """Selects a subset of columns from the DataFrame.
2011
2068
 
2012
2069
  Args:
@@ -2019,17 +2076,17 @@ class FlowDataEngine:
2019
2076
  list_select = [list_select]
2020
2077
 
2021
2078
  idx_to_keep = [self.cols_idx.get(c) for c in list_select]
2022
- selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep) if id_to_keep is not None]
2079
+ selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep, strict=False) if id_to_keep is not None]
2023
2080
  new_schema = [self.schema[i] for i in idx_to_keep if i is not None]
2024
2081
 
2025
2082
  return FlowDataEngine(
2026
2083
  self.data_frame.select(selects),
2027
2084
  number_of_records=self.number_of_records,
2028
2085
  schema=new_schema,
2029
- streamable=self._streamable
2086
+ streamable=self._streamable,
2030
2087
  )
2031
2088
 
2032
- def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
2089
+ def drop_columns(self, columns: list[str]) -> "FlowDataEngine":
2033
2090
  """Drops specified columns from the DataFrame.
2034
2091
 
2035
2092
  Args:
@@ -2043,12 +2100,10 @@ class FlowDataEngine:
2043
2100
  new_schema = [self.schema[i] for i in idx_to_keep]
2044
2101
 
2045
2102
  return FlowDataEngine(
2046
- self.data_frame.select(cols_for_select),
2047
- number_of_records=self.number_of_records,
2048
- schema=new_schema
2103
+ self.data_frame.select(cols_for_select), number_of_records=self.number_of_records, schema=new_schema
2049
2104
  )
2050
2105
 
2051
- def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
2106
+ def reorganize_order(self, column_order: list[str]) -> "FlowDataEngine":
2052
2107
  """Reorganizes columns into a specified order.
2053
2108
 
2054
2109
  Args:
@@ -2061,8 +2116,9 @@ class FlowDataEngine:
2061
2116
  schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
2062
2117
  return FlowDataEngine(df, schema=schema, number_of_records=self.number_of_records)
2063
2118
 
2064
- def apply_flowfile_formula(self, func: str, col_name: str,
2065
- output_data_type: pl.DataType = None) -> "FlowDataEngine":
2119
+ def apply_flowfile_formula(
2120
+ self, func: str, col_name: str, output_data_type: pl.DataType = None
2121
+ ) -> "FlowDataEngine":
2066
2122
  """Applies a formula to create a new column or transform an existing one.
2067
2123
 
2068
2124
  Args:
@@ -2081,8 +2137,7 @@ class FlowDataEngine:
2081
2137
 
2082
2138
  return FlowDataEngine(df2, number_of_records=self.number_of_records)
2083
2139
 
2084
- def apply_sql_formula(self, func: str, col_name: str,
2085
- output_data_type: pl.DataType = None) -> "FlowDataEngine":
2140
+ def apply_sql_formula(self, func: str, col_name: str, output_data_type: pl.DataType = None) -> "FlowDataEngine":
2086
2141
  """Applies an SQL-style formula using `pl.sql_expr`.
2087
2142
 
2088
2143
  Args:
@@ -2101,8 +2156,9 @@ class FlowDataEngine:
2101
2156
 
2102
2157
  return FlowDataEngine(df, number_of_records=self.number_of_records)
2103
2158
 
2104
- def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
2105
- execute_remote: bool = True) -> "FlowDataEngine":
2159
+ def output(
2160
+ self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str, execute_remote: bool = True
2161
+ ) -> "FlowDataEngine":
2106
2162
  """Writes the DataFrame to an output file.
2107
2163
 
2108
2164
  Can execute the write operation locally or in a remote worker process.
@@ -2116,7 +2172,7 @@ class FlowDataEngine:
2116
2172
  Returns:
2117
2173
  The same `FlowDataEngine` instance for chaining.
2118
2174
  """
2119
- logger.info('Starting to write output')
2175
+ logger.info("Starting to write output")
2120
2176
  if execute_remote:
2121
2177
  status = utils.write_output(
2122
2178
  self.data_frame,
@@ -2126,11 +2182,11 @@ class FlowDataEngine:
2126
2182
  sheet_name=output_fs.sheet_name,
2127
2183
  delimiter=output_fs.delimiter,
2128
2184
  flow_id=flow_id,
2129
- node_id=node_id
2185
+ node_id=node_id,
2130
2186
  )
2131
2187
  tracker = ExternalExecutorTracker(status)
2132
2188
  tracker.get_result()
2133
- logger.info('Finished writing output')
2189
+ logger.info("Finished writing output")
2134
2190
  else:
2135
2191
  logger.info("Starting to write results locally")
2136
2192
  utils.local_write_output(
@@ -2172,11 +2228,10 @@ class FlowDataEngine:
2172
2228
  if isinstance(other, FlowDataEngine):
2173
2229
  other = [other]
2174
2230
 
2175
- dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
2176
- return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
2231
+ dfs: list[pl.LazyFrame] | list[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
2232
+ return FlowDataEngine(pl.concat(dfs, how="diagonal_relaxed"))
2177
2233
 
2178
- def do_select(self, select_inputs: transform_schemas.SelectInputs,
2179
- keep_missing: bool = True) -> "FlowDataEngine":
2234
+ def do_select(self, select_inputs: transform_schemas.SelectInputs, keep_missing: bool = True) -> "FlowDataEngine":
2180
2235
  """Performs a complex column selection, renaming, and reordering operation.
2181
2236
 
2182
2237
  Args:
@@ -2192,7 +2247,8 @@ class FlowDataEngine:
2192
2247
 
2193
2248
  if not keep_missing:
2194
2249
  drop_cols = set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames).union(
2195
- set(r.old_name for r in renames if not r.keep))
2250
+ set(r.old_name for r in renames if not r.keep)
2251
+ )
2196
2252
  keep_cols = []
2197
2253
  else:
2198
2254
  keep_cols = list(set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames))
@@ -2212,12 +2268,14 @@ class FlowDataEngine:
2212
2268
 
2213
2269
  rename_dict = {r.old_name: r.new_name for r in available_renames}
2214
2270
  fl = self.select_columns(
2215
- list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols)
2271
+ list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
2272
+ )
2216
2273
  fl = fl.change_column_types(transforms=[r for r in renames if r.keep])
2217
2274
  ndf = fl.data_frame.rename(rename_dict)
2218
2275
  renames.sort(key=lambda r: 0 if r.position is None else r.position)
2219
- sorted_cols = utils.match_order(ndf.collect_schema().names(),
2220
- [r.new_name for r in renames] + self.data_frame.collect_schema().names())
2276
+ sorted_cols = utils.match_order(
2277
+ ndf.collect_schema().names(), [r.new_name for r in renames] + self.data_frame.collect_schema().names()
2278
+ )
2221
2279
  output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
2222
2280
  return output_file.reorganize_order(sorted_cols)
2223
2281
 
@@ -2225,10 +2283,9 @@ class FlowDataEngine:
2225
2283
  """Sets whether DataFrame operations should be streamable."""
2226
2284
  self._streamable = streamable
2227
2285
 
2228
- def _calculate_schema(self) -> List[Dict]:
2286
+ def _calculate_schema(self) -> list[dict]:
2229
2287
  """Calculates schema statistics."""
2230
2288
  if self.external_source is not None:
2231
-
2232
2289
  self.collect_external()
2233
2290
  v = utils.calculate_schema(self.data_frame)
2234
2291
  return v
@@ -2247,8 +2304,9 @@ class FlowDataEngine:
2247
2304
  """Creates a FlowDataEngine from a path in a worker process."""
2248
2305
  received_table.set_absolute_filepath()
2249
2306
 
2250
- external_fetcher = ExternalCreateFetcher(received_table=received_table,
2251
- file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
2307
+ external_fetcher = ExternalCreateFetcher(
2308
+ received_table=received_table, file_type=received_table.file_type, flow_id=flow_id, node_id=node_id
2309
+ )
2252
2310
  return cls(external_fetcher.get_result())
2253
2311
 
2254
2312
 
@@ -2271,10 +2329,10 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
2271
2329
  if len(flowfile_tables) == 0:
2272
2330
  kwargs = {}
2273
2331
  elif len(flowfile_tables) == 1:
2274
- kwargs = {'input_df': flowfile_tables[0].data_frame}
2332
+ kwargs = {"input_df": flowfile_tables[0].data_frame}
2275
2333
  else:
2276
- kwargs = {f'input_df_{i+1}': flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
2334
+ kwargs = {f"input_df_{i+1}": flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
2277
2335
  df = polars_executable(**kwargs)
2278
2336
  if isinstance(df, pl.DataFrame):
2279
2337
  logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
2280
- return FlowDataEngine(df)
2338
+ return FlowDataEngine(df)