Flowfile 0.4.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. build_backends/main.py +25 -22
  2. build_backends/main_prd.py +10 -19
  3. flowfile/__init__.py +179 -73
  4. flowfile/__main__.py +10 -7
  5. flowfile/api.py +52 -59
  6. flowfile/web/__init__.py +14 -9
  7. flowfile/web/static/assets/AdminView-49392a9a.js +713 -0
  8. flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
  9. flowfile/web/static/assets/CloudConnectionView-36bcd6df.css +72 -0
  10. flowfile/web/static/assets/{CloudConnectionManager-d3248f8d.js → CloudConnectionView-f13f202b.js} +11 -11
  11. flowfile/web/static/assets/{CloudStorageReader-d65bf041.js → CloudStorageReader-0023d4a5.js} +10 -8
  12. flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
  13. flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
  14. flowfile/web/static/assets/{CloudStorageWriter-e83be3ed.js → CloudStorageWriter-8e781e11.js} +10 -8
  15. flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
  16. flowfile/web/static/assets/{ColumnSelector-cce661cf.js → ColumnSelector-8ad68ea9.js} +3 -5
  17. flowfile/web/static/assets/{ContextMenu-c13f91d0.css → ContextMenu-26d4dd27.css} +6 -6
  18. flowfile/web/static/assets/{ContextMenu-11a4652a.js → ContextMenu-31ee57f0.js} +3 -3
  19. flowfile/web/static/assets/{ContextMenu-160afb08.js → ContextMenu-69a74055.js} +3 -3
  20. flowfile/web/static/assets/{ContextMenu-cf18d2cc.js → ContextMenu-8e2051c6.js} +3 -3
  21. flowfile/web/static/assets/{ContextMenu-4c74eef1.css → ContextMenu-8ec1729e.css} +6 -6
  22. flowfile/web/static/assets/{ContextMenu-63cfa99b.css → ContextMenu-9b310c60.css} +6 -6
  23. flowfile/web/static/assets/{CrossJoin-d395d38c.js → CrossJoin-03df6938.js} +12 -10
  24. flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
  25. flowfile/web/static/assets/CustomNode-59e99a86.css +32 -0
  26. flowfile/web/static/assets/{CustomNode-b812dc0b.js → CustomNode-8479239b.js} +36 -24
  27. flowfile/web/static/assets/{DatabaseConnectionSettings-7000bf2c.js → DatabaseConnectionSettings-869e3efd.js} +5 -4
  28. flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-e91df89a.css} +13 -13
  29. flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-36898a00.css} +24 -24
  30. flowfile/web/static/assets/{DatabaseReader-4f035d0c.js → DatabaseReader-c58b9552.js} +25 -15
  31. flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
  32. flowfile/web/static/assets/{DatabaseManager-9662ec5b.js → DatabaseView-d26a9140.js} +11 -11
  33. flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-217a99f1.css} +19 -19
  34. flowfile/web/static/assets/{DatabaseWriter-f65dcd54.js → DatabaseWriter-4d05ddc7.js} +17 -10
  35. flowfile/web/static/assets/{designer-e3c150ec.css → DesignerView-a6d0ee84.css} +629 -538
  36. flowfile/web/static/assets/{designer-f3656d8c.js → DesignerView-e6f5c0e8.js} +1214 -3209
  37. flowfile/web/static/assets/{documentation-52b241e7.js → DocumentationView-2e78ef1b.js} +5 -5
  38. flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-fd46c656.css} +7 -7
  39. flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
  40. flowfile/web/static/assets/{ExploreData-94c43dfc.js → ExploreData-7b54caca.js} +18 -9
  41. flowfile/web/static/assets/{ExternalSource-ac04b3cc.js → ExternalSource-3fa399b2.js} +9 -7
  42. flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-47ab05a3.css} +17 -17
  43. flowfile/web/static/assets/Filter-7494ea97.css +48 -0
  44. flowfile/web/static/assets/Filter-8cbbdbf3.js +287 -0
  45. flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
  46. flowfile/web/static/assets/{Formula-71472193.js → Formula-aac42b1e.js} +13 -11
  47. flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
  48. flowfile/web/static/assets/{FuzzyMatch-b317f631.js → FuzzyMatch-cd9bbfca.js} +12 -10
  49. flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-c24dec17.css} +5 -5
  50. flowfile/web/static/assets/{GraphSolver-754a234f.js → GraphSolver-c7e6780e.js} +13 -11
  51. flowfile/web/static/assets/{GroupBy-6c6f9802.js → GroupBy-93c5d22b.js} +9 -7
  52. flowfile/web/static/assets/{GroupBy-b9505323.css → GroupBy-be7ac0bf.css} +10 -10
  53. flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
  54. flowfile/web/static/assets/{Join-a1b800be.js → Join-a19b2de2.js} +13 -11
  55. flowfile/web/static/assets/LoginView-0df4ed0a.js +134 -0
  56. flowfile/web/static/assets/LoginView-d325d632.css +172 -0
  57. flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
  58. flowfile/web/static/assets/{ManualInput-a9640276.js → ManualInput-8d3374b2.js} +170 -116
  59. flowfile/web/static/assets/{MultiSelect-97213888.js → MultiSelect-ad1b6243.js} +2 -2
  60. flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-6ffe088a.js → MultiSelect.vue_vue_type_script_setup_true_lang-e278950d.js} +1 -1
  61. flowfile/web/static/assets/NodeDesigner-40b647c9.js +2610 -0
  62. flowfile/web/static/assets/NodeDesigner-5f53be3f.css +1429 -0
  63. flowfile/web/static/assets/{NumericInput-e638088a.js → NumericInput-7100234c.js} +2 -2
  64. flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-90eb2cba.js → NumericInput.vue_vue_type_script_setup_true_lang-5130219f.js} +5 -2
  65. flowfile/web/static/assets/{Output-ddc9079f.css → Output-35e97000.css} +6 -6
  66. flowfile/web/static/assets/{Output-76750610.js → Output-f5efd2aa.js} +60 -38
  67. flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
  68. flowfile/web/static/assets/{Pivot-7814803f.js → Pivot-d981d23c.js} +11 -9
  69. flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
  70. flowfile/web/static/assets/{PivotValidation-f92137d2.js → PivotValidation-39386e95.js} +3 -3
  71. flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
  72. flowfile/web/static/assets/{PivotValidation-76dd431a.js → PivotValidation-63de1f73.js} +3 -3
  73. flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
  74. flowfile/web/static/assets/{PolarsCode-889c3008.js → PolarsCode-f9d69217.js} +18 -9
  75. flowfile/web/static/assets/PopOver-b22f049e.js +939 -0
  76. flowfile/web/static/assets/PopOver-d96599db.css +33 -0
  77. flowfile/web/static/assets/{Read-6b17491f.css → Read-36e7bd51.css} +12 -12
  78. flowfile/web/static/assets/{Read-637b72a7.js → Read-aec2e377.js} +83 -105
  79. flowfile/web/static/assets/{RecordCount-2b050c41.js → RecordCount-78ed6845.js} +6 -4
  80. flowfile/web/static/assets/{RecordId-81df7784.js → RecordId-2156e890.js} +8 -6
  81. flowfile/web/static/assets/{SQLQueryComponent-36cef432.css → SQLQueryComponent-1c2f26b4.css} +5 -5
  82. flowfile/web/static/assets/{SQLQueryComponent-88dcfe53.js → SQLQueryComponent-48c72f5b.js} +3 -3
  83. flowfile/web/static/assets/{Sample-258ad2a9.js → Sample-1352ca74.js} +6 -4
  84. flowfile/web/static/assets/SecretSelector-22b5ff89.js +113 -0
  85. flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
  86. flowfile/web/static/assets/{SecretManager-2a2cb7e2.js → SecretsView-17df66ee.js} +35 -36
  87. flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
  88. flowfile/web/static/assets/{Select-850215fd.js → Select-0aee4c54.js} +9 -7
  89. flowfile/web/static/assets/{SettingsSection-55bae608.js → SettingsSection-0784e157.js} +3 -3
  90. flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
  91. flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
  92. flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
  93. flowfile/web/static/assets/{SettingsSection-0e8d9123.js → SettingsSection-cd341bb6.js} +3 -3
  94. flowfile/web/static/assets/{SettingsSection-29b4fa6b.js → SettingsSection-f2002a6d.js} +3 -3
  95. flowfile/web/static/assets/{SingleSelect-bebd408b.js → SingleSelect-460cc0ea.js} +2 -2
  96. flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-6093741c.js → SingleSelect.vue_vue_type_script_setup_true_lang-30741bb2.js} +1 -1
  97. flowfile/web/static/assets/{SliderInput-6a05ab61.js → SliderInput-5d926864.js} +7 -4
  98. flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
  99. flowfile/web/static/assets/{Sort-10ab48ed.js → Sort-3cdc971b.js} +9 -7
  100. flowfile/web/static/assets/{Unique-f9fb0809.css → Sort-8a871341.css} +10 -10
  101. flowfile/web/static/assets/{TextInput-df9d6259.js → TextInput-a2d0bfbd.js} +2 -2
  102. flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-000e1178.js → TextInput.vue_vue_type_script_setup_true_lang-abad1ca2.js} +5 -2
  103. flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
  104. flowfile/web/static/assets/{TextToRows-6c2d93d8.js → TextToRows-918945f7.js} +11 -10
  105. flowfile/web/static/assets/{ToggleSwitch-0ff7ac52.js → ToggleSwitch-f0ef5196.js} +2 -2
  106. flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-c6dc3029.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-5605c793.js} +1 -1
  107. flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-54d2f518.css} +6 -6
  108. flowfile/web/static/assets/{UnavailableFields-1bab97cb.js → UnavailableFields-bdad6144.js} +4 -4
  109. flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
  110. flowfile/web/static/assets/{Union-b563478a.js → Union-e8ab8c86.js} +8 -6
  111. flowfile/web/static/assets/{Unique-f90db5db.js → Unique-8cd4f976.js} +13 -22
  112. flowfile/web/static/assets/{Sort-3643d625.css → Unique-9fb2f567.css} +10 -10
  113. flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-710a2948.css} +7 -7
  114. flowfile/web/static/assets/{Unpivot-bcb0025f.js → Unpivot-8da14095.js} +10 -8
  115. flowfile/web/static/assets/{UnpivotValidation-c4e73b04.js → UnpivotValidation-6f7d89ff.js} +3 -3
  116. flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
  117. flowfile/web/static/assets/{VueGraphicWalker-bb8535e2.js → VueGraphicWalker-3fb312e1.js} +4 -4
  118. flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
  119. flowfile/web/static/assets/{api-4c8e3822.js → api-24483f0d.js} +1 -1
  120. flowfile/web/static/assets/{api-2d6adc4f.js → api-8b81fa73.js} +1 -1
  121. flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-3d8dc5fa.css} +40 -40
  122. flowfile/web/static/assets/{dropDown-1bca8a74.js → dropDown-ac0fda9d.js} +3 -3
  123. flowfile/web/static/assets/{fullEditor-2985687e.js → fullEditor-5497a84a.js} +11 -10
  124. flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-a0be62b3.css} +74 -62
  125. flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
  126. flowfile/web/static/assets/{genericNodeSettings-0476ba4e.js → genericNodeSettings-99014e1d.js} +5 -5
  127. flowfile/web/static/assets/index-07dda503.js +38 -0
  128. flowfile/web/static/assets/index-3ba44389.js +2696 -0
  129. flowfile/web/static/assets/{index-50508d4d.css → index-e6289dd0.css} +1945 -569
  130. flowfile/web/static/assets/{index-246f201c.js → index-fb6493ae.js} +41626 -40869
  131. flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
  132. flowfile/web/static/assets/nodeInput-0eb13f1a.js +2 -0
  133. flowfile/web/static/assets/{outputCsv-d686eeaf.js → outputCsv-8f8ba42d.js} +3 -3
  134. flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
  135. flowfile/web/static/assets/{outputExcel-8809ea2f.js → outputExcel-393f4fef.js} +3 -3
  136. flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
  137. flowfile/web/static/assets/{outputParquet-53ba645a.js → outputParquet-07c81f65.js} +4 -4
  138. flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
  139. flowfile/web/static/assets/{readCsv-053bf97b.js → readCsv-07f6d9ad.js} +21 -20
  140. flowfile/web/static/assets/{readCsv-bca3ed53.css → readCsv-3bfac4c3.css} +15 -15
  141. flowfile/web/static/assets/{readExcel-e1b381ea.css → readExcel-3db6b763.css} +13 -13
  142. flowfile/web/static/assets/{readExcel-ad531eab.js → readExcel-ed69bc8f.js} +10 -12
  143. flowfile/web/static/assets/{readParquet-cee068e2.css → readParquet-c5244ad5.css} +4 -4
  144. flowfile/web/static/assets/{readParquet-58e899a1.js → readParquet-e3ed4528.js} +4 -7
  145. flowfile/web/static/assets/secrets.api-002e7d7e.js +65 -0
  146. flowfile/web/static/assets/{selectDynamic-b38de2ba.js → selectDynamic-80b92899.js} +5 -5
  147. flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
  148. flowfile/web/static/assets/{vue-codemirror.esm-db9b8936.js → vue-codemirror.esm-0965f39f.js} +31 -637
  149. flowfile/web/static/assets/{vue-content-loader.es-b5f3ac30.js → vue-content-loader.es-c506ad97.js} +1 -1
  150. flowfile/web/static/index.html +2 -2
  151. {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/METADATA +4 -4
  152. flowfile-0.5.3.dist-info/RECORD +402 -0
  153. {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/WHEEL +1 -1
  154. {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/entry_points.txt +1 -0
  155. flowfile_core/__init__.py +13 -3
  156. flowfile_core/auth/jwt.py +51 -16
  157. flowfile_core/auth/models.py +32 -7
  158. flowfile_core/auth/password.py +89 -0
  159. flowfile_core/auth/secrets.py +8 -6
  160. flowfile_core/configs/__init__.py +9 -7
  161. flowfile_core/configs/flow_logger.py +15 -14
  162. flowfile_core/configs/node_store/__init__.py +72 -4
  163. flowfile_core/configs/node_store/nodes.py +155 -172
  164. flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
  165. flowfile_core/configs/settings.py +28 -15
  166. flowfile_core/database/connection.py +7 -6
  167. flowfile_core/database/init_db.py +96 -2
  168. flowfile_core/database/models.py +3 -1
  169. flowfile_core/fileExplorer/__init__.py +17 -0
  170. flowfile_core/fileExplorer/funcs.py +123 -57
  171. flowfile_core/fileExplorer/utils.py +10 -11
  172. flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
  173. flowfile_core/flowfile/analytics/analytics_processor.py +27 -24
  174. flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
  175. flowfile_core/flowfile/analytics/utils.py +1 -1
  176. flowfile_core/flowfile/code_generator/code_generator.py +391 -279
  177. flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
  178. flowfile_core/flowfile/connection_manager/models.py +1 -1
  179. flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
  180. flowfile_core/flowfile/database_connection_manager/models.py +1 -1
  181. flowfile_core/flowfile/extensions.py +17 -12
  182. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
  183. flowfile_core/flowfile/flow_data_engine/create/funcs.py +152 -103
  184. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +526 -477
  185. flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
  186. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
  187. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
  188. flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
  189. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
  190. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +43 -32
  191. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
  192. flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
  193. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +15 -11
  194. flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
  195. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
  196. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
  197. flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
  198. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
  199. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
  200. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +360 -191
  201. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
  202. flowfile_core/flowfile/flow_data_engine/utils.py +101 -67
  203. flowfile_core/flowfile/flow_graph.py +1011 -561
  204. flowfile_core/flowfile/flow_graph_utils.py +31 -49
  205. flowfile_core/flowfile/flow_node/flow_node.py +332 -232
  206. flowfile_core/flowfile/flow_node/models.py +54 -41
  207. flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
  208. flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
  209. flowfile_core/flowfile/handler.py +82 -32
  210. flowfile_core/flowfile/manage/compatibility_enhancements.py +493 -47
  211. flowfile_core/flowfile/manage/io_flowfile.py +391 -0
  212. flowfile_core/flowfile/node_designer/__init__.py +15 -13
  213. flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
  214. flowfile_core/flowfile/node_designer/custom_node.py +162 -36
  215. flowfile_core/flowfile/node_designer/ui_components.py +136 -35
  216. flowfile_core/flowfile/schema_callbacks.py +77 -54
  217. flowfile_core/flowfile/setting_generator/__init__.py +0 -1
  218. flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
  219. flowfile_core/flowfile/setting_generator/settings.py +72 -55
  220. flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
  221. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
  222. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
  223. flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
  224. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
  225. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
  226. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
  227. flowfile_core/flowfile/util/calculate_layout.py +9 -13
  228. flowfile_core/flowfile/util/execution_orderer.py +25 -17
  229. flowfile_core/flowfile/util/node_skipper.py +4 -4
  230. flowfile_core/flowfile/utils.py +19 -21
  231. flowfile_core/main.py +26 -19
  232. flowfile_core/routes/auth.py +284 -11
  233. flowfile_core/routes/cloud_connections.py +25 -25
  234. flowfile_core/routes/logs.py +21 -29
  235. flowfile_core/routes/public.py +3 -3
  236. flowfile_core/routes/routes.py +77 -43
  237. flowfile_core/routes/secrets.py +25 -27
  238. flowfile_core/routes/user_defined_components.py +483 -4
  239. flowfile_core/run_lock.py +0 -1
  240. flowfile_core/schemas/__init__.py +4 -6
  241. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
  242. flowfile_core/schemas/cloud_storage_schemas.py +59 -55
  243. flowfile_core/schemas/input_schema.py +398 -154
  244. flowfile_core/schemas/output_model.py +50 -35
  245. flowfile_core/schemas/schemas.py +207 -67
  246. flowfile_core/schemas/transform_schema.py +1360 -435
  247. flowfile_core/schemas/yaml_types.py +117 -0
  248. flowfile_core/secret_manager/secret_manager.py +17 -13
  249. flowfile_core/{flowfile/node_designer/data_types.py → types.py} +33 -3
  250. flowfile_core/utils/arrow_reader.py +7 -6
  251. flowfile_core/utils/excel_file_manager.py +3 -3
  252. flowfile_core/utils/fileManager.py +7 -7
  253. flowfile_core/utils/fl_executor.py +8 -10
  254. flowfile_core/utils/utils.py +4 -4
  255. flowfile_core/utils/validate_setup.py +5 -4
  256. flowfile_frame/__init__.py +107 -50
  257. flowfile_frame/adapters.py +2 -9
  258. flowfile_frame/adding_expr.py +73 -32
  259. flowfile_frame/cloud_storage/frame_helpers.py +27 -23
  260. flowfile_frame/cloud_storage/secret_manager.py +12 -26
  261. flowfile_frame/config.py +2 -5
  262. flowfile_frame/expr.py +311 -218
  263. flowfile_frame/expr.pyi +160 -159
  264. flowfile_frame/expr_name.py +23 -23
  265. flowfile_frame/flow_frame.py +581 -489
  266. flowfile_frame/flow_frame.pyi +123 -104
  267. flowfile_frame/flow_frame_methods.py +236 -252
  268. flowfile_frame/group_frame.py +50 -20
  269. flowfile_frame/join.py +2 -2
  270. flowfile_frame/lazy.py +129 -87
  271. flowfile_frame/lazy_methods.py +83 -30
  272. flowfile_frame/list_name_space.py +55 -50
  273. flowfile_frame/selectors.py +148 -68
  274. flowfile_frame/series.py +9 -7
  275. flowfile_frame/utils.py +19 -21
  276. flowfile_worker/__init__.py +12 -4
  277. flowfile_worker/configs.py +11 -19
  278. flowfile_worker/create/__init__.py +14 -27
  279. flowfile_worker/create/funcs.py +143 -94
  280. flowfile_worker/create/models.py +139 -68
  281. flowfile_worker/create/pl_types.py +14 -15
  282. flowfile_worker/create/read_excel_tables.py +34 -41
  283. flowfile_worker/create/utils.py +22 -19
  284. flowfile_worker/external_sources/s3_source/main.py +18 -51
  285. flowfile_worker/external_sources/s3_source/models.py +34 -27
  286. flowfile_worker/external_sources/sql_source/main.py +8 -5
  287. flowfile_worker/external_sources/sql_source/models.py +13 -9
  288. flowfile_worker/flow_logger.py +10 -8
  289. flowfile_worker/funcs.py +214 -155
  290. flowfile_worker/main.py +11 -17
  291. flowfile_worker/models.py +35 -28
  292. flowfile_worker/process_manager.py +2 -3
  293. flowfile_worker/routes.py +121 -93
  294. flowfile_worker/secrets.py +9 -6
  295. flowfile_worker/spawner.py +80 -49
  296. flowfile_worker/utils.py +3 -2
  297. shared/__init__.py +2 -7
  298. shared/storage_config.py +25 -13
  299. test_utils/postgres/commands.py +3 -2
  300. test_utils/postgres/fixtures.py +9 -9
  301. test_utils/s3/commands.py +1 -1
  302. test_utils/s3/data_generator.py +3 -4
  303. test_utils/s3/demo_data_generator.py +4 -7
  304. test_utils/s3/fixtures.py +7 -5
  305. tools/migrate/README.md +56 -0
  306. tools/migrate/__init__.py +12 -0
  307. tools/migrate/__main__.py +118 -0
  308. tools/migrate/legacy_schemas.py +682 -0
  309. tools/migrate/migrate.py +610 -0
  310. tools/migrate/tests/__init__.py +0 -0
  311. tools/migrate/tests/conftest.py +21 -0
  312. tools/migrate/tests/test_migrate.py +622 -0
  313. tools/migrate/tests/test_migration_e2e.py +1009 -0
  314. tools/migrate/tests/test_node_migrations.py +843 -0
  315. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
  316. flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
  317. flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
  318. flowfile/web/static/assets/Filter-812dcbca.js +0 -164
  319. flowfile/web/static/assets/Filter-f62091b3.css +0 -20
  320. flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
  321. flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
  322. flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
  323. flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
  324. flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
  325. flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
  326. flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
  327. flowfile/web/static/assets/secretApi-538058f3.js +0 -46
  328. flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
  329. flowfile-0.4.1.dist-info/RECORD +0 -376
  330. flowfile_core/flowfile/manage/open_flowfile.py +0 -143
  331. {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/licenses/LICENSE +0 -0
  332. /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
@@ -1,52 +1,50 @@
1
1
  # Standard library imports
2
+ from __future__ import annotations
3
+
2
4
  import logging
3
5
  import os
6
+ from collections.abc import Callable, Generator, Iterable
4
7
  from copy import deepcopy
5
8
  from dataclasses import dataclass
6
9
  from math import ceil
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
10
+ from typing import Any, Literal, TypeVar, Union
8
11
 
9
- from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
12
+ import polars as pl
10
13
 
11
14
  # Third-party imports
12
15
  from loky import Future
13
- import polars as pl
16
+ from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
14
17
  from polars.exceptions import PanicException
15
- from polars_grouper import graph_solver
16
18
  from polars_expr_transformer import simple_function_to_expr as to_expr
19
+ from polars_grouper import graph_solver
17
20
  from pyarrow import Table as PaTable
18
21
  from pyarrow.parquet import ParquetFile
19
22
 
20
23
  # Local imports - Core
21
24
  from flowfile_core.configs import logger
22
- from flowfile_core.utils.utils import ensure_similarity_dicts
23
25
  from flowfile_core.configs.flow_logger import NodeLogger
24
- from flowfile_core.schemas import (
25
- cloud_storage_schemas,
26
- input_schema,
27
- transform_schema as transform_schemas
28
- )
29
- from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
30
26
 
31
27
  # Local imports - Flow File Components
32
28
  from flowfile_core.flowfile.flow_data_engine import utils
33
- from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (CloudStorageReader,
34
- ensure_path_has_wildcard_pattern,
35
- get_first_file_from_s3_dir)
29
+ from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
30
+ CloudStorageReader,
31
+ ensure_path_has_wildcard_pattern,
32
+ get_first_file_from_s3_dir,
33
+ )
36
34
  from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
37
35
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
38
36
  FlowfileColumn,
39
37
  assert_if_flowfile_schema,
40
- convert_stats_to_column_info
38
+ convert_stats_to_column_info,
41
39
  )
42
40
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
43
41
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
44
42
  from flowfile_core.flowfile.flow_data_engine.join import (
45
- verify_join_select_integrity,
46
- verify_join_map_integrity,
47
- rename_df_table_for_join,
43
+ get_col_name_to_delete,
48
44
  get_undo_rename_mapping_join,
49
- get_col_name_to_delete
45
+ rename_df_table_for_join,
46
+ verify_join_map_integrity,
47
+ verify_join_select_integrity,
50
48
  )
51
49
  from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
52
50
  from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
@@ -55,19 +53,21 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
55
53
  ExternalDfFetcher,
56
54
  ExternalExecutorTracker,
57
55
  ExternalFuzzyMatchFetcher,
58
- fetch_unique_values
56
+ fetch_unique_values,
59
57
  )
60
- from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
61
- get_join_count,
62
- write_threaded
63
- )
64
-
58
+ from flowfile_core.flowfile.flow_data_engine.threaded_processes import write_threaded
65
59
  from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
60
+ from flowfile_core.schemas import cloud_storage_schemas, input_schema
61
+ from flowfile_core.schemas import transform_schema as transform_schemas
62
+ from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
63
+ from flowfile_core.utils.utils import ensure_similarity_dicts
66
64
 
67
- T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
65
+ T = TypeVar("T", pl.DataFrame, pl.LazyFrame)
68
66
 
69
67
 
70
- def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
68
+ def _handle_duplication_join_keys(
69
+ left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager
70
+ ) -> tuple[T, T, dict[str, str]]:
71
71
  """Temporarily renames join keys to avoid conflicts during a join.
72
72
 
73
73
  This helper function checks the join type and renames the join key columns
@@ -86,20 +86,28 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform
86
86
  - The (potentially modified) right DataFrame.
87
87
  - A dictionary mapping the temporary names back to their desired final names.
88
88
  """
89
+
89
90
  def _construct_temp_name(column_name: str) -> str:
90
- return "__FL_TEMP__"+column_name
91
- if join_input.how == 'right':
92
- left_df = left_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
93
- for jk in join_input.left_select.join_key_selects)
91
+ return "__FL_TEMP__" + column_name
92
+
93
+ if join_manager.how == "right":
94
+ left_df = left_df.with_columns(
95
+ pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
96
+ for jk in join_manager.left_manager.get_join_key_selects()
97
+ )
94
98
  reverse_actions = {
95
99
  _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
96
- for jk in join_input.left_select.join_key_selects}
97
- elif join_input.how in ('left', 'inner'):
98
- right_df = right_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
99
- for jk in join_input.right_select.join_key_selects)
100
+ for jk in join_manager.left_manager.get_join_key_selects()
101
+ }
102
+ elif join_manager.how in ("left", "inner"):
103
+ right_df = right_df.with_columns(
104
+ pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
105
+ for jk in join_manager.right_manager.get_join_key_selects()
106
+ )
100
107
  reverse_actions = {
101
108
  _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
102
- for jk in join_input.right_select.join_key_selects}
109
+ for jk in join_manager.right_manager.get_join_key_selects()
110
+ }
103
111
  else:
104
112
  reverse_actions = {}
105
113
  return left_df, right_df, reverse_actions
@@ -116,12 +124,12 @@ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.
116
124
  Args:
117
125
  join_input: The JoinInput settings object to modify.
118
126
  """
119
- if join_input.how in ('semi', 'anti'):
127
+ if join_input.how in ("semi", "anti"):
120
128
  for jk in join_input.right_select.renames:
121
129
  jk.keep = False
122
130
 
123
131
 
124
- def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
132
+ def get_select_columns(full_select_input: list[transform_schemas.SelectInput]) -> list[str]:
125
133
  """Extracts a list of column names to be selected from a SelectInput list.
126
134
 
127
135
  This function filters a list of `SelectInput` objects to return the names
@@ -154,15 +162,16 @@ class FlowDataEngine:
154
162
  errors: A list of errors encountered during operations.
155
163
  _schema: A cached list of `FlowfileColumn` objects representing the schema.
156
164
  """
165
+
157
166
  # Core attributes
158
- _data_frame: Union[pl.DataFrame, pl.LazyFrame]
159
- columns: List[Any]
167
+ _data_frame: pl.DataFrame | pl.LazyFrame
168
+ columns: list[Any]
160
169
 
161
170
  # Metadata attributes
162
171
  name: str = None
163
172
  number_of_records: int = None
164
- errors: List = None
165
- _schema: Optional[List['FlowfileColumn']] = None
173
+ errors: list = None
174
+ _schema: list["FlowfileColumn"] | None = None
166
175
 
167
176
  # Configuration attributes
168
177
  _optimize_memory: bool = False
@@ -171,16 +180,16 @@ class FlowDataEngine:
171
180
  _calculate_schema_stats: bool = False
172
181
 
173
182
  # Cache and optimization attributes
174
- __col_name_idx_map: Dict = None
175
- __data_map: Dict = None
176
- __optimized_columns: List = None
183
+ __col_name_idx_map: dict = None
184
+ __data_map: dict = None
185
+ __optimized_columns: list = None
177
186
  __sample__: str = None
178
187
  __number_of_fields: int = None
179
- _col_idx: Dict[str, int] = None
188
+ _col_idx: dict[str, int] = None
180
189
 
181
190
  # Source tracking
182
- _org_path: Optional[str] = None
183
- _external_source: Optional[ExternalDataSource] = None
191
+ _org_path: str | None = None
192
+ _external_source: ExternalDataSource | None = None
184
193
 
185
194
  # State tracking
186
195
  sorted_by: int = None
@@ -193,18 +202,21 @@ class FlowDataEngine:
193
202
  _number_of_records_callback: Callable = None
194
203
  _data_callback: Callable = None
195
204
 
196
-
197
- def __init__(self,
198
- raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
199
- path_ref: str = None,
200
- name: str = None,
201
- optimize_memory: bool = True,
202
- schema: List['FlowfileColumn'] | List[str] | pl.Schema = None,
203
- number_of_records: int = None,
204
- calculate_schema_stats: bool = False,
205
- streamable: bool = True,
206
- number_of_records_callback: Callable = None,
207
- data_callback: Callable = None):
205
+ def __init__(
206
+ self,
207
+ raw_data: Union[
208
+ list[dict], list[Any], dict[str, Any], "ParquetFile", pl.DataFrame, pl.LazyFrame, input_schema.RawData
209
+ ] = None,
210
+ path_ref: str = None,
211
+ name: str = None,
212
+ optimize_memory: bool = True,
213
+ schema: list["FlowfileColumn"] | list[str] | pl.Schema = None,
214
+ number_of_records: int = None,
215
+ calculate_schema_stats: bool = False,
216
+ streamable: bool = True,
217
+ number_of_records_callback: Callable = None,
218
+ data_callback: Callable = None,
219
+ ):
208
220
  """Initializes the FlowDataEngine from various data sources.
209
221
 
210
222
  Args:
@@ -264,12 +276,12 @@ class FlowDataEngine:
264
276
  elif isinstance(raw_data, (list, dict)):
265
277
  self._handle_python_data(raw_data)
266
278
 
267
- def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
279
+ def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: int | None):
268
280
  """(Internal) Initializes the engine from an eager Polars DataFrame."""
269
281
  self.data_frame = df
270
282
  self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
271
283
 
272
- def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
284
+ def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: int | None, optimize_memory: bool):
273
285
  """(Internal) Initializes the engine from a Polars LazyFrame."""
274
286
  self.data_frame = lf
275
287
  self._lazy = True
@@ -280,14 +292,14 @@ class FlowDataEngine:
280
292
  else:
281
293
  self.number_of_records = lf.select(pl.len()).collect()[0, 0]
282
294
 
283
- def _handle_python_data(self, data: Union[List, Dict]):
295
+ def _handle_python_data(self, data: list | dict):
284
296
  """(Internal) Dispatches Python collections to the correct handler."""
285
297
  if isinstance(data, dict):
286
298
  self._handle_dict_input(data)
287
299
  else:
288
300
  self._handle_list_input(data)
289
301
 
290
- def _handle_dict_input(self, data: Dict):
302
+ def _handle_dict_input(self, data: dict):
291
303
  """(Internal) Initializes the engine from a Python dictionary."""
292
304
  if len(data) == 0:
293
305
  self.initialize_empty_fl()
@@ -311,8 +323,12 @@ class FlowDataEngine:
311
323
  raw_data: An instance of `RawData` containing the data and schema.
312
324
  """
313
325
  flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
314
- polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
315
- for flowfile_column in flowfile_schema])
326
+ polars_schema = pl.Schema(
327
+ [
328
+ (flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
329
+ for flowfile_column in flowfile_schema
330
+ ]
331
+ )
316
332
  try:
317
333
  df = pl.DataFrame(raw_data.data, polars_schema, strict=False)
318
334
  except TypeError as e:
@@ -322,7 +338,7 @@ class FlowDataEngine:
322
338
  self.data_frame = df.lazy()
323
339
  self.lazy = True
324
340
 
325
- def _handle_list_input(self, data: List):
341
+ def _handle_list_input(self, data: list):
326
342
  """(Internal) Initializes the engine from a list of records."""
327
343
  number_of_records = len(data)
328
344
  if number_of_records > 0:
@@ -335,19 +351,19 @@ class FlowDataEngine:
335
351
  self.number_of_records = 0
336
352
 
337
353
  @staticmethod
338
- def _process_list_data(data: List) -> List[Dict]:
354
+ def _process_list_data(data: list) -> list[dict]:
339
355
  """(Internal) Normalizes list data into a list of dictionaries.
340
356
 
341
357
  Ensures that a list of objects or non-dict items is converted into a
342
358
  uniform list of dictionaries suitable for Polars DataFrame creation.
343
359
  """
344
- if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
360
+ if not (isinstance(data[0], dict) or hasattr(data[0], "__dict__")):
345
361
  try:
346
362
  return pl.DataFrame(data).to_dicts()
347
363
  except TypeError:
348
- raise Exception('Value must be able to be converted to dictionary')
364
+ raise Exception("Value must be able to be converted to dictionary")
349
365
  except Exception as e:
350
- raise Exception(f'Value must be able to be converted to dictionary: {e}')
366
+ raise Exception(f"Value must be able to be converted to dictionary: {e}")
351
367
 
352
368
  if not isinstance(data[0], dict):
353
369
  data = [row.__dict__ for row in data]
@@ -374,49 +390,37 @@ class FlowDataEngine:
374
390
 
375
391
  logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
376
392
 
377
- if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
393
+ if write_settings.write_mode == "append" and write_settings.file_format != "delta":
378
394
  raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
379
395
  storage_options = CloudStorageReader.get_storage_options(connection)
380
396
  credential_provider = CloudStorageReader.get_credential_provider(connection)
381
397
  # Dispatch to the correct writer based on file format
382
398
  if write_settings.file_format == "parquet":
383
399
  self._write_parquet_to_cloud(
384
- write_settings.resource_path,
385
- storage_options,
386
- credential_provider,
387
- write_settings
400
+ write_settings.resource_path, storage_options, credential_provider, write_settings
388
401
  )
389
402
  elif write_settings.file_format == "delta":
390
403
  self._write_delta_to_cloud(
391
- write_settings.resource_path,
392
- storage_options,
393
- credential_provider,
394
- write_settings
404
+ write_settings.resource_path, storage_options, credential_provider, write_settings
395
405
  )
396
406
  elif write_settings.file_format == "csv":
397
- self._write_csv_to_cloud(
398
- write_settings.resource_path,
399
- storage_options,
400
- credential_provider,
401
- write_settings
402
- )
407
+ self._write_csv_to_cloud(write_settings.resource_path, storage_options, credential_provider, write_settings)
403
408
  elif write_settings.file_format == "json":
404
409
  self._write_json_to_cloud(
405
- write_settings.resource_path,
406
- storage_options,
407
- credential_provider,
408
- write_settings
410
+ write_settings.resource_path, storage_options, credential_provider, write_settings
409
411
  )
410
412
  else:
411
413
  raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
412
414
 
413
415
  logger.info(f"Successfully wrote data to {write_settings.resource_path}")
414
416
 
415
- def _write_parquet_to_cloud(self,
416
- resource_path: str,
417
- storage_options: Dict[str, Any],
418
- credential_provider: Optional[Callable],
419
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
417
+ def _write_parquet_to_cloud(
418
+ self,
419
+ resource_path: str,
420
+ storage_options: dict[str, Any],
421
+ credential_provider: Callable | None,
422
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
423
+ ):
420
424
  """(Internal) Writes the DataFrame to a Parquet file in cloud storage.
421
425
 
422
426
  Uses `sink_parquet` for efficient streaming writes. Falls back to a
@@ -436,18 +440,20 @@ class FlowDataEngine:
436
440
  except Exception as e:
437
441
  logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
438
442
  pl_df = self.collect()
439
- sink_kwargs['file'] = sink_kwargs.pop("path")
443
+ sink_kwargs["file"] = sink_kwargs.pop("path")
440
444
  pl_df.write_parquet(**sink_kwargs)
441
445
 
442
446
  except Exception as e:
443
447
  logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
444
448
  raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
445
449
 
446
- def _write_delta_to_cloud(self,
447
- resource_path: str,
448
- storage_options: Dict[str, Any],
449
- credential_provider: Optional[Callable],
450
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
450
+ def _write_delta_to_cloud(
451
+ self,
452
+ resource_path: str,
453
+ storage_options: dict[str, Any],
454
+ credential_provider: Callable | None,
455
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
456
+ ):
451
457
  """(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
452
458
 
453
459
  This operation requires collecting the data first, as `write_delta` operates
@@ -463,11 +469,13 @@ class FlowDataEngine:
463
469
  sink_kwargs["credential_provider"] = credential_provider
464
470
  self.collect().write_delta(**sink_kwargs)
465
471
 
466
- def _write_csv_to_cloud(self,
467
- resource_path: str,
468
- storage_options: Dict[str, Any],
469
- credential_provider: Optional[Callable],
470
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
472
+ def _write_csv_to_cloud(
473
+ self,
474
+ resource_path: str,
475
+ storage_options: dict[str, Any],
476
+ credential_provider: Callable | None,
477
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
478
+ ):
471
479
  """(Internal) Writes the DataFrame to a CSV file in cloud storage.
472
480
 
473
481
  Uses `sink_csv` for efficient, streaming writes of the data.
@@ -489,11 +497,13 @@ class FlowDataEngine:
489
497
  logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
490
498
  raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
491
499
 
492
- def _write_json_to_cloud(self,
493
- resource_path: str,
494
- storage_options: Dict[str, Any],
495
- credential_provider: Optional[Callable],
496
- write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
500
+ def _write_json_to_cloud(
501
+ self,
502
+ resource_path: str,
503
+ storage_options: dict[str, Any],
504
+ credential_provider: Callable | None,
505
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
506
+ ):
497
507
  """(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
498
508
 
499
509
  Uses `sink_ndjson` for efficient, streaming writes.
@@ -511,7 +521,9 @@ class FlowDataEngine:
511
521
  raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
512
522
 
513
523
  @classmethod
514
- def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal) -> "FlowDataEngine":
524
+ def from_cloud_storage_obj(
525
+ cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal
526
+ ) -> "FlowDataEngine":
515
527
  """Creates a FlowDataEngine from an object in cloud storage.
516
528
 
517
529
  This method supports reading from various cloud storage providers like AWS S3,
@@ -548,31 +560,22 @@ class FlowDataEngine:
548
560
  )
549
561
  elif read_settings.file_format == "delta":
550
562
  return cls._read_delta_from_cloud(
551
- read_settings.resource_path,
552
- storage_options,
553
- credential_provider,
554
- read_settings
563
+ read_settings.resource_path, storage_options, credential_provider, read_settings
555
564
  )
556
565
  elif read_settings.file_format == "csv":
557
566
  return cls._read_csv_from_cloud(
558
- read_settings.resource_path,
559
- storage_options,
560
- credential_provider,
561
- read_settings
567
+ read_settings.resource_path, storage_options, credential_provider, read_settings
562
568
  )
563
569
  elif read_settings.file_format == "json":
564
570
  return cls._read_json_from_cloud(
565
571
  read_settings.resource_path,
566
572
  storage_options,
567
573
  credential_provider,
568
- read_settings.scan_mode == "directory"
574
+ read_settings.scan_mode == "directory",
569
575
  )
570
576
  elif read_settings.file_format == "iceberg":
571
577
  return cls._read_iceberg_from_cloud(
572
- read_settings.resource_path,
573
- storage_options,
574
- credential_provider,
575
- read_settings
578
+ read_settings.resource_path, storage_options, credential_provider, read_settings
576
579
  )
577
580
 
578
581
  elif read_settings.file_format in ["delta", "iceberg"]:
@@ -582,33 +585,40 @@ class FlowDataEngine:
582
585
  raise ValueError(f"Unsupported file format: {read_settings.file_format}")
583
586
 
584
587
  @staticmethod
585
- def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any],
586
- file_format: Literal["csv", "parquet", "json", "delta"]) -> List[FlowfileColumn] | None:
588
+ def _get_schema_from_first_file_in_dir(
589
+ source: str, storage_options: dict[str, Any], file_format: Literal["csv", "parquet", "json", "delta"]
590
+ ) -> list[FlowfileColumn] | None:
587
591
  """Infers the schema by scanning the first file in a cloud directory."""
588
592
  try:
589
593
  scan_func = getattr(pl, "scan_" + file_format)
590
594
  first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
591
- return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
592
- scan_func(first_file_ref, storage_options=storage_options).collect_schema()))
595
+ return convert_stats_to_column_info(
596
+ FlowDataEngine._create_schema_stats_from_pl_schema(
597
+ scan_func(first_file_ref, storage_options=storage_options).collect_schema()
598
+ )
599
+ )
593
600
  except Exception as e:
594
601
  logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
595
602
 
596
-
597
603
  @classmethod
598
- def _read_iceberg_from_cloud(cls,
599
- resource_path: str,
600
- storage_options: Dict[str, Any],
601
- credential_provider: Optional[Callable],
602
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
604
+ def _read_iceberg_from_cloud(
605
+ cls,
606
+ resource_path: str,
607
+ storage_options: dict[str, Any],
608
+ credential_provider: Callable | None,
609
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
610
+ ) -> "FlowDataEngine":
603
611
  """Reads Iceberg table(s) from cloud storage."""
604
- raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
612
+ raise NotImplementedError("Failed to read Iceberg table from cloud storage: Not yet implemented")
605
613
 
606
614
  @classmethod
607
- def _read_parquet_from_cloud(cls,
608
- resource_path: str,
609
- storage_options: Dict[str, Any],
610
- credential_provider: Optional[Callable],
611
- is_directory: bool) -> "FlowDataEngine":
615
+ def _read_parquet_from_cloud(
616
+ cls,
617
+ resource_path: str,
618
+ storage_options: dict[str, Any],
619
+ credential_provider: Callable | None,
620
+ is_directory: bool,
621
+ ) -> "FlowDataEngine":
612
622
  """Reads Parquet file(s) from cloud storage."""
613
623
  try:
614
624
  # Use scan_parquet for lazy evaluation
@@ -632,7 +642,7 @@ class FlowDataEngine:
632
642
  number_of_records=6_666_666, # Set so the provider is not accessed for this stat
633
643
  optimize_memory=True,
634
644
  streamable=True,
635
- schema=schema
645
+ schema=schema,
636
646
  )
637
647
 
638
648
  except Exception as e:
@@ -640,18 +650,20 @@ class FlowDataEngine:
640
650
  raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
641
651
 
642
652
  @classmethod
643
- def _read_delta_from_cloud(cls,
644
- resource_path: str,
645
- storage_options: Dict[str, Any],
646
- credential_provider: Optional[Callable],
647
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
653
+ def _read_delta_from_cloud(
654
+ cls,
655
+ resource_path: str,
656
+ storage_options: dict[str, Any],
657
+ credential_provider: Callable | None,
658
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
659
+ ) -> "FlowDataEngine":
648
660
  """Reads a Delta Lake table from cloud storage."""
649
661
  try:
650
662
  logger.info("Reading Delta file from cloud storage...")
651
663
  logger.info(f"read_settings: {read_settings}")
652
664
  scan_kwargs = {"source": resource_path}
653
665
  if read_settings.delta_version:
654
- scan_kwargs['version'] = read_settings.delta_version
666
+ scan_kwargs["version"] = read_settings.delta_version
655
667
  if storage_options:
656
668
  scan_kwargs["storage_options"] = storage_options
657
669
  if credential_provider:
@@ -662,18 +674,20 @@ class FlowDataEngine:
662
674
  lf,
663
675
  number_of_records=6_666_666, # Set so the provider is not accessed for this stat
664
676
  optimize_memory=True,
665
- streamable=True
677
+ streamable=True,
666
678
  )
667
679
  except Exception as e:
668
680
  logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
669
681
  raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
670
682
 
671
683
  @classmethod
672
- def _read_csv_from_cloud(cls,
673
- resource_path: str,
674
- storage_options: Dict[str, Any],
675
- credential_provider: Optional[Callable],
676
- read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
684
+ def _read_csv_from_cloud(
685
+ cls,
686
+ resource_path: str,
687
+ storage_options: dict[str, Any],
688
+ credential_provider: Callable | None,
689
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings,
690
+ ) -> "FlowDataEngine":
677
691
  """Reads CSV file(s) from cloud storage."""
678
692
  try:
679
693
  scan_kwargs = {
@@ -702,7 +716,7 @@ class FlowDataEngine:
702
716
  number_of_records=6_666_666, # Will be calculated lazily
703
717
  optimize_memory=True,
704
718
  streamable=True,
705
- schema=schema
719
+ schema=schema,
706
720
  )
707
721
 
708
722
  except Exception as e:
@@ -710,11 +724,13 @@ class FlowDataEngine:
710
724
  raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
711
725
 
712
726
  @classmethod
713
- def _read_json_from_cloud(cls,
714
- resource_path: str,
715
- storage_options: Dict[str, Any],
716
- credential_provider: Optional[Callable],
717
- is_directory: bool) -> "FlowDataEngine":
727
+ def _read_json_from_cloud(
728
+ cls,
729
+ resource_path: str,
730
+ storage_options: dict[str, Any],
731
+ credential_provider: Callable | None,
732
+ is_directory: bool,
733
+ ) -> "FlowDataEngine":
718
734
  """Reads JSON file(s) from cloud storage."""
719
735
  try:
720
736
  if is_directory:
@@ -754,8 +770,9 @@ class FlowDataEngine:
754
770
  else:
755
771
  self.data_frame = pl.read_parquet(path_ref)
756
772
 
757
- def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
758
- calculate_schema_stats: bool):
773
+ def _finalize_initialization(
774
+ self, name: str, optimize_memory: bool, schema: Any | None, calculate_schema_stats: bool
775
+ ):
759
776
  """Finalizes initialization by setting remaining attributes."""
760
777
  _ = calculate_schema_stats
761
778
  self.name = name
@@ -802,23 +819,20 @@ class FlowDataEngine:
802
819
  def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
803
820
  """Sets the underlying Polars DataFrame or LazyFrame."""
804
821
  if self.lazy and isinstance(df, pl.DataFrame):
805
- raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
822
+ raise Exception("Cannot set a non-lazy dataframe to a lazy flowfile")
806
823
  self._data_frame = df
807
824
 
808
825
  @staticmethod
809
- def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
826
+ def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> list[dict]:
810
827
  """Converts a Polars Schema into a list of schema statistics dictionaries."""
811
- return [
812
- dict(column_name=k, pl_datatype=v, col_index=i)
813
- for i, (k, v) in enumerate(pl_schema.items())
814
- ]
828
+ return [dict(column_name=k, pl_datatype=v, col_index=i) for i, (k, v) in enumerate(pl_schema.items())]
815
829
 
816
- def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
830
+ def _add_schema_from_schema_stats(self, schema_stats: list[dict]):
817
831
  """Populates the schema from a list of schema statistics dictionaries."""
818
832
  self._schema = convert_stats_to_column_info(schema_stats)
819
833
 
820
834
  @property
821
- def schema(self) -> List[FlowfileColumn]:
835
+ def schema(self) -> list[FlowfileColumn]:
822
836
  """The schema of the DataFrame as a list of `FlowfileColumn` objects.
823
837
 
824
838
  This property lazily calculates the schema if it hasn't been determined yet.
@@ -865,8 +879,10 @@ class FlowDataEngine:
865
879
  if n_records is None:
866
880
  logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
867
881
  else:
868
- logger.info(f'Fetching {n_records} record(s) for Table object "{id(self)}". '
869
- f'Settings: streaming={self._streamable}')
882
+ logger.info(
883
+ f'Fetching {n_records} record(s) for Table object "{id(self)}". '
884
+ f"Settings: streaming={self._streamable}"
885
+ )
870
886
 
871
887
  if not self.lazy:
872
888
  return self.data_frame
@@ -880,16 +896,15 @@ class FlowDataEngine:
880
896
  def _collect_data(self, n_records: int = None) -> pl.DataFrame:
881
897
  """Internal method to handle data collection logic."""
882
898
  if n_records is None:
883
-
884
899
  self.collect_external()
885
900
  if self._streamable:
886
901
  try:
887
- logger.info('Collecting data in streaming mode')
902
+ logger.info("Collecting data in streaming mode")
888
903
  return self.data_frame.collect(engine="streaming")
889
904
  except PanicException:
890
905
  self._streamable = False
891
906
 
892
- logger.info('Collecting data in non-streaming mode')
907
+ logger.info("Collecting data in non-streaming mode")
893
908
  return self.data_frame.collect()
894
909
 
895
910
  if self.external_source is not None:
@@ -918,7 +933,7 @@ class FlowDataEngine:
918
933
  return self._create_partial_dataframe(ok_cols, error_cols, n_records)
919
934
  return self._create_empty_dataframe(n_records)
920
935
 
921
- def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
936
+ def _identify_valid_columns(self, n_records: int) -> tuple[list[str], list[tuple[str, Any]]]:
922
937
  """Identifies which columns can be collected successfully."""
923
938
  ok_cols = []
924
939
  error_cols = []
@@ -930,30 +945,30 @@ class FlowDataEngine:
930
945
  error_cols.append((c, self.data_frame.schema[c]))
931
946
  return ok_cols, error_cols
932
947
 
933
- def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
934
- n_records: int) -> pl.DataFrame:
948
+ def _create_partial_dataframe(
949
+ self, ok_cols: list[str], error_cols: list[tuple[str, Any]], n_records: int
950
+ ) -> pl.DataFrame:
935
951
  """Creates a DataFrame with partial data for columns that could be collected."""
936
952
  df = self.data_frame.select(ok_cols)
937
- df = df.with_columns([
938
- pl.lit(None).alias(column_name).cast(data_type)
939
- for column_name, data_type in error_cols
940
- ])
953
+ df = df.with_columns([pl.lit(None).alias(column_name).cast(data_type) for column_name, data_type in error_cols])
941
954
  return df.select(self.columns).head(n_records).collect()
942
955
 
943
956
  def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
944
957
  """Creates an empty DataFrame with the correct schema."""
945
958
  if self.number_of_records > 0:
946
- return pl.DataFrame({
947
- column_name: pl.Series(
948
- name=column_name,
949
- values=[None] * min(self.number_of_records, n_records)
950
- ).cast(data_type)
951
- for column_name, data_type in self.data_frame.schema.items()
952
- })
959
+ return pl.DataFrame(
960
+ {
961
+ column_name: pl.Series(
962
+ name=column_name, values=[None] * min(self.number_of_records, n_records)
963
+ ).cast(data_type)
964
+ for column_name, data_type in self.data_frame.schema.items()
965
+ }
966
+ )
953
967
  return pl.DataFrame(schema=self.data_frame.schema)
954
968
 
955
- def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
956
- calculate_schema_stats: bool = True) -> "FlowDataEngine":
969
+ def do_group_by(
970
+ self, group_by_input: transform_schemas.GroupByInput, calculate_schema_stats: bool = True
971
+ ) -> "FlowDataEngine":
957
972
  """Performs a group-by operation on the DataFrame.
958
973
 
959
974
  Args:
@@ -965,27 +980,23 @@ class FlowDataEngine:
965
980
  Returns:
966
981
  A new `FlowDataEngine` instance with the grouped and aggregated data.
967
982
  """
968
- aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
969
- group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
983
+ aggregations = [c for c in group_by_input.agg_cols if c.agg != "groupby"]
984
+ group_columns = [c for c in group_by_input.agg_cols if c.agg == "groupby"]
970
985
 
971
986
  if len(group_columns) == 0:
972
987
  return FlowDataEngine(
973
- self.data_frame.select(
974
- ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
975
- ),
976
- calculate_schema_stats=calculate_schema_stats
988
+ self.data_frame.select(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
989
+ calculate_schema_stats=calculate_schema_stats,
977
990
  )
978
991
 
979
992
  df = self.data_frame.rename({c.old_name: c.new_name for c in group_columns})
980
993
  group_by_columns = [n_c.new_name for n_c in group_columns]
981
994
  return FlowDataEngine(
982
- df.group_by(*group_by_columns).agg(
983
- ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
984
- ),
985
- calculate_schema_stats=calculate_schema_stats
995
+ df.group_by(*group_by_columns).agg(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
996
+ calculate_schema_stats=calculate_schema_stats,
986
997
  )
987
998
 
988
- def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
999
+ def do_sort(self, sorts: list[transform_schemas.SortByInput]) -> "FlowDataEngine":
989
1000
  """Sorts the DataFrame by one or more columns.
990
1001
 
991
1002
  Args:
@@ -998,12 +1009,13 @@ class FlowDataEngine:
998
1009
  if not sorts:
999
1010
  return self
1000
1011
 
1001
- descending = [s.how == 'desc' or s.how.lower() == 'descending' for s in sorts]
1012
+ descending = [s.how == "desc" or s.how.lower() == "descending" for s in sorts]
1002
1013
  df = self.data_frame.sort([sort_by.column for sort_by in sorts], descending=descending)
1003
1014
  return FlowDataEngine(df, number_of_records=self.number_of_records, schema=self.schema)
1004
1015
 
1005
- def change_column_types(self, transforms: List[transform_schemas.SelectInput],
1006
- calculate_schema: bool = False) -> "FlowDataEngine":
1016
+ def change_column_types(
1017
+ self, transforms: list[transform_schemas.SelectInput], calculate_schema: bool = False
1018
+ ) -> "FlowDataEngine":
1007
1019
  """Changes the data type of one or more columns.
1008
1020
 
1009
1021
  Args:
@@ -1017,7 +1029,8 @@ class FlowDataEngine:
1017
1029
  dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
1018
1030
  idx_mapping = list(
1019
1031
  (transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
1020
- for transform in transforms if transform.data_type is not None
1032
+ for transform in transforms
1033
+ if transform.data_type is not None
1021
1034
  )
1022
1035
 
1023
1036
  actual_transforms = [c for c in idx_mapping if c[2] != dtypes[c[1]]]
@@ -1031,10 +1044,10 @@ class FlowDataEngine:
1031
1044
  df,
1032
1045
  number_of_records=self.number_of_records,
1033
1046
  calculate_schema_stats=calculate_schema,
1034
- streamable=self._streamable
1047
+ streamable=self._streamable,
1035
1048
  )
1036
1049
 
1037
- def save(self, path: str, data_type: str = 'parquet') -> Future:
1050
+ def save(self, path: str, data_type: str = "parquet") -> Future:
1038
1051
  """Saves the DataFrame to a file in a separate thread.
1039
1052
 
1040
1053
  Args:
@@ -1048,7 +1061,7 @@ class FlowDataEngine:
1048
1061
  df = deepcopy(self.data_frame)
1049
1062
  return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
1050
1063
 
1051
- def to_pylist(self) -> List[Dict]:
1064
+ def to_pylist(self) -> list[dict]:
1052
1065
  """Converts the DataFrame to a list of Python dictionaries.
1053
1066
 
1054
1067
  Returns:
@@ -1082,15 +1095,15 @@ class FlowDataEngine:
1082
1095
  data = list(self.to_dict().values())
1083
1096
  return input_schema.RawData(columns=columns, data=data)
1084
1097
 
1085
- def to_dict(self) -> Dict[str, List]:
1098
+ def to_dict(self) -> dict[str, list]:
1086
1099
  """Converts the DataFrame to a Python dictionary of columns.
1087
1100
 
1088
- Each key in the dictionary is a column name, and the corresponding value
1089
- is a list of the data in that column.
1101
+ Each key in the dictionary is a column name, and the corresponding value
1102
+ is a list of the data in that column.
1090
1103
 
1091
- Returns:
1092
- A dictionary mapping column names to lists of their values.
1093
- """
1104
+ Returns:
1105
+ A dictionary mapping column names to lists of their values.
1106
+ """
1094
1107
  if self.lazy:
1095
1108
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
1096
1109
  else:
@@ -1130,7 +1143,7 @@ class FlowDataEngine:
1130
1143
  return cls(pl.read_sql(sql, conn))
1131
1144
 
1132
1145
  @classmethod
1133
- def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
1146
+ def create_from_schema(cls, schema: list[FlowfileColumn]) -> "FlowDataEngine":
1134
1147
  """Creates an empty FlowDataEngine from a schema definition.
1135
1148
 
1136
1149
  Args:
@@ -1147,7 +1160,7 @@ class FlowDataEngine:
1147
1160
  return cls(df, schema=schema, calculate_schema_stats=False, number_of_records=0)
1148
1161
 
1149
1162
  @classmethod
1150
- def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
1163
+ def create_from_path(cls, received_table: input_schema.ReceivedTable) -> "FlowDataEngine":
1151
1164
  """Creates a FlowDataEngine from a local file path.
1152
1165
 
1153
1166
  Supports various file types like CSV, Parquet, and Excel.
@@ -1161,14 +1174,14 @@ class FlowDataEngine:
1161
1174
  """
1162
1175
  received_table.set_absolute_filepath()
1163
1176
  file_type_handlers = {
1164
- 'csv': create_funcs.create_from_path_csv,
1165
- 'parquet': create_funcs.create_from_path_parquet,
1166
- 'excel': create_funcs.create_from_path_excel
1177
+ "csv": create_funcs.create_from_path_csv,
1178
+ "parquet": create_funcs.create_from_path_parquet,
1179
+ "excel": create_funcs.create_from_path_excel,
1167
1180
  }
1168
1181
 
1169
1182
  handler = file_type_handlers.get(received_table.file_type)
1170
1183
  if not handler:
1171
- raise Exception(f'Cannot create from {received_table.file_type}')
1184
+ raise Exception(f"Cannot create from {received_table.file_type}")
1172
1185
 
1173
1186
  flow_file = cls(handler(received_table))
1174
1187
  flow_file._org_path = received_table.abs_file_path
@@ -1189,7 +1202,7 @@ class FlowDataEngine:
1189
1202
  return cls(create_fake_data(number_of_records))
1190
1203
 
1191
1204
  @classmethod
1192
- def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
1205
+ def generate_enumerator(cls, length: int = 1000, output_name: str = "output_column") -> "FlowDataEngine":
1193
1206
  """Generates a FlowDataEngine with a single column containing a sequence of integers.
1194
1207
 
1195
1208
  Args:
@@ -1203,8 +1216,9 @@ class FlowDataEngine:
1203
1216
  length = 10_000_000
1204
1217
  return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
1205
1218
 
1206
- def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
1207
- pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
1219
+ def _handle_schema(
1220
+ self, schema: list[FlowfileColumn] | list[str] | pl.Schema | None, pl_schema: pl.Schema
1221
+ ) -> list[FlowfileColumn] | None:
1208
1222
  """Handles schema processing and validation during initialization."""
1209
1223
  if schema is None and pl_schema is not None:
1210
1224
  return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
@@ -1215,7 +1229,8 @@ class FlowDataEngine:
1215
1229
  elif pl_schema is not None and schema is not None:
1216
1230
  if schema.__len__() != pl_schema.__len__():
1217
1231
  raise Exception(
1218
- f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
1232
+ f"Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}"
1233
+ )
1219
1234
  if isinstance(schema, pl.Schema):
1220
1235
  return self._handle_polars_schema(schema, pl_schema)
1221
1236
  elif isinstance(schema, list) and len(schema) == 0:
@@ -1224,31 +1239,29 @@ class FlowDataEngine:
1224
1239
  return self._handle_string_schema(schema, pl_schema)
1225
1240
  return schema
1226
1241
 
1227
- def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
1242
+ def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> list[FlowfileColumn]:
1228
1243
  """Handles Polars schema conversion."""
1229
1244
  flow_file_columns = [
1230
1245
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
1231
- for col_name, dtype in zip(schema.names(), schema.dtypes())
1246
+ for col_name, dtype in zip(schema.names(), schema.dtypes(), strict=False)
1232
1247
  ]
1233
1248
 
1234
1249
  select_arg = [
1235
1250
  pl.col(o).alias(n).cast(schema_dtype)
1236
- for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes())
1251
+ for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes(), strict=False)
1237
1252
  ]
1238
1253
 
1239
1254
  self.data_frame = self.data_frame.select(select_arg)
1240
1255
  return flow_file_columns
1241
1256
 
1242
- def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
1257
+ def _handle_string_schema(self, schema: list[str], pl_schema: pl.Schema) -> list[FlowfileColumn]:
1243
1258
  """Handles string-based schema conversion."""
1244
1259
  flow_file_columns = [
1245
1260
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
1246
- for col_name, dtype in zip(schema, pl_schema.dtypes())
1261
+ for col_name, dtype in zip(schema, pl_schema.dtypes(), strict=False)
1247
1262
  ]
1248
1263
 
1249
- self.data_frame = self.data_frame.rename({
1250
- o: n for o, n in zip(pl_schema.names(), schema)
1251
- })
1264
+ self.data_frame = self.data_frame.rename({o: n for o, n in zip(pl_schema.names(), schema, strict=False)})
1252
1265
 
1253
1266
  return flow_file_columns
1254
1267
 
@@ -1266,25 +1279,16 @@ class FlowDataEngine:
1266
1279
  A new `FlowDataEngine` instance with the exploded rows.
1267
1280
  """
1268
1281
  output_column_name = (
1269
- split_input.output_column_name
1270
- if split_input.output_column_name
1271
- else split_input.column_to_split
1282
+ split_input.output_column_name if split_input.output_column_name else split_input.column_to_split
1272
1283
  )
1273
1284
 
1274
1285
  split_value = (
1275
- split_input.split_fixed_value
1276
- if split_input.split_by_fixed_value
1277
- else pl.col(split_input.split_by_column)
1286
+ split_input.split_fixed_value if split_input.split_by_fixed_value else pl.col(split_input.split_by_column)
1278
1287
  )
1279
1288
 
1280
- df = (
1281
- self.data_frame.with_columns(
1282
- pl.col(split_input.column_to_split)
1283
- .str.split(by=split_value)
1284
- .alias(output_column_name)
1285
- )
1286
- .explode(output_column_name)
1287
- )
1289
+ df = self.data_frame.with_columns(
1290
+ pl.col(split_input.column_to_split).str.split(by=split_value).alias(output_column_name)
1291
+ ).explode(output_column_name)
1288
1292
 
1289
1293
  return FlowDataEngine(df)
1290
1294
 
@@ -1304,15 +1308,9 @@ class FlowDataEngine:
1304
1308
  lf = self.data_frame
1305
1309
 
1306
1310
  if unpivot_input.data_type_selector_expr is not None:
1307
- result = lf.unpivot(
1308
- on=unpivot_input.data_type_selector_expr(),
1309
- index=unpivot_input.index_columns
1310
- )
1311
+ result = lf.unpivot(on=unpivot_input.data_type_selector_expr(), index=unpivot_input.index_columns)
1311
1312
  elif unpivot_input.value_columns is not None:
1312
- result = lf.unpivot(
1313
- on=unpivot_input.value_columns,
1314
- index=unpivot_input.index_columns
1315
- )
1313
+ result = lf.unpivot(on=unpivot_input.value_columns, index=unpivot_input.index_columns)
1316
1314
  else:
1317
1315
  result = lf.unpivot()
1318
1316
 
@@ -1332,19 +1330,24 @@ class FlowDataEngine:
1332
1330
  """
1333
1331
  # Get unique values for pivot columns
1334
1332
  max_unique_vals = 200
1335
- new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
1336
- .unique()
1337
- .sort(pivot_input.pivot_column)
1338
- .limit(max_unique_vals).cast(pl.String))
1333
+ new_cols_unique = fetch_unique_values(
1334
+ self.data_frame.select(pivot_input.pivot_column)
1335
+ .unique()
1336
+ .sort(pivot_input.pivot_column)
1337
+ .limit(max_unique_vals)
1338
+ .cast(pl.String)
1339
+ )
1339
1340
  if len(new_cols_unique) >= max_unique_vals:
1340
1341
  if node_logger:
1341
- node_logger.warning('Pivot column has too many unique values. Please consider using a different column.'
1342
- f' Max unique values: {max_unique_vals}')
1342
+ node_logger.warning(
1343
+ "Pivot column has too many unique values. Please consider using a different column."
1344
+ f" Max unique values: {max_unique_vals}"
1345
+ )
1343
1346
 
1344
1347
  if len(pivot_input.index_columns) == 0:
1345
1348
  no_index_cols = True
1346
- pivot_input.index_columns = ['__temp__']
1347
- ff = self.apply_flowfile_formula('1', col_name='__temp__')
1349
+ pivot_input.index_columns = ["__temp__"]
1350
+ ff = self.apply_flowfile_formula("1", col_name="__temp__")
1348
1351
  else:
1349
1352
  no_index_cols = False
1350
1353
  ff = self
@@ -1354,36 +1357,32 @@ class FlowDataEngine:
1354
1357
  grouped_ff = ff.do_group_by(pivot_input.get_group_by_input(), False)
1355
1358
  pivot_column = pivot_input.get_pivot_column()
1356
1359
 
1357
- input_df = grouped_ff.data_frame.with_columns(
1358
- pivot_column.cast(pl.String).alias(pivot_input.pivot_column)
1359
- )
1360
+ input_df = grouped_ff.data_frame.with_columns(pivot_column.cast(pl.String).alias(pivot_input.pivot_column))
1360
1361
  number_of_aggregations = len(pivot_input.aggregations)
1361
1362
  df = (
1362
- input_df.select(
1363
- *index_columns,
1364
- pivot_column,
1365
- pivot_input.get_values_expr()
1366
- )
1363
+ input_df.select(*index_columns, pivot_column, pivot_input.get_values_expr())
1367
1364
  .group_by(*index_columns)
1368
- .agg([
1369
- (pl.col('vals').filter(pivot_column == new_col_value))
1370
- .first()
1371
- .alias(new_col_value)
1372
- for new_col_value in new_cols_unique
1373
- ])
1365
+ .agg(
1366
+ [
1367
+ (pl.col("vals").filter(pivot_column == new_col_value)).first().alias(new_col_value)
1368
+ for new_col_value in new_cols_unique
1369
+ ]
1370
+ )
1374
1371
  .select(
1375
1372
  *index_columns,
1376
1373
  *[
1377
- pl.col(new_col).struct.field(agg).alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
1374
+ pl.col(new_col)
1375
+ .struct.field(agg)
1376
+ .alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
1378
1377
  for new_col in new_cols_unique
1379
1378
  for agg in pivot_input.aggregations
1380
- ]
1379
+ ],
1381
1380
  )
1382
1381
  )
1383
1382
 
1384
1383
  # Clean up temporary columns if needed
1385
1384
  if no_index_cols:
1386
- df = df.drop('__temp__')
1385
+ df = df.drop("__temp__")
1387
1386
  pivot_input.index_columns = []
1388
1387
 
1389
1388
  return FlowDataEngine(df, calculate_schema_stats=False)
@@ -1402,7 +1401,7 @@ class FlowDataEngine:
1402
1401
  try:
1403
1402
  f = to_expr(predicate)
1404
1403
  except Exception as e:
1405
- logger.warning(f'Error in filter expression: {e}')
1404
+ logger.warning(f"Error in filter expression: {e}")
1406
1405
  f = to_expr("False")
1407
1406
  df = self.data_frame.filter(f)
1408
1407
  return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
@@ -1429,29 +1428,27 @@ class FlowDataEngine:
1429
1428
  select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
1430
1429
 
1431
1430
  df = (
1432
- self.data_frame
1433
- .with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
1431
+ self.data_frame.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
1434
1432
  .with_columns(
1435
- (pl.cum_count(record_id_settings.output_column_name)
1436
- .over(record_id_settings.group_by_columns) + record_id_settings.offset - 1)
1437
- .alias(record_id_settings.output_column_name)
1433
+ (
1434
+ pl.cum_count(record_id_settings.output_column_name).over(record_id_settings.group_by_columns)
1435
+ + record_id_settings.offset
1436
+ - 1
1437
+ ).alias(record_id_settings.output_column_name)
1438
1438
  )
1439
1439
  .select(select_cols)
1440
1440
  )
1441
1441
 
1442
- output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
1442
+ output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
1443
1443
  output_schema.extend(self.schema)
1444
1444
 
1445
1445
  return FlowDataEngine(df, schema=output_schema)
1446
1446
 
1447
1447
  def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
1448
1448
  """Adds a simple sequential record ID column."""
1449
- df = self.data_frame.with_row_index(
1450
- record_id_settings.output_column_name,
1451
- record_id_settings.offset
1452
- )
1449
+ df = self.data_frame.with_row_index(record_id_settings.output_column_name, record_id_settings.offset)
1453
1450
 
1454
- output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
1451
+ output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
1455
1452
  output_schema.extend(self.schema)
1456
1453
 
1457
1454
  return FlowDataEngine(df, schema=output_schema)
@@ -1483,7 +1480,7 @@ class FlowDataEngine:
1483
1480
 
1484
1481
  def __repr__(self) -> str:
1485
1482
  """Returns a string representation of the FlowDataEngine."""
1486
- return f'flow data engine\n{self.data_frame.__repr__()}'
1483
+ return f"flow data engine\n{self.data_frame.__repr__()}"
1487
1484
 
1488
1485
  def __call__(self) -> "FlowDataEngine":
1489
1486
  """Makes the class instance callable, returning itself."""
@@ -1503,16 +1500,16 @@ class FlowDataEngine:
1503
1500
  Returns:
1504
1501
  The same `FlowDataEngine` instance, now backed by the cached data.
1505
1502
  """
1506
- edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
1507
- flow_id=-1,
1508
- node_id=-1)
1509
- logger.info('Caching data in background')
1503
+ edf = ExternalDfFetcher(
1504
+ lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False, flow_id=-1, node_id=-1
1505
+ )
1506
+ logger.info("Caching data in background")
1510
1507
  result = edf.get_result()
1511
1508
  if isinstance(result, pl.LazyFrame):
1512
- logger.info('Data cached')
1509
+ logger.info("Data cached")
1513
1510
  del self._data_frame
1514
1511
  self.data_frame = result
1515
- logger.info('Data loaded from cache')
1512
+ logger.info("Data loaded from cache")
1516
1513
  return self
1517
1514
 
1518
1515
  def collect_external(self):
@@ -1524,14 +1521,14 @@ class FlowDataEngine:
1524
1521
  re-evaluated.
1525
1522
  """
1526
1523
  if self._external_source is not None:
1527
- logger.info('Collecting external source')
1524
+ logger.info("Collecting external source")
1528
1525
  if self.external_source.get_pl_df() is not None:
1529
1526
  self.data_frame = self.external_source.get_pl_df().lazy()
1530
1527
  else:
1531
1528
  self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
1532
1529
  self._schema = None # enforce reset schema
1533
1530
 
1534
- def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
1531
+ def get_output_sample(self, n_rows: int = 10) -> list[dict]:
1535
1532
  """Gets a sample of the data as a list of dictionaries.
1536
1533
 
1537
1534
  This is typically used to display a preview of the data in a UI.
@@ -1559,14 +1556,20 @@ class FlowDataEngine:
1559
1556
  try:
1560
1557
  df = df.head(n_rows).collect()
1561
1558
  except Exception as e:
1562
- logger.warning(f'Error in getting sample: {e}')
1559
+ logger.warning(f"Error in getting sample: {e}")
1563
1560
  df = df.head(n_rows).collect(engine="auto")
1564
1561
  else:
1565
1562
  df = self.collect()
1566
1563
  return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
1567
1564
 
1568
- def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
1569
- seed: int = None, execution_location: Optional[ExecutionLocationsLiteral] = None) -> "FlowDataEngine":
1565
+ def get_sample(
1566
+ self,
1567
+ n_rows: int = 100,
1568
+ random: bool = False,
1569
+ shuffle: bool = False,
1570
+ seed: int = None,
1571
+ execution_location: ExecutionLocationsLiteral | None = None,
1572
+ ) -> "FlowDataEngine":
1570
1573
  """Gets a sample of rows from the DataFrame.
1571
1574
 
1572
1575
  Args:
@@ -1578,23 +1581,23 @@ class FlowDataEngine:
1578
1581
  Returns:
1579
1582
  A new `FlowDataEngine` instance containing the sampled data.
1580
1583
  """
1581
- logging.info(f'Getting sample of {n_rows} rows')
1582
-
1584
+ logging.info(f"Getting sample of {n_rows} rows")
1583
1585
  if random:
1584
1586
  if self.lazy and self.external_source is not None:
1585
1587
  self.collect_external()
1586
1588
 
1587
1589
  if self.lazy and shuffle:
1588
- sample_df = (self.data_frame.collect(engine="streaming" if self._streamable else "auto")
1589
- .sample(n_rows, seed=seed, shuffle=shuffle))
1590
+ sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(
1591
+ n_rows, seed=seed, shuffle=shuffle
1592
+ )
1590
1593
  elif shuffle:
1591
1594
  sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
1592
1595
  else:
1593
1596
  if execution_location is None:
1594
1597
  execution_location = get_global_execution_location()
1595
- n_rows = min(n_rows, self.get_number_of_records(
1596
- calculate_in_worker_process=execution_location == "remote")
1597
- )
1598
+ n_rows = min(
1599
+ n_rows, self.get_number_of_records(calculate_in_worker_process=execution_location == "remote")
1600
+ )
1598
1601
 
1599
1602
  every_n_records = ceil(self.number_of_records / n_rows)
1600
1603
  sample_df = self.data_frame.gather_every(every_n_records)
@@ -1619,8 +1622,9 @@ class FlowDataEngine:
1619
1622
  else:
1620
1623
  return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
1621
1624
 
1622
- def iter_batches(self, batch_size: int = 1000,
1623
- columns: Union[List, Tuple, str] = None) -> Generator["FlowDataEngine", None, None]:
1625
+ def iter_batches(
1626
+ self, batch_size: int = 1000, columns: list | tuple | str = None
1627
+ ) -> Generator["FlowDataEngine", None, None]:
1624
1628
  """Iterates over the DataFrame in batches.
1625
1629
 
1626
1630
  Args:
@@ -1638,9 +1642,14 @@ class FlowDataEngine:
1638
1642
  for batch in batches:
1639
1643
  yield FlowDataEngine(batch)
1640
1644
 
1641
- def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1642
- other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
1643
- node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
1645
+ def start_fuzzy_join(
1646
+ self,
1647
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1648
+ other: "FlowDataEngine",
1649
+ file_ref: str,
1650
+ flow_id: int = -1,
1651
+ node_id: int | str = -1,
1652
+ ) -> ExternalFuzzyMatchFetcher:
1644
1653
  """Starts a fuzzy join operation in a background process.
1645
1654
 
1646
1655
  This method prepares the data and initiates the fuzzy matching in a
@@ -1657,45 +1666,71 @@ class FlowDataEngine:
1657
1666
  An `ExternalFuzzyMatchFetcher` object that can be used to track the
1658
1667
  progress and retrieve the result of the fuzzy join.
1659
1668
  """
1660
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
1661
- return ExternalFuzzyMatchFetcher(left_df, right_df,
1662
- fuzzy_maps=fuzzy_match_input.fuzzy_maps,
1663
- file_ref=file_ref + '_fm',
1664
- wait_on_completion=False,
1665
- flow_id=flow_id,
1666
- node_id=node_id)
1667
-
1668
- def fuzzy_join_external(self,
1669
- fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1670
- other: "FlowDataEngine",
1671
- file_ref: str = None,
1672
- flow_id: int = -1,
1673
- node_id: int = -1
1674
- ):
1669
+ fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1670
+ left_df, right_df = prepare_for_fuzzy_match(
1671
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1672
+ )
1673
+
1674
+ return ExternalFuzzyMatchFetcher(
1675
+ left_df,
1676
+ right_df,
1677
+ fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1678
+ file_ref=file_ref + "_fm",
1679
+ wait_on_completion=False,
1680
+ flow_id=flow_id,
1681
+ node_id=node_id,
1682
+ )
1683
+
1684
+ def fuzzy_join_external(
1685
+ self,
1686
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1687
+ other: "FlowDataEngine",
1688
+ file_ref: str = None,
1689
+ flow_id: int = -1,
1690
+ node_id: int = -1,
1691
+ ):
1675
1692
  if file_ref is None:
1676
- file_ref = str(id(self)) + '_' + str(id(other))
1677
-
1678
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
1679
- external_tracker = ExternalFuzzyMatchFetcher(left_df, right_df,
1680
- fuzzy_maps=fuzzy_match_input.fuzzy_maps,
1681
- file_ref=file_ref + '_fm',
1682
- wait_on_completion=False,
1683
- flow_id=flow_id,
1684
- node_id=node_id)
1693
+ file_ref = str(id(self)) + "_" + str(id(other))
1694
+ fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1695
+
1696
+ left_df, right_df = prepare_for_fuzzy_match(
1697
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1698
+ )
1699
+ external_tracker = ExternalFuzzyMatchFetcher(
1700
+ left_df,
1701
+ right_df,
1702
+ fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
1703
+ file_ref=file_ref + "_fm",
1704
+ wait_on_completion=False,
1705
+ flow_id=flow_id,
1706
+ node_id=node_id,
1707
+ )
1685
1708
  return FlowDataEngine(external_tracker.get_result())
1686
1709
 
1687
- def fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1688
- other: "FlowDataEngine",
1689
- node_logger: NodeLogger = None) -> "FlowDataEngine":
1690
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
1691
- fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input.fuzzy_maps]
1692
- return FlowDataEngine(fuzzy_match_dfs(left_df, right_df, fuzzy_maps=fuzzy_mappings,
1693
- logger=node_logger.logger if node_logger else logger)
1694
- .lazy())
1695
-
1696
- def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
1697
- auto_generate_selection: bool, verify_integrity: bool,
1698
- other: "FlowDataEngine") -> "FlowDataEngine":
1710
+ def fuzzy_join(
1711
+ self,
1712
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1713
+ other: "FlowDataEngine",
1714
+ node_logger: NodeLogger = None,
1715
+ ) -> "FlowDataEngine":
1716
+ fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
1717
+ left_df, right_df = prepare_for_fuzzy_match(
1718
+ left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
1719
+ )
1720
+ fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input_manager.fuzzy_maps]
1721
+ return FlowDataEngine(
1722
+ fuzzy_match_dfs(
1723
+ left_df, right_df, fuzzy_maps=fuzzy_mappings, logger=node_logger.logger if node_logger else logger
1724
+ ).lazy()
1725
+ )
1726
+
1727
+ def do_cross_join(
1728
+ self,
1729
+ cross_join_input: transform_schemas.CrossJoinInput,
1730
+ auto_generate_selection: bool,
1731
+ verify_integrity: bool,
1732
+ other: "FlowDataEngine",
1733
+ ) -> "FlowDataEngine":
1699
1734
  """Performs a cross join with another DataFrame.
1700
1735
 
1701
1736
  A cross join produces the Cartesian product of the two DataFrames.
@@ -1713,101 +1748,109 @@ class FlowDataEngine:
1713
1748
  Exception: If `verify_integrity` is True and the join would result in
1714
1749
  an excessively large number of records.
1715
1750
  """
1716
-
1717
1751
  self.lazy = True
1718
-
1719
1752
  other.lazy = True
1753
+ cross_join_input_manager = transform_schemas.CrossJoinInputManager(cross_join_input)
1754
+ verify_join_select_integrity(
1755
+ cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns
1756
+ )
1757
+ right_select = [
1758
+ v.old_name
1759
+ for v in cross_join_input_manager.right_select.renames
1760
+ if (v.keep or v.join_key) and v.is_available
1761
+ ]
1762
+ left_select = [
1763
+ v.old_name
1764
+ for v in cross_join_input_manager.left_select.renames
1765
+ if (v.keep or v.join_key) and v.is_available
1766
+ ]
1767
+ cross_join_input_manager.auto_rename(rename_mode="suffix")
1768
+ left = self.data_frame.select(left_select).rename(cross_join_input_manager.left_select.rename_table)
1769
+ right = other.data_frame.select(right_select).rename(cross_join_input_manager.right_select.rename_table)
1720
1770
 
1721
- verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
1722
- right_select = [v.old_name for v in cross_join_input.right_select.renames
1723
- if (v.keep or v.join_key) and v.is_available]
1724
- left_select = [v.old_name for v in cross_join_input.left_select.renames
1725
- if (v.keep or v.join_key) and v.is_available]
1726
-
1727
- left = self.data_frame.select(left_select).rename(cross_join_input.left_select.rename_table)
1728
- right = other.data_frame.select(right_select).rename(cross_join_input.right_select.rename_table)
1729
-
1730
- joined_df = left.join(right, how='cross')
1771
+ joined_df = left.join(right, how="cross")
1731
1772
 
1732
- cols_to_delete_after = [col.new_name for col in
1733
- cross_join_input.left_select.renames + cross_join_input.left_select.renames
1734
- if col.join_key and not col.keep and col.is_available]
1773
+ cols_to_delete_after = [
1774
+ col.new_name
1775
+ for col in cross_join_input_manager.left_select.renames + cross_join_input_manager.right_select.renames
1776
+ if col.join_key and not col.keep and col.is_available
1777
+ ]
1735
1778
 
1736
1779
  fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
1737
1780
  return fl
1738
1781
 
1739
- def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
1740
- verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
1741
- """Performs a standard SQL-style join with another DataFrame.
1742
-
1743
- Supports various join types like 'inner', 'left', 'right', 'outer', 'semi', and 'anti'.
1782
+ def join(
1783
+ self,
1784
+ join_input: transform_schemas.JoinInput,
1785
+ auto_generate_selection: bool,
1786
+ verify_integrity: bool,
1787
+ other: "FlowDataEngine",
1788
+ ) -> "FlowDataEngine":
1789
+ """Performs a standard SQL-style join with another DataFrame."""
1790
+ # Create manager from input
1791
+ join_manager = transform_schemas.JoinInputManager(join_input)
1792
+ ensure_right_unselect_for_semi_and_anti_joins(join_manager.input)
1793
+ for jk in join_manager.join_mapping:
1794
+ if jk.left_col not in {c.old_name for c in join_manager.left_select.renames}:
1795
+ join_manager.left_select.append(transform_schemas.SelectInput(jk.left_col, keep=False))
1796
+ if jk.right_col not in {c.old_name for c in join_manager.right_select.renames}:
1797
+ join_manager.right_select.append(transform_schemas.SelectInput(jk.right_col, keep=False))
1798
+ verify_join_select_integrity(join_manager.input, left_columns=self.columns, right_columns=other.columns)
1799
+ if not verify_join_map_integrity(join_manager.input, left_columns=self.schema, right_columns=other.schema):
1800
+ raise Exception("Join is not valid by the data fields")
1744
1801
 
1745
- Args:
1746
- join_input: A `JoinInput` object defining the join keys, join type,
1747
- and column selections.
1748
- auto_generate_selection: If True, automatically handles column renaming.
1749
- verify_integrity: If True, performs checks to prevent excessively large joins.
1750
- other: The right `FlowDataEngine` to join with.
1802
+ if auto_generate_selection:
1803
+ join_manager.auto_rename()
1751
1804
 
1752
- Returns:
1753
- A new `FlowDataEngine` with the joined data.
1805
+ # Use manager properties throughout
1806
+ left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(
1807
+ join_manager.left_manager.get_rename_table()
1808
+ )
1809
+ right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(
1810
+ join_manager.right_manager.get_rename_table()
1811
+ )
1754
1812
 
1755
- Raises:
1756
- Exception: If the join configuration is invalid or if `verify_integrity`
1757
- is True and the join is predicted to be too large.
1758
- """
1759
- ensure_right_unselect_for_semi_and_anti_joins(join_input)
1760
- verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
1761
- if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
1762
- raise Exception('Join is not valid by the data fields')
1763
- if auto_generate_selection:
1764
- join_input.auto_rename()
1765
- left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
1766
- right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
1767
- if verify_integrity and join_input.how != 'right':
1768
- n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
1769
- right_on_keys=join_input.right_join_keys, how=join_input.how)
1770
- if n_records > 1_000_000_000:
1771
- raise Exception("Join will result in too many records, ending process")
1772
- else:
1773
- n_records = -1
1774
- left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_input)
1775
- left, right = rename_df_table_for_join(left, right, join_input.get_join_key_renames())
1776
- if join_input.how == 'right':
1813
+ left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_manager)
1814
+ left, right = rename_df_table_for_join(left, right, join_manager.get_join_key_renames())
1815
+ if join_manager.how == "right":
1777
1816
  joined_df = right.join(
1778
1817
  other=left,
1779
- left_on=join_input.right_join_keys,
1780
- right_on=join_input.left_join_keys,
1818
+ left_on=join_manager.right_join_keys,
1819
+ right_on=join_manager.left_join_keys,
1781
1820
  how="left",
1782
- suffix="").rename(reverse_join_key_mapping)
1821
+ suffix="",
1822
+ ).rename(reverse_join_key_mapping)
1783
1823
  else:
1784
1824
  joined_df = left.join(
1785
1825
  other=right,
1786
- left_on=join_input.left_join_keys,
1787
- right_on=join_input.right_join_keys,
1788
- how=join_input.how,
1789
- suffix="").rename(reverse_join_key_mapping)
1790
- left_cols_to_delete_after = [get_col_name_to_delete(col, 'left') for col in join_input.left_select.renames
1791
- if not col.keep
1792
- and col.is_available and col.join_key
1793
- ]
1794
- right_cols_to_delete_after = [get_col_name_to_delete(col, 'right') for col in join_input.right_select.renames
1795
- if not col.keep
1796
- and col.is_available and col.join_key
1797
- and join_input.how in ("left", "right", "inner", "cross", "outer")
1798
- ]
1826
+ left_on=join_manager.left_join_keys,
1827
+ right_on=join_manager.right_join_keys,
1828
+ how=join_manager.how,
1829
+ suffix="",
1830
+ ).rename(reverse_join_key_mapping)
1831
+
1832
+ left_cols_to_delete_after = [
1833
+ get_col_name_to_delete(col, "left")
1834
+ for col in join_manager.input.left_select.renames
1835
+ if not col.keep and col.is_available and col.join_key
1836
+ ]
1837
+
1838
+ right_cols_to_delete_after = [
1839
+ get_col_name_to_delete(col, "right")
1840
+ for col in join_manager.input.right_select.renames
1841
+ if not col.keep
1842
+ and col.is_available
1843
+ and col.join_key
1844
+ and join_manager.how in ("left", "right", "inner", "cross", "outer")
1845
+ ]
1846
+
1799
1847
  if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
1800
1848
  joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
1801
- undo_join_key_remapping = get_undo_rename_mapping_join(join_input)
1849
+
1850
+ undo_join_key_remapping = get_undo_rename_mapping_join(join_manager)
1802
1851
  joined_df = joined_df.rename(undo_join_key_remapping)
1803
1852
 
1804
- if verify_integrity:
1805
- return FlowDataEngine(joined_df, calculate_schema_stats=True,
1806
- number_of_records=n_records, streamable=False)
1807
- else:
1808
- fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
1809
- number_of_records=0, streamable=False)
1810
- return fl
1853
+ return FlowDataEngine(joined_df, calculate_schema_stats=False, number_of_records=0, streamable=False)
1811
1854
 
1812
1855
  def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
1813
1856
  """Solves a graph problem represented by 'from' and 'to' columns.
@@ -1822,8 +1865,9 @@ class FlowDataEngine:
1822
1865
  A new `FlowDataEngine` instance with the solved graph data.
1823
1866
  """
1824
1867
  lf = self.data_frame.with_columns(
1825
- graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
1826
- .alias(graph_solver_input.output_column_name)
1868
+ graph_solver(graph_solver_input.col_from, graph_solver_input.col_to).alias(
1869
+ graph_solver_input.output_column_name
1870
+ )
1827
1871
  )
1828
1872
  return FlowDataEngine(lf)
1829
1873
 
@@ -1838,7 +1882,7 @@ class FlowDataEngine:
1838
1882
  A new `FlowDataEngine` instance with the added column.
1839
1883
  """
1840
1884
  if col_name is None:
1841
- col_name = 'new_values'
1885
+ col_name = "new_values"
1842
1886
  return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
1843
1887
 
1844
1888
  def get_record_count(self) -> "FlowDataEngine":
@@ -1848,7 +1892,7 @@ class FlowDataEngine:
1848
1892
  Returns:
1849
1893
  A new `FlowDataEngine` instance.
1850
1894
  """
1851
- return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
1895
+ return FlowDataEngine(self.data_frame.select(pl.len().alias("number_of_records")))
1852
1896
 
1853
1897
  def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
1854
1898
  """Asserts that this DataFrame is equal to another.
@@ -1871,13 +1915,13 @@ class FlowDataEngine:
1871
1915
  other = other.select_columns(self.columns)
1872
1916
 
1873
1917
  if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
1874
- raise Exception('Number of records is not equal')
1918
+ raise Exception("Number of records is not equal")
1875
1919
 
1876
1920
  if self.columns != other.columns:
1877
- raise Exception('Schema is not equal')
1921
+ raise Exception("Schema is not equal")
1878
1922
 
1879
1923
  if strict_schema:
1880
- assert self.data_frame.schema == other.data_frame.schema, 'Data types do not match'
1924
+ assert self.data_frame.schema == other.data_frame.schema, "Data types do not match"
1881
1925
 
1882
1926
  if ordered:
1883
1927
  self_lf = self.data_frame.sort(by=self.columns)
@@ -1887,7 +1931,7 @@ class FlowDataEngine:
1887
1931
  other_lf = other.data_frame
1888
1932
 
1889
1933
  self.lazy, other.lazy = org_laziness
1890
- assert self_lf.equals(other_lf), 'Data is not equal'
1934
+ assert self_lf.equals(other_lf), "Data is not equal"
1891
1935
 
1892
1936
  def initialize_empty_fl(self):
1893
1937
  """Initializes an empty LazyFrame."""
@@ -1902,7 +1946,7 @@ class FlowDataEngine:
1902
1946
  operation_type="calculate_number_of_records",
1903
1947
  flow_id=-1,
1904
1948
  node_id=-1,
1905
- wait_on_completion=True
1949
+ wait_on_completion=True,
1906
1950
  ).result
1907
1951
  return number_of_records
1908
1952
 
@@ -1918,8 +1962,9 @@ class FlowDataEngine:
1918
1962
  """
1919
1963
  return self.get_number_of_records(force_calculate=force_calculate)
1920
1964
 
1921
- def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
1922
- calculate_in_worker_process: bool = False) -> int:
1965
+ def get_number_of_records(
1966
+ self, warn: bool = False, force_calculate: bool = False, calculate_in_worker_process: bool = False
1967
+ ) -> int:
1923
1968
  """Gets the total number of records in the DataFrame.
1924
1969
 
1925
1970
  For lazy frames, this may trigger a full data scan, which can be expensive.
@@ -1949,12 +1994,13 @@ class FlowDataEngine:
1949
1994
  except Exception as e:
1950
1995
  logger.error(f"Error: {e}")
1951
1996
  if warn:
1952
- logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1997
+ logger.warning("Calculating the number of records this can be expensive on a lazy frame")
1953
1998
  try:
1954
1999
  self.number_of_records = self.data_frame.select(pl.len()).collect(
1955
- engine="streaming" if self._streamable else "auto")[0, 0]
2000
+ engine="streaming" if self._streamable else "auto"
2001
+ )[0, 0]
1956
2002
  except Exception:
1957
- raise ValueError('Could not get number of records')
2003
+ raise ValueError("Could not get number of records")
1958
2004
  else:
1959
2005
  self.number_of_records = self.data_frame.__len__()
1960
2006
  return self.number_of_records
@@ -1995,7 +2041,7 @@ class FlowDataEngine:
1995
2041
  return self._external_source
1996
2042
 
1997
2043
  @property
1998
- def cols_idx(self) -> Dict[str, int]:
2044
+ def cols_idx(self) -> dict[str, int]:
1999
2045
  """A dictionary mapping column names to their integer index."""
2000
2046
  if self._col_idx is None:
2001
2047
  self._col_idx = {c: i for i, c in enumerate(self.columns)}
@@ -2017,7 +2063,7 @@ class FlowDataEngine:
2017
2063
  [transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
2018
2064
  )
2019
2065
 
2020
- def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
2066
+ def select_columns(self, list_select: list[str] | tuple[str] | str) -> "FlowDataEngine":
2021
2067
  """Selects a subset of columns from the DataFrame.
2022
2068
 
2023
2069
  Args:
@@ -2030,17 +2076,17 @@ class FlowDataEngine:
2030
2076
  list_select = [list_select]
2031
2077
 
2032
2078
  idx_to_keep = [self.cols_idx.get(c) for c in list_select]
2033
- selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep) if id_to_keep is not None]
2079
+ selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep, strict=False) if id_to_keep is not None]
2034
2080
  new_schema = [self.schema[i] for i in idx_to_keep if i is not None]
2035
2081
 
2036
2082
  return FlowDataEngine(
2037
2083
  self.data_frame.select(selects),
2038
2084
  number_of_records=self.number_of_records,
2039
2085
  schema=new_schema,
2040
- streamable=self._streamable
2086
+ streamable=self._streamable,
2041
2087
  )
2042
2088
 
2043
- def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
2089
+ def drop_columns(self, columns: list[str]) -> "FlowDataEngine":
2044
2090
  """Drops specified columns from the DataFrame.
2045
2091
 
2046
2092
  Args:
@@ -2054,12 +2100,10 @@ class FlowDataEngine:
2054
2100
  new_schema = [self.schema[i] for i in idx_to_keep]
2055
2101
 
2056
2102
  return FlowDataEngine(
2057
- self.data_frame.select(cols_for_select),
2058
- number_of_records=self.number_of_records,
2059
- schema=new_schema
2103
+ self.data_frame.select(cols_for_select), number_of_records=self.number_of_records, schema=new_schema
2060
2104
  )
2061
2105
 
2062
- def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
2106
+ def reorganize_order(self, column_order: list[str]) -> "FlowDataEngine":
2063
2107
  """Reorganizes columns into a specified order.
2064
2108
 
2065
2109
  Args:
@@ -2072,8 +2116,9 @@ class FlowDataEngine:
2072
2116
  schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
2073
2117
  return FlowDataEngine(df, schema=schema, number_of_records=self.number_of_records)
2074
2118
 
2075
- def apply_flowfile_formula(self, func: str, col_name: str,
2076
- output_data_type: pl.DataType = None) -> "FlowDataEngine":
2119
+ def apply_flowfile_formula(
2120
+ self, func: str, col_name: str, output_data_type: pl.DataType = None
2121
+ ) -> "FlowDataEngine":
2077
2122
  """Applies a formula to create a new column or transform an existing one.
2078
2123
 
2079
2124
  Args:
@@ -2092,8 +2137,7 @@ class FlowDataEngine:
2092
2137
 
2093
2138
  return FlowDataEngine(df2, number_of_records=self.number_of_records)
2094
2139
 
2095
- def apply_sql_formula(self, func: str, col_name: str,
2096
- output_data_type: pl.DataType = None) -> "FlowDataEngine":
2140
+ def apply_sql_formula(self, func: str, col_name: str, output_data_type: pl.DataType = None) -> "FlowDataEngine":
2097
2141
  """Applies an SQL-style formula using `pl.sql_expr`.
2098
2142
 
2099
2143
  Args:
@@ -2105,15 +2149,16 @@ class FlowDataEngine:
2105
2149
  A new `FlowDataEngine` instance with the applied formula.
2106
2150
  """
2107
2151
  expr = to_expr(func)
2108
- if output_data_type not in (None, "Auto"):
2152
+ if output_data_type not in (None, transform_schemas.AUTO_DATA_TYPE):
2109
2153
  df = self.data_frame.with_columns(expr.cast(output_data_type).alias(col_name))
2110
2154
  else:
2111
2155
  df = self.data_frame.with_columns(expr.alias(col_name))
2112
2156
 
2113
2157
  return FlowDataEngine(df, number_of_records=self.number_of_records)
2114
2158
 
2115
- def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
2116
- execute_remote: bool = True) -> "FlowDataEngine":
2159
+ def output(
2160
+ self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str, execute_remote: bool = True
2161
+ ) -> "FlowDataEngine":
2117
2162
  """Writes the DataFrame to an output file.
2118
2163
 
2119
2164
  Can execute the write operation locally or in a remote worker process.
@@ -2127,21 +2172,21 @@ class FlowDataEngine:
2127
2172
  Returns:
2128
2173
  The same `FlowDataEngine` instance for chaining.
2129
2174
  """
2130
- logger.info('Starting to write output')
2175
+ logger.info("Starting to write output")
2131
2176
  if execute_remote:
2132
2177
  status = utils.write_output(
2133
2178
  self.data_frame,
2134
2179
  data_type=output_fs.file_type,
2135
2180
  path=output_fs.abs_file_path,
2136
2181
  write_mode=output_fs.write_mode,
2137
- sheet_name=output_fs.output_excel_table.sheet_name,
2138
- delimiter=output_fs.output_csv_table.delimiter,
2182
+ sheet_name=output_fs.sheet_name,
2183
+ delimiter=output_fs.delimiter,
2139
2184
  flow_id=flow_id,
2140
- node_id=node_id
2185
+ node_id=node_id,
2141
2186
  )
2142
2187
  tracker = ExternalExecutorTracker(status)
2143
2188
  tracker.get_result()
2144
- logger.info('Finished writing output')
2189
+ logger.info("Finished writing output")
2145
2190
  else:
2146
2191
  logger.info("Starting to write results locally")
2147
2192
  utils.local_write_output(
@@ -2149,8 +2194,8 @@ class FlowDataEngine:
2149
2194
  data_type=output_fs.file_type,
2150
2195
  path=output_fs.abs_file_path,
2151
2196
  write_mode=output_fs.write_mode,
2152
- sheet_name=output_fs.output_excel_table.sheet_name,
2153
- delimiter=output_fs.output_csv_table.delimiter,
2197
+ sheet_name=output_fs.sheet_name,
2198
+ delimiter=output_fs.delimiter,
2154
2199
  flow_id=flow_id,
2155
2200
  node_id=node_id,
2156
2201
  )
@@ -2183,11 +2228,10 @@ class FlowDataEngine:
2183
2228
  if isinstance(other, FlowDataEngine):
2184
2229
  other = [other]
2185
2230
 
2186
- dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
2187
- return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
2231
+ dfs: list[pl.LazyFrame] | list[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
2232
+ return FlowDataEngine(pl.concat(dfs, how="diagonal_relaxed"))
2188
2233
 
2189
- def do_select(self, select_inputs: transform_schemas.SelectInputs,
2190
- keep_missing: bool = True) -> "FlowDataEngine":
2234
+ def do_select(self, select_inputs: transform_schemas.SelectInputs, keep_missing: bool = True) -> "FlowDataEngine":
2191
2235
  """Performs a complex column selection, renaming, and reordering operation.
2192
2236
 
2193
2237
  Args:
@@ -2203,7 +2247,8 @@ class FlowDataEngine:
2203
2247
 
2204
2248
  if not keep_missing:
2205
2249
  drop_cols = set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames).union(
2206
- set(r.old_name for r in renames if not r.keep))
2250
+ set(r.old_name for r in renames if not r.keep)
2251
+ )
2207
2252
  keep_cols = []
2208
2253
  else:
2209
2254
  keep_cols = list(set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames))
@@ -2223,12 +2268,14 @@ class FlowDataEngine:
2223
2268
 
2224
2269
  rename_dict = {r.old_name: r.new_name for r in available_renames}
2225
2270
  fl = self.select_columns(
2226
- list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols)
2271
+ list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
2272
+ )
2227
2273
  fl = fl.change_column_types(transforms=[r for r in renames if r.keep])
2228
2274
  ndf = fl.data_frame.rename(rename_dict)
2229
2275
  renames.sort(key=lambda r: 0 if r.position is None else r.position)
2230
- sorted_cols = utils.match_order(ndf.collect_schema().names(),
2231
- [r.new_name for r in renames] + self.data_frame.collect_schema().names())
2276
+ sorted_cols = utils.match_order(
2277
+ ndf.collect_schema().names(), [r.new_name for r in renames] + self.data_frame.collect_schema().names()
2278
+ )
2232
2279
  output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
2233
2280
  return output_file.reorganize_order(sorted_cols)
2234
2281
 
@@ -2236,7 +2283,7 @@ class FlowDataEngine:
2236
2283
  """Sets whether DataFrame operations should be streamable."""
2237
2284
  self._streamable = streamable
2238
2285
 
2239
- def _calculate_schema(self) -> List[Dict]:
2286
+ def _calculate_schema(self) -> list[dict]:
2240
2287
  """Calculates schema statistics."""
2241
2288
  if self.external_source is not None:
2242
2289
  self.collect_external()
@@ -2256,8 +2303,10 @@ class FlowDataEngine:
2256
2303
  def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
2257
2304
  """Creates a FlowDataEngine from a path in a worker process."""
2258
2305
  received_table.set_absolute_filepath()
2259
- external_fetcher = ExternalCreateFetcher(received_table=received_table,
2260
- file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
2306
+
2307
+ external_fetcher = ExternalCreateFetcher(
2308
+ received_table=received_table, file_type=received_table.file_type, flow_id=flow_id, node_id=node_id
2309
+ )
2261
2310
  return cls(external_fetcher.get_result())
2262
2311
 
2263
2312
 
@@ -2280,10 +2329,10 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
2280
2329
  if len(flowfile_tables) == 0:
2281
2330
  kwargs = {}
2282
2331
  elif len(flowfile_tables) == 1:
2283
- kwargs = {'input_df': flowfile_tables[0].data_frame}
2332
+ kwargs = {"input_df": flowfile_tables[0].data_frame}
2284
2333
  else:
2285
- kwargs = {f'input_df_{i+1}': flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
2334
+ kwargs = {f"input_df_{i+1}": flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
2286
2335
  df = polars_executable(**kwargs)
2287
2336
  if isinstance(df, pl.DataFrame):
2288
2337
  logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
2289
- return FlowDataEngine(df)
2338
+ return FlowDataEngine(df)