Flowfile 0.4.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. build_backends/main.py +25 -22
  2. build_backends/main_prd.py +10 -19
  3. flowfile/__init__.py +179 -73
  4. flowfile/__main__.py +10 -7
  5. flowfile/api.py +52 -59
  6. flowfile/web/__init__.py +14 -9
  7. flowfile/web/static/assets/AdminView-49392a9a.js +713 -0
  8. flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
  9. flowfile/web/static/assets/CloudConnectionView-36bcd6df.css +72 -0
  10. flowfile/web/static/assets/{CloudConnectionManager-d3248f8d.js → CloudConnectionView-f13f202b.js} +11 -11
  11. flowfile/web/static/assets/{CloudStorageReader-d65bf041.js → CloudStorageReader-0023d4a5.js} +10 -8
  12. flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
  13. flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
  14. flowfile/web/static/assets/{CloudStorageWriter-e83be3ed.js → CloudStorageWriter-8e781e11.js} +10 -8
  15. flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
  16. flowfile/web/static/assets/{ColumnSelector-cce661cf.js → ColumnSelector-8ad68ea9.js} +3 -5
  17. flowfile/web/static/assets/{ContextMenu-c13f91d0.css → ContextMenu-26d4dd27.css} +6 -6
  18. flowfile/web/static/assets/{ContextMenu-11a4652a.js → ContextMenu-31ee57f0.js} +3 -3
  19. flowfile/web/static/assets/{ContextMenu-160afb08.js → ContextMenu-69a74055.js} +3 -3
  20. flowfile/web/static/assets/{ContextMenu-cf18d2cc.js → ContextMenu-8e2051c6.js} +3 -3
  21. flowfile/web/static/assets/{ContextMenu-4c74eef1.css → ContextMenu-8ec1729e.css} +6 -6
  22. flowfile/web/static/assets/{ContextMenu-63cfa99b.css → ContextMenu-9b310c60.css} +6 -6
  23. flowfile/web/static/assets/{CrossJoin-d395d38c.js → CrossJoin-03df6938.js} +12 -10
  24. flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
  25. flowfile/web/static/assets/CustomNode-59e99a86.css +32 -0
  26. flowfile/web/static/assets/{CustomNode-b812dc0b.js → CustomNode-8479239b.js} +36 -24
  27. flowfile/web/static/assets/{DatabaseConnectionSettings-7000bf2c.js → DatabaseConnectionSettings-869e3efd.js} +5 -4
  28. flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-e91df89a.css} +13 -13
  29. flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-36898a00.css} +24 -24
  30. flowfile/web/static/assets/{DatabaseReader-4f035d0c.js → DatabaseReader-c58b9552.js} +25 -15
  31. flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
  32. flowfile/web/static/assets/{DatabaseManager-9662ec5b.js → DatabaseView-d26a9140.js} +11 -11
  33. flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-217a99f1.css} +19 -19
  34. flowfile/web/static/assets/{DatabaseWriter-f65dcd54.js → DatabaseWriter-4d05ddc7.js} +17 -10
  35. flowfile/web/static/assets/{designer-e3c150ec.css → DesignerView-a6d0ee84.css} +629 -538
  36. flowfile/web/static/assets/{designer-f3656d8c.js → DesignerView-e6f5c0e8.js} +1214 -3209
  37. flowfile/web/static/assets/{documentation-52b241e7.js → DocumentationView-2e78ef1b.js} +5 -5
  38. flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-fd46c656.css} +7 -7
  39. flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
  40. flowfile/web/static/assets/{ExploreData-94c43dfc.js → ExploreData-7b54caca.js} +18 -9
  41. flowfile/web/static/assets/{ExternalSource-ac04b3cc.js → ExternalSource-3fa399b2.js} +9 -7
  42. flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-47ab05a3.css} +17 -17
  43. flowfile/web/static/assets/Filter-7494ea97.css +48 -0
  44. flowfile/web/static/assets/Filter-8cbbdbf3.js +287 -0
  45. flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
  46. flowfile/web/static/assets/{Formula-71472193.js → Formula-aac42b1e.js} +13 -11
  47. flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
  48. flowfile/web/static/assets/{FuzzyMatch-b317f631.js → FuzzyMatch-cd9bbfca.js} +12 -10
  49. flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-c24dec17.css} +5 -5
  50. flowfile/web/static/assets/{GraphSolver-754a234f.js → GraphSolver-c7e6780e.js} +13 -11
  51. flowfile/web/static/assets/{GroupBy-6c6f9802.js → GroupBy-93c5d22b.js} +9 -7
  52. flowfile/web/static/assets/{GroupBy-b9505323.css → GroupBy-be7ac0bf.css} +10 -10
  53. flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
  54. flowfile/web/static/assets/{Join-a1b800be.js → Join-a19b2de2.js} +13 -11
  55. flowfile/web/static/assets/LoginView-0df4ed0a.js +134 -0
  56. flowfile/web/static/assets/LoginView-d325d632.css +172 -0
  57. flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
  58. flowfile/web/static/assets/{ManualInput-a9640276.js → ManualInput-8d3374b2.js} +170 -116
  59. flowfile/web/static/assets/{MultiSelect-97213888.js → MultiSelect-ad1b6243.js} +2 -2
  60. flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-6ffe088a.js → MultiSelect.vue_vue_type_script_setup_true_lang-e278950d.js} +1 -1
  61. flowfile/web/static/assets/NodeDesigner-40b647c9.js +2610 -0
  62. flowfile/web/static/assets/NodeDesigner-5f53be3f.css +1429 -0
  63. flowfile/web/static/assets/{NumericInput-e638088a.js → NumericInput-7100234c.js} +2 -2
  64. flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-90eb2cba.js → NumericInput.vue_vue_type_script_setup_true_lang-5130219f.js} +5 -2
  65. flowfile/web/static/assets/{Output-ddc9079f.css → Output-35e97000.css} +6 -6
  66. flowfile/web/static/assets/{Output-76750610.js → Output-f5efd2aa.js} +60 -38
  67. flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
  68. flowfile/web/static/assets/{Pivot-7814803f.js → Pivot-d981d23c.js} +11 -9
  69. flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
  70. flowfile/web/static/assets/{PivotValidation-f92137d2.js → PivotValidation-39386e95.js} +3 -3
  71. flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
  72. flowfile/web/static/assets/{PivotValidation-76dd431a.js → PivotValidation-63de1f73.js} +3 -3
  73. flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
  74. flowfile/web/static/assets/{PolarsCode-889c3008.js → PolarsCode-f9d69217.js} +18 -9
  75. flowfile/web/static/assets/PopOver-b22f049e.js +939 -0
  76. flowfile/web/static/assets/PopOver-d96599db.css +33 -0
  77. flowfile/web/static/assets/{Read-6b17491f.css → Read-36e7bd51.css} +12 -12
  78. flowfile/web/static/assets/{Read-637b72a7.js → Read-aec2e377.js} +83 -105
  79. flowfile/web/static/assets/{RecordCount-2b050c41.js → RecordCount-78ed6845.js} +6 -4
  80. flowfile/web/static/assets/{RecordId-81df7784.js → RecordId-2156e890.js} +8 -6
  81. flowfile/web/static/assets/{SQLQueryComponent-36cef432.css → SQLQueryComponent-1c2f26b4.css} +5 -5
  82. flowfile/web/static/assets/{SQLQueryComponent-88dcfe53.js → SQLQueryComponent-48c72f5b.js} +3 -3
  83. flowfile/web/static/assets/{Sample-258ad2a9.js → Sample-1352ca74.js} +6 -4
  84. flowfile/web/static/assets/SecretSelector-22b5ff89.js +113 -0
  85. flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
  86. flowfile/web/static/assets/{SecretManager-2a2cb7e2.js → SecretsView-17df66ee.js} +35 -36
  87. flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
  88. flowfile/web/static/assets/{Select-850215fd.js → Select-0aee4c54.js} +9 -7
  89. flowfile/web/static/assets/{SettingsSection-55bae608.js → SettingsSection-0784e157.js} +3 -3
  90. flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
  91. flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
  92. flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
  93. flowfile/web/static/assets/{SettingsSection-0e8d9123.js → SettingsSection-cd341bb6.js} +3 -3
  94. flowfile/web/static/assets/{SettingsSection-29b4fa6b.js → SettingsSection-f2002a6d.js} +3 -3
  95. flowfile/web/static/assets/{SingleSelect-bebd408b.js → SingleSelect-460cc0ea.js} +2 -2
  96. flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-6093741c.js → SingleSelect.vue_vue_type_script_setup_true_lang-30741bb2.js} +1 -1
  97. flowfile/web/static/assets/{SliderInput-6a05ab61.js → SliderInput-5d926864.js} +7 -4
  98. flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
  99. flowfile/web/static/assets/{Sort-10ab48ed.js → Sort-3cdc971b.js} +9 -7
  100. flowfile/web/static/assets/{Unique-f9fb0809.css → Sort-8a871341.css} +10 -10
  101. flowfile/web/static/assets/{TextInput-df9d6259.js → TextInput-a2d0bfbd.js} +2 -2
  102. flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-000e1178.js → TextInput.vue_vue_type_script_setup_true_lang-abad1ca2.js} +5 -2
  103. flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
  104. flowfile/web/static/assets/{TextToRows-6c2d93d8.js → TextToRows-918945f7.js} +11 -10
  105. flowfile/web/static/assets/{ToggleSwitch-0ff7ac52.js → ToggleSwitch-f0ef5196.js} +2 -2
  106. flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-c6dc3029.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-5605c793.js} +1 -1
  107. flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-54d2f518.css} +6 -6
  108. flowfile/web/static/assets/{UnavailableFields-1bab97cb.js → UnavailableFields-bdad6144.js} +4 -4
  109. flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
  110. flowfile/web/static/assets/{Union-b563478a.js → Union-e8ab8c86.js} +8 -6
  111. flowfile/web/static/assets/{Unique-f90db5db.js → Unique-8cd4f976.js} +13 -22
  112. flowfile/web/static/assets/{Sort-3643d625.css → Unique-9fb2f567.css} +10 -10
  113. flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-710a2948.css} +7 -7
  114. flowfile/web/static/assets/{Unpivot-bcb0025f.js → Unpivot-8da14095.js} +10 -8
  115. flowfile/web/static/assets/{UnpivotValidation-c4e73b04.js → UnpivotValidation-6f7d89ff.js} +3 -3
  116. flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
  117. flowfile/web/static/assets/{VueGraphicWalker-bb8535e2.js → VueGraphicWalker-3fb312e1.js} +4 -4
  118. flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
  119. flowfile/web/static/assets/{api-4c8e3822.js → api-24483f0d.js} +1 -1
  120. flowfile/web/static/assets/{api-2d6adc4f.js → api-8b81fa73.js} +1 -1
  121. flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-3d8dc5fa.css} +40 -40
  122. flowfile/web/static/assets/{dropDown-1bca8a74.js → dropDown-ac0fda9d.js} +3 -3
  123. flowfile/web/static/assets/{fullEditor-2985687e.js → fullEditor-5497a84a.js} +11 -10
  124. flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-a0be62b3.css} +74 -62
  125. flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
  126. flowfile/web/static/assets/{genericNodeSettings-0476ba4e.js → genericNodeSettings-99014e1d.js} +5 -5
  127. flowfile/web/static/assets/index-07dda503.js +38 -0
  128. flowfile/web/static/assets/index-3ba44389.js +2696 -0
  129. flowfile/web/static/assets/{index-50508d4d.css → index-e6289dd0.css} +1945 -569
  130. flowfile/web/static/assets/{index-246f201c.js → index-fb6493ae.js} +41626 -40869
  131. flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
  132. flowfile/web/static/assets/nodeInput-0eb13f1a.js +2 -0
  133. flowfile/web/static/assets/{outputCsv-d686eeaf.js → outputCsv-8f8ba42d.js} +3 -3
  134. flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
  135. flowfile/web/static/assets/{outputExcel-8809ea2f.js → outputExcel-393f4fef.js} +3 -3
  136. flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
  137. flowfile/web/static/assets/{outputParquet-53ba645a.js → outputParquet-07c81f65.js} +4 -4
  138. flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
  139. flowfile/web/static/assets/{readCsv-053bf97b.js → readCsv-07f6d9ad.js} +21 -20
  140. flowfile/web/static/assets/{readCsv-bca3ed53.css → readCsv-3bfac4c3.css} +15 -15
  141. flowfile/web/static/assets/{readExcel-e1b381ea.css → readExcel-3db6b763.css} +13 -13
  142. flowfile/web/static/assets/{readExcel-ad531eab.js → readExcel-ed69bc8f.js} +10 -12
  143. flowfile/web/static/assets/{readParquet-cee068e2.css → readParquet-c5244ad5.css} +4 -4
  144. flowfile/web/static/assets/{readParquet-58e899a1.js → readParquet-e3ed4528.js} +4 -7
  145. flowfile/web/static/assets/secrets.api-002e7d7e.js +65 -0
  146. flowfile/web/static/assets/{selectDynamic-b38de2ba.js → selectDynamic-80b92899.js} +5 -5
  147. flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
  148. flowfile/web/static/assets/{vue-codemirror.esm-db9b8936.js → vue-codemirror.esm-0965f39f.js} +31 -637
  149. flowfile/web/static/assets/{vue-content-loader.es-b5f3ac30.js → vue-content-loader.es-c506ad97.js} +1 -1
  150. flowfile/web/static/index.html +2 -2
  151. {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/METADATA +4 -4
  152. flowfile-0.5.3.dist-info/RECORD +402 -0
  153. {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/WHEEL +1 -1
  154. {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/entry_points.txt +1 -0
  155. flowfile_core/__init__.py +13 -3
  156. flowfile_core/auth/jwt.py +51 -16
  157. flowfile_core/auth/models.py +32 -7
  158. flowfile_core/auth/password.py +89 -0
  159. flowfile_core/auth/secrets.py +8 -6
  160. flowfile_core/configs/__init__.py +9 -7
  161. flowfile_core/configs/flow_logger.py +15 -14
  162. flowfile_core/configs/node_store/__init__.py +72 -4
  163. flowfile_core/configs/node_store/nodes.py +155 -172
  164. flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
  165. flowfile_core/configs/settings.py +28 -15
  166. flowfile_core/database/connection.py +7 -6
  167. flowfile_core/database/init_db.py +96 -2
  168. flowfile_core/database/models.py +3 -1
  169. flowfile_core/fileExplorer/__init__.py +17 -0
  170. flowfile_core/fileExplorer/funcs.py +123 -57
  171. flowfile_core/fileExplorer/utils.py +10 -11
  172. flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
  173. flowfile_core/flowfile/analytics/analytics_processor.py +27 -24
  174. flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
  175. flowfile_core/flowfile/analytics/utils.py +1 -1
  176. flowfile_core/flowfile/code_generator/code_generator.py +391 -279
  177. flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
  178. flowfile_core/flowfile/connection_manager/models.py +1 -1
  179. flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
  180. flowfile_core/flowfile/database_connection_manager/models.py +1 -1
  181. flowfile_core/flowfile/extensions.py +17 -12
  182. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
  183. flowfile_core/flowfile/flow_data_engine/create/funcs.py +152 -103
  184. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +526 -477
  185. flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
  186. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
  187. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
  188. flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
  189. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
  190. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +43 -32
  191. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
  192. flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
  193. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +15 -11
  194. flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
  195. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
  196. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
  197. flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
  198. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
  199. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
  200. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +360 -191
  201. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
  202. flowfile_core/flowfile/flow_data_engine/utils.py +101 -67
  203. flowfile_core/flowfile/flow_graph.py +1011 -561
  204. flowfile_core/flowfile/flow_graph_utils.py +31 -49
  205. flowfile_core/flowfile/flow_node/flow_node.py +332 -232
  206. flowfile_core/flowfile/flow_node/models.py +54 -41
  207. flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
  208. flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
  209. flowfile_core/flowfile/handler.py +82 -32
  210. flowfile_core/flowfile/manage/compatibility_enhancements.py +493 -47
  211. flowfile_core/flowfile/manage/io_flowfile.py +391 -0
  212. flowfile_core/flowfile/node_designer/__init__.py +15 -13
  213. flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
  214. flowfile_core/flowfile/node_designer/custom_node.py +162 -36
  215. flowfile_core/flowfile/node_designer/ui_components.py +136 -35
  216. flowfile_core/flowfile/schema_callbacks.py +77 -54
  217. flowfile_core/flowfile/setting_generator/__init__.py +0 -1
  218. flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
  219. flowfile_core/flowfile/setting_generator/settings.py +72 -55
  220. flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
  221. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
  222. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
  223. flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
  224. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
  225. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
  226. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
  227. flowfile_core/flowfile/util/calculate_layout.py +9 -13
  228. flowfile_core/flowfile/util/execution_orderer.py +25 -17
  229. flowfile_core/flowfile/util/node_skipper.py +4 -4
  230. flowfile_core/flowfile/utils.py +19 -21
  231. flowfile_core/main.py +26 -19
  232. flowfile_core/routes/auth.py +284 -11
  233. flowfile_core/routes/cloud_connections.py +25 -25
  234. flowfile_core/routes/logs.py +21 -29
  235. flowfile_core/routes/public.py +3 -3
  236. flowfile_core/routes/routes.py +77 -43
  237. flowfile_core/routes/secrets.py +25 -27
  238. flowfile_core/routes/user_defined_components.py +483 -4
  239. flowfile_core/run_lock.py +0 -1
  240. flowfile_core/schemas/__init__.py +4 -6
  241. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
  242. flowfile_core/schemas/cloud_storage_schemas.py +59 -55
  243. flowfile_core/schemas/input_schema.py +398 -154
  244. flowfile_core/schemas/output_model.py +50 -35
  245. flowfile_core/schemas/schemas.py +207 -67
  246. flowfile_core/schemas/transform_schema.py +1360 -435
  247. flowfile_core/schemas/yaml_types.py +117 -0
  248. flowfile_core/secret_manager/secret_manager.py +17 -13
  249. flowfile_core/{flowfile/node_designer/data_types.py → types.py} +33 -3
  250. flowfile_core/utils/arrow_reader.py +7 -6
  251. flowfile_core/utils/excel_file_manager.py +3 -3
  252. flowfile_core/utils/fileManager.py +7 -7
  253. flowfile_core/utils/fl_executor.py +8 -10
  254. flowfile_core/utils/utils.py +4 -4
  255. flowfile_core/utils/validate_setup.py +5 -4
  256. flowfile_frame/__init__.py +107 -50
  257. flowfile_frame/adapters.py +2 -9
  258. flowfile_frame/adding_expr.py +73 -32
  259. flowfile_frame/cloud_storage/frame_helpers.py +27 -23
  260. flowfile_frame/cloud_storage/secret_manager.py +12 -26
  261. flowfile_frame/config.py +2 -5
  262. flowfile_frame/expr.py +311 -218
  263. flowfile_frame/expr.pyi +160 -159
  264. flowfile_frame/expr_name.py +23 -23
  265. flowfile_frame/flow_frame.py +581 -489
  266. flowfile_frame/flow_frame.pyi +123 -104
  267. flowfile_frame/flow_frame_methods.py +236 -252
  268. flowfile_frame/group_frame.py +50 -20
  269. flowfile_frame/join.py +2 -2
  270. flowfile_frame/lazy.py +129 -87
  271. flowfile_frame/lazy_methods.py +83 -30
  272. flowfile_frame/list_name_space.py +55 -50
  273. flowfile_frame/selectors.py +148 -68
  274. flowfile_frame/series.py +9 -7
  275. flowfile_frame/utils.py +19 -21
  276. flowfile_worker/__init__.py +12 -4
  277. flowfile_worker/configs.py +11 -19
  278. flowfile_worker/create/__init__.py +14 -27
  279. flowfile_worker/create/funcs.py +143 -94
  280. flowfile_worker/create/models.py +139 -68
  281. flowfile_worker/create/pl_types.py +14 -15
  282. flowfile_worker/create/read_excel_tables.py +34 -41
  283. flowfile_worker/create/utils.py +22 -19
  284. flowfile_worker/external_sources/s3_source/main.py +18 -51
  285. flowfile_worker/external_sources/s3_source/models.py +34 -27
  286. flowfile_worker/external_sources/sql_source/main.py +8 -5
  287. flowfile_worker/external_sources/sql_source/models.py +13 -9
  288. flowfile_worker/flow_logger.py +10 -8
  289. flowfile_worker/funcs.py +214 -155
  290. flowfile_worker/main.py +11 -17
  291. flowfile_worker/models.py +35 -28
  292. flowfile_worker/process_manager.py +2 -3
  293. flowfile_worker/routes.py +121 -93
  294. flowfile_worker/secrets.py +9 -6
  295. flowfile_worker/spawner.py +80 -49
  296. flowfile_worker/utils.py +3 -2
  297. shared/__init__.py +2 -7
  298. shared/storage_config.py +25 -13
  299. test_utils/postgres/commands.py +3 -2
  300. test_utils/postgres/fixtures.py +9 -9
  301. test_utils/s3/commands.py +1 -1
  302. test_utils/s3/data_generator.py +3 -4
  303. test_utils/s3/demo_data_generator.py +4 -7
  304. test_utils/s3/fixtures.py +7 -5
  305. tools/migrate/README.md +56 -0
  306. tools/migrate/__init__.py +12 -0
  307. tools/migrate/__main__.py +118 -0
  308. tools/migrate/legacy_schemas.py +682 -0
  309. tools/migrate/migrate.py +610 -0
  310. tools/migrate/tests/__init__.py +0 -0
  311. tools/migrate/tests/conftest.py +21 -0
  312. tools/migrate/tests/test_migrate.py +622 -0
  313. tools/migrate/tests/test_migration_e2e.py +1009 -0
  314. tools/migrate/tests/test_node_migrations.py +843 -0
  315. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
  316. flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
  317. flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
  318. flowfile/web/static/assets/Filter-812dcbca.js +0 -164
  319. flowfile/web/static/assets/Filter-f62091b3.css +0 -20
  320. flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
  321. flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
  322. flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
  323. flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
  324. flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
  325. flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
  326. flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
  327. flowfile/web/static/assets/secretApi-538058f3.js +0 -46
  328. flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
  329. flowfile-0.4.1.dist-info/RECORD +0 -376
  330. flowfile_core/flowfile/manage/open_flowfile.py +0 -143
  331. {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/licenses/LICENSE +0 -0
  332. /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
@@ -1,35 +1,127 @@
1
- from typing import List, Dict, Tuple, Set, Optional, Literal, Callable
2
- from dataclasses import dataclass, field
3
- import polars as pl
4
- from polars import selectors
1
+ from collections.abc import Callable
5
2
  from copy import deepcopy
3
+ from dataclasses import asdict
4
+ from enum import Enum
5
+ from typing import Any, Literal, NamedTuple
6
6
 
7
- from typing import NamedTuple
8
-
7
+ import polars as pl
9
8
  from pl_fuzzy_frame_match.models import FuzzyMapping
9
+ from polars import selectors
10
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
11
+
12
+ from flowfile_core.schemas.yaml_types import (
13
+ BasicFilterYaml,
14
+ CrossJoinInputYaml,
15
+ FilterInputYaml,
16
+ FuzzyMatchInputYaml,
17
+ JoinInputsYaml,
18
+ JoinInputYaml,
19
+ SelectInputYaml,
20
+ )
21
+ from flowfile_core.types import DataType, DataTypeStr
22
+
23
+
24
+ class FilterOperator(str, Enum):
25
+ """Supported filter comparison operators."""
26
+
27
+ EQUALS = "equals"
28
+ NOT_EQUALS = "not_equals"
29
+ GREATER_THAN = "greater_than"
30
+ GREATER_THAN_OR_EQUALS = "greater_than_or_equals"
31
+ LESS_THAN = "less_than"
32
+ LESS_THAN_OR_EQUALS = "less_than_or_equals"
33
+ CONTAINS = "contains"
34
+ NOT_CONTAINS = "not_contains"
35
+ STARTS_WITH = "starts_with"
36
+ ENDS_WITH = "ends_with"
37
+ IS_NULL = "is_null"
38
+ IS_NOT_NULL = "is_not_null"
39
+ IN = "in"
40
+ NOT_IN = "not_in"
41
+ BETWEEN = "between"
42
+
43
+ def __str__(self) -> str:
44
+ return self.value
45
+
46
+ @classmethod
47
+ def from_symbol(cls, symbol: str) -> "FilterOperator":
48
+ """Convert UI symbol to FilterOperator enum."""
49
+ symbol_mapping = {
50
+ "=": cls.EQUALS,
51
+ "==": cls.EQUALS,
52
+ "!=": cls.NOT_EQUALS,
53
+ "<>": cls.NOT_EQUALS,
54
+ ">": cls.GREATER_THAN,
55
+ ">=": cls.GREATER_THAN_OR_EQUALS,
56
+ "<": cls.LESS_THAN,
57
+ "<=": cls.LESS_THAN_OR_EQUALS,
58
+ "contains": cls.CONTAINS,
59
+ "not_contains": cls.NOT_CONTAINS,
60
+ "starts_with": cls.STARTS_WITH,
61
+ "ends_with": cls.ENDS_WITH,
62
+ "is_null": cls.IS_NULL,
63
+ "is_not_null": cls.IS_NOT_NULL,
64
+ "in": cls.IN,
65
+ "not_in": cls.NOT_IN,
66
+ "between": cls.BETWEEN,
67
+ }
68
+ if symbol in symbol_mapping:
69
+ return symbol_mapping[symbol]
70
+ # Try to match by value directly
71
+ try:
72
+ return cls(symbol)
73
+ except ValueError:
74
+ raise ValueError(f"Unknown filter operator symbol: {symbol}")
75
+
76
+ def to_symbol(self) -> str:
77
+ """Convert FilterOperator to UI-friendly symbol."""
78
+ symbol_mapping = {
79
+ FilterOperator.EQUALS: "=",
80
+ FilterOperator.NOT_EQUALS: "!=",
81
+ FilterOperator.GREATER_THAN: ">",
82
+ FilterOperator.GREATER_THAN_OR_EQUALS: ">=",
83
+ FilterOperator.LESS_THAN: "<",
84
+ FilterOperator.LESS_THAN_OR_EQUALS: "<=",
85
+ FilterOperator.CONTAINS: "contains",
86
+ FilterOperator.NOT_CONTAINS: "not_contains",
87
+ FilterOperator.STARTS_WITH: "starts_with",
88
+ FilterOperator.ENDS_WITH: "ends_with",
89
+ FilterOperator.IS_NULL: "is_null",
90
+ FilterOperator.IS_NOT_NULL: "is_not_null",
91
+ FilterOperator.IN: "in",
92
+ FilterOperator.NOT_IN: "not_in",
93
+ FilterOperator.BETWEEN: "between",
94
+ }
95
+ return symbol_mapping.get(self, self.value)
96
+
97
+
98
+ FilterModeLiteral = Literal["basic", "advanced"]
99
+
100
+ FuzzyMap = FuzzyMapping
101
+
102
+ AUTO_DATA_TYPE = "Auto"
10
103
 
11
- FuzzyMap = FuzzyMapping # For backwards compatibility
12
104
 
13
105
  def get_func_type_mapping(func: str):
14
106
  """Infers the output data type of common aggregation functions."""
15
107
  if func in ["mean", "avg", "median", "std", "var"]:
16
108
  return "Float64"
17
- elif func in ['min', 'max', 'first', 'last', "cumsum", "sum"]:
109
+ elif func in ["min", "max", "first", "last", "cumsum", "sum"]:
18
110
  return None
19
- elif func in ['count', 'n_unique']:
111
+ elif func in ["count", "n_unique"]:
20
112
  return "Int64"
21
- elif func in ['concat']:
113
+ elif func in ["concat"]:
22
114
  return "Utf8"
23
115
 
24
116
 
25
117
  def string_concat(*column: str):
26
118
  """A simple wrapper to concatenate string columns in Polars."""
27
- return pl.col(column).cast(pl.Utf8).str.concat(delimiter=',')
119
+ return pl.col(column).cast(pl.Utf8).str.concat(delimiter=",")
28
120
 
29
121
 
30
122
  SideLit = Literal["left", "right"]
31
- JoinStrategy = Literal['inner', 'left', 'right', 'full', 'semi', 'anti', 'cross', 'outer']
32
- FuzzyTypeLiteral = Literal['levenshtein', 'jaro', 'jaro_winkler', 'hamming', 'damerau_levenshtein', 'indel']
123
+ JoinStrategy = Literal["inner", "left", "right", "full", "semi", "anti", "cross", "outer"]
124
+ FuzzyTypeLiteral = Literal["levenshtein", "jaro", "jaro_winkler", "hamming", "damerau_levenshtein", "indel"]
33
125
 
34
126
 
35
127
  def construct_join_key_name(side: SideLit, column_name: str) -> str:
@@ -39,452 +131,673 @@ def construct_join_key_name(side: SideLit, column_name: str) -> str:
39
131
 
40
132
  class JoinKeyRename(NamedTuple):
41
133
  """Represents the renaming of a join key from its original to a temporary name."""
134
+
42
135
  original_name: str
43
136
  temp_name: str
44
137
 
45
138
 
46
139
  class JoinKeyRenameResponse(NamedTuple):
47
140
  """Contains a list of join key renames for one side of a join."""
141
+
48
142
  side: SideLit
49
- join_key_renames: List[JoinKeyRename]
143
+ join_key_renames: list[JoinKeyRename]
50
144
 
51
145
 
52
146
  class FullJoinKeyResponse(NamedTuple):
53
147
  """Holds the join key rename responses for both sides of a join."""
148
+
54
149
  left: JoinKeyRenameResponse
55
150
  right: JoinKeyRenameResponse
56
151
 
57
152
 
58
- @dataclass
59
- class SelectInput:
153
+ class SelectInput(BaseModel):
60
154
  """Defines how a single column should be selected, renamed, or type-cast.
61
155
 
62
156
  This is a core building block for any operation that involves column manipulation.
63
157
  It holds all the configuration for a single field in a selection operation.
64
158
  """
159
+
160
+ model_config = ConfigDict(frozen=False)
161
+
65
162
  old_name: str
66
- original_position: Optional[int] = None
67
- new_name: Optional[str] = None
68
- data_type: Optional[str] = None
69
- data_type_change: Optional[bool] = False
70
- join_key: Optional[bool] = False
71
- is_altered: Optional[bool] = False
72
- position: Optional[int] = None
73
- is_available: Optional[bool] = True
74
- keep: Optional[bool] = True
163
+ original_position: int | None = None
164
+ new_name: str | None = None
165
+ data_type: str | None = None
166
+ data_type_change: bool = False
167
+ join_key: bool = False
168
+ is_altered: bool = False
169
+ position: int | None = None
170
+ is_available: bool = True
171
+ keep: bool = True
172
+
173
+ def __init__(self, old_name: str = None, new_name: str = None, **data):
174
+ if old_name is not None:
175
+ data["old_name"] = old_name
176
+ if new_name is not None:
177
+ data["new_name"] = new_name
178
+ super().__init__(**data)
179
+
180
+ def to_yaml_dict(self) -> SelectInputYaml:
181
+ """Serialize for YAML output - only user-relevant fields."""
182
+ result: SelectInputYaml = {"old_name": self.old_name}
183
+ if self.new_name != self.old_name:
184
+ result["new_name"] = self.new_name
185
+ if not self.keep:
186
+ result["keep"] = self.keep
187
+ if self.data_type_change and self.data_type:
188
+ result["data_type"] = self.data_type
189
+ return result
190
+
191
+ @classmethod
192
+ def from_yaml_dict(cls, data: dict) -> "SelectInput":
193
+ """Load from slim YAML format."""
194
+ old_name = data["old_name"]
195
+ new_name = data.get("new_name", old_name)
196
+ return cls(
197
+ old_name=old_name,
198
+ new_name=new_name,
199
+ keep=data.get("keep", True),
200
+ data_type=data.get("data_type"),
201
+ data_type_change=data.get("data_type") is not None,
202
+ is_altered=old_name != new_name,
203
+ )
204
+
205
+ @model_validator(mode="after")
206
+ def set_default_new_name(self):
207
+ """If new_name is None, default it to old_name."""
208
+ if self.new_name is None:
209
+ self.new_name = self.old_name
210
+ if self.old_name != self.new_name:
211
+ self.is_altered = True
212
+ return self
75
213
 
76
214
  def __hash__(self):
215
+ """Allow SelectInput to be used in sets and as dict keys."""
77
216
  return hash(self.old_name)
78
217
 
79
- def __init__(self, old_name: str, new_name: str = None, keep: bool = True, data_type: str = None,
80
- data_type_change: bool = False, join_key: bool = False, is_altered: bool = False,
81
- is_available: bool = True, position: int = None):
82
- self.old_name = old_name
83
- if new_name is None:
84
- new_name = old_name
85
- self.new_name = new_name
86
- self.keep = keep
87
- self.data_type = data_type
88
- self.data_type_change = data_type_change
89
- self.join_key = join_key
90
- self.is_altered = is_altered
91
- self.is_available = is_available
92
- self.position = position
218
+ def __eq__(self, other):
219
+ """Required when implementing __hash__."""
220
+ if not isinstance(other, SelectInput):
221
+ return False
222
+ return self.old_name == other.old_name
93
223
 
94
224
  @property
95
225
  def polars_type(self) -> str:
96
226
  """Translates a user-friendly type name to a Polars data type string."""
97
- if self.data_type.lower() == 'string':
98
- return 'Utf8'
99
- elif self.data_type.lower() == 'integer':
100
- return 'Int64'
101
- elif self.data_type.lower() == 'double':
102
- return 'Float64'
227
+ data_type_lower = self.data_type.lower()
228
+ if data_type_lower == "string":
229
+ return "Utf8"
230
+ elif data_type_lower == "integer":
231
+ return "Int64"
232
+ elif data_type_lower == "double":
233
+ return "Float64"
103
234
  return self.data_type
104
235
 
105
236
 
106
- @dataclass
107
- class FieldInput:
237
+ class FieldInput(BaseModel):
108
238
  """Represents a single field with its name and data type, typically for defining an output column."""
109
- name: str
110
- data_type: Optional[str] = None
111
239
 
112
- def __init__(self, name: str, data_type: str = None):
113
- self.name = name
114
- self.data_type = data_type
240
+ name: str
241
+ data_type: DataType | Literal["Auto"] | DataTypeStr | None = AUTO_DATA_TYPE
115
242
 
116
243
 
117
- @dataclass
118
- class FunctionInput:
244
+ class FunctionInput(BaseModel):
119
245
  """Defines a formula to be applied, including the output field information."""
246
+
120
247
  field: FieldInput
121
248
  function: str
122
249
 
250
+ def __init__(self, field: FieldInput = None, function: str = None, **data):
251
+ if field is not None:
252
+ data["field"] = field
253
+ if function is not None:
254
+ data["function"] = function
255
+ super().__init__(**data)
123
256
 
124
- @dataclass
125
- class BasicFilter:
126
- """Defines a simple, single-condition filter (e.g., 'column' 'equals' 'value')."""
127
- field: str = ''
128
- filter_type: str = ''
129
- filter_value: str = ''
130
257
 
258
+ class BasicFilter(BaseModel):
259
+ """Defines a simple, single-condition filter (e.g., 'column' 'equals' 'value').
131
260
 
132
- @dataclass
133
- class FilterInput:
134
- """Defines the settings for a filter operation, supporting basic or advanced (expression-based) modes."""
135
- advanced_filter: str = ''
136
- basic_filter: BasicFilter = None
137
- filter_type: str = 'basic'
261
+ Attributes:
262
+ field: The column name to filter on.
263
+ operator: The comparison operator (FilterOperator enum value or symbol).
264
+ value: The value to compare against.
265
+ value2: Second value for BETWEEN operator (optional).
266
+ """
138
267
 
268
+ field: str = ""
269
+ operator: FilterOperator | str = FilterOperator.EQUALS
270
+ value: str = ""
271
+ value2: str | None = None # For BETWEEN operator
272
+
273
+ # Keep old field names for backward compatibility
274
+ filter_type: str | None = None
275
+ filter_value: str | None = None
276
+
277
+ def __init__(
278
+ self,
279
+ field: str = None,
280
+ operator: FilterOperator | str = None,
281
+ value: str = None,
282
+ value2: str = None,
283
+ # Backward compatibility parameters
284
+ filter_type: str = None,
285
+ filter_value: str = None,
286
+ **data,
287
+ ):
288
+ # Handle backward compatibility
289
+ if filter_type is not None and operator is None:
290
+ data["operator"] = filter_type
291
+ elif operator is not None:
292
+ data["operator"] = operator
293
+
294
+ if filter_value is not None and value is None:
295
+ data["value"] = filter_value
296
+ elif value is not None:
297
+ data["value"] = value
298
+
299
+ if field is not None:
300
+ data["field"] = field
301
+ if value2 is not None:
302
+ data["value2"] = value2
303
+
304
+ super().__init__(**data)
305
+
306
+ @model_validator(mode="after")
307
+ def normalize_operator(self):
308
+ """Normalize the operator to FilterOperator enum."""
309
+ if isinstance(self.operator, str):
310
+ try:
311
+ self.operator = FilterOperator.from_symbol(self.operator)
312
+ except ValueError:
313
+ # Keep as string if conversion fails (for backward compat)
314
+ pass
315
+ return self
316
+
317
+ def get_operator(self) -> FilterOperator:
318
+ """Get the operator as FilterOperator enum."""
319
+ if isinstance(self.operator, FilterOperator):
320
+ return self.operator
321
+ return FilterOperator.from_symbol(self.operator)
322
+
323
+ def to_yaml_dict(self) -> BasicFilterYaml:
324
+ """Serialize for YAML output."""
325
+ result: BasicFilterYaml = {
326
+ "field": self.field,
327
+ "operator": self.operator.value if isinstance(self.operator, FilterOperator) else self.operator,
328
+ "value": self.value,
329
+ }
330
+ if self.value2:
331
+ result["value2"] = self.value2
332
+ return result
139
333
 
140
- @dataclass
141
- class SelectInputs:
142
- """A container for a list of `SelectInput` objects, providing helper methods for managing selections."""
143
- renames: List[SelectInput]
334
+ @classmethod
335
+ def from_yaml_dict(cls, data: dict) -> "BasicFilter":
336
+ """Load from YAML format."""
337
+ return cls(
338
+ field=data.get("field", ""),
339
+ operator=data.get("operator", FilterOperator.EQUALS),
340
+ value=data.get("value", ""),
341
+ value2=data.get("value2"),
342
+ )
144
343
 
145
- @property
146
- def old_cols(self) -> Set:
147
- """Returns a set of original column names to be kept in the selection."""
148
- return set(v.old_name for v in self.renames if v.keep)
149
344
 
150
- @property
151
- def new_cols(self) -> Set:
152
- """Returns a set of new (renamed) column names to be kept in the selection."""
153
- return set(v.new_name for v in self.renames if v.keep)
345
+ class FilterInput(BaseModel):
346
+ """Defines the settings for a filter operation, supporting basic or advanced (expression-based) modes.
154
347
 
155
- @property
156
- def rename_table(self):
157
- """Generates a dictionary for use in Polars' `.rename()` method."""
158
- return {v.old_name: v.new_name for v in self.renames if v.is_available and (v.keep or v.join_key)}
348
+ Attributes:
349
+ mode: The filter mode - "basic" or "advanced".
350
+ basic_filter: The basic filter configuration (used when mode="basic").
351
+ advanced_filter: The advanced filter expression string (used when mode="advanced").
352
+ """
159
353
 
160
- def get_select_cols(self, include_join_key: bool = True):
161
- """Gets a list of original column names to select from the source DataFrame."""
162
- return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
354
+ mode: FilterModeLiteral = "basic"
355
+ basic_filter: BasicFilter | None = None
356
+ advanced_filter: str = ""
357
+
358
+ # Keep old field name for backward compatibility
359
+ filter_type: str | None = None
360
+
361
+ def __init__(
362
+ self,
363
+ mode: FilterModeLiteral = None,
364
+ basic_filter: BasicFilter = None,
365
+ advanced_filter: str = None,
366
+ # Backward compatibility
367
+ filter_type: str = None,
368
+ **data,
369
+ ):
370
+ # Handle backward compatibility: filter_type -> mode
371
+ if filter_type is not None and mode is None:
372
+ data["mode"] = filter_type
373
+ elif mode is not None:
374
+ data["mode"] = mode
375
+
376
+ if advanced_filter is not None:
377
+ data["advanced_filter"] = advanced_filter
378
+ if basic_filter is not None:
379
+ data["basic_filter"] = basic_filter
380
+
381
+ super().__init__(**data)
382
+
383
+ @model_validator(mode="after")
384
+ def ensure_basic_filter(self):
385
+ """Ensure basic_filter exists when mode is basic."""
386
+ if self.mode == "basic" and self.basic_filter is None:
387
+ self.basic_filter = BasicFilter()
388
+ return self
389
+
390
+ def is_advanced(self) -> bool:
391
+ """Check if filter is in advanced mode."""
392
+ return self.mode == "advanced"
393
+
394
+ def to_yaml_dict(self) -> FilterInputYaml:
395
+ """Serialize for YAML output."""
396
+ result: FilterInputYaml = {"mode": self.mode}
397
+ if self.mode == "basic" and self.basic_filter:
398
+ result["basic_filter"] = self.basic_filter.to_yaml_dict()
399
+ elif self.mode == "advanced" and self.advanced_filter:
400
+ result["advanced_filter"] = self.advanced_filter
401
+ return result
163
402
 
164
- def has_drop_cols(self) -> bool:
165
- """Checks if any column is marked to be dropped from the selection."""
166
- return any(not v.keep for v in self.renames)
403
+ @classmethod
404
+ def from_yaml_dict(cls, data: dict) -> "FilterInput":
405
+ """Load from YAML format."""
406
+ mode = data.get("mode", "basic")
407
+ basic_filter = None
408
+ if "basic_filter" in data:
409
+ basic_filter = BasicFilter.from_yaml_dict(data["basic_filter"])
410
+ return cls(
411
+ mode=mode,
412
+ basic_filter=basic_filter,
413
+ advanced_filter=data.get("advanced_filter", ""),
414
+ )
167
415
 
168
- @property
169
- def drop_columns(self) -> List[SelectInput]:
170
- """Returns a list of column names that are marked to be dropped from the selection."""
171
- return [v for v in self.renames if not v.keep and v.is_available]
172
416
 
173
- @property
174
- def non_jk_drop_columns(self) -> List[SelectInput]:
175
- return [v for v in self.renames if not v.keep and v.is_available and not v.join_key]
417
+ class SelectInputs(BaseModel):
418
+ """A container for a list of `SelectInput` objects (pure data, no logic)."""
176
419
 
177
- def __add__(self, other: "SelectInput"):
178
- """Allows adding a SelectInput using the '+' operator."""
179
- self.renames.append(other)
420
+ renames: list[SelectInput] = Field(default_factory=list)
180
421
 
181
- def append(self, other: "SelectInput"):
182
- """Appends a new SelectInput to the list of renames."""
183
- self.renames.append(other)
422
+ def __init__(self, renames: list[SelectInput] = None, **kwargs):
423
+ if renames is not None:
424
+ kwargs["renames"] = renames
425
+ else:
426
+ kwargs["renames"] = []
427
+ super().__init__(**kwargs)
184
428
 
185
- def remove_select_input(self, old_key: str):
186
- """Removes a SelectInput from the list based on its original name."""
187
- self.renames = [rename for rename in self.renames if rename.old_name != old_key]
429
+ def to_yaml_dict(self) -> JoinInputsYaml:
430
+ """Serialize for YAML output."""
431
+ return {"select": [r.to_yaml_dict() for r in self.renames]}
188
432
 
189
- def unselect_field(self, old_key: str):
190
- """Marks a field to be dropped from the final selection by setting `keep` to False."""
191
- for rename in self.renames:
192
- if old_key == rename.old_name:
193
- rename.keep = False
433
+ @classmethod
434
+ def from_yaml_dict(cls, data: dict) -> "SelectInputs":
435
+ """Load from slim YAML format. Supports both 'select' (new) and 'renames' (internal)."""
436
+ items = data.get("select", data.get("renames", []))
437
+ return cls(renames=[SelectInput.from_yaml_dict(item) for item in items])
194
438
 
195
439
  @classmethod
196
- def create_from_list(cls, col_list: List[str]):
440
+ def create_from_list(cls, col_list: list[str]) -> "SelectInputs":
197
441
  """Creates a SelectInputs object from a simple list of column names."""
198
- return cls([SelectInput(c) for c in col_list])
442
+ return cls(renames=[SelectInput(old_name=c) for c in col_list])
199
443
 
200
444
  @classmethod
201
- def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame):
445
+ def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame) -> "SelectInputs":
202
446
  """Creates a SelectInputs object from a Polars DataFrame's columns."""
203
- return cls([SelectInput(c) for c in df.columns])
447
+ return cls(renames=[SelectInput(old_name=c) for c in df.columns])
204
448
 
205
- def get_select_input_on_old_name(self, old_name: str) -> SelectInput | None:
206
- return next((v for v in self.renames if v.old_name == old_name), None)
207
-
208
- def get_select_input_on_new_name(self, old_name: str) -> SelectInput | None:
209
- return next((v for v in self.renames if v.new_name == old_name), None)
449
+ def remove_select_input(self, old_key: str) -> None:
450
+ """Removes a SelectInput from the list based on its original name."""
451
+ self.renames = [rename for rename in self.renames if rename.old_name != old_key]
210
452
 
211
453
 
212
454
  class JoinInputs(SelectInputs):
213
- """Extends `SelectInputs` with functionality specific to join operations, like handling join keys."""
214
-
215
- def __init__(self, renames: List[SelectInput]):
216
- self.renames = renames
217
-
218
- @property
219
- def join_key_selects(self) -> List[SelectInput]:
220
- """Returns only the `SelectInput` objects that are marked as join keys."""
221
- return [v for v in self.renames if v.join_key]
222
-
223
- def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
224
- """Gets the temporary rename mapping for all join keys on one side of a join."""
225
- return JoinKeyRenameResponse(
226
- side,
227
- [JoinKeyRename(jk.new_name,
228
- construct_join_key_name(side, jk.new_name))
229
- for jk in self.join_key_selects if jk.keep or not filter_drop]
230
- )
455
+ """Data model for join-specific select inputs (extends SelectInputs)."""
231
456
 
232
- def get_join_key_rename_mapping(self, side: SideLit) -> Dict[str, str]:
233
- """Returns a dictionary mapping original join key names to their temporary names."""
234
- return {jkr[0]: jkr[1] for jkr in self.get_join_key_renames(side)[1]}
457
+ def __init__(self, renames: list[SelectInput] = None, **kwargs):
458
+ if renames is not None:
459
+ kwargs["renames"] = renames
460
+ else:
461
+ kwargs["renames"] = []
462
+ super().__init__(**kwargs)
235
463
 
236
464
 
237
- @dataclass
238
- class JoinMap:
465
+ class JoinMap(BaseModel):
239
466
  """Defines a single mapping between a left and right column for a join key."""
240
- left_col: str
241
- right_col: str
242
467
 
468
+ left_col: str | None = None
469
+ right_col: str | None = None
243
470
 
244
- class JoinSelectMixin:
245
- """A mixin providing common methods for join-like operations that involve left and right inputs."""
246
- left_select: JoinInputs = None
247
- right_select: JoinInputs = None
471
+ def __init__(self, left_col: str = None, right_col: str = None, **data):
472
+ if left_col is not None:
473
+ data["left_col"] = left_col
474
+ if right_col is not None:
475
+ data["right_col"] = right_col
476
+ super().__init__(**data)
248
477
 
249
- @staticmethod
250
- def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> JoinInputs | None:
251
- """Parses various input formats into a standardized `JoinInputs` object."""
252
- if all(isinstance(c, SelectInput) for c in select):
253
- return JoinInputs(select)
254
- elif all(isinstance(c, dict) for c in select):
255
- return JoinInputs([SelectInput(**c.__dict__) for c in select])
256
- elif isinstance(select, dict):
257
- renames = select.get('renames')
258
- if renames:
259
- return JoinInputs([SelectInput(**c) for c in renames])
260
- elif all(isinstance(c, str) for c in select):
261
- return JoinInputs([SelectInput(s, s) for s in select])
262
-
263
- def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
264
- """Generates a new, non-conflicting column name by adding a suffix if necessary."""
265
- current_names = self.left_select.new_cols & self.right_select.new_cols
266
- if old_col_name not in current_names:
267
- return old_col_name
268
- while True:
269
- if old_col_name not in current_names:
270
- return old_col_name
271
- old_col_name = f'{side}_{old_col_name}'
478
+ @model_validator(mode="after")
479
+ def set_default_right_col(self):
480
+ """If right_col is None, default it to left_col."""
481
+ if self.right_col is None:
482
+ self.right_col = self.left_col
483
+ return self
272
484
 
273
- def add_new_select_column(self, select_input: SelectInput, side: str):
274
- """Adds a new column to the selection for either the left or right side."""
275
- selects = self.right_select if side == 'right' else self.left_select
276
- select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
277
- selects.__add__(select_input)
278
485
 
486
+ class CrossJoinInput(BaseModel):
487
+ """Data model for cross join operations."""
279
488
 
280
- @dataclass
281
- class CrossJoinInput(JoinSelectMixin):
282
- """Defines the settings for a cross join operation, including column selections for both inputs."""
283
- left_select: SelectInputs = None
284
- right_select: SelectInputs = None
489
+ left_select: JoinInputs
490
+ right_select: JoinInputs
285
491
 
286
- def __init__(self, left_select: List[SelectInput] | List[str],
287
- right_select: List[SelectInput] | List[str]):
288
- """Initializes the CrossJoinInput with selections for left and right tables."""
289
- self.left_select = self.parse_select(left_select)
290
- self.right_select = self.parse_select(right_select)
492
+ @model_validator(mode="before")
493
+ @classmethod
494
+ def parse_inputs(cls, data: Any) -> Any:
495
+ """Parse flexible input formats before validation."""
496
+ if isinstance(data, dict):
497
+ # Parse join_mapping
498
+ if "join_mapping" in data:
499
+ data["join_mapping"] = cls._parse_join_mapping(data["join_mapping"])
291
500
 
292
- @property
293
- def overlapping_records(self):
294
- """Finds column names that would conflict after the join."""
295
- return self.left_select.new_cols & self.right_select.new_cols
501
+ # Parse left_select
502
+ if "left_select" in data:
503
+ data["left_select"] = cls._parse_select(data["left_select"])
296
504
 
297
- def auto_rename(self):
298
- """Automatically renames columns on the right side to prevent naming conflicts."""
299
- overlapping_records = self.overlapping_records
300
- while len(overlapping_records) > 0:
301
- for right_col in self.right_select.renames:
302
- if right_col.new_name in overlapping_records:
303
- right_col.new_name = 'right_' + right_col.new_name
304
- overlapping_records = self.overlapping_records
505
+ # Parse right_select
506
+ if "right_select" in data:
507
+ data["right_select"] = cls._parse_select(data["right_select"])
305
508
 
509
+ return data
306
510
 
307
- @dataclass
308
- class JoinInput(JoinSelectMixin):
309
- """Defines the settings for a standard SQL-style join, including keys, strategy, and selections."""
310
- join_mapping: List[JoinMap]
311
- left_select: JoinInputs = None
312
- right_select: JoinInputs = None
313
- how: JoinStrategy = 'inner'
511
+ @staticmethod
512
+ def _parse_join_mapping(join_mapping: Any) -> list[JoinMap]:
513
+ """Parse various join_mapping formats."""
514
+ # Already a list of JoinMaps
515
+ if isinstance(join_mapping, list):
516
+ result = []
517
+ for jm in join_mapping:
518
+ if isinstance(jm, JoinMap):
519
+ result.append(jm)
520
+ elif isinstance(jm, dict):
521
+ result.append(JoinMap(**jm))
522
+ elif isinstance(jm, (tuple, list)) and len(jm) == 2:
523
+ result.append(JoinMap(left_col=jm[0], right_col=jm[1]))
524
+ elif isinstance(jm, str):
525
+ result.append(JoinMap(left_col=jm, right_col=jm))
526
+ else:
527
+ raise ValueError(f"Invalid join mapping item: {jm}")
528
+ return result
529
+
530
+ # Single JoinMap
531
+ if isinstance(join_mapping, JoinMap):
532
+ return [join_mapping]
533
+
534
+ # String: same column on both sides
535
+ if isinstance(join_mapping, str):
536
+ return [JoinMap(left_col=join_mapping, right_col=join_mapping)]
537
+
538
+ # Tuple: (left, right)
539
+ if isinstance(join_mapping, tuple) and len(join_mapping) == 2:
540
+ return [JoinMap(left_col=join_mapping[0], right_col=join_mapping[1])]
541
+
542
+ raise ValueError(f"Invalid join_mapping format: {type(join_mapping)}")
314
543
 
315
544
  @staticmethod
316
- def parse_join_mapping(join_mapping: any) -> List[JoinMap]:
317
- """Parses various input formats for join keys into a standardized list of `JoinMap` objects."""
318
- if isinstance(join_mapping, (tuple, list)):
319
- assert len(join_mapping) > 0
320
- if all(isinstance(jm, dict) for jm in join_mapping):
321
- join_mapping = [JoinMap(**jm) for jm in join_mapping]
322
-
323
- if not isinstance(join_mapping[0], JoinMap):
324
- assert len(join_mapping) <= 2
325
- if len(join_mapping) == 2:
326
- assert isinstance(join_mapping[0], str) and isinstance(join_mapping[1], str)
327
- join_mapping = [JoinMap(*join_mapping)]
328
- elif isinstance(join_mapping[0], str):
329
- join_mapping = [JoinMap(join_mapping[0], join_mapping[0])]
330
- elif isinstance(join_mapping, str):
331
- join_mapping = [JoinMap(join_mapping, join_mapping)]
332
- else:
333
- raise Exception('No valid join mapping as input')
334
- return join_mapping
335
-
336
- def __init__(self, join_mapping: List[JoinMap] | Tuple[str, str] | str,
337
- left_select: List[SelectInput] | List[str],
338
- right_select: List[SelectInput] | List[str],
339
- how: JoinStrategy = 'inner'):
340
- """Initializes the JoinInput with keys, selections, and join strategy."""
341
- self.join_mapping = self.parse_join_mapping(join_mapping)
342
- self.left_select = self.parse_select(left_select)
343
- self.right_select = self.parse_select(right_select)
344
- self.set_join_keys()
345
- self.how = how
545
+ def _parse_select(select: Any) -> JoinInputs:
546
+ """Parse various select input formats."""
547
+ # Already JoinInputs
548
+ if isinstance(select, JoinInputs):
549
+ return select
550
+
551
+ # List of SelectInput objects
552
+ if isinstance(select, list):
553
+ if all(isinstance(s, SelectInput) for s in select):
554
+ return JoinInputs(renames=select)
555
+ elif all(isinstance(s, str) for s in select):
556
+ return JoinInputs(renames=[SelectInput(old_name=s) for s in select])
557
+ elif all(isinstance(s, dict) for s in select):
558
+ return JoinInputs(renames=[SelectInput(**s) for s in select])
559
+
560
+ # Dict with 'select' (new YAML) or 'renames' (internal) key
561
+ if isinstance(select, dict):
562
+ if "select" in select:
563
+ return JoinInputs(renames=[SelectInput.from_yaml_dict(s) for s in select["select"]])
564
+ if "renames" in select:
565
+ return JoinInputs(**select)
566
+
567
+ raise ValueError(f"Invalid select format: {type(select)}")
568
+
569
+ def __init__(
570
+ self,
571
+ left_select: JoinInputs | list[SelectInput] | list[str] = None,
572
+ right_select: JoinInputs | list[SelectInput] | list[str] = None,
573
+ **data,
574
+ ):
575
+ """Custom init for backward compatibility with positional arguments."""
576
+ if left_select is not None:
577
+ data["left_select"] = left_select
578
+ if right_select is not None:
579
+ data["right_select"] = right_select
580
+ super().__init__(**data)
581
+
582
+ def to_yaml_dict(self) -> CrossJoinInputYaml:
583
+ """Serialize for YAML output."""
584
+ return {
585
+ "left_select": self.left_select.to_yaml_dict(),
586
+ "right_select": self.right_select.to_yaml_dict(),
587
+ }
588
+
589
+ def add_new_select_column(self, select_input: SelectInput, side: str) -> None:
590
+ """Adds a new column to the selection for either the left or right side."""
591
+ target_input = self.right_select if side == "right" else self.left_select
592
+ if select_input.new_name is None:
593
+ select_input.new_name = select_input.old_name
594
+ target_input.renames.append(select_input)
346
595
 
347
- def set_join_keys(self):
348
- """Marks the `SelectInput` objects corresponding to join keys."""
349
- [setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
350
- [setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
351
596
 
352
- def get_join_key_renames(self, filter_drop: bool = False) -> FullJoinKeyResponse:
353
- """Gets the temporary rename mappings for the join keys on both sides."""
354
- return FullJoinKeyResponse(self.left_select.get_join_key_renames(side="left", filter_drop=filter_drop),
355
- self.right_select.get_join_key_renames(side="right", filter_drop=filter_drop))
356
-
357
- def get_names_for_table_rename(self) -> List[JoinMap]:
358
- new_mappings: List[JoinMap] = []
359
- left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
360
- for join_map in self.join_mapping:
361
- new_mappings.append(JoinMap(left_rename_table.get(join_map.left_col, join_map.left_col),
362
- right_rename_table.get(join_map.right_col, join_map.right_col)
363
- )
364
- )
365
- return new_mappings
597
+ class JoinInput(BaseModel):
598
+ """Data model for standard SQL-style join operations."""
366
599
 
367
- @property
368
- def _left_join_keys(self) -> Set:
369
- """Returns a set of the left-side join key column names."""
370
- return set(jm.left_col for jm in self.join_mapping)
600
+ join_mapping: list[JoinMap]
601
+ left_select: JoinInputs
602
+ right_select: JoinInputs
603
+ how: JoinStrategy = "inner"
371
604
 
372
- @property
373
- def _right_join_keys(self) -> Set:
374
- """Returns a set of the right-side join key column names."""
375
- return set(jm.right_col for jm in self.join_mapping)
605
+ @model_validator(mode="before")
606
+ @classmethod
607
+ def parse_inputs(cls, data: Any) -> Any:
608
+ """Parse flexible input formats before validation."""
609
+ if isinstance(data, dict):
610
+ # Parse join_mapping
611
+ if "join_mapping" in data:
612
+ data["join_mapping"] = cls._parse_join_mapping(data["join_mapping"])
376
613
 
377
- @property
378
- def left_join_keys(self) -> List[str]:
379
- """Returns an ordered list of the left-side join key column names to be used in the join."""
380
- return [jm.left_col for jm in self.used_join_mapping]
614
+ # Parse left_select
615
+ if "left_select" in data:
616
+ data["left_select"] = cls._parse_select(data["left_select"])
381
617
 
382
- @property
383
- def right_join_keys(self) -> List[str]:
384
- """Returns an ordered list of the right-side join key column names to be used in the join."""
385
- return [jm.right_col for jm in self.used_join_mapping]
618
+ # Parse right_select
619
+ if "right_select" in data:
620
+ data["right_select"] = cls._parse_select(data["right_select"])
386
621
 
387
- @property
388
- def overlapping_records(self):
389
- if self.how in ('left', 'right', 'inner'):
390
- return self.left_select.new_cols & self.right_select.new_cols
391
- else:
392
- return self.left_select.new_cols & self.right_select.new_cols
622
+ return data
393
623
 
394
- def auto_rename(self):
395
- """Automatically renames columns on the right side to prevent naming conflicts."""
396
- self.set_join_keys()
397
- overlapping_records = self.overlapping_records
398
- while len(overlapping_records) > 0:
399
- for right_col in self.right_select.renames:
400
- if right_col.new_name in overlapping_records:
401
- right_col.new_name = right_col.new_name + '_right'
402
- overlapping_records = self.overlapping_records
624
+ @staticmethod
625
+ def _parse_join_mapping(join_mapping: Any) -> list[JoinMap]:
626
+ """Parse various join_mapping formats."""
627
+ # Already a list of JoinMaps
628
+ if isinstance(join_mapping, list):
629
+ result = []
630
+ for jm in join_mapping:
631
+ if isinstance(jm, JoinMap):
632
+ result.append(jm)
633
+ elif isinstance(jm, dict):
634
+ result.append(JoinMap(**jm))
635
+ elif isinstance(jm, (tuple, list)) and len(jm) == 2:
636
+ result.append(JoinMap(left_col=jm[0], right_col=jm[1]))
637
+ elif isinstance(jm, str):
638
+ result.append(JoinMap(left_col=jm, right_col=jm))
639
+ else:
640
+ raise ValueError(f"Invalid join mapping item: {jm}")
641
+ return result
642
+
643
+ # Single JoinMap
644
+ if isinstance(join_mapping, JoinMap):
645
+ return [join_mapping]
646
+
647
+ # String: same column on both sides
648
+ if isinstance(join_mapping, str):
649
+ return [JoinMap(left_col=join_mapping, right_col=join_mapping)]
650
+
651
+ # Tuple: (left, right)
652
+ if isinstance(join_mapping, tuple) and len(join_mapping) == 2:
653
+ return [JoinMap(left_col=join_mapping[0], right_col=join_mapping[1])]
654
+
655
+ raise ValueError(f"Invalid join_mapping format: {type(join_mapping)}")
656
+
657
+ @staticmethod
658
+ def _parse_select(select: Any) -> JoinInputs:
659
+ """Parse various select input formats."""
660
+ # Already JoinInputs
661
+ if isinstance(select, JoinInputs):
662
+ return select
663
+
664
+ # List of SelectInput objects
665
+ if isinstance(select, list):
666
+ if all(isinstance(s, SelectInput) for s in select):
667
+ return JoinInputs(renames=select)
668
+ elif all(isinstance(s, str) for s in select):
669
+ return JoinInputs(renames=[SelectInput(old_name=s) for s in select])
670
+ elif all(isinstance(s, dict) for s in select):
671
+ return JoinInputs(renames=[SelectInput(**s) for s in select])
672
+
673
+ # Dict with 'select' (new YAML) or 'renames' (internal) key
674
+ if isinstance(select, dict):
675
+ if "select" in select:
676
+ return JoinInputs(renames=[SelectInput.from_yaml_dict(s) for s in select["select"]])
677
+ if "renames" in select:
678
+ return JoinInputs(**select)
679
+
680
+ raise ValueError(f"Invalid select format: {type(select)}")
681
+
682
+ def __init__(
683
+ self,
684
+ join_mapping: list[JoinMap] | JoinMap | tuple[str, str] | str | list[tuple] | list[str] = None,
685
+ left_select: JoinInputs | list[SelectInput] | list[str] = None,
686
+ right_select: JoinInputs | list[SelectInput] | list[str] = None,
687
+ how: JoinStrategy = "inner",
688
+ **data,
689
+ ):
690
+ """Custom init for backward compatibility with positional arguments."""
691
+ if join_mapping is not None:
692
+ data["join_mapping"] = join_mapping
693
+ if left_select is not None:
694
+ data["left_select"] = left_select
695
+ if right_select is not None:
696
+ data["right_select"] = right_select
697
+ if how is not None:
698
+ data["how"] = how
699
+
700
+ super().__init__(**data)
701
+
702
+ def to_yaml_dict(self) -> JoinInputYaml:
703
+ """Serialize for YAML output."""
704
+ return {
705
+ "join_mapping": [{"left_col": jm.left_col, "right_col": jm.right_col} for jm in self.join_mapping],
706
+ "left_select": self.left_select.to_yaml_dict(),
707
+ "right_select": self.right_select.to_yaml_dict(),
708
+ "how": self.how,
709
+ }
710
+
711
+ def add_new_select_column(self, select_input: SelectInput, side: str) -> None:
712
+ """Adds a new column to the selection for either the left or right side."""
713
+ target_input = self.right_select if side == "right" else self.left_select
714
+ if select_input.new_name is None:
715
+ select_input.new_name = select_input.old_name
716
+ target_input.renames.append(select_input)
403
717
 
404
- @property
405
- def used_join_mapping(self) -> List[JoinMap]:
406
- """Returns the final join mapping after applying all renames and transformations."""
407
- new_mappings: List[JoinMap] = []
408
- left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
409
- left_join_rename_mapping: Dict[str, str] = self.left_select.get_join_key_rename_mapping("left")
410
- right_join_rename_mapping: Dict[str, str] = self.right_select.get_join_key_rename_mapping("right")
411
- for join_map in self.join_mapping:
412
- # del self.right_select.rename_table, self.left_select.rename_table
413
- new_mappings.append(JoinMap(left_join_rename_mapping.get(left_rename_table.get(join_map.left_col, join_map.left_col)),
414
- right_join_rename_mapping.get(right_rename_table.get(join_map.right_col, join_map.right_col))
415
- )
416
- )
417
- return new_mappings
418
718
 
719
+ class FuzzyMatchInput(BaseModel):
720
+ """Data model for fuzzy matching join operations."""
419
721
 
420
- @dataclass
421
- class FuzzyMatchInput(JoinInput):
422
- """Extends `JoinInput` with settings specific to fuzzy matching, such as the matching algorithm and threshold."""
423
- join_mapping: List[FuzzyMapping]
722
+ join_mapping: list[FuzzyMapping]
723
+ left_select: JoinInputs
724
+ right_select: JoinInputs
725
+ how: JoinStrategy = "inner"
424
726
  aggregate_output: bool = False
425
727
 
426
- @staticmethod
427
- def parse_fuzz_mapping(fuzz_mapping: List[FuzzyMapping] | Tuple[str, str] | str) -> List[FuzzyMapping]:
428
- if isinstance(fuzz_mapping, (tuple, list)):
429
- assert len(fuzz_mapping) > 0
430
- if all(isinstance(fm, dict) for fm in fuzz_mapping):
431
- fuzz_mapping = [FuzzyMapping(**fm) for fm in fuzz_mapping]
728
+ def __init__(
729
+ self,
730
+ left_select: JoinInputs | list[SelectInput] | list[str] = None,
731
+ right_select: JoinInputs | list[SelectInput] | list[str] = None,
732
+ **data,
733
+ ):
734
+ """Custom init for backward compatibility with positional arguments."""
735
+ if left_select is not None:
736
+ data["left_select"] = left_select
737
+ if right_select is not None:
738
+ data["right_select"] = right_select
739
+
740
+ super().__init__(**data)
741
+
742
+ def to_yaml_dict(self) -> FuzzyMatchInputYaml:
743
+ """Serialize for YAML output."""
744
+ return {
745
+ "join_mapping": [asdict(jm) for jm in self.join_mapping],
746
+ "left_select": self.left_select.to_yaml_dict(),
747
+ "right_select": self.right_select.to_yaml_dict(),
748
+ "how": self.how,
749
+ "aggregate_output": self.aggregate_output,
750
+ }
751
+
752
+ def add_new_select_column(self, select_input: SelectInput, side: str) -> None:
753
+ """Adds a new column to the selection for either the left or right side."""
754
+ target_input = self.right_select if side == "right" else self.left_select
755
+ if select_input.new_name is None:
756
+ select_input.new_name = select_input.old_name
757
+ target_input.renames.append(select_input)
432
758
 
433
- if not isinstance(fuzz_mapping[0], FuzzyMapping):
434
- assert len(fuzz_mapping) <= 2
435
- if len(fuzz_mapping) == 2:
436
- assert isinstance(fuzz_mapping[0], str) and isinstance(fuzz_mapping[1], str)
437
- fuzz_mapping = [FuzzyMapping(*fuzz_mapping)]
438
- elif isinstance(fuzz_mapping[0], str):
439
- fuzz_mapping = [FuzzyMapping(fuzz_mapping[0], fuzz_mapping[0])]
440
- elif isinstance(fuzz_mapping, str):
441
- fuzz_mapping = [FuzzyMapping(fuzz_mapping, fuzz_mapping)]
442
- elif isinstance(fuzz_mapping, FuzzyMapping):
443
- fuzz_mapping = [fuzz_mapping]
444
- else:
445
- raise Exception('No valid join mapping as input')
446
- return fuzz_mapping
447
-
448
- def __init__(self, join_mapping: List[FuzzyMapping] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
449
- right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: JoinStrategy = 'inner'):
450
- self.join_mapping = self.parse_fuzz_mapping(join_mapping)
451
- self.left_select = self.parse_select(left_select)
452
- self.right_select = self.parse_select(right_select)
453
- self.how = how
454
- for jm in self.join_mapping:
455
-
456
- if jm.right_col not in {v.old_name for v in self.right_select.renames}:
457
- self.right_select.append(SelectInput(jm.right_col, keep=False, join_key=True))
458
- if jm.left_col not in {v.old_name for v in self.left_select.renames}:
459
- self.left_select.append(SelectInput(jm.left_col, keep=False, join_key=True))
460
- [setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
461
- [setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
462
- self.aggregate_output = aggregate_output
759
+ @staticmethod
760
+ def _parse_select(select: Any) -> JoinInputs:
761
+ """Parse various select input formats."""
762
+ # Already JoinInputs
763
+ if isinstance(select, JoinInputs):
764
+ return select
765
+
766
+ # List of SelectInput objects
767
+ if isinstance(select, list):
768
+ if all(isinstance(s, SelectInput) for s in select):
769
+ return JoinInputs(renames=select)
770
+ elif all(isinstance(s, str) for s in select):
771
+ return JoinInputs(renames=[SelectInput(old_name=s) for s in select])
772
+ elif all(isinstance(s, dict) for s in select):
773
+ return JoinInputs(renames=[SelectInput(**s) for s in select])
774
+
775
+ # Dict with 'select' (new YAML) or 'renames' (internal) key
776
+ if isinstance(select, dict):
777
+ if "select" in select:
778
+ return JoinInputs(renames=[SelectInput.from_yaml_dict(s) for s in select["select"]])
779
+ if "renames" in select:
780
+ return JoinInputs(**select)
781
+
782
+ raise ValueError(f"Invalid select format: {type(select)}")
783
+
784
+ @model_validator(mode="before")
785
+ @classmethod
786
+ def parse_inputs(cls, data: Any) -> Any:
787
+ """Parse flexible input formats before validation."""
788
+ if isinstance(data, dict):
789
+ # Parse left_select
790
+ if "left_select" in data:
791
+ data["left_select"] = cls._parse_select(data["left_select"])
463
792
 
464
- @property
465
- def overlapping_records(self):
466
- return self.left_select.new_cols & self.right_select.new_cols
793
+ # Parse right_select
794
+ if "right_select" in data:
795
+ data["right_select"] = cls._parse_select(data["right_select"])
467
796
 
468
- @property
469
- def fuzzy_maps(self) -> List[FuzzyMapping]:
470
- """Returns the final fuzzy mappings after applying all column renames."""
471
- new_mappings = []
472
- left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
473
- for org_fuzzy_map in self.join_mapping:
474
- right_col = right_rename_table.get(org_fuzzy_map.right_col)
475
- left_col = left_rename_table.get(org_fuzzy_map.left_col)
476
- if right_col != org_fuzzy_map.right_col or left_col != org_fuzzy_map.left_col:
477
- new_mapping = deepcopy(org_fuzzy_map)
478
- new_mapping.left_col = left_col
479
- new_mapping.right_col = right_col
480
- new_mappings.append(new_mapping)
481
- else:
482
- new_mappings.append(org_fuzzy_map)
483
- return new_mappings
797
+ return data
484
798
 
485
799
 
486
- @dataclass
487
- class AggColl:
800
+ class AggColl(BaseModel):
488
801
  """
489
802
  A data class that represents a single aggregation operation for a group by operation.
490
803
 
@@ -493,7 +806,7 @@ class AggColl:
493
806
  old_name : str
494
807
  The name of the column in the original DataFrame to be aggregated.
495
808
 
496
- agg : Any
809
+ agg : str
497
810
  The aggregation function to use. This can be a string representing a built-in function or a custom function.
498
811
 
499
812
  new_name : Optional[str]
@@ -513,42 +826,57 @@ class AggColl:
513
826
  output_type='float'
514
827
  )
515
828
  """
829
+
516
830
  old_name: str
517
831
  agg: str
518
- new_name: Optional[str]
519
- output_type: Optional[str] = None
520
-
521
- def __init__(self, old_name: str, agg: str, new_name: str = None, output_type: str = None):
522
- """Initializes an aggregation column with its source, function, and new name."""
523
- self.old_name = str(old_name)
524
- if agg != 'groupby':
525
- self.new_name = new_name if new_name is not None else self.old_name + "_" + agg
526
- else:
527
- self.new_name = new_name if new_name is not None else self.old_name
528
- self.output_type = output_type if output_type is not None else get_func_type_mapping(agg)
529
- self.agg = agg
832
+ new_name: str | None = None
833
+ output_type: str | None = None
834
+
835
+ def __init__(self, old_name: str, agg: str, new_name: str | None = None, output_type: str | None = None):
836
+ data = {"old_name": old_name, "agg": agg}
837
+ if new_name is not None:
838
+ data["new_name"] = new_name
839
+ if output_type is not None:
840
+ data["output_type"] = output_type
841
+
842
+ super().__init__(**data)
843
+
844
+ @model_validator(mode="after")
845
+ def set_defaults(self):
846
+ """Set default new_name and output_type based on agg function."""
847
+ # Set new_name
848
+ if self.new_name is None:
849
+ if self.agg != "groupby":
850
+ self.new_name = self.old_name + "_" + self.agg
851
+ else:
852
+ self.new_name = self.old_name
853
+
854
+ # Set output_type
855
+ if self.output_type is None:
856
+ self.output_type = get_func_type_mapping(self.agg)
857
+
858
+ # Ensure old_name is a string
859
+ self.old_name = str(self.old_name)
860
+
861
+ return self
530
862
 
531
863
  @property
532
864
  def agg_func(self):
533
865
  """Returns the corresponding Polars aggregation function from the `agg` string."""
534
- if self.agg == 'groupby':
866
+ if self.agg == "groupby":
535
867
  return self.agg
536
- elif self.agg == 'concat':
868
+ elif self.agg == "concat":
537
869
  return string_concat
538
870
  else:
539
871
  return getattr(pl, self.agg) if isinstance(self.agg, str) else self.agg
540
872
 
541
873
 
542
- @dataclass
543
- class GroupByInput:
874
+ class GroupByInput(BaseModel):
544
875
  """
545
876
  A data class that represents the input for a group by operation.
546
877
 
547
878
  Attributes
548
879
  ----------
549
- group_columns : List[str]
550
- A list of column names to group the DataFrame by. These column(s) will be set as the DataFrame index.
551
-
552
880
  agg_cols : List[AggColl]
553
881
  A list of `AggColl` objects that specify the aggregation operations to perform on the DataFrame columns
554
882
  after grouping. Each `AggColl` object should specify the column to be aggregated and the aggregation
@@ -557,32 +885,41 @@ class GroupByInput:
557
885
  Example
558
886
  --------
559
887
  group_by_input = GroupByInput(
560
- agg_cols=[AggColl(old_name='ix', agg='groupby'), AggColl(old_name='groups', agg='groupby'), AggColl(old_name='col1', agg='sum'), AggColl(old_name='col2', agg='mean')]
888
+ agg_cols=[AggColl(old_name='ix', agg='groupby'), AggColl(old_name='groups', agg='groupby'),
889
+ AggColl(old_name='col1', agg='sum'), AggColl(old_name='col2', agg='mean')]
561
890
  )
562
891
  """
563
- agg_cols: List[AggColl]
892
+
893
+ agg_cols: list[AggColl]
894
+
895
+ def __init__(self, agg_cols: list[AggColl]):
896
+ """Backwards compatibility implementation"""
897
+ super().__init__(agg_cols=agg_cols)
564
898
 
565
899
 
566
- @dataclass
567
- class PivotInput:
900
+ class PivotInput(BaseModel):
568
901
  """Defines the settings for a pivot (long-to-wide) operation."""
569
- index_columns: List[str]
902
+
903
+ index_columns: list[str]
570
904
  pivot_column: str
571
905
  value_col: str
572
- aggregations: List[str]
906
+ aggregations: list[str]
573
907
 
574
908
  @property
575
- def grouped_columns(self) -> List[str]:
909
+ def grouped_columns(self) -> list[str]:
576
910
  """Returns the list of columns to be used for the initial grouping stage of the pivot."""
577
911
  return self.index_columns + [self.pivot_column]
578
912
 
579
913
  def get_group_by_input(self) -> GroupByInput:
580
914
  """Constructs the `GroupByInput` needed for the pre-aggregation step of the pivot."""
581
- group_by_cols = [AggColl(c, 'groupby') for c in self.grouped_columns]
582
- agg_cols = [AggColl(self.value_col, agg=aggregation, new_name=aggregation) for aggregation in self.aggregations]
583
- return GroupByInput(group_by_cols+agg_cols)
584
-
585
- def get_index_columns(self) -> List[pl.col]:
915
+ group_by_cols = [AggColl(old_name=c, agg="groupby") for c in self.grouped_columns]
916
+ agg_cols = [
917
+ AggColl(old_name=self.value_col, agg=aggregation, new_name=aggregation) for aggregation in self.aggregations
918
+ ]
919
+ return GroupByInput(agg_cols=group_by_cols + agg_cols)
920
+
921
+ def get_index_columns(self) -> list[pl.col]:
922
+ """Returns the index columns as Polars column expressions."""
586
923
  return [pl.col(c) for c in self.index_columns]
587
924
 
588
925
  def get_pivot_column(self) -> pl.Expr:
@@ -591,87 +928,675 @@ class PivotInput:
591
928
 
592
929
  def get_values_expr(self) -> pl.Expr:
593
930
  """Creates the struct expression used to gather the values for pivoting."""
594
- return pl.struct([pl.col(c) for c in self.aggregations]).alias('vals')
931
+ return pl.struct([pl.col(c) for c in self.aggregations]).alias("vals")
595
932
 
596
933
 
597
- @dataclass
598
- class SortByInput:
934
+ class SortByInput(BaseModel):
599
935
  """Defines a single sort condition on a column, including the direction."""
936
+
600
937
  column: str
601
- how: str = 'asc'
938
+ how: str | None = "asc"
602
939
 
603
940
 
604
- @dataclass
605
- class RecordIdInput:
941
+ class RecordIdInput(BaseModel):
606
942
  """Defines settings for adding a record ID (row number) column to the data."""
607
- output_column_name: str = 'record_id'
943
+
944
+ output_column_name: str = "record_id"
608
945
  offset: int = 1
609
- group_by: Optional[bool] = False
610
- group_by_columns: Optional[List[str]] = field(default_factory=list)
946
+ group_by: bool | None = False
947
+ group_by_columns: list[str] | None = Field(default_factory=list)
611
948
 
612
949
 
613
- @dataclass
614
- class TextToRowsInput:
950
+ class TextToRowsInput(BaseModel):
615
951
  """Defines settings for splitting a text column into multiple rows based on a delimiter."""
952
+
616
953
  column_to_split: str
617
- output_column_name: Optional[str] = None
618
- split_by_fixed_value: Optional[bool] = True
619
- split_fixed_value: Optional[str] = ','
620
- split_by_column: Optional[str] = None
954
+ output_column_name: str | None = None
955
+ split_by_fixed_value: bool | None = True
956
+ split_fixed_value: str | None = ","
957
+ split_by_column: str | None = None
621
958
 
622
959
 
623
- @dataclass
624
- class UnpivotInput:
960
+ class UnpivotInput(BaseModel):
625
961
  """Defines settings for an unpivot (wide-to-long) operation."""
626
- index_columns: Optional[List[str]] = field(default_factory=list)
627
- value_columns: Optional[List[str]] = field(default_factory=list)
628
- data_type_selector: Optional[Literal['float', 'all', 'date', 'numeric', 'string']] = None
629
- data_type_selector_mode: Optional[Literal['data_type', 'column']] = 'column'
630
-
631
- def __post_init__(self):
632
- """Ensures that list attributes are initialized correctly if they are None."""
633
- if self.index_columns is None:
634
- self.index_columns = []
635
- if self.value_columns is None:
636
- self.value_columns = []
637
- if self.data_type_selector_mode is None:
638
- self.data_type_selector_mode = 'column'
962
+
963
+ model_config = ConfigDict(arbitrary_types_allowed=True)
964
+
965
+ index_columns: list[str] = Field(default_factory=list)
966
+ value_columns: list[str] = Field(default_factory=list)
967
+ data_type_selector: Literal["float", "all", "date", "numeric", "string"] | None = None
968
+ data_type_selector_mode: Literal["data_type", "column"] = "column"
639
969
 
640
970
  @property
641
- def data_type_selector_expr(self) -> Optional[Callable]:
971
+ def data_type_selector_expr(self) -> Callable | None:
642
972
  """Returns a Polars selector function based on the `data_type_selector` string."""
643
- if self.data_type_selector_mode == 'data_type':
973
+ if self.data_type_selector_mode == "data_type":
644
974
  if self.data_type_selector is not None:
645
975
  try:
646
976
  return getattr(selectors, self.data_type_selector)
647
- except Exception as e:
648
- print(f'Could not find the selector: {self.data_type_selector}')
977
+ except Exception:
978
+ print(f"Could not find the selector: {self.data_type_selector}")
649
979
  return selectors.all
650
980
  return selectors.all
981
+ return None
651
982
 
652
983
 
653
- @dataclass
654
- class UnionInput:
984
+ class UnionInput(BaseModel):
655
985
  """Defines settings for a union (concatenation) operation."""
656
- mode: Literal['selective', 'relaxed'] = 'relaxed'
986
+
987
+ mode: Literal["selective", "relaxed"] = "relaxed"
657
988
 
658
989
 
659
- @dataclass
660
- class UniqueInput:
990
+ class UniqueInput(BaseModel):
661
991
  """Defines settings for a uniqueness operation, specifying columns and which row to keep."""
662
- columns: Optional[List[str]] = None
992
+
993
+ columns: list[str] | None = None
663
994
  strategy: Literal["first", "last", "any", "none"] = "any"
664
995
 
665
996
 
666
- @dataclass
667
- class GraphSolverInput:
997
+ class GraphSolverInput(BaseModel):
668
998
  """Defines settings for a graph-solving operation (e.g., finding connected components)."""
999
+
669
1000
  col_from: str
670
1001
  col_to: str
671
- output_column_name: Optional[str] = 'graph_group'
1002
+ output_column_name: str | None = "graph_group"
672
1003
 
673
1004
 
674
- @dataclass
675
- class PolarsCodeInput:
1005
+ class PolarsCodeInput(BaseModel):
676
1006
  """A simple container for a string of user-provided Polars code to be executed."""
1007
+
677
1008
  polars_code: str
1009
+
1010
+
1011
+ class SelectInputsManager:
1012
+ """Manager class that provides all query and mutation operations."""
1013
+
1014
+ def __init__(self, select_inputs: SelectInputs):
1015
+ self.select_inputs = select_inputs
1016
+
1017
+ # === Query Methods (read-only) ===
1018
+
1019
+ def get_old_cols(self) -> set[str]:
1020
+ """Returns a set of original column names to be kept in the selection."""
1021
+ return set(v.old_name for v in self.select_inputs.renames if v.keep)
1022
+
1023
+ def get_new_cols(self) -> set[str]:
1024
+ """Returns a set of new (renamed) column names to be kept in the selection."""
1025
+ return set(v.new_name for v in self.select_inputs.renames if v.keep)
1026
+
1027
+ def get_rename_table(self) -> dict[str, str]:
1028
+ """Generates a dictionary for use in Polars' `.rename()` method."""
1029
+ return {v.old_name: v.new_name for v in self.select_inputs.renames if v.is_available and (v.keep or v.join_key)}
1030
+
1031
+ def get_select_cols(self, include_join_key: bool = True) -> list[str]:
1032
+ """Gets a list of original column names to select from the source DataFrame."""
1033
+ return [v.old_name for v in self.select_inputs.renames if v.keep or (v.join_key and include_join_key)]
1034
+
1035
+ def has_drop_cols(self) -> bool:
1036
+ """Checks if any column is marked to be dropped from the selection."""
1037
+ return any(not v.keep for v in self.select_inputs.renames)
1038
+
1039
+ def get_drop_columns(self) -> list[SelectInput]:
1040
+ """Returns a list of SelectInput objects that are marked to be dropped."""
1041
+ return [v for v in self.select_inputs.renames if not v.keep and v.is_available]
1042
+
1043
+ def get_non_jk_drop_columns(self) -> list[SelectInput]:
1044
+ """Returns drop columns that are not join keys."""
1045
+ return [v for v in self.select_inputs.renames if not v.keep and v.is_available and not v.join_key]
1046
+
1047
+ def find_by_old_name(self, old_name: str) -> SelectInput | None:
1048
+ """Find SelectInput by original column name."""
1049
+ return next((v for v in self.select_inputs.renames if v.old_name == old_name), None)
1050
+
1051
+ def find_by_new_name(self, new_name: str) -> SelectInput | None:
1052
+ """Find SelectInput by new column name."""
1053
+ return next((v for v in self.select_inputs.renames if v.new_name == new_name), None)
1054
+
1055
+ # === Mutation Methods ===
1056
+
1057
+ def append(self, other: SelectInput) -> None:
1058
+ """Appends a new SelectInput to the list of renames."""
1059
+ self.select_inputs.renames.append(other)
1060
+
1061
+ def remove_select_input(self, old_key: str) -> None:
1062
+ """Removes a SelectInput from the list based on its original name."""
1063
+ self.select_inputs.renames = [rename for rename in self.select_inputs.renames if rename.old_name != old_key]
1064
+
1065
+ def unselect_field(self, old_key: str) -> None:
1066
+ """Marks a field to be dropped from the final selection by setting `keep` to False."""
1067
+ for rename in self.select_inputs.renames:
1068
+ if old_key == rename.old_name:
1069
+ rename.keep = False
1070
+
1071
+ # === Backward Compatibility Properties ===
1072
+
1073
+ @property
1074
+ def old_cols(self) -> set[str]:
1075
+ """Backward compatibility: Returns set of old column names."""
1076
+ return self.get_old_cols()
1077
+
1078
+ @property
1079
+ def new_cols(self) -> set[str]:
1080
+ """Backward compatibility: Returns set of new column names."""
1081
+ return self.get_new_cols()
1082
+
1083
+ @property
1084
+ def rename_table(self) -> dict[str, str]:
1085
+ """Backward compatibility: Returns rename table dictionary."""
1086
+ return self.get_rename_table()
1087
+
1088
+ @property
1089
+ def drop_columns(self) -> list[SelectInput]:
1090
+ """Backward compatibility: Returns list of columns to drop."""
1091
+ return self.get_drop_columns()
1092
+
1093
+ @property
1094
+ def non_jk_drop_columns(self) -> list[SelectInput]:
1095
+ """Backward compatibility: Returns non-join-key columns to drop."""
1096
+ return self.get_non_jk_drop_columns()
1097
+
1098
+ @property
1099
+ def renames(self) -> list[SelectInput]:
1100
+ """Backward compatibility: Direct access to renames list."""
1101
+ return self.select_inputs.renames
1102
+
1103
+ def get_select_input_on_old_name(self, old_name: str) -> SelectInput | None:
1104
+ """Backward compatibility alias: Find SelectInput by original column name."""
1105
+ return self.find_by_old_name(old_name)
1106
+
1107
+ def get_select_input_on_new_name(self, new_name: str) -> SelectInput | None:
1108
+ """Backward compatibility alias: Find SelectInput by new column name."""
1109
+ return self.find_by_new_name(new_name)
1110
+
1111
+ def __add__(self, other: SelectInput) -> "SelectInputsManager":
1112
+ """Backward compatibility: Support += operator for appending."""
1113
+ self.append(other)
1114
+ return self
1115
+
1116
+
1117
+ class JoinInputsManager(SelectInputsManager):
1118
+ """Manager for join-specific operations, extends SelectInputsManager."""
1119
+
1120
+ def __init__(self, join_inputs: JoinInputs):
1121
+ super().__init__(join_inputs)
1122
+ self.join_inputs = join_inputs
1123
+
1124
+ # === Query Methods ===
1125
+
1126
+ def get_join_key_selects(self) -> list[SelectInput]:
1127
+ """Returns only the `SelectInput` objects that are marked as join keys."""
1128
+ return [v for v in self.join_inputs.renames if v.join_key]
1129
+
1130
+ def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
1131
+ """Gets the temporary rename mapping for all join keys on one side of a join."""
1132
+ join_key_selects = self.get_join_key_selects()
1133
+ join_key_list = [
1134
+ JoinKeyRename(jk.new_name, construct_join_key_name(side, jk.new_name))
1135
+ for jk in join_key_selects
1136
+ if jk.keep or not filter_drop
1137
+ ]
1138
+ return JoinKeyRenameResponse(side, join_key_list)
1139
+
1140
+ def get_join_key_rename_mapping(self, side: SideLit) -> dict[str, str]:
1141
+ """Returns a dictionary mapping original join key names to their temporary names."""
1142
+ join_key_response = self.get_join_key_renames(side)
1143
+ return {jkr.original_name: jkr.temp_name for jkr in join_key_response.join_key_renames}
1144
+
1145
+ @property
1146
+ def join_key_selects(self) -> list[SelectInput]:
1147
+ """Backward compatibility: Returns join key SelectInputs."""
1148
+ return self.get_join_key_selects()
1149
+
1150
+
1151
+ class JoinSelectManagerMixin:
1152
+ """Mixin providing common methods for join-like operations."""
1153
+
1154
+ left_manager: JoinInputsManager
1155
+ right_manager: JoinInputsManager
1156
+ input: CrossJoinInput | JoinInput | FuzzyMatchInput
1157
+
1158
+ @staticmethod
1159
+ def parse_select(select: list[SelectInput] | list[str] | list[dict] | dict) -> JoinInputs:
1160
+ """Parses various input formats into a standardized `JoinInputs` object."""
1161
+ if not select:
1162
+ return JoinInputs(renames=[])
1163
+
1164
+ if all(isinstance(c, SelectInput) for c in select):
1165
+ return JoinInputs(renames=select)
1166
+ elif all(isinstance(c, dict) for c in select):
1167
+ return JoinInputs(renames=[SelectInput(**c) for c in select])
1168
+ elif isinstance(select, dict):
1169
+ renames = select.get("renames")
1170
+ if renames:
1171
+ return JoinInputs(renames=[SelectInput(**c) for c in renames])
1172
+ return JoinInputs(renames=[])
1173
+ elif all(isinstance(c, str) for c in select):
1174
+ return JoinInputs(renames=[SelectInput(old_name=s, new_name=s) for s in select])
1175
+
1176
+ raise ValueError(f"Unable to parse select input: {type(select)}")
1177
+
1178
+ def get_overlapping_columns(self) -> set[str]:
1179
+ """Finds column names that would conflict after the join."""
1180
+ return self.left_manager.get_new_cols() & self.right_manager.get_new_cols()
1181
+
1182
+ def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
1183
+ """Generates a new, non-conflicting column name by adding a suffix if necessary."""
1184
+ current_names = self.get_overlapping_columns()
1185
+ if old_col_name not in current_names:
1186
+ return old_col_name
1187
+
1188
+ new_name = old_col_name
1189
+ while new_name in current_names:
1190
+ new_name = f"{side}_{new_name}"
1191
+ return new_name
1192
+
1193
+ def add_new_select_column(self, select_input: SelectInput, side: str) -> None:
1194
+ """Adds a new column to the selection for either the left or right side."""
1195
+ target_input = self.input.right_select if side == "right" else self.input.left_select
1196
+
1197
+ select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
1198
+
1199
+ target_input.renames.append(select_input)
1200
+
1201
+
1202
+ class CrossJoinInputManager(JoinSelectManagerMixin):
1203
+ """Manager for cross join operations."""
1204
+
1205
+ def __init__(self, cross_join_input: CrossJoinInput):
1206
+ self.input = deepcopy(cross_join_input)
1207
+ self.left_manager = JoinInputsManager(self.input.left_select)
1208
+ self.right_manager = JoinInputsManager(self.input.right_select)
1209
+
1210
+ @classmethod
1211
+ def create(
1212
+ cls, left_select: list[SelectInput] | list[str], right_select: list[SelectInput] | list[str]
1213
+ ) -> "CrossJoinInputManager":
1214
+ """Factory method to create CrossJoinInput from various input formats."""
1215
+ left_inputs = cls.parse_select(left_select)
1216
+ right_inputs = cls.parse_select(right_select)
1217
+
1218
+ cross_join = CrossJoinInput(left_select=left_inputs, right_select=right_inputs)
1219
+ return cls(cross_join)
1220
+
1221
+ def get_overlapping_records(self) -> set[str]:
1222
+ """Finds column names that would conflict after the join."""
1223
+ return self.get_overlapping_columns()
1224
+
1225
+ def auto_rename(self, rename_mode: Literal["suffix", "prefix"] = "prefix") -> None:
1226
+ """Automatically renames columns on the right side to prevent naming conflicts."""
1227
+ overlapping_records = self.get_overlapping_records()
1228
+
1229
+ while len(overlapping_records) > 0:
1230
+ for right_col in self.input.right_select.renames:
1231
+ if right_col.new_name in overlapping_records:
1232
+ if rename_mode == "prefix":
1233
+ right_col.new_name = "right_" + right_col.new_name
1234
+ elif rename_mode == "suffix":
1235
+ right_col.new_name = right_col.new_name + "_right"
1236
+ else:
1237
+ raise ValueError(f"Unknown rename_mode: {rename_mode}")
1238
+ overlapping_records = self.get_overlapping_records()
1239
+
1240
+ # === Backward Compatibility Properties ===
1241
+
1242
+ @property
1243
+ def left_select(self) -> JoinInputsManager:
1244
+ """Backward compatibility: Access left_manager as left_select."""
1245
+ return self.left_manager
1246
+
1247
+ @property
1248
+ def right_select(self) -> JoinInputsManager:
1249
+ """Backward compatibility: Access right_manager as right_select."""
1250
+ return self.right_manager
1251
+
1252
+ @property
1253
+ def overlapping_records(self) -> set[str]:
1254
+ """Backward compatibility: Returns overlapping column names."""
1255
+ return self.get_overlapping_records()
1256
+
1257
+ def to_cross_join_input(self) -> CrossJoinInput:
1258
+ """Creates a new CrossJoinInput instance based on the current manager settings.
1259
+
1260
+ This is useful when you've modified the manager (e.g., via auto_rename) and
1261
+ want to get a fresh CrossJoinInput with all the current settings applied.
1262
+
1263
+ Returns:
1264
+ A new CrossJoinInput instance with current settings
1265
+ """
1266
+ return CrossJoinInput(
1267
+ left_select=JoinInputs(renames=self.input.left_select.renames.copy()),
1268
+ right_select=JoinInputs(renames=self.input.right_select.renames.copy()),
1269
+ )
1270
+
1271
+
1272
+ class JoinInputManager(JoinSelectManagerMixin):
1273
+ """Manager for standard SQL-style join operations."""
1274
+
1275
+ def __init__(self, join_input: JoinInput):
1276
+ self.input = deepcopy(join_input)
1277
+ self.left_manager = JoinInputsManager(self.input.left_select)
1278
+ self.right_manager = JoinInputsManager(self.input.right_select)
1279
+ self.set_join_keys()
1280
+
1281
+ @classmethod
1282
+ def create(
1283
+ cls,
1284
+ join_mapping: list[JoinMap] | tuple[str, str] | str,
1285
+ left_select: list[SelectInput] | list[str],
1286
+ right_select: list[SelectInput] | list[str],
1287
+ how: JoinStrategy = "inner",
1288
+ ) -> "JoinInputManager":
1289
+ """Factory method to create JoinInput from various input formats."""
1290
+ # Use JoinInput's own create method for parsing
1291
+ join_input = JoinInput(join_mapping=join_mapping, left_select=left_select, right_select=right_select, how=how)
1292
+
1293
+ manager = cls(join_input)
1294
+ manager.set_join_keys()
1295
+ return manager
1296
+
1297
+ def set_join_keys(self) -> None:
1298
+ """Marks the `SelectInput` objects corresponding to join keys."""
1299
+ left_join_keys = self._get_left_join_keys_set()
1300
+ right_join_keys = self._get_right_join_keys_set()
1301
+
1302
+ for select_input in self.input.left_select.renames:
1303
+ select_input.join_key = select_input.old_name in left_join_keys
1304
+
1305
+ for select_input in self.input.right_select.renames:
1306
+ select_input.join_key = select_input.old_name in right_join_keys
1307
+
1308
+ def _get_left_join_keys_set(self) -> set[str]:
1309
+ """Internal: Returns a set of the left-side join key column names."""
1310
+ return {jm.left_col for jm in self.input.join_mapping}
1311
+
1312
+ def _get_right_join_keys_set(self) -> set[str]:
1313
+ """Internal: Returns a set of the right-side join key column names."""
1314
+ return {jm.right_col for jm in self.input.join_mapping}
1315
+
1316
+ def get_left_join_keys(self) -> set[str]:
1317
+ """Returns a set of the left-side join key column names."""
1318
+ return self._get_left_join_keys_set()
1319
+
1320
+ def get_right_join_keys(self) -> set[str]:
1321
+ """Returns a set of the right-side join key column names."""
1322
+ return self._get_right_join_keys_set()
1323
+
1324
+ def get_left_join_keys_list(self) -> list[str]:
1325
+ """Returns an ordered list of the left-side join key column names."""
1326
+ return [jm.left_col for jm in self.used_join_mapping]
1327
+
1328
+ def get_right_join_keys_list(self) -> list[str]:
1329
+ """Returns an ordered list of the right-side join key column names."""
1330
+ return [jm.right_col for jm in self.used_join_mapping]
1331
+
1332
+ def get_overlapping_records(self) -> set[str]:
1333
+ """Finds column names that would conflict after the join."""
1334
+ return self.get_overlapping_columns()
1335
+
1336
+ def auto_rename(self) -> None:
1337
+ """Automatically renames columns on the right side to prevent naming conflicts."""
1338
+ self.set_join_keys()
1339
+ overlapping_records = self.get_overlapping_records()
1340
+
1341
+ while len(overlapping_records) > 0:
1342
+ for right_col in self.input.right_select.renames:
1343
+ if right_col.new_name in overlapping_records:
1344
+ right_col.new_name = right_col.new_name + "_right"
1345
+ overlapping_records = self.get_overlapping_records()
1346
+
1347
+ def get_join_key_renames(self, filter_drop: bool = False) -> FullJoinKeyResponse:
1348
+ """Gets the temporary rename mappings for the join keys on both sides."""
1349
+ left_renames = self.left_manager.get_join_key_renames(side="left", filter_drop=filter_drop)
1350
+ right_renames = self.right_manager.get_join_key_renames(side="right", filter_drop=filter_drop)
1351
+ return FullJoinKeyResponse(left_renames, right_renames)
1352
+
1353
+ def get_names_for_table_rename(self) -> list[JoinMap]:
1354
+ """Gets join mapping with renamed columns applied."""
1355
+ new_mappings: list[JoinMap] = []
1356
+ left_rename_table = self.left_manager.get_rename_table()
1357
+ right_rename_table = self.right_manager.get_rename_table()
1358
+
1359
+ for join_map in self.input.join_mapping:
1360
+ new_left = left_rename_table.get(join_map.left_col, join_map.left_col)
1361
+ new_right = right_rename_table.get(join_map.right_col, join_map.right_col)
1362
+ new_mappings.append(JoinMap(left_col=new_left, right_col=new_right))
1363
+
1364
+ return new_mappings
1365
+
1366
+ def get_used_join_mapping(self) -> list[JoinMap]:
1367
+ """Returns the final join mapping after applying all renames and transformations."""
1368
+ new_mappings: list[JoinMap] = []
1369
+ left_rename_table = self.left_manager.get_rename_table()
1370
+ right_rename_table = self.right_manager.get_rename_table()
1371
+ left_join_rename_mapping = self.left_manager.get_join_key_rename_mapping("left")
1372
+ right_join_rename_mapping = self.right_manager.get_join_key_rename_mapping("right")
1373
+ for join_map in self.input.join_mapping:
1374
+ left_col = left_rename_table.get(join_map.left_col, join_map.left_col)
1375
+ right_col = right_rename_table.get(join_map.right_col, join_map.left_col)
1376
+
1377
+ final_left = left_join_rename_mapping.get(left_col, None)
1378
+ final_right = right_join_rename_mapping.get(right_col, None)
1379
+
1380
+ new_mappings.append(JoinMap(left_col=final_left, right_col=final_right))
1381
+
1382
+ return new_mappings
1383
+
1384
+ def to_join_input(self) -> JoinInput:
1385
+ """Creates a new JoinInput instance based on the current manager settings.
1386
+
1387
+ This is useful when you've modified the manager (e.g., via auto_rename) and
1388
+ want to get a fresh JoinInput with all the current settings applied.
1389
+
1390
+ Returns:
1391
+ A new JoinInput instance with current settings
1392
+ """
1393
+ return JoinInput(
1394
+ join_mapping=self.input.join_mapping,
1395
+ left_select=JoinInputs(renames=self.input.left_select.renames.copy()),
1396
+ right_select=JoinInputs(renames=self.input.right_select.renames.copy()),
1397
+ how=self.input.how,
1398
+ )
1399
+
1400
+ @property
1401
+ def left_select(self) -> JoinInputsManager:
1402
+ """Backward compatibility: Access left_manager as left_select.
1403
+
1404
+ This returns the MANAGER, not the data model.
1405
+ Usage: manager.left_select.join_key_selects
1406
+ """
1407
+ return self.left_manager
1408
+
1409
+ @property
1410
+ def right_select(self) -> JoinInputsManager:
1411
+ """Backward compatibility: Access right_manager as right_select.
1412
+
1413
+ This returns the MANAGER, not the data model.
1414
+ Usage: manager.right_select.join_key_selects
1415
+ """
1416
+ return self.right_manager
1417
+
1418
+ @property
1419
+ def how(self) -> JoinStrategy:
1420
+ """Backward compatibility: Access join strategy."""
1421
+ return self.input.how
1422
+
1423
+ @property
1424
+ def join_mapping(self) -> list[JoinMap]:
1425
+ """Backward compatibility: Access join mapping."""
1426
+ return self.input.join_mapping
1427
+
1428
+ @property
1429
+ def overlapping_records(self) -> set[str]:
1430
+ """Backward compatibility: Returns overlapping column names."""
1431
+ return self.get_overlapping_records()
1432
+
1433
+ @property
1434
+ def used_join_mapping(self) -> list[JoinMap]:
1435
+ """Backward compatibility: Returns used join mapping.
1436
+
1437
+ This property is critical - it's used by left_join_keys and right_join_keys.
1438
+ """
1439
+ return self.get_used_join_mapping()
1440
+
1441
+ @property
1442
+ def left_join_keys(self) -> list[str]:
1443
+ """Backward compatibility: Returns left join keys list.
1444
+
1445
+ IMPORTANT: Uses the used_join_mapping PROPERTY (not method).
1446
+ """
1447
+ return [jm.left_col for jm in self.used_join_mapping]
1448
+
1449
+ @property
1450
+ def right_join_keys(self) -> list[str]:
1451
+ """Backward compatibility: Returns right join keys list.
1452
+
1453
+ IMPORTANT: Uses the used_join_mapping PROPERTY (not method).
1454
+ """
1455
+ return [jm.right_col for jm in self.used_join_mapping]
1456
+
1457
+ @property
1458
+ def _left_join_keys(self) -> set[str]:
1459
+ """Backward compatibility: Private property for left join key set."""
1460
+ return self._get_left_join_keys_set()
1461
+
1462
+ @property
1463
+ def _right_join_keys(self) -> set[str]:
1464
+ """Backward compatibility: Private property for right join key set."""
1465
+ return self._get_right_join_keys_set()
1466
+
1467
+
1468
+ class FuzzyMatchInputManager(JoinInputManager):
1469
+ """Manager for fuzzy matching join operations."""
1470
+
1471
+ def __init__(self, fuzzy_input: FuzzyMatchInput):
1472
+ self.fuzzy_input = deepcopy(fuzzy_input)
1473
+ super().__init__(
1474
+ JoinInput(
1475
+ join_mapping=[
1476
+ JoinMap(left_col=fm.left_col, right_col=fm.right_col) for fm in self.fuzzy_input.join_mapping
1477
+ ],
1478
+ left_select=self.fuzzy_input.left_select,
1479
+ right_select=self.fuzzy_input.right_select,
1480
+ how=self.fuzzy_input.how,
1481
+ )
1482
+ )
1483
+
1484
+ @classmethod
1485
+ def create(
1486
+ cls,
1487
+ join_mapping: list[FuzzyMapping] | tuple[str, str] | str,
1488
+ left_select: list[SelectInput] | list[str],
1489
+ right_select: list[SelectInput] | list[str],
1490
+ aggregate_output: bool = False,
1491
+ how: JoinStrategy = "inner",
1492
+ ) -> "FuzzyMatchInputManager":
1493
+ """Factory method to create FuzzyMatchInput from various input formats."""
1494
+ parsed_mapping = cls.parse_fuzz_mapping(join_mapping)
1495
+ left_inputs = cls.parse_select(left_select)
1496
+ right_inputs = cls.parse_select(right_select)
1497
+
1498
+ fuzzy_input = FuzzyMatchInput(
1499
+ join_mapping=parsed_mapping,
1500
+ left_select=left_inputs,
1501
+ right_select=right_inputs,
1502
+ how=how,
1503
+ aggregate_output=aggregate_output,
1504
+ )
1505
+
1506
+ manager = cls(fuzzy_input)
1507
+
1508
+ right_old_names = {v.old_name for v in fuzzy_input.right_select.renames}
1509
+ left_old_names = {v.old_name for v in fuzzy_input.left_select.renames}
1510
+
1511
+ for jm in parsed_mapping:
1512
+ if jm.right_col not in right_old_names:
1513
+ manager.right_manager.append(SelectInput(old_name=jm.right_col, keep=False, join_key=True))
1514
+ if jm.left_col not in left_old_names:
1515
+ manager.left_manager.append(SelectInput(old_name=jm.left_col, keep=False, join_key=True))
1516
+
1517
+ manager.set_join_keys()
1518
+ return manager
1519
+
1520
+ @staticmethod
1521
+ def parse_fuzz_mapping(
1522
+ fuzz_mapping: list[FuzzyMapping] | tuple[str, str] | str | FuzzyMapping | list[dict],
1523
+ ) -> list[FuzzyMapping]:
1524
+ """Parses various input formats into a list of FuzzyMapping objects."""
1525
+ if isinstance(fuzz_mapping, (tuple, list)):
1526
+ if len(fuzz_mapping) == 0:
1527
+ raise ValueError("Fuzzy mapping cannot be empty")
1528
+
1529
+ if all(isinstance(fm, dict) for fm in fuzz_mapping):
1530
+ return [FuzzyMapping(**fm) for fm in fuzz_mapping]
1531
+
1532
+ if all(isinstance(fm, FuzzyMapping) for fm in fuzz_mapping):
1533
+ return fuzz_mapping
1534
+
1535
+ if len(fuzz_mapping) <= 2:
1536
+ if len(fuzz_mapping) == 2:
1537
+ if isinstance(fuzz_mapping[0], str) and isinstance(fuzz_mapping[1], str):
1538
+ return [FuzzyMapping(left_col=fuzz_mapping[0], right_col=fuzz_mapping[1])]
1539
+ elif len(fuzz_mapping) == 1 and isinstance(fuzz_mapping[0], str):
1540
+ return [FuzzyMapping(left_col=fuzz_mapping[0], right_col=fuzz_mapping[0])]
1541
+
1542
+ elif isinstance(fuzz_mapping, str):
1543
+ return [FuzzyMapping(left_col=fuzz_mapping, right_col=fuzz_mapping)]
1544
+
1545
+ elif isinstance(fuzz_mapping, FuzzyMapping):
1546
+ return [fuzz_mapping]
1547
+
1548
+ raise ValueError(f"No valid fuzzy mapping as input: {type(fuzz_mapping)}")
1549
+
1550
+ def get_fuzzy_maps(self) -> list[FuzzyMapping]:
1551
+ """Returns the final fuzzy mappings after applying all column renames."""
1552
+ new_mappings = []
1553
+ left_rename_table = self.left_manager.get_rename_table()
1554
+ right_rename_table = self.right_manager.get_rename_table()
1555
+
1556
+ for org_fuzzy_map in self.fuzzy_input.join_mapping:
1557
+ right_col = right_rename_table.get(org_fuzzy_map.right_col, org_fuzzy_map.right_col)
1558
+ left_col = left_rename_table.get(org_fuzzy_map.left_col, org_fuzzy_map.left_col)
1559
+
1560
+ if right_col != org_fuzzy_map.right_col or left_col != org_fuzzy_map.left_col:
1561
+ new_mapping = deepcopy(org_fuzzy_map)
1562
+ new_mapping.left_col = left_col
1563
+ new_mapping.right_col = right_col
1564
+ new_mappings.append(new_mapping)
1565
+ else:
1566
+ new_mappings.append(org_fuzzy_map)
1567
+
1568
+ return new_mappings
1569
+
1570
+ # === Backward Compatibility Properties ===
1571
+
1572
+ @property
1573
+ def fuzzy_maps(self) -> list[FuzzyMapping]:
1574
+ """Backward compatibility: Returns fuzzy mappings."""
1575
+ return self.get_fuzzy_maps()
1576
+
1577
+ @property
1578
+ def join_mapping(self) -> list[FuzzyMapping]:
1579
+ """Backward compatibility: Access fuzzy join mapping."""
1580
+ return self.get_fuzzy_maps()
1581
+
1582
+ @property
1583
+ def aggregate_output(self) -> bool:
1584
+ """Backward compatibility: Access aggregate_output setting."""
1585
+ return self.fuzzy_input.aggregate_output
1586
+
1587
+ def to_fuzzy_match_input(self) -> FuzzyMatchInput:
1588
+ """Creates a new FuzzyMatchInput instance based on the current manager settings.
1589
+
1590
+ This is useful when you've modified the manager (e.g., via auto_rename) and
1591
+ want to get a fresh FuzzyMatchInput with all the current settings applied.
1592
+
1593
+ Returns:
1594
+ A new FuzzyMatchInput instance with current settings
1595
+ """
1596
+ return FuzzyMatchInput(
1597
+ join_mapping=self.fuzzy_input.join_mapping,
1598
+ left_select=JoinInputs(renames=self.input.left_select.renames.copy()),
1599
+ right_select=JoinInputs(renames=self.input.right_select.renames.copy()),
1600
+ how=self.fuzzy_input.how,
1601
+ aggregate_output=self.fuzzy_input.aggregate_output,
1602
+ )