Flowfile 0.4.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backends/main.py +25 -22
- build_backends/main_prd.py +10 -19
- flowfile/__init__.py +179 -73
- flowfile/__main__.py +10 -7
- flowfile/api.py +52 -59
- flowfile/web/__init__.py +14 -9
- flowfile/web/static/assets/AdminView-49392a9a.js +713 -0
- flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
- flowfile/web/static/assets/CloudConnectionView-36bcd6df.css +72 -0
- flowfile/web/static/assets/{CloudConnectionManager-d3248f8d.js → CloudConnectionView-f13f202b.js} +11 -11
- flowfile/web/static/assets/{CloudStorageReader-d65bf041.js → CloudStorageReader-0023d4a5.js} +10 -8
- flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
- flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
- flowfile/web/static/assets/{CloudStorageWriter-e83be3ed.js → CloudStorageWriter-8e781e11.js} +10 -8
- flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
- flowfile/web/static/assets/{ColumnSelector-cce661cf.js → ColumnSelector-8ad68ea9.js} +3 -5
- flowfile/web/static/assets/{ContextMenu-c13f91d0.css → ContextMenu-26d4dd27.css} +6 -6
- flowfile/web/static/assets/{ContextMenu-11a4652a.js → ContextMenu-31ee57f0.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-160afb08.js → ContextMenu-69a74055.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-cf18d2cc.js → ContextMenu-8e2051c6.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-4c74eef1.css → ContextMenu-8ec1729e.css} +6 -6
- flowfile/web/static/assets/{ContextMenu-63cfa99b.css → ContextMenu-9b310c60.css} +6 -6
- flowfile/web/static/assets/{CrossJoin-d395d38c.js → CrossJoin-03df6938.js} +12 -10
- flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
- flowfile/web/static/assets/CustomNode-59e99a86.css +32 -0
- flowfile/web/static/assets/{CustomNode-b812dc0b.js → CustomNode-8479239b.js} +36 -24
- flowfile/web/static/assets/{DatabaseConnectionSettings-7000bf2c.js → DatabaseConnectionSettings-869e3efd.js} +5 -4
- flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-e91df89a.css} +13 -13
- flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-36898a00.css} +24 -24
- flowfile/web/static/assets/{DatabaseReader-4f035d0c.js → DatabaseReader-c58b9552.js} +25 -15
- flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
- flowfile/web/static/assets/{DatabaseManager-9662ec5b.js → DatabaseView-d26a9140.js} +11 -11
- flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-217a99f1.css} +19 -19
- flowfile/web/static/assets/{DatabaseWriter-f65dcd54.js → DatabaseWriter-4d05ddc7.js} +17 -10
- flowfile/web/static/assets/{designer-e3c150ec.css → DesignerView-a6d0ee84.css} +629 -538
- flowfile/web/static/assets/{designer-f3656d8c.js → DesignerView-e6f5c0e8.js} +1214 -3209
- flowfile/web/static/assets/{documentation-52b241e7.js → DocumentationView-2e78ef1b.js} +5 -5
- flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-fd46c656.css} +7 -7
- flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
- flowfile/web/static/assets/{ExploreData-94c43dfc.js → ExploreData-7b54caca.js} +18 -9
- flowfile/web/static/assets/{ExternalSource-ac04b3cc.js → ExternalSource-3fa399b2.js} +9 -7
- flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-47ab05a3.css} +17 -17
- flowfile/web/static/assets/Filter-7494ea97.css +48 -0
- flowfile/web/static/assets/Filter-8cbbdbf3.js +287 -0
- flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
- flowfile/web/static/assets/{Formula-71472193.js → Formula-aac42b1e.js} +13 -11
- flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
- flowfile/web/static/assets/{FuzzyMatch-b317f631.js → FuzzyMatch-cd9bbfca.js} +12 -10
- flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-c24dec17.css} +5 -5
- flowfile/web/static/assets/{GraphSolver-754a234f.js → GraphSolver-c7e6780e.js} +13 -11
- flowfile/web/static/assets/{GroupBy-6c6f9802.js → GroupBy-93c5d22b.js} +9 -7
- flowfile/web/static/assets/{GroupBy-b9505323.css → GroupBy-be7ac0bf.css} +10 -10
- flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
- flowfile/web/static/assets/{Join-a1b800be.js → Join-a19b2de2.js} +13 -11
- flowfile/web/static/assets/LoginView-0df4ed0a.js +134 -0
- flowfile/web/static/assets/LoginView-d325d632.css +172 -0
- flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
- flowfile/web/static/assets/{ManualInput-a9640276.js → ManualInput-8d3374b2.js} +170 -116
- flowfile/web/static/assets/{MultiSelect-97213888.js → MultiSelect-ad1b6243.js} +2 -2
- flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-6ffe088a.js → MultiSelect.vue_vue_type_script_setup_true_lang-e278950d.js} +1 -1
- flowfile/web/static/assets/NodeDesigner-40b647c9.js +2610 -0
- flowfile/web/static/assets/NodeDesigner-5f53be3f.css +1429 -0
- flowfile/web/static/assets/{NumericInput-e638088a.js → NumericInput-7100234c.js} +2 -2
- flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-90eb2cba.js → NumericInput.vue_vue_type_script_setup_true_lang-5130219f.js} +5 -2
- flowfile/web/static/assets/{Output-ddc9079f.css → Output-35e97000.css} +6 -6
- flowfile/web/static/assets/{Output-76750610.js → Output-f5efd2aa.js} +60 -38
- flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
- flowfile/web/static/assets/{Pivot-7814803f.js → Pivot-d981d23c.js} +11 -9
- flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
- flowfile/web/static/assets/{PivotValidation-f92137d2.js → PivotValidation-39386e95.js} +3 -3
- flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
- flowfile/web/static/assets/{PivotValidation-76dd431a.js → PivotValidation-63de1f73.js} +3 -3
- flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
- flowfile/web/static/assets/{PolarsCode-889c3008.js → PolarsCode-f9d69217.js} +18 -9
- flowfile/web/static/assets/PopOver-b22f049e.js +939 -0
- flowfile/web/static/assets/PopOver-d96599db.css +33 -0
- flowfile/web/static/assets/{Read-6b17491f.css → Read-36e7bd51.css} +12 -12
- flowfile/web/static/assets/{Read-637b72a7.js → Read-aec2e377.js} +83 -105
- flowfile/web/static/assets/{RecordCount-2b050c41.js → RecordCount-78ed6845.js} +6 -4
- flowfile/web/static/assets/{RecordId-81df7784.js → RecordId-2156e890.js} +8 -6
- flowfile/web/static/assets/{SQLQueryComponent-36cef432.css → SQLQueryComponent-1c2f26b4.css} +5 -5
- flowfile/web/static/assets/{SQLQueryComponent-88dcfe53.js → SQLQueryComponent-48c72f5b.js} +3 -3
- flowfile/web/static/assets/{Sample-258ad2a9.js → Sample-1352ca74.js} +6 -4
- flowfile/web/static/assets/SecretSelector-22b5ff89.js +113 -0
- flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
- flowfile/web/static/assets/{SecretManager-2a2cb7e2.js → SecretsView-17df66ee.js} +35 -36
- flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
- flowfile/web/static/assets/{Select-850215fd.js → Select-0aee4c54.js} +9 -7
- flowfile/web/static/assets/{SettingsSection-55bae608.js → SettingsSection-0784e157.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-0e8d9123.js → SettingsSection-cd341bb6.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-29b4fa6b.js → SettingsSection-f2002a6d.js} +3 -3
- flowfile/web/static/assets/{SingleSelect-bebd408b.js → SingleSelect-460cc0ea.js} +2 -2
- flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-6093741c.js → SingleSelect.vue_vue_type_script_setup_true_lang-30741bb2.js} +1 -1
- flowfile/web/static/assets/{SliderInput-6a05ab61.js → SliderInput-5d926864.js} +7 -4
- flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
- flowfile/web/static/assets/{Sort-10ab48ed.js → Sort-3cdc971b.js} +9 -7
- flowfile/web/static/assets/{Unique-f9fb0809.css → Sort-8a871341.css} +10 -10
- flowfile/web/static/assets/{TextInput-df9d6259.js → TextInput-a2d0bfbd.js} +2 -2
- flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-000e1178.js → TextInput.vue_vue_type_script_setup_true_lang-abad1ca2.js} +5 -2
- flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
- flowfile/web/static/assets/{TextToRows-6c2d93d8.js → TextToRows-918945f7.js} +11 -10
- flowfile/web/static/assets/{ToggleSwitch-0ff7ac52.js → ToggleSwitch-f0ef5196.js} +2 -2
- flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-c6dc3029.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-5605c793.js} +1 -1
- flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-54d2f518.css} +6 -6
- flowfile/web/static/assets/{UnavailableFields-1bab97cb.js → UnavailableFields-bdad6144.js} +4 -4
- flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
- flowfile/web/static/assets/{Union-b563478a.js → Union-e8ab8c86.js} +8 -6
- flowfile/web/static/assets/{Unique-f90db5db.js → Unique-8cd4f976.js} +13 -22
- flowfile/web/static/assets/{Sort-3643d625.css → Unique-9fb2f567.css} +10 -10
- flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-710a2948.css} +7 -7
- flowfile/web/static/assets/{Unpivot-bcb0025f.js → Unpivot-8da14095.js} +10 -8
- flowfile/web/static/assets/{UnpivotValidation-c4e73b04.js → UnpivotValidation-6f7d89ff.js} +3 -3
- flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
- flowfile/web/static/assets/{VueGraphicWalker-bb8535e2.js → VueGraphicWalker-3fb312e1.js} +4 -4
- flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
- flowfile/web/static/assets/{api-4c8e3822.js → api-24483f0d.js} +1 -1
- flowfile/web/static/assets/{api-2d6adc4f.js → api-8b81fa73.js} +1 -1
- flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-3d8dc5fa.css} +40 -40
- flowfile/web/static/assets/{dropDown-1bca8a74.js → dropDown-ac0fda9d.js} +3 -3
- flowfile/web/static/assets/{fullEditor-2985687e.js → fullEditor-5497a84a.js} +11 -10
- flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-a0be62b3.css} +74 -62
- flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
- flowfile/web/static/assets/{genericNodeSettings-0476ba4e.js → genericNodeSettings-99014e1d.js} +5 -5
- flowfile/web/static/assets/index-07dda503.js +38 -0
- flowfile/web/static/assets/index-3ba44389.js +2696 -0
- flowfile/web/static/assets/{index-50508d4d.css → index-e6289dd0.css} +1945 -569
- flowfile/web/static/assets/{index-246f201c.js → index-fb6493ae.js} +41626 -40869
- flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
- flowfile/web/static/assets/nodeInput-0eb13f1a.js +2 -0
- flowfile/web/static/assets/{outputCsv-d686eeaf.js → outputCsv-8f8ba42d.js} +3 -3
- flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
- flowfile/web/static/assets/{outputExcel-8809ea2f.js → outputExcel-393f4fef.js} +3 -3
- flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
- flowfile/web/static/assets/{outputParquet-53ba645a.js → outputParquet-07c81f65.js} +4 -4
- flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
- flowfile/web/static/assets/{readCsv-053bf97b.js → readCsv-07f6d9ad.js} +21 -20
- flowfile/web/static/assets/{readCsv-bca3ed53.css → readCsv-3bfac4c3.css} +15 -15
- flowfile/web/static/assets/{readExcel-e1b381ea.css → readExcel-3db6b763.css} +13 -13
- flowfile/web/static/assets/{readExcel-ad531eab.js → readExcel-ed69bc8f.js} +10 -12
- flowfile/web/static/assets/{readParquet-cee068e2.css → readParquet-c5244ad5.css} +4 -4
- flowfile/web/static/assets/{readParquet-58e899a1.js → readParquet-e3ed4528.js} +4 -7
- flowfile/web/static/assets/secrets.api-002e7d7e.js +65 -0
- flowfile/web/static/assets/{selectDynamic-b38de2ba.js → selectDynamic-80b92899.js} +5 -5
- flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
- flowfile/web/static/assets/{vue-codemirror.esm-db9b8936.js → vue-codemirror.esm-0965f39f.js} +31 -637
- flowfile/web/static/assets/{vue-content-loader.es-b5f3ac30.js → vue-content-loader.es-c506ad97.js} +1 -1
- flowfile/web/static/index.html +2 -2
- {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/METADATA +4 -4
- flowfile-0.5.3.dist-info/RECORD +402 -0
- {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/WHEEL +1 -1
- {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/entry_points.txt +1 -0
- flowfile_core/__init__.py +13 -3
- flowfile_core/auth/jwt.py +51 -16
- flowfile_core/auth/models.py +32 -7
- flowfile_core/auth/password.py +89 -0
- flowfile_core/auth/secrets.py +8 -6
- flowfile_core/configs/__init__.py +9 -7
- flowfile_core/configs/flow_logger.py +15 -14
- flowfile_core/configs/node_store/__init__.py +72 -4
- flowfile_core/configs/node_store/nodes.py +155 -172
- flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
- flowfile_core/configs/settings.py +28 -15
- flowfile_core/database/connection.py +7 -6
- flowfile_core/database/init_db.py +96 -2
- flowfile_core/database/models.py +3 -1
- flowfile_core/fileExplorer/__init__.py +17 -0
- flowfile_core/fileExplorer/funcs.py +123 -57
- flowfile_core/fileExplorer/utils.py +10 -11
- flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
- flowfile_core/flowfile/analytics/analytics_processor.py +27 -24
- flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
- flowfile_core/flowfile/analytics/utils.py +1 -1
- flowfile_core/flowfile/code_generator/code_generator.py +391 -279
- flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
- flowfile_core/flowfile/database_connection_manager/models.py +1 -1
- flowfile_core/flowfile/extensions.py +17 -12
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +152 -103
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +526 -477
- flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
- flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +43 -32
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +15 -11
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
- flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +360 -191
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
- flowfile_core/flowfile/flow_data_engine/utils.py +101 -67
- flowfile_core/flowfile/flow_graph.py +1011 -561
- flowfile_core/flowfile/flow_graph_utils.py +31 -49
- flowfile_core/flowfile/flow_node/flow_node.py +332 -232
- flowfile_core/flowfile/flow_node/models.py +54 -41
- flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
- flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
- flowfile_core/flowfile/handler.py +82 -32
- flowfile_core/flowfile/manage/compatibility_enhancements.py +493 -47
- flowfile_core/flowfile/manage/io_flowfile.py +391 -0
- flowfile_core/flowfile/node_designer/__init__.py +15 -13
- flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
- flowfile_core/flowfile/node_designer/custom_node.py +162 -36
- flowfile_core/flowfile/node_designer/ui_components.py +136 -35
- flowfile_core/flowfile/schema_callbacks.py +77 -54
- flowfile_core/flowfile/setting_generator/__init__.py +0 -1
- flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
- flowfile_core/flowfile/setting_generator/settings.py +72 -55
- flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
- flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
- flowfile_core/flowfile/util/calculate_layout.py +9 -13
- flowfile_core/flowfile/util/execution_orderer.py +25 -17
- flowfile_core/flowfile/util/node_skipper.py +4 -4
- flowfile_core/flowfile/utils.py +19 -21
- flowfile_core/main.py +26 -19
- flowfile_core/routes/auth.py +284 -11
- flowfile_core/routes/cloud_connections.py +25 -25
- flowfile_core/routes/logs.py +21 -29
- flowfile_core/routes/public.py +3 -3
- flowfile_core/routes/routes.py +77 -43
- flowfile_core/routes/secrets.py +25 -27
- flowfile_core/routes/user_defined_components.py +483 -4
- flowfile_core/run_lock.py +0 -1
- flowfile_core/schemas/__init__.py +4 -6
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
- flowfile_core/schemas/cloud_storage_schemas.py +59 -55
- flowfile_core/schemas/input_schema.py +398 -154
- flowfile_core/schemas/output_model.py +50 -35
- flowfile_core/schemas/schemas.py +207 -67
- flowfile_core/schemas/transform_schema.py +1360 -435
- flowfile_core/schemas/yaml_types.py +117 -0
- flowfile_core/secret_manager/secret_manager.py +17 -13
- flowfile_core/{flowfile/node_designer/data_types.py → types.py} +33 -3
- flowfile_core/utils/arrow_reader.py +7 -6
- flowfile_core/utils/excel_file_manager.py +3 -3
- flowfile_core/utils/fileManager.py +7 -7
- flowfile_core/utils/fl_executor.py +8 -10
- flowfile_core/utils/utils.py +4 -4
- flowfile_core/utils/validate_setup.py +5 -4
- flowfile_frame/__init__.py +107 -50
- flowfile_frame/adapters.py +2 -9
- flowfile_frame/adding_expr.py +73 -32
- flowfile_frame/cloud_storage/frame_helpers.py +27 -23
- flowfile_frame/cloud_storage/secret_manager.py +12 -26
- flowfile_frame/config.py +2 -5
- flowfile_frame/expr.py +311 -218
- flowfile_frame/expr.pyi +160 -159
- flowfile_frame/expr_name.py +23 -23
- flowfile_frame/flow_frame.py +581 -489
- flowfile_frame/flow_frame.pyi +123 -104
- flowfile_frame/flow_frame_methods.py +236 -252
- flowfile_frame/group_frame.py +50 -20
- flowfile_frame/join.py +2 -2
- flowfile_frame/lazy.py +129 -87
- flowfile_frame/lazy_methods.py +83 -30
- flowfile_frame/list_name_space.py +55 -50
- flowfile_frame/selectors.py +148 -68
- flowfile_frame/series.py +9 -7
- flowfile_frame/utils.py +19 -21
- flowfile_worker/__init__.py +12 -4
- flowfile_worker/configs.py +11 -19
- flowfile_worker/create/__init__.py +14 -27
- flowfile_worker/create/funcs.py +143 -94
- flowfile_worker/create/models.py +139 -68
- flowfile_worker/create/pl_types.py +14 -15
- flowfile_worker/create/read_excel_tables.py +34 -41
- flowfile_worker/create/utils.py +22 -19
- flowfile_worker/external_sources/s3_source/main.py +18 -51
- flowfile_worker/external_sources/s3_source/models.py +34 -27
- flowfile_worker/external_sources/sql_source/main.py +8 -5
- flowfile_worker/external_sources/sql_source/models.py +13 -9
- flowfile_worker/flow_logger.py +10 -8
- flowfile_worker/funcs.py +214 -155
- flowfile_worker/main.py +11 -17
- flowfile_worker/models.py +35 -28
- flowfile_worker/process_manager.py +2 -3
- flowfile_worker/routes.py +121 -93
- flowfile_worker/secrets.py +9 -6
- flowfile_worker/spawner.py +80 -49
- flowfile_worker/utils.py +3 -2
- shared/__init__.py +2 -7
- shared/storage_config.py +25 -13
- test_utils/postgres/commands.py +3 -2
- test_utils/postgres/fixtures.py +9 -9
- test_utils/s3/commands.py +1 -1
- test_utils/s3/data_generator.py +3 -4
- test_utils/s3/demo_data_generator.py +4 -7
- test_utils/s3/fixtures.py +7 -5
- tools/migrate/README.md +56 -0
- tools/migrate/__init__.py +12 -0
- tools/migrate/__main__.py +118 -0
- tools/migrate/legacy_schemas.py +682 -0
- tools/migrate/migrate.py +610 -0
- tools/migrate/tests/__init__.py +0 -0
- tools/migrate/tests/conftest.py +21 -0
- tools/migrate/tests/test_migrate.py +622 -0
- tools/migrate/tests/test_migration_e2e.py +1009 -0
- tools/migrate/tests/test_node_migrations.py +843 -0
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
- flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
- flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
- flowfile/web/static/assets/Filter-812dcbca.js +0 -164
- flowfile/web/static/assets/Filter-f62091b3.css +0 -20
- flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
- flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
- flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
- flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
- flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
- flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
- flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
- flowfile/web/static/assets/secretApi-538058f3.js +0 -46
- flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
- flowfile-0.4.1.dist-info/RECORD +0 -376
- flowfile_core/flowfile/manage/open_flowfile.py +0 -143
- {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/licenses/LICENSE +0 -0
- /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
|
@@ -1,35 +1,127 @@
|
|
|
1
|
-
from
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
|
-
import polars as pl
|
|
4
|
-
from polars import selectors
|
|
1
|
+
from collections.abc import Callable
|
|
5
2
|
from copy import deepcopy
|
|
3
|
+
from dataclasses import asdict
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, Literal, NamedTuple
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
import polars as pl
|
|
9
8
|
from pl_fuzzy_frame_match.models import FuzzyMapping
|
|
9
|
+
from polars import selectors
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
11
|
+
|
|
12
|
+
from flowfile_core.schemas.yaml_types import (
|
|
13
|
+
BasicFilterYaml,
|
|
14
|
+
CrossJoinInputYaml,
|
|
15
|
+
FilterInputYaml,
|
|
16
|
+
FuzzyMatchInputYaml,
|
|
17
|
+
JoinInputsYaml,
|
|
18
|
+
JoinInputYaml,
|
|
19
|
+
SelectInputYaml,
|
|
20
|
+
)
|
|
21
|
+
from flowfile_core.types import DataType, DataTypeStr
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class FilterOperator(str, Enum):
|
|
25
|
+
"""Supported filter comparison operators."""
|
|
26
|
+
|
|
27
|
+
EQUALS = "equals"
|
|
28
|
+
NOT_EQUALS = "not_equals"
|
|
29
|
+
GREATER_THAN = "greater_than"
|
|
30
|
+
GREATER_THAN_OR_EQUALS = "greater_than_or_equals"
|
|
31
|
+
LESS_THAN = "less_than"
|
|
32
|
+
LESS_THAN_OR_EQUALS = "less_than_or_equals"
|
|
33
|
+
CONTAINS = "contains"
|
|
34
|
+
NOT_CONTAINS = "not_contains"
|
|
35
|
+
STARTS_WITH = "starts_with"
|
|
36
|
+
ENDS_WITH = "ends_with"
|
|
37
|
+
IS_NULL = "is_null"
|
|
38
|
+
IS_NOT_NULL = "is_not_null"
|
|
39
|
+
IN = "in"
|
|
40
|
+
NOT_IN = "not_in"
|
|
41
|
+
BETWEEN = "between"
|
|
42
|
+
|
|
43
|
+
def __str__(self) -> str:
|
|
44
|
+
return self.value
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_symbol(cls, symbol: str) -> "FilterOperator":
|
|
48
|
+
"""Convert UI symbol to FilterOperator enum."""
|
|
49
|
+
symbol_mapping = {
|
|
50
|
+
"=": cls.EQUALS,
|
|
51
|
+
"==": cls.EQUALS,
|
|
52
|
+
"!=": cls.NOT_EQUALS,
|
|
53
|
+
"<>": cls.NOT_EQUALS,
|
|
54
|
+
">": cls.GREATER_THAN,
|
|
55
|
+
">=": cls.GREATER_THAN_OR_EQUALS,
|
|
56
|
+
"<": cls.LESS_THAN,
|
|
57
|
+
"<=": cls.LESS_THAN_OR_EQUALS,
|
|
58
|
+
"contains": cls.CONTAINS,
|
|
59
|
+
"not_contains": cls.NOT_CONTAINS,
|
|
60
|
+
"starts_with": cls.STARTS_WITH,
|
|
61
|
+
"ends_with": cls.ENDS_WITH,
|
|
62
|
+
"is_null": cls.IS_NULL,
|
|
63
|
+
"is_not_null": cls.IS_NOT_NULL,
|
|
64
|
+
"in": cls.IN,
|
|
65
|
+
"not_in": cls.NOT_IN,
|
|
66
|
+
"between": cls.BETWEEN,
|
|
67
|
+
}
|
|
68
|
+
if symbol in symbol_mapping:
|
|
69
|
+
return symbol_mapping[symbol]
|
|
70
|
+
# Try to match by value directly
|
|
71
|
+
try:
|
|
72
|
+
return cls(symbol)
|
|
73
|
+
except ValueError:
|
|
74
|
+
raise ValueError(f"Unknown filter operator symbol: {symbol}")
|
|
75
|
+
|
|
76
|
+
def to_symbol(self) -> str:
|
|
77
|
+
"""Convert FilterOperator to UI-friendly symbol."""
|
|
78
|
+
symbol_mapping = {
|
|
79
|
+
FilterOperator.EQUALS: "=",
|
|
80
|
+
FilterOperator.NOT_EQUALS: "!=",
|
|
81
|
+
FilterOperator.GREATER_THAN: ">",
|
|
82
|
+
FilterOperator.GREATER_THAN_OR_EQUALS: ">=",
|
|
83
|
+
FilterOperator.LESS_THAN: "<",
|
|
84
|
+
FilterOperator.LESS_THAN_OR_EQUALS: "<=",
|
|
85
|
+
FilterOperator.CONTAINS: "contains",
|
|
86
|
+
FilterOperator.NOT_CONTAINS: "not_contains",
|
|
87
|
+
FilterOperator.STARTS_WITH: "starts_with",
|
|
88
|
+
FilterOperator.ENDS_WITH: "ends_with",
|
|
89
|
+
FilterOperator.IS_NULL: "is_null",
|
|
90
|
+
FilterOperator.IS_NOT_NULL: "is_not_null",
|
|
91
|
+
FilterOperator.IN: "in",
|
|
92
|
+
FilterOperator.NOT_IN: "not_in",
|
|
93
|
+
FilterOperator.BETWEEN: "between",
|
|
94
|
+
}
|
|
95
|
+
return symbol_mapping.get(self, self.value)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
FilterModeLiteral = Literal["basic", "advanced"]
|
|
99
|
+
|
|
100
|
+
FuzzyMap = FuzzyMapping
|
|
101
|
+
|
|
102
|
+
AUTO_DATA_TYPE = "Auto"
|
|
10
103
|
|
|
11
|
-
FuzzyMap = FuzzyMapping # For backwards compatibility
|
|
12
104
|
|
|
13
105
|
def get_func_type_mapping(func: str):
|
|
14
106
|
"""Infers the output data type of common aggregation functions."""
|
|
15
107
|
if func in ["mean", "avg", "median", "std", "var"]:
|
|
16
108
|
return "Float64"
|
|
17
|
-
elif func in [
|
|
109
|
+
elif func in ["min", "max", "first", "last", "cumsum", "sum"]:
|
|
18
110
|
return None
|
|
19
|
-
elif func in [
|
|
111
|
+
elif func in ["count", "n_unique"]:
|
|
20
112
|
return "Int64"
|
|
21
|
-
elif func in [
|
|
113
|
+
elif func in ["concat"]:
|
|
22
114
|
return "Utf8"
|
|
23
115
|
|
|
24
116
|
|
|
25
117
|
def string_concat(*column: str):
|
|
26
118
|
"""A simple wrapper to concatenate string columns in Polars."""
|
|
27
|
-
return pl.col(column).cast(pl.Utf8).str.concat(delimiter=
|
|
119
|
+
return pl.col(column).cast(pl.Utf8).str.concat(delimiter=",")
|
|
28
120
|
|
|
29
121
|
|
|
30
122
|
SideLit = Literal["left", "right"]
|
|
31
|
-
JoinStrategy = Literal[
|
|
32
|
-
FuzzyTypeLiteral = Literal[
|
|
123
|
+
JoinStrategy = Literal["inner", "left", "right", "full", "semi", "anti", "cross", "outer"]
|
|
124
|
+
FuzzyTypeLiteral = Literal["levenshtein", "jaro", "jaro_winkler", "hamming", "damerau_levenshtein", "indel"]
|
|
33
125
|
|
|
34
126
|
|
|
35
127
|
def construct_join_key_name(side: SideLit, column_name: str) -> str:
|
|
@@ -39,452 +131,673 @@ def construct_join_key_name(side: SideLit, column_name: str) -> str:
|
|
|
39
131
|
|
|
40
132
|
class JoinKeyRename(NamedTuple):
|
|
41
133
|
"""Represents the renaming of a join key from its original to a temporary name."""
|
|
134
|
+
|
|
42
135
|
original_name: str
|
|
43
136
|
temp_name: str
|
|
44
137
|
|
|
45
138
|
|
|
46
139
|
class JoinKeyRenameResponse(NamedTuple):
|
|
47
140
|
"""Contains a list of join key renames for one side of a join."""
|
|
141
|
+
|
|
48
142
|
side: SideLit
|
|
49
|
-
join_key_renames:
|
|
143
|
+
join_key_renames: list[JoinKeyRename]
|
|
50
144
|
|
|
51
145
|
|
|
52
146
|
class FullJoinKeyResponse(NamedTuple):
|
|
53
147
|
"""Holds the join key rename responses for both sides of a join."""
|
|
148
|
+
|
|
54
149
|
left: JoinKeyRenameResponse
|
|
55
150
|
right: JoinKeyRenameResponse
|
|
56
151
|
|
|
57
152
|
|
|
58
|
-
|
|
59
|
-
class SelectInput:
|
|
153
|
+
class SelectInput(BaseModel):
|
|
60
154
|
"""Defines how a single column should be selected, renamed, or type-cast.
|
|
61
155
|
|
|
62
156
|
This is a core building block for any operation that involves column manipulation.
|
|
63
157
|
It holds all the configuration for a single field in a selection operation.
|
|
64
158
|
"""
|
|
159
|
+
|
|
160
|
+
model_config = ConfigDict(frozen=False)
|
|
161
|
+
|
|
65
162
|
old_name: str
|
|
66
|
-
original_position:
|
|
67
|
-
new_name:
|
|
68
|
-
data_type:
|
|
69
|
-
data_type_change:
|
|
70
|
-
join_key:
|
|
71
|
-
is_altered:
|
|
72
|
-
position:
|
|
73
|
-
is_available:
|
|
74
|
-
keep:
|
|
163
|
+
original_position: int | None = None
|
|
164
|
+
new_name: str | None = None
|
|
165
|
+
data_type: str | None = None
|
|
166
|
+
data_type_change: bool = False
|
|
167
|
+
join_key: bool = False
|
|
168
|
+
is_altered: bool = False
|
|
169
|
+
position: int | None = None
|
|
170
|
+
is_available: bool = True
|
|
171
|
+
keep: bool = True
|
|
172
|
+
|
|
173
|
+
def __init__(self, old_name: str = None, new_name: str = None, **data):
|
|
174
|
+
if old_name is not None:
|
|
175
|
+
data["old_name"] = old_name
|
|
176
|
+
if new_name is not None:
|
|
177
|
+
data["new_name"] = new_name
|
|
178
|
+
super().__init__(**data)
|
|
179
|
+
|
|
180
|
+
def to_yaml_dict(self) -> SelectInputYaml:
|
|
181
|
+
"""Serialize for YAML output - only user-relevant fields."""
|
|
182
|
+
result: SelectInputYaml = {"old_name": self.old_name}
|
|
183
|
+
if self.new_name != self.old_name:
|
|
184
|
+
result["new_name"] = self.new_name
|
|
185
|
+
if not self.keep:
|
|
186
|
+
result["keep"] = self.keep
|
|
187
|
+
if self.data_type_change and self.data_type:
|
|
188
|
+
result["data_type"] = self.data_type
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
@classmethod
|
|
192
|
+
def from_yaml_dict(cls, data: dict) -> "SelectInput":
|
|
193
|
+
"""Load from slim YAML format."""
|
|
194
|
+
old_name = data["old_name"]
|
|
195
|
+
new_name = data.get("new_name", old_name)
|
|
196
|
+
return cls(
|
|
197
|
+
old_name=old_name,
|
|
198
|
+
new_name=new_name,
|
|
199
|
+
keep=data.get("keep", True),
|
|
200
|
+
data_type=data.get("data_type"),
|
|
201
|
+
data_type_change=data.get("data_type") is not None,
|
|
202
|
+
is_altered=old_name != new_name,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
@model_validator(mode="after")
|
|
206
|
+
def set_default_new_name(self):
|
|
207
|
+
"""If new_name is None, default it to old_name."""
|
|
208
|
+
if self.new_name is None:
|
|
209
|
+
self.new_name = self.old_name
|
|
210
|
+
if self.old_name != self.new_name:
|
|
211
|
+
self.is_altered = True
|
|
212
|
+
return self
|
|
75
213
|
|
|
76
214
|
def __hash__(self):
|
|
215
|
+
"""Allow SelectInput to be used in sets and as dict keys."""
|
|
77
216
|
return hash(self.old_name)
|
|
78
217
|
|
|
79
|
-
def
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
new_name = old_name
|
|
85
|
-
self.new_name = new_name
|
|
86
|
-
self.keep = keep
|
|
87
|
-
self.data_type = data_type
|
|
88
|
-
self.data_type_change = data_type_change
|
|
89
|
-
self.join_key = join_key
|
|
90
|
-
self.is_altered = is_altered
|
|
91
|
-
self.is_available = is_available
|
|
92
|
-
self.position = position
|
|
218
|
+
def __eq__(self, other):
|
|
219
|
+
"""Required when implementing __hash__."""
|
|
220
|
+
if not isinstance(other, SelectInput):
|
|
221
|
+
return False
|
|
222
|
+
return self.old_name == other.old_name
|
|
93
223
|
|
|
94
224
|
@property
|
|
95
225
|
def polars_type(self) -> str:
|
|
96
226
|
"""Translates a user-friendly type name to a Polars data type string."""
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
227
|
+
data_type_lower = self.data_type.lower()
|
|
228
|
+
if data_type_lower == "string":
|
|
229
|
+
return "Utf8"
|
|
230
|
+
elif data_type_lower == "integer":
|
|
231
|
+
return "Int64"
|
|
232
|
+
elif data_type_lower == "double":
|
|
233
|
+
return "Float64"
|
|
103
234
|
return self.data_type
|
|
104
235
|
|
|
105
236
|
|
|
106
|
-
|
|
107
|
-
class FieldInput:
|
|
237
|
+
class FieldInput(BaseModel):
|
|
108
238
|
"""Represents a single field with its name and data type, typically for defining an output column."""
|
|
109
|
-
name: str
|
|
110
|
-
data_type: Optional[str] = None
|
|
111
239
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
self.data_type = data_type
|
|
240
|
+
name: str
|
|
241
|
+
data_type: DataType | Literal["Auto"] | DataTypeStr | None = AUTO_DATA_TYPE
|
|
115
242
|
|
|
116
243
|
|
|
117
|
-
|
|
118
|
-
class FunctionInput:
|
|
244
|
+
class FunctionInput(BaseModel):
|
|
119
245
|
"""Defines a formula to be applied, including the output field information."""
|
|
246
|
+
|
|
120
247
|
field: FieldInput
|
|
121
248
|
function: str
|
|
122
249
|
|
|
250
|
+
def __init__(self, field: FieldInput = None, function: str = None, **data):
|
|
251
|
+
if field is not None:
|
|
252
|
+
data["field"] = field
|
|
253
|
+
if function is not None:
|
|
254
|
+
data["function"] = function
|
|
255
|
+
super().__init__(**data)
|
|
123
256
|
|
|
124
|
-
@dataclass
|
|
125
|
-
class BasicFilter:
|
|
126
|
-
"""Defines a simple, single-condition filter (e.g., 'column' 'equals' 'value')."""
|
|
127
|
-
field: str = ''
|
|
128
|
-
filter_type: str = ''
|
|
129
|
-
filter_value: str = ''
|
|
130
257
|
|
|
258
|
+
class BasicFilter(BaseModel):
|
|
259
|
+
"""Defines a simple, single-condition filter (e.g., 'column' 'equals' 'value').
|
|
131
260
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
261
|
+
Attributes:
|
|
262
|
+
field: The column name to filter on.
|
|
263
|
+
operator: The comparison operator (FilterOperator enum value or symbol).
|
|
264
|
+
value: The value to compare against.
|
|
265
|
+
value2: Second value for BETWEEN operator (optional).
|
|
266
|
+
"""
|
|
138
267
|
|
|
268
|
+
field: str = ""
|
|
269
|
+
operator: FilterOperator | str = FilterOperator.EQUALS
|
|
270
|
+
value: str = ""
|
|
271
|
+
value2: str | None = None # For BETWEEN operator
|
|
272
|
+
|
|
273
|
+
# Keep old field names for backward compatibility
|
|
274
|
+
filter_type: str | None = None
|
|
275
|
+
filter_value: str | None = None
|
|
276
|
+
|
|
277
|
+
def __init__(
|
|
278
|
+
self,
|
|
279
|
+
field: str = None,
|
|
280
|
+
operator: FilterOperator | str = None,
|
|
281
|
+
value: str = None,
|
|
282
|
+
value2: str = None,
|
|
283
|
+
# Backward compatibility parameters
|
|
284
|
+
filter_type: str = None,
|
|
285
|
+
filter_value: str = None,
|
|
286
|
+
**data,
|
|
287
|
+
):
|
|
288
|
+
# Handle backward compatibility
|
|
289
|
+
if filter_type is not None and operator is None:
|
|
290
|
+
data["operator"] = filter_type
|
|
291
|
+
elif operator is not None:
|
|
292
|
+
data["operator"] = operator
|
|
293
|
+
|
|
294
|
+
if filter_value is not None and value is None:
|
|
295
|
+
data["value"] = filter_value
|
|
296
|
+
elif value is not None:
|
|
297
|
+
data["value"] = value
|
|
298
|
+
|
|
299
|
+
if field is not None:
|
|
300
|
+
data["field"] = field
|
|
301
|
+
if value2 is not None:
|
|
302
|
+
data["value2"] = value2
|
|
303
|
+
|
|
304
|
+
super().__init__(**data)
|
|
305
|
+
|
|
306
|
+
@model_validator(mode="after")
|
|
307
|
+
def normalize_operator(self):
|
|
308
|
+
"""Normalize the operator to FilterOperator enum."""
|
|
309
|
+
if isinstance(self.operator, str):
|
|
310
|
+
try:
|
|
311
|
+
self.operator = FilterOperator.from_symbol(self.operator)
|
|
312
|
+
except ValueError:
|
|
313
|
+
# Keep as string if conversion fails (for backward compat)
|
|
314
|
+
pass
|
|
315
|
+
return self
|
|
316
|
+
|
|
317
|
+
def get_operator(self) -> FilterOperator:
|
|
318
|
+
"""Get the operator as FilterOperator enum."""
|
|
319
|
+
if isinstance(self.operator, FilterOperator):
|
|
320
|
+
return self.operator
|
|
321
|
+
return FilterOperator.from_symbol(self.operator)
|
|
322
|
+
|
|
323
|
+
def to_yaml_dict(self) -> BasicFilterYaml:
|
|
324
|
+
"""Serialize for YAML output."""
|
|
325
|
+
result: BasicFilterYaml = {
|
|
326
|
+
"field": self.field,
|
|
327
|
+
"operator": self.operator.value if isinstance(self.operator, FilterOperator) else self.operator,
|
|
328
|
+
"value": self.value,
|
|
329
|
+
}
|
|
330
|
+
if self.value2:
|
|
331
|
+
result["value2"] = self.value2
|
|
332
|
+
return result
|
|
139
333
|
|
|
140
|
-
@
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
334
|
+
@classmethod
|
|
335
|
+
def from_yaml_dict(cls, data: dict) -> "BasicFilter":
|
|
336
|
+
"""Load from YAML format."""
|
|
337
|
+
return cls(
|
|
338
|
+
field=data.get("field", ""),
|
|
339
|
+
operator=data.get("operator", FilterOperator.EQUALS),
|
|
340
|
+
value=data.get("value", ""),
|
|
341
|
+
value2=data.get("value2"),
|
|
342
|
+
)
|
|
144
343
|
|
|
145
|
-
@property
|
|
146
|
-
def old_cols(self) -> Set:
|
|
147
|
-
"""Returns a set of original column names to be kept in the selection."""
|
|
148
|
-
return set(v.old_name for v in self.renames if v.keep)
|
|
149
344
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
"""Returns a set of new (renamed) column names to be kept in the selection."""
|
|
153
|
-
return set(v.new_name for v in self.renames if v.keep)
|
|
345
|
+
class FilterInput(BaseModel):
|
|
346
|
+
"""Defines the settings for a filter operation, supporting basic or advanced (expression-based) modes.
|
|
154
347
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
348
|
+
Attributes:
|
|
349
|
+
mode: The filter mode - "basic" or "advanced".
|
|
350
|
+
basic_filter: The basic filter configuration (used when mode="basic").
|
|
351
|
+
advanced_filter: The advanced filter expression string (used when mode="advanced").
|
|
352
|
+
"""
|
|
159
353
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
354
|
+
mode: FilterModeLiteral = "basic"
|
|
355
|
+
basic_filter: BasicFilter | None = None
|
|
356
|
+
advanced_filter: str = ""
|
|
357
|
+
|
|
358
|
+
# Keep old field name for backward compatibility
|
|
359
|
+
filter_type: str | None = None
|
|
360
|
+
|
|
361
|
+
def __init__(
|
|
362
|
+
self,
|
|
363
|
+
mode: FilterModeLiteral = None,
|
|
364
|
+
basic_filter: BasicFilter = None,
|
|
365
|
+
advanced_filter: str = None,
|
|
366
|
+
# Backward compatibility
|
|
367
|
+
filter_type: str = None,
|
|
368
|
+
**data,
|
|
369
|
+
):
|
|
370
|
+
# Handle backward compatibility: filter_type -> mode
|
|
371
|
+
if filter_type is not None and mode is None:
|
|
372
|
+
data["mode"] = filter_type
|
|
373
|
+
elif mode is not None:
|
|
374
|
+
data["mode"] = mode
|
|
375
|
+
|
|
376
|
+
if advanced_filter is not None:
|
|
377
|
+
data["advanced_filter"] = advanced_filter
|
|
378
|
+
if basic_filter is not None:
|
|
379
|
+
data["basic_filter"] = basic_filter
|
|
380
|
+
|
|
381
|
+
super().__init__(**data)
|
|
382
|
+
|
|
383
|
+
@model_validator(mode="after")
|
|
384
|
+
def ensure_basic_filter(self):
|
|
385
|
+
"""Ensure basic_filter exists when mode is basic."""
|
|
386
|
+
if self.mode == "basic" and self.basic_filter is None:
|
|
387
|
+
self.basic_filter = BasicFilter()
|
|
388
|
+
return self
|
|
389
|
+
|
|
390
|
+
def is_advanced(self) -> bool:
|
|
391
|
+
"""Check if filter is in advanced mode."""
|
|
392
|
+
return self.mode == "advanced"
|
|
393
|
+
|
|
394
|
+
def to_yaml_dict(self) -> FilterInputYaml:
|
|
395
|
+
"""Serialize for YAML output."""
|
|
396
|
+
result: FilterInputYaml = {"mode": self.mode}
|
|
397
|
+
if self.mode == "basic" and self.basic_filter:
|
|
398
|
+
result["basic_filter"] = self.basic_filter.to_yaml_dict()
|
|
399
|
+
elif self.mode == "advanced" and self.advanced_filter:
|
|
400
|
+
result["advanced_filter"] = self.advanced_filter
|
|
401
|
+
return result
|
|
163
402
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
403
|
+
@classmethod
|
|
404
|
+
def from_yaml_dict(cls, data: dict) -> "FilterInput":
|
|
405
|
+
"""Load from YAML format."""
|
|
406
|
+
mode = data.get("mode", "basic")
|
|
407
|
+
basic_filter = None
|
|
408
|
+
if "basic_filter" in data:
|
|
409
|
+
basic_filter = BasicFilter.from_yaml_dict(data["basic_filter"])
|
|
410
|
+
return cls(
|
|
411
|
+
mode=mode,
|
|
412
|
+
basic_filter=basic_filter,
|
|
413
|
+
advanced_filter=data.get("advanced_filter", ""),
|
|
414
|
+
)
|
|
167
415
|
|
|
168
|
-
@property
|
|
169
|
-
def drop_columns(self) -> List[SelectInput]:
|
|
170
|
-
"""Returns a list of column names that are marked to be dropped from the selection."""
|
|
171
|
-
return [v for v in self.renames if not v.keep and v.is_available]
|
|
172
416
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
return [v for v in self.renames if not v.keep and v.is_available and not v.join_key]
|
|
417
|
+
class SelectInputs(BaseModel):
|
|
418
|
+
"""A container for a list of `SelectInput` objects (pure data, no logic)."""
|
|
176
419
|
|
|
177
|
-
|
|
178
|
-
"""Allows adding a SelectInput using the '+' operator."""
|
|
179
|
-
self.renames.append(other)
|
|
420
|
+
renames: list[SelectInput] = Field(default_factory=list)
|
|
180
421
|
|
|
181
|
-
def
|
|
182
|
-
|
|
183
|
-
|
|
422
|
+
def __init__(self, renames: list[SelectInput] = None, **kwargs):
|
|
423
|
+
if renames is not None:
|
|
424
|
+
kwargs["renames"] = renames
|
|
425
|
+
else:
|
|
426
|
+
kwargs["renames"] = []
|
|
427
|
+
super().__init__(**kwargs)
|
|
184
428
|
|
|
185
|
-
def
|
|
186
|
-
"""
|
|
187
|
-
|
|
429
|
+
def to_yaml_dict(self) -> JoinInputsYaml:
|
|
430
|
+
"""Serialize for YAML output."""
|
|
431
|
+
return {"select": [r.to_yaml_dict() for r in self.renames]}
|
|
188
432
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
433
|
+
@classmethod
|
|
434
|
+
def from_yaml_dict(cls, data: dict) -> "SelectInputs":
|
|
435
|
+
"""Load from slim YAML format. Supports both 'select' (new) and 'renames' (internal)."""
|
|
436
|
+
items = data.get("select", data.get("renames", []))
|
|
437
|
+
return cls(renames=[SelectInput.from_yaml_dict(item) for item in items])
|
|
194
438
|
|
|
195
439
|
@classmethod
|
|
196
|
-
def create_from_list(cls, col_list:
|
|
440
|
+
def create_from_list(cls, col_list: list[str]) -> "SelectInputs":
|
|
197
441
|
"""Creates a SelectInputs object from a simple list of column names."""
|
|
198
|
-
return cls([SelectInput(c) for c in col_list])
|
|
442
|
+
return cls(renames=[SelectInput(old_name=c) for c in col_list])
|
|
199
443
|
|
|
200
444
|
@classmethod
|
|
201
|
-
def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame):
|
|
445
|
+
def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame) -> "SelectInputs":
|
|
202
446
|
"""Creates a SelectInputs object from a Polars DataFrame's columns."""
|
|
203
|
-
return cls([SelectInput(c) for c in df.columns])
|
|
447
|
+
return cls(renames=[SelectInput(old_name=c) for c in df.columns])
|
|
204
448
|
|
|
205
|
-
def
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
def get_select_input_on_new_name(self, old_name: str) -> SelectInput | None:
|
|
209
|
-
return next((v for v in self.renames if v.new_name == old_name), None)
|
|
449
|
+
def remove_select_input(self, old_key: str) -> None:
|
|
450
|
+
"""Removes a SelectInput from the list based on its original name."""
|
|
451
|
+
self.renames = [rename for rename in self.renames if rename.old_name != old_key]
|
|
210
452
|
|
|
211
453
|
|
|
212
454
|
class JoinInputs(SelectInputs):
|
|
213
|
-
"""
|
|
214
|
-
|
|
215
|
-
def __init__(self, renames: List[SelectInput]):
|
|
216
|
-
self.renames = renames
|
|
217
|
-
|
|
218
|
-
@property
|
|
219
|
-
def join_key_selects(self) -> List[SelectInput]:
|
|
220
|
-
"""Returns only the `SelectInput` objects that are marked as join keys."""
|
|
221
|
-
return [v for v in self.renames if v.join_key]
|
|
222
|
-
|
|
223
|
-
def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
|
|
224
|
-
"""Gets the temporary rename mapping for all join keys on one side of a join."""
|
|
225
|
-
return JoinKeyRenameResponse(
|
|
226
|
-
side,
|
|
227
|
-
[JoinKeyRename(jk.new_name,
|
|
228
|
-
construct_join_key_name(side, jk.new_name))
|
|
229
|
-
for jk in self.join_key_selects if jk.keep or not filter_drop]
|
|
230
|
-
)
|
|
455
|
+
"""Data model for join-specific select inputs (extends SelectInputs)."""
|
|
231
456
|
|
|
232
|
-
def
|
|
233
|
-
|
|
234
|
-
|
|
457
|
+
def __init__(self, renames: list[SelectInput] = None, **kwargs):
|
|
458
|
+
if renames is not None:
|
|
459
|
+
kwargs["renames"] = renames
|
|
460
|
+
else:
|
|
461
|
+
kwargs["renames"] = []
|
|
462
|
+
super().__init__(**kwargs)
|
|
235
463
|
|
|
236
464
|
|
|
237
|
-
|
|
238
|
-
class JoinMap:
|
|
465
|
+
class JoinMap(BaseModel):
|
|
239
466
|
"""Defines a single mapping between a left and right column for a join key."""
|
|
240
|
-
left_col: str
|
|
241
|
-
right_col: str
|
|
242
467
|
|
|
468
|
+
left_col: str | None = None
|
|
469
|
+
right_col: str | None = None
|
|
243
470
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
471
|
+
def __init__(self, left_col: str = None, right_col: str = None, **data):
|
|
472
|
+
if left_col is not None:
|
|
473
|
+
data["left_col"] = left_col
|
|
474
|
+
if right_col is not None:
|
|
475
|
+
data["right_col"] = right_col
|
|
476
|
+
super().__init__(**data)
|
|
248
477
|
|
|
249
|
-
@
|
|
250
|
-
def
|
|
251
|
-
"""
|
|
252
|
-
if
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
return JoinInputs([SelectInput(**c.__dict__) for c in select])
|
|
256
|
-
elif isinstance(select, dict):
|
|
257
|
-
renames = select.get('renames')
|
|
258
|
-
if renames:
|
|
259
|
-
return JoinInputs([SelectInput(**c) for c in renames])
|
|
260
|
-
elif all(isinstance(c, str) for c in select):
|
|
261
|
-
return JoinInputs([SelectInput(s, s) for s in select])
|
|
262
|
-
|
|
263
|
-
def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
|
|
264
|
-
"""Generates a new, non-conflicting column name by adding a suffix if necessary."""
|
|
265
|
-
current_names = self.left_select.new_cols & self.right_select.new_cols
|
|
266
|
-
if old_col_name not in current_names:
|
|
267
|
-
return old_col_name
|
|
268
|
-
while True:
|
|
269
|
-
if old_col_name not in current_names:
|
|
270
|
-
return old_col_name
|
|
271
|
-
old_col_name = f'{side}_{old_col_name}'
|
|
478
|
+
@model_validator(mode="after")
|
|
479
|
+
def set_default_right_col(self):
|
|
480
|
+
"""If right_col is None, default it to left_col."""
|
|
481
|
+
if self.right_col is None:
|
|
482
|
+
self.right_col = self.left_col
|
|
483
|
+
return self
|
|
272
484
|
|
|
273
|
-
def add_new_select_column(self, select_input: SelectInput, side: str):
|
|
274
|
-
"""Adds a new column to the selection for either the left or right side."""
|
|
275
|
-
selects = self.right_select if side == 'right' else self.left_select
|
|
276
|
-
select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
|
|
277
|
-
selects.__add__(select_input)
|
|
278
485
|
|
|
486
|
+
class CrossJoinInput(BaseModel):
|
|
487
|
+
"""Data model for cross join operations."""
|
|
279
488
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
"""Defines the settings for a cross join operation, including column selections for both inputs."""
|
|
283
|
-
left_select: SelectInputs = None
|
|
284
|
-
right_select: SelectInputs = None
|
|
489
|
+
left_select: JoinInputs
|
|
490
|
+
right_select: JoinInputs
|
|
285
491
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
492
|
+
@model_validator(mode="before")
|
|
493
|
+
@classmethod
|
|
494
|
+
def parse_inputs(cls, data: Any) -> Any:
|
|
495
|
+
"""Parse flexible input formats before validation."""
|
|
496
|
+
if isinstance(data, dict):
|
|
497
|
+
# Parse join_mapping
|
|
498
|
+
if "join_mapping" in data:
|
|
499
|
+
data["join_mapping"] = cls._parse_join_mapping(data["join_mapping"])
|
|
291
500
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
return self.left_select.new_cols & self.right_select.new_cols
|
|
501
|
+
# Parse left_select
|
|
502
|
+
if "left_select" in data:
|
|
503
|
+
data["left_select"] = cls._parse_select(data["left_select"])
|
|
296
504
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
while len(overlapping_records) > 0:
|
|
301
|
-
for right_col in self.right_select.renames:
|
|
302
|
-
if right_col.new_name in overlapping_records:
|
|
303
|
-
right_col.new_name = 'right_' + right_col.new_name
|
|
304
|
-
overlapping_records = self.overlapping_records
|
|
505
|
+
# Parse right_select
|
|
506
|
+
if "right_select" in data:
|
|
507
|
+
data["right_select"] = cls._parse_select(data["right_select"])
|
|
305
508
|
|
|
509
|
+
return data
|
|
306
510
|
|
|
307
|
-
@
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
511
|
+
@staticmethod
|
|
512
|
+
def _parse_join_mapping(join_mapping: Any) -> list[JoinMap]:
|
|
513
|
+
"""Parse various join_mapping formats."""
|
|
514
|
+
# Already a list of JoinMaps
|
|
515
|
+
if isinstance(join_mapping, list):
|
|
516
|
+
result = []
|
|
517
|
+
for jm in join_mapping:
|
|
518
|
+
if isinstance(jm, JoinMap):
|
|
519
|
+
result.append(jm)
|
|
520
|
+
elif isinstance(jm, dict):
|
|
521
|
+
result.append(JoinMap(**jm))
|
|
522
|
+
elif isinstance(jm, (tuple, list)) and len(jm) == 2:
|
|
523
|
+
result.append(JoinMap(left_col=jm[0], right_col=jm[1]))
|
|
524
|
+
elif isinstance(jm, str):
|
|
525
|
+
result.append(JoinMap(left_col=jm, right_col=jm))
|
|
526
|
+
else:
|
|
527
|
+
raise ValueError(f"Invalid join mapping item: {jm}")
|
|
528
|
+
return result
|
|
529
|
+
|
|
530
|
+
# Single JoinMap
|
|
531
|
+
if isinstance(join_mapping, JoinMap):
|
|
532
|
+
return [join_mapping]
|
|
533
|
+
|
|
534
|
+
# String: same column on both sides
|
|
535
|
+
if isinstance(join_mapping, str):
|
|
536
|
+
return [JoinMap(left_col=join_mapping, right_col=join_mapping)]
|
|
537
|
+
|
|
538
|
+
# Tuple: (left, right)
|
|
539
|
+
if isinstance(join_mapping, tuple) and len(join_mapping) == 2:
|
|
540
|
+
return [JoinMap(left_col=join_mapping[0], right_col=join_mapping[1])]
|
|
541
|
+
|
|
542
|
+
raise ValueError(f"Invalid join_mapping format: {type(join_mapping)}")
|
|
314
543
|
|
|
315
544
|
@staticmethod
|
|
316
|
-
def
|
|
317
|
-
"""
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
self
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
545
|
+
def _parse_select(select: Any) -> JoinInputs:
|
|
546
|
+
"""Parse various select input formats."""
|
|
547
|
+
# Already JoinInputs
|
|
548
|
+
if isinstance(select, JoinInputs):
|
|
549
|
+
return select
|
|
550
|
+
|
|
551
|
+
# List of SelectInput objects
|
|
552
|
+
if isinstance(select, list):
|
|
553
|
+
if all(isinstance(s, SelectInput) for s in select):
|
|
554
|
+
return JoinInputs(renames=select)
|
|
555
|
+
elif all(isinstance(s, str) for s in select):
|
|
556
|
+
return JoinInputs(renames=[SelectInput(old_name=s) for s in select])
|
|
557
|
+
elif all(isinstance(s, dict) for s in select):
|
|
558
|
+
return JoinInputs(renames=[SelectInput(**s) for s in select])
|
|
559
|
+
|
|
560
|
+
# Dict with 'select' (new YAML) or 'renames' (internal) key
|
|
561
|
+
if isinstance(select, dict):
|
|
562
|
+
if "select" in select:
|
|
563
|
+
return JoinInputs(renames=[SelectInput.from_yaml_dict(s) for s in select["select"]])
|
|
564
|
+
if "renames" in select:
|
|
565
|
+
return JoinInputs(**select)
|
|
566
|
+
|
|
567
|
+
raise ValueError(f"Invalid select format: {type(select)}")
|
|
568
|
+
|
|
569
|
+
def __init__(
|
|
570
|
+
self,
|
|
571
|
+
left_select: JoinInputs | list[SelectInput] | list[str] = None,
|
|
572
|
+
right_select: JoinInputs | list[SelectInput] | list[str] = None,
|
|
573
|
+
**data,
|
|
574
|
+
):
|
|
575
|
+
"""Custom init for backward compatibility with positional arguments."""
|
|
576
|
+
if left_select is not None:
|
|
577
|
+
data["left_select"] = left_select
|
|
578
|
+
if right_select is not None:
|
|
579
|
+
data["right_select"] = right_select
|
|
580
|
+
super().__init__(**data)
|
|
581
|
+
|
|
582
|
+
def to_yaml_dict(self) -> CrossJoinInputYaml:
|
|
583
|
+
"""Serialize for YAML output."""
|
|
584
|
+
return {
|
|
585
|
+
"left_select": self.left_select.to_yaml_dict(),
|
|
586
|
+
"right_select": self.right_select.to_yaml_dict(),
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
def add_new_select_column(self, select_input: SelectInput, side: str) -> None:
|
|
590
|
+
"""Adds a new column to the selection for either the left or right side."""
|
|
591
|
+
target_input = self.right_select if side == "right" else self.left_select
|
|
592
|
+
if select_input.new_name is None:
|
|
593
|
+
select_input.new_name = select_input.old_name
|
|
594
|
+
target_input.renames.append(select_input)
|
|
346
595
|
|
|
347
|
-
def set_join_keys(self):
|
|
348
|
-
"""Marks the `SelectInput` objects corresponding to join keys."""
|
|
349
|
-
[setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
|
|
350
|
-
[setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
|
|
351
596
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
return FullJoinKeyResponse(self.left_select.get_join_key_renames(side="left", filter_drop=filter_drop),
|
|
355
|
-
self.right_select.get_join_key_renames(side="right", filter_drop=filter_drop))
|
|
356
|
-
|
|
357
|
-
def get_names_for_table_rename(self) -> List[JoinMap]:
|
|
358
|
-
new_mappings: List[JoinMap] = []
|
|
359
|
-
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
360
|
-
for join_map in self.join_mapping:
|
|
361
|
-
new_mappings.append(JoinMap(left_rename_table.get(join_map.left_col, join_map.left_col),
|
|
362
|
-
right_rename_table.get(join_map.right_col, join_map.right_col)
|
|
363
|
-
)
|
|
364
|
-
)
|
|
365
|
-
return new_mappings
|
|
597
|
+
class JoinInput(BaseModel):
|
|
598
|
+
"""Data model for standard SQL-style join operations."""
|
|
366
599
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
600
|
+
join_mapping: list[JoinMap]
|
|
601
|
+
left_select: JoinInputs
|
|
602
|
+
right_select: JoinInputs
|
|
603
|
+
how: JoinStrategy = "inner"
|
|
371
604
|
|
|
372
|
-
@
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
605
|
+
@model_validator(mode="before")
|
|
606
|
+
@classmethod
|
|
607
|
+
def parse_inputs(cls, data: Any) -> Any:
|
|
608
|
+
"""Parse flexible input formats before validation."""
|
|
609
|
+
if isinstance(data, dict):
|
|
610
|
+
# Parse join_mapping
|
|
611
|
+
if "join_mapping" in data:
|
|
612
|
+
data["join_mapping"] = cls._parse_join_mapping(data["join_mapping"])
|
|
376
613
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
return [jm.left_col for jm in self.used_join_mapping]
|
|
614
|
+
# Parse left_select
|
|
615
|
+
if "left_select" in data:
|
|
616
|
+
data["left_select"] = cls._parse_select(data["left_select"])
|
|
381
617
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
return [jm.right_col for jm in self.used_join_mapping]
|
|
618
|
+
# Parse right_select
|
|
619
|
+
if "right_select" in data:
|
|
620
|
+
data["right_select"] = cls._parse_select(data["right_select"])
|
|
386
621
|
|
|
387
|
-
|
|
388
|
-
def overlapping_records(self):
|
|
389
|
-
if self.how in ('left', 'right', 'inner'):
|
|
390
|
-
return self.left_select.new_cols & self.right_select.new_cols
|
|
391
|
-
else:
|
|
392
|
-
return self.left_select.new_cols & self.right_select.new_cols
|
|
622
|
+
return data
|
|
393
623
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
624
|
+
@staticmethod
|
|
625
|
+
def _parse_join_mapping(join_mapping: Any) -> list[JoinMap]:
|
|
626
|
+
"""Parse various join_mapping formats."""
|
|
627
|
+
# Already a list of JoinMaps
|
|
628
|
+
if isinstance(join_mapping, list):
|
|
629
|
+
result = []
|
|
630
|
+
for jm in join_mapping:
|
|
631
|
+
if isinstance(jm, JoinMap):
|
|
632
|
+
result.append(jm)
|
|
633
|
+
elif isinstance(jm, dict):
|
|
634
|
+
result.append(JoinMap(**jm))
|
|
635
|
+
elif isinstance(jm, (tuple, list)) and len(jm) == 2:
|
|
636
|
+
result.append(JoinMap(left_col=jm[0], right_col=jm[1]))
|
|
637
|
+
elif isinstance(jm, str):
|
|
638
|
+
result.append(JoinMap(left_col=jm, right_col=jm))
|
|
639
|
+
else:
|
|
640
|
+
raise ValueError(f"Invalid join mapping item: {jm}")
|
|
641
|
+
return result
|
|
642
|
+
|
|
643
|
+
# Single JoinMap
|
|
644
|
+
if isinstance(join_mapping, JoinMap):
|
|
645
|
+
return [join_mapping]
|
|
646
|
+
|
|
647
|
+
# String: same column on both sides
|
|
648
|
+
if isinstance(join_mapping, str):
|
|
649
|
+
return [JoinMap(left_col=join_mapping, right_col=join_mapping)]
|
|
650
|
+
|
|
651
|
+
# Tuple: (left, right)
|
|
652
|
+
if isinstance(join_mapping, tuple) and len(join_mapping) == 2:
|
|
653
|
+
return [JoinMap(left_col=join_mapping[0], right_col=join_mapping[1])]
|
|
654
|
+
|
|
655
|
+
raise ValueError(f"Invalid join_mapping format: {type(join_mapping)}")
|
|
656
|
+
|
|
657
|
+
@staticmethod
|
|
658
|
+
def _parse_select(select: Any) -> JoinInputs:
|
|
659
|
+
"""Parse various select input formats."""
|
|
660
|
+
# Already JoinInputs
|
|
661
|
+
if isinstance(select, JoinInputs):
|
|
662
|
+
return select
|
|
663
|
+
|
|
664
|
+
# List of SelectInput objects
|
|
665
|
+
if isinstance(select, list):
|
|
666
|
+
if all(isinstance(s, SelectInput) for s in select):
|
|
667
|
+
return JoinInputs(renames=select)
|
|
668
|
+
elif all(isinstance(s, str) for s in select):
|
|
669
|
+
return JoinInputs(renames=[SelectInput(old_name=s) for s in select])
|
|
670
|
+
elif all(isinstance(s, dict) for s in select):
|
|
671
|
+
return JoinInputs(renames=[SelectInput(**s) for s in select])
|
|
672
|
+
|
|
673
|
+
# Dict with 'select' (new YAML) or 'renames' (internal) key
|
|
674
|
+
if isinstance(select, dict):
|
|
675
|
+
if "select" in select:
|
|
676
|
+
return JoinInputs(renames=[SelectInput.from_yaml_dict(s) for s in select["select"]])
|
|
677
|
+
if "renames" in select:
|
|
678
|
+
return JoinInputs(**select)
|
|
679
|
+
|
|
680
|
+
raise ValueError(f"Invalid select format: {type(select)}")
|
|
681
|
+
|
|
682
|
+
def __init__(
|
|
683
|
+
self,
|
|
684
|
+
join_mapping: list[JoinMap] | JoinMap | tuple[str, str] | str | list[tuple] | list[str] = None,
|
|
685
|
+
left_select: JoinInputs | list[SelectInput] | list[str] = None,
|
|
686
|
+
right_select: JoinInputs | list[SelectInput] | list[str] = None,
|
|
687
|
+
how: JoinStrategy = "inner",
|
|
688
|
+
**data,
|
|
689
|
+
):
|
|
690
|
+
"""Custom init for backward compatibility with positional arguments."""
|
|
691
|
+
if join_mapping is not None:
|
|
692
|
+
data["join_mapping"] = join_mapping
|
|
693
|
+
if left_select is not None:
|
|
694
|
+
data["left_select"] = left_select
|
|
695
|
+
if right_select is not None:
|
|
696
|
+
data["right_select"] = right_select
|
|
697
|
+
if how is not None:
|
|
698
|
+
data["how"] = how
|
|
699
|
+
|
|
700
|
+
super().__init__(**data)
|
|
701
|
+
|
|
702
|
+
def to_yaml_dict(self) -> JoinInputYaml:
|
|
703
|
+
"""Serialize for YAML output."""
|
|
704
|
+
return {
|
|
705
|
+
"join_mapping": [{"left_col": jm.left_col, "right_col": jm.right_col} for jm in self.join_mapping],
|
|
706
|
+
"left_select": self.left_select.to_yaml_dict(),
|
|
707
|
+
"right_select": self.right_select.to_yaml_dict(),
|
|
708
|
+
"how": self.how,
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
def add_new_select_column(self, select_input: SelectInput, side: str) -> None:
|
|
712
|
+
"""Adds a new column to the selection for either the left or right side."""
|
|
713
|
+
target_input = self.right_select if side == "right" else self.left_select
|
|
714
|
+
if select_input.new_name is None:
|
|
715
|
+
select_input.new_name = select_input.old_name
|
|
716
|
+
target_input.renames.append(select_input)
|
|
403
717
|
|
|
404
|
-
@property
|
|
405
|
-
def used_join_mapping(self) -> List[JoinMap]:
|
|
406
|
-
"""Returns the final join mapping after applying all renames and transformations."""
|
|
407
|
-
new_mappings: List[JoinMap] = []
|
|
408
|
-
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
409
|
-
left_join_rename_mapping: Dict[str, str] = self.left_select.get_join_key_rename_mapping("left")
|
|
410
|
-
right_join_rename_mapping: Dict[str, str] = self.right_select.get_join_key_rename_mapping("right")
|
|
411
|
-
for join_map in self.join_mapping:
|
|
412
|
-
# del self.right_select.rename_table, self.left_select.rename_table
|
|
413
|
-
new_mappings.append(JoinMap(left_join_rename_mapping.get(left_rename_table.get(join_map.left_col, join_map.left_col)),
|
|
414
|
-
right_join_rename_mapping.get(right_rename_table.get(join_map.right_col, join_map.right_col))
|
|
415
|
-
)
|
|
416
|
-
)
|
|
417
|
-
return new_mappings
|
|
418
718
|
|
|
719
|
+
class FuzzyMatchInput(BaseModel):
|
|
720
|
+
"""Data model for fuzzy matching join operations."""
|
|
419
721
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
722
|
+
join_mapping: list[FuzzyMapping]
|
|
723
|
+
left_select: JoinInputs
|
|
724
|
+
right_select: JoinInputs
|
|
725
|
+
how: JoinStrategy = "inner"
|
|
424
726
|
aggregate_output: bool = False
|
|
425
727
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
728
|
+
def __init__(
|
|
729
|
+
self,
|
|
730
|
+
left_select: JoinInputs | list[SelectInput] | list[str] = None,
|
|
731
|
+
right_select: JoinInputs | list[SelectInput] | list[str] = None,
|
|
732
|
+
**data,
|
|
733
|
+
):
|
|
734
|
+
"""Custom init for backward compatibility with positional arguments."""
|
|
735
|
+
if left_select is not None:
|
|
736
|
+
data["left_select"] = left_select
|
|
737
|
+
if right_select is not None:
|
|
738
|
+
data["right_select"] = right_select
|
|
739
|
+
|
|
740
|
+
super().__init__(**data)
|
|
741
|
+
|
|
742
|
+
def to_yaml_dict(self) -> FuzzyMatchInputYaml:
|
|
743
|
+
"""Serialize for YAML output."""
|
|
744
|
+
return {
|
|
745
|
+
"join_mapping": [asdict(jm) for jm in self.join_mapping],
|
|
746
|
+
"left_select": self.left_select.to_yaml_dict(),
|
|
747
|
+
"right_select": self.right_select.to_yaml_dict(),
|
|
748
|
+
"how": self.how,
|
|
749
|
+
"aggregate_output": self.aggregate_output,
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
def add_new_select_column(self, select_input: SelectInput, side: str) -> None:
|
|
753
|
+
"""Adds a new column to the selection for either the left or right side."""
|
|
754
|
+
target_input = self.right_select if side == "right" else self.left_select
|
|
755
|
+
if select_input.new_name is None:
|
|
756
|
+
select_input.new_name = select_input.old_name
|
|
757
|
+
target_input.renames.append(select_input)
|
|
432
758
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
759
|
+
@staticmethod
|
|
760
|
+
def _parse_select(select: Any) -> JoinInputs:
|
|
761
|
+
"""Parse various select input formats."""
|
|
762
|
+
# Already JoinInputs
|
|
763
|
+
if isinstance(select, JoinInputs):
|
|
764
|
+
return select
|
|
765
|
+
|
|
766
|
+
# List of SelectInput objects
|
|
767
|
+
if isinstance(select, list):
|
|
768
|
+
if all(isinstance(s, SelectInput) for s in select):
|
|
769
|
+
return JoinInputs(renames=select)
|
|
770
|
+
elif all(isinstance(s, str) for s in select):
|
|
771
|
+
return JoinInputs(renames=[SelectInput(old_name=s) for s in select])
|
|
772
|
+
elif all(isinstance(s, dict) for s in select):
|
|
773
|
+
return JoinInputs(renames=[SelectInput(**s) for s in select])
|
|
774
|
+
|
|
775
|
+
# Dict with 'select' (new YAML) or 'renames' (internal) key
|
|
776
|
+
if isinstance(select, dict):
|
|
777
|
+
if "select" in select:
|
|
778
|
+
return JoinInputs(renames=[SelectInput.from_yaml_dict(s) for s in select["select"]])
|
|
779
|
+
if "renames" in select:
|
|
780
|
+
return JoinInputs(**select)
|
|
781
|
+
|
|
782
|
+
raise ValueError(f"Invalid select format: {type(select)}")
|
|
783
|
+
|
|
784
|
+
@model_validator(mode="before")
|
|
785
|
+
@classmethod
|
|
786
|
+
def parse_inputs(cls, data: Any) -> Any:
|
|
787
|
+
"""Parse flexible input formats before validation."""
|
|
788
|
+
if isinstance(data, dict):
|
|
789
|
+
# Parse left_select
|
|
790
|
+
if "left_select" in data:
|
|
791
|
+
data["left_select"] = cls._parse_select(data["left_select"])
|
|
463
792
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
793
|
+
# Parse right_select
|
|
794
|
+
if "right_select" in data:
|
|
795
|
+
data["right_select"] = cls._parse_select(data["right_select"])
|
|
467
796
|
|
|
468
|
-
|
|
469
|
-
def fuzzy_maps(self) -> List[FuzzyMapping]:
|
|
470
|
-
"""Returns the final fuzzy mappings after applying all column renames."""
|
|
471
|
-
new_mappings = []
|
|
472
|
-
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
473
|
-
for org_fuzzy_map in self.join_mapping:
|
|
474
|
-
right_col = right_rename_table.get(org_fuzzy_map.right_col)
|
|
475
|
-
left_col = left_rename_table.get(org_fuzzy_map.left_col)
|
|
476
|
-
if right_col != org_fuzzy_map.right_col or left_col != org_fuzzy_map.left_col:
|
|
477
|
-
new_mapping = deepcopy(org_fuzzy_map)
|
|
478
|
-
new_mapping.left_col = left_col
|
|
479
|
-
new_mapping.right_col = right_col
|
|
480
|
-
new_mappings.append(new_mapping)
|
|
481
|
-
else:
|
|
482
|
-
new_mappings.append(org_fuzzy_map)
|
|
483
|
-
return new_mappings
|
|
797
|
+
return data
|
|
484
798
|
|
|
485
799
|
|
|
486
|
-
|
|
487
|
-
class AggColl:
|
|
800
|
+
class AggColl(BaseModel):
|
|
488
801
|
"""
|
|
489
802
|
A data class that represents a single aggregation operation for a group by operation.
|
|
490
803
|
|
|
@@ -493,7 +806,7 @@ class AggColl:
|
|
|
493
806
|
old_name : str
|
|
494
807
|
The name of the column in the original DataFrame to be aggregated.
|
|
495
808
|
|
|
496
|
-
agg :
|
|
809
|
+
agg : str
|
|
497
810
|
The aggregation function to use. This can be a string representing a built-in function or a custom function.
|
|
498
811
|
|
|
499
812
|
new_name : Optional[str]
|
|
@@ -513,42 +826,57 @@ class AggColl:
|
|
|
513
826
|
output_type='float'
|
|
514
827
|
)
|
|
515
828
|
"""
|
|
829
|
+
|
|
516
830
|
old_name: str
|
|
517
831
|
agg: str
|
|
518
|
-
new_name:
|
|
519
|
-
output_type:
|
|
520
|
-
|
|
521
|
-
def __init__(self, old_name: str, agg: str, new_name: str = None, output_type: str = None):
|
|
522
|
-
""
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
832
|
+
new_name: str | None = None
|
|
833
|
+
output_type: str | None = None
|
|
834
|
+
|
|
835
|
+
def __init__(self, old_name: str, agg: str, new_name: str | None = None, output_type: str | None = None):
|
|
836
|
+
data = {"old_name": old_name, "agg": agg}
|
|
837
|
+
if new_name is not None:
|
|
838
|
+
data["new_name"] = new_name
|
|
839
|
+
if output_type is not None:
|
|
840
|
+
data["output_type"] = output_type
|
|
841
|
+
|
|
842
|
+
super().__init__(**data)
|
|
843
|
+
|
|
844
|
+
@model_validator(mode="after")
|
|
845
|
+
def set_defaults(self):
|
|
846
|
+
"""Set default new_name and output_type based on agg function."""
|
|
847
|
+
# Set new_name
|
|
848
|
+
if self.new_name is None:
|
|
849
|
+
if self.agg != "groupby":
|
|
850
|
+
self.new_name = self.old_name + "_" + self.agg
|
|
851
|
+
else:
|
|
852
|
+
self.new_name = self.old_name
|
|
853
|
+
|
|
854
|
+
# Set output_type
|
|
855
|
+
if self.output_type is None:
|
|
856
|
+
self.output_type = get_func_type_mapping(self.agg)
|
|
857
|
+
|
|
858
|
+
# Ensure old_name is a string
|
|
859
|
+
self.old_name = str(self.old_name)
|
|
860
|
+
|
|
861
|
+
return self
|
|
530
862
|
|
|
531
863
|
@property
|
|
532
864
|
def agg_func(self):
|
|
533
865
|
"""Returns the corresponding Polars aggregation function from the `agg` string."""
|
|
534
|
-
if self.agg ==
|
|
866
|
+
if self.agg == "groupby":
|
|
535
867
|
return self.agg
|
|
536
|
-
elif self.agg ==
|
|
868
|
+
elif self.agg == "concat":
|
|
537
869
|
return string_concat
|
|
538
870
|
else:
|
|
539
871
|
return getattr(pl, self.agg) if isinstance(self.agg, str) else self.agg
|
|
540
872
|
|
|
541
873
|
|
|
542
|
-
|
|
543
|
-
class GroupByInput:
|
|
874
|
+
class GroupByInput(BaseModel):
|
|
544
875
|
"""
|
|
545
876
|
A data class that represents the input for a group by operation.
|
|
546
877
|
|
|
547
878
|
Attributes
|
|
548
879
|
----------
|
|
549
|
-
group_columns : List[str]
|
|
550
|
-
A list of column names to group the DataFrame by. These column(s) will be set as the DataFrame index.
|
|
551
|
-
|
|
552
880
|
agg_cols : List[AggColl]
|
|
553
881
|
A list of `AggColl` objects that specify the aggregation operations to perform on the DataFrame columns
|
|
554
882
|
after grouping. Each `AggColl` object should specify the column to be aggregated and the aggregation
|
|
@@ -557,32 +885,41 @@ class GroupByInput:
|
|
|
557
885
|
Example
|
|
558
886
|
--------
|
|
559
887
|
group_by_input = GroupByInput(
|
|
560
|
-
agg_cols=[AggColl(old_name='ix', agg='groupby'), AggColl(old_name='groups', agg='groupby'),
|
|
888
|
+
agg_cols=[AggColl(old_name='ix', agg='groupby'), AggColl(old_name='groups', agg='groupby'),
|
|
889
|
+
AggColl(old_name='col1', agg='sum'), AggColl(old_name='col2', agg='mean')]
|
|
561
890
|
)
|
|
562
891
|
"""
|
|
563
|
-
|
|
892
|
+
|
|
893
|
+
agg_cols: list[AggColl]
|
|
894
|
+
|
|
895
|
+
def __init__(self, agg_cols: list[AggColl]):
|
|
896
|
+
"""Backwards compatibility implementation"""
|
|
897
|
+
super().__init__(agg_cols=agg_cols)
|
|
564
898
|
|
|
565
899
|
|
|
566
|
-
|
|
567
|
-
class PivotInput:
|
|
900
|
+
class PivotInput(BaseModel):
|
|
568
901
|
"""Defines the settings for a pivot (long-to-wide) operation."""
|
|
569
|
-
|
|
902
|
+
|
|
903
|
+
index_columns: list[str]
|
|
570
904
|
pivot_column: str
|
|
571
905
|
value_col: str
|
|
572
|
-
aggregations:
|
|
906
|
+
aggregations: list[str]
|
|
573
907
|
|
|
574
908
|
@property
|
|
575
|
-
def grouped_columns(self) ->
|
|
909
|
+
def grouped_columns(self) -> list[str]:
|
|
576
910
|
"""Returns the list of columns to be used for the initial grouping stage of the pivot."""
|
|
577
911
|
return self.index_columns + [self.pivot_column]
|
|
578
912
|
|
|
579
913
|
def get_group_by_input(self) -> GroupByInput:
|
|
580
914
|
"""Constructs the `GroupByInput` needed for the pre-aggregation step of the pivot."""
|
|
581
|
-
group_by_cols = [AggColl(c,
|
|
582
|
-
agg_cols = [
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
915
|
+
group_by_cols = [AggColl(old_name=c, agg="groupby") for c in self.grouped_columns]
|
|
916
|
+
agg_cols = [
|
|
917
|
+
AggColl(old_name=self.value_col, agg=aggregation, new_name=aggregation) for aggregation in self.aggregations
|
|
918
|
+
]
|
|
919
|
+
return GroupByInput(agg_cols=group_by_cols + agg_cols)
|
|
920
|
+
|
|
921
|
+
def get_index_columns(self) -> list[pl.col]:
|
|
922
|
+
"""Returns the index columns as Polars column expressions."""
|
|
586
923
|
return [pl.col(c) for c in self.index_columns]
|
|
587
924
|
|
|
588
925
|
def get_pivot_column(self) -> pl.Expr:
|
|
@@ -591,87 +928,675 @@ class PivotInput:
|
|
|
591
928
|
|
|
592
929
|
def get_values_expr(self) -> pl.Expr:
|
|
593
930
|
"""Creates the struct expression used to gather the values for pivoting."""
|
|
594
|
-
return pl.struct([pl.col(c) for c in self.aggregations]).alias(
|
|
931
|
+
return pl.struct([pl.col(c) for c in self.aggregations]).alias("vals")
|
|
595
932
|
|
|
596
933
|
|
|
597
|
-
|
|
598
|
-
class SortByInput:
|
|
934
|
+
class SortByInput(BaseModel):
|
|
599
935
|
"""Defines a single sort condition on a column, including the direction."""
|
|
936
|
+
|
|
600
937
|
column: str
|
|
601
|
-
how: str =
|
|
938
|
+
how: str | None = "asc"
|
|
602
939
|
|
|
603
940
|
|
|
604
|
-
|
|
605
|
-
class RecordIdInput:
|
|
941
|
+
class RecordIdInput(BaseModel):
|
|
606
942
|
"""Defines settings for adding a record ID (row number) column to the data."""
|
|
607
|
-
|
|
943
|
+
|
|
944
|
+
output_column_name: str = "record_id"
|
|
608
945
|
offset: int = 1
|
|
609
|
-
group_by:
|
|
610
|
-
group_by_columns:
|
|
946
|
+
group_by: bool | None = False
|
|
947
|
+
group_by_columns: list[str] | None = Field(default_factory=list)
|
|
611
948
|
|
|
612
949
|
|
|
613
|
-
|
|
614
|
-
class TextToRowsInput:
|
|
950
|
+
class TextToRowsInput(BaseModel):
|
|
615
951
|
"""Defines settings for splitting a text column into multiple rows based on a delimiter."""
|
|
952
|
+
|
|
616
953
|
column_to_split: str
|
|
617
|
-
output_column_name:
|
|
618
|
-
split_by_fixed_value:
|
|
619
|
-
split_fixed_value:
|
|
620
|
-
split_by_column:
|
|
954
|
+
output_column_name: str | None = None
|
|
955
|
+
split_by_fixed_value: bool | None = True
|
|
956
|
+
split_fixed_value: str | None = ","
|
|
957
|
+
split_by_column: str | None = None
|
|
621
958
|
|
|
622
959
|
|
|
623
|
-
|
|
624
|
-
class UnpivotInput:
|
|
960
|
+
class UnpivotInput(BaseModel):
|
|
625
961
|
"""Defines settings for an unpivot (wide-to-long) operation."""
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
if self.index_columns is None:
|
|
634
|
-
self.index_columns = []
|
|
635
|
-
if self.value_columns is None:
|
|
636
|
-
self.value_columns = []
|
|
637
|
-
if self.data_type_selector_mode is None:
|
|
638
|
-
self.data_type_selector_mode = 'column'
|
|
962
|
+
|
|
963
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
964
|
+
|
|
965
|
+
index_columns: list[str] = Field(default_factory=list)
|
|
966
|
+
value_columns: list[str] = Field(default_factory=list)
|
|
967
|
+
data_type_selector: Literal["float", "all", "date", "numeric", "string"] | None = None
|
|
968
|
+
data_type_selector_mode: Literal["data_type", "column"] = "column"
|
|
639
969
|
|
|
640
970
|
@property
|
|
641
|
-
def data_type_selector_expr(self) ->
|
|
971
|
+
def data_type_selector_expr(self) -> Callable | None:
|
|
642
972
|
"""Returns a Polars selector function based on the `data_type_selector` string."""
|
|
643
|
-
if self.data_type_selector_mode ==
|
|
973
|
+
if self.data_type_selector_mode == "data_type":
|
|
644
974
|
if self.data_type_selector is not None:
|
|
645
975
|
try:
|
|
646
976
|
return getattr(selectors, self.data_type_selector)
|
|
647
|
-
except Exception
|
|
648
|
-
print(f
|
|
977
|
+
except Exception:
|
|
978
|
+
print(f"Could not find the selector: {self.data_type_selector}")
|
|
649
979
|
return selectors.all
|
|
650
980
|
return selectors.all
|
|
981
|
+
return None
|
|
651
982
|
|
|
652
983
|
|
|
653
|
-
|
|
654
|
-
class UnionInput:
|
|
984
|
+
class UnionInput(BaseModel):
|
|
655
985
|
"""Defines settings for a union (concatenation) operation."""
|
|
656
|
-
|
|
986
|
+
|
|
987
|
+
mode: Literal["selective", "relaxed"] = "relaxed"
|
|
657
988
|
|
|
658
989
|
|
|
659
|
-
|
|
660
|
-
class UniqueInput:
|
|
990
|
+
class UniqueInput(BaseModel):
|
|
661
991
|
"""Defines settings for a uniqueness operation, specifying columns and which row to keep."""
|
|
662
|
-
|
|
992
|
+
|
|
993
|
+
columns: list[str] | None = None
|
|
663
994
|
strategy: Literal["first", "last", "any", "none"] = "any"
|
|
664
995
|
|
|
665
996
|
|
|
666
|
-
|
|
667
|
-
class GraphSolverInput:
|
|
997
|
+
class GraphSolverInput(BaseModel):
|
|
668
998
|
"""Defines settings for a graph-solving operation (e.g., finding connected components)."""
|
|
999
|
+
|
|
669
1000
|
col_from: str
|
|
670
1001
|
col_to: str
|
|
671
|
-
output_column_name:
|
|
1002
|
+
output_column_name: str | None = "graph_group"
|
|
672
1003
|
|
|
673
1004
|
|
|
674
|
-
|
|
675
|
-
class PolarsCodeInput:
|
|
1005
|
+
class PolarsCodeInput(BaseModel):
|
|
676
1006
|
"""A simple container for a string of user-provided Polars code to be executed."""
|
|
1007
|
+
|
|
677
1008
|
polars_code: str
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
class SelectInputsManager:
|
|
1012
|
+
"""Manager class that provides all query and mutation operations."""
|
|
1013
|
+
|
|
1014
|
+
def __init__(self, select_inputs: SelectInputs):
|
|
1015
|
+
self.select_inputs = select_inputs
|
|
1016
|
+
|
|
1017
|
+
# === Query Methods (read-only) ===
|
|
1018
|
+
|
|
1019
|
+
def get_old_cols(self) -> set[str]:
|
|
1020
|
+
"""Returns a set of original column names to be kept in the selection."""
|
|
1021
|
+
return set(v.old_name for v in self.select_inputs.renames if v.keep)
|
|
1022
|
+
|
|
1023
|
+
def get_new_cols(self) -> set[str]:
|
|
1024
|
+
"""Returns a set of new (renamed) column names to be kept in the selection."""
|
|
1025
|
+
return set(v.new_name for v in self.select_inputs.renames if v.keep)
|
|
1026
|
+
|
|
1027
|
+
def get_rename_table(self) -> dict[str, str]:
|
|
1028
|
+
"""Generates a dictionary for use in Polars' `.rename()` method."""
|
|
1029
|
+
return {v.old_name: v.new_name for v in self.select_inputs.renames if v.is_available and (v.keep or v.join_key)}
|
|
1030
|
+
|
|
1031
|
+
def get_select_cols(self, include_join_key: bool = True) -> list[str]:
|
|
1032
|
+
"""Gets a list of original column names to select from the source DataFrame."""
|
|
1033
|
+
return [v.old_name for v in self.select_inputs.renames if v.keep or (v.join_key and include_join_key)]
|
|
1034
|
+
|
|
1035
|
+
def has_drop_cols(self) -> bool:
|
|
1036
|
+
"""Checks if any column is marked to be dropped from the selection."""
|
|
1037
|
+
return any(not v.keep for v in self.select_inputs.renames)
|
|
1038
|
+
|
|
1039
|
+
def get_drop_columns(self) -> list[SelectInput]:
|
|
1040
|
+
"""Returns a list of SelectInput objects that are marked to be dropped."""
|
|
1041
|
+
return [v for v in self.select_inputs.renames if not v.keep and v.is_available]
|
|
1042
|
+
|
|
1043
|
+
def get_non_jk_drop_columns(self) -> list[SelectInput]:
|
|
1044
|
+
"""Returns drop columns that are not join keys."""
|
|
1045
|
+
return [v for v in self.select_inputs.renames if not v.keep and v.is_available and not v.join_key]
|
|
1046
|
+
|
|
1047
|
+
def find_by_old_name(self, old_name: str) -> SelectInput | None:
|
|
1048
|
+
"""Find SelectInput by original column name."""
|
|
1049
|
+
return next((v for v in self.select_inputs.renames if v.old_name == old_name), None)
|
|
1050
|
+
|
|
1051
|
+
def find_by_new_name(self, new_name: str) -> SelectInput | None:
|
|
1052
|
+
"""Find SelectInput by new column name."""
|
|
1053
|
+
return next((v for v in self.select_inputs.renames if v.new_name == new_name), None)
|
|
1054
|
+
|
|
1055
|
+
# === Mutation Methods ===
|
|
1056
|
+
|
|
1057
|
+
def append(self, other: SelectInput) -> None:
|
|
1058
|
+
"""Appends a new SelectInput to the list of renames."""
|
|
1059
|
+
self.select_inputs.renames.append(other)
|
|
1060
|
+
|
|
1061
|
+
def remove_select_input(self, old_key: str) -> None:
|
|
1062
|
+
"""Removes a SelectInput from the list based on its original name."""
|
|
1063
|
+
self.select_inputs.renames = [rename for rename in self.select_inputs.renames if rename.old_name != old_key]
|
|
1064
|
+
|
|
1065
|
+
def unselect_field(self, old_key: str) -> None:
|
|
1066
|
+
"""Marks a field to be dropped from the final selection by setting `keep` to False."""
|
|
1067
|
+
for rename in self.select_inputs.renames:
|
|
1068
|
+
if old_key == rename.old_name:
|
|
1069
|
+
rename.keep = False
|
|
1070
|
+
|
|
1071
|
+
# === Backward Compatibility Properties ===
|
|
1072
|
+
|
|
1073
|
+
@property
|
|
1074
|
+
def old_cols(self) -> set[str]:
|
|
1075
|
+
"""Backward compatibility: Returns set of old column names."""
|
|
1076
|
+
return self.get_old_cols()
|
|
1077
|
+
|
|
1078
|
+
@property
|
|
1079
|
+
def new_cols(self) -> set[str]:
|
|
1080
|
+
"""Backward compatibility: Returns set of new column names."""
|
|
1081
|
+
return self.get_new_cols()
|
|
1082
|
+
|
|
1083
|
+
@property
|
|
1084
|
+
def rename_table(self) -> dict[str, str]:
|
|
1085
|
+
"""Backward compatibility: Returns rename table dictionary."""
|
|
1086
|
+
return self.get_rename_table()
|
|
1087
|
+
|
|
1088
|
+
@property
|
|
1089
|
+
def drop_columns(self) -> list[SelectInput]:
|
|
1090
|
+
"""Backward compatibility: Returns list of columns to drop."""
|
|
1091
|
+
return self.get_drop_columns()
|
|
1092
|
+
|
|
1093
|
+
@property
|
|
1094
|
+
def non_jk_drop_columns(self) -> list[SelectInput]:
|
|
1095
|
+
"""Backward compatibility: Returns non-join-key columns to drop."""
|
|
1096
|
+
return self.get_non_jk_drop_columns()
|
|
1097
|
+
|
|
1098
|
+
@property
|
|
1099
|
+
def renames(self) -> list[SelectInput]:
|
|
1100
|
+
"""Backward compatibility: Direct access to renames list."""
|
|
1101
|
+
return self.select_inputs.renames
|
|
1102
|
+
|
|
1103
|
+
def get_select_input_on_old_name(self, old_name: str) -> SelectInput | None:
|
|
1104
|
+
"""Backward compatibility alias: Find SelectInput by original column name."""
|
|
1105
|
+
return self.find_by_old_name(old_name)
|
|
1106
|
+
|
|
1107
|
+
def get_select_input_on_new_name(self, new_name: str) -> SelectInput | None:
|
|
1108
|
+
"""Backward compatibility alias: Find SelectInput by new column name."""
|
|
1109
|
+
return self.find_by_new_name(new_name)
|
|
1110
|
+
|
|
1111
|
+
def __add__(self, other: SelectInput) -> "SelectInputsManager":
|
|
1112
|
+
"""Backward compatibility: Support += operator for appending."""
|
|
1113
|
+
self.append(other)
|
|
1114
|
+
return self
|
|
1115
|
+
|
|
1116
|
+
|
|
1117
|
+
class JoinInputsManager(SelectInputsManager):
|
|
1118
|
+
"""Manager for join-specific operations, extends SelectInputsManager."""
|
|
1119
|
+
|
|
1120
|
+
def __init__(self, join_inputs: JoinInputs):
|
|
1121
|
+
super().__init__(join_inputs)
|
|
1122
|
+
self.join_inputs = join_inputs
|
|
1123
|
+
|
|
1124
|
+
# === Query Methods ===
|
|
1125
|
+
|
|
1126
|
+
def get_join_key_selects(self) -> list[SelectInput]:
|
|
1127
|
+
"""Returns only the `SelectInput` objects that are marked as join keys."""
|
|
1128
|
+
return [v for v in self.join_inputs.renames if v.join_key]
|
|
1129
|
+
|
|
1130
|
+
def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
|
|
1131
|
+
"""Gets the temporary rename mapping for all join keys on one side of a join."""
|
|
1132
|
+
join_key_selects = self.get_join_key_selects()
|
|
1133
|
+
join_key_list = [
|
|
1134
|
+
JoinKeyRename(jk.new_name, construct_join_key_name(side, jk.new_name))
|
|
1135
|
+
for jk in join_key_selects
|
|
1136
|
+
if jk.keep or not filter_drop
|
|
1137
|
+
]
|
|
1138
|
+
return JoinKeyRenameResponse(side, join_key_list)
|
|
1139
|
+
|
|
1140
|
+
def get_join_key_rename_mapping(self, side: SideLit) -> dict[str, str]:
|
|
1141
|
+
"""Returns a dictionary mapping original join key names to their temporary names."""
|
|
1142
|
+
join_key_response = self.get_join_key_renames(side)
|
|
1143
|
+
return {jkr.original_name: jkr.temp_name for jkr in join_key_response.join_key_renames}
|
|
1144
|
+
|
|
1145
|
+
@property
|
|
1146
|
+
def join_key_selects(self) -> list[SelectInput]:
|
|
1147
|
+
"""Backward compatibility: Returns join key SelectInputs."""
|
|
1148
|
+
return self.get_join_key_selects()
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
class JoinSelectManagerMixin:
|
|
1152
|
+
"""Mixin providing common methods for join-like operations."""
|
|
1153
|
+
|
|
1154
|
+
left_manager: JoinInputsManager
|
|
1155
|
+
right_manager: JoinInputsManager
|
|
1156
|
+
input: CrossJoinInput | JoinInput | FuzzyMatchInput
|
|
1157
|
+
|
|
1158
|
+
@staticmethod
|
|
1159
|
+
def parse_select(select: list[SelectInput] | list[str] | list[dict] | dict) -> JoinInputs:
|
|
1160
|
+
"""Parses various input formats into a standardized `JoinInputs` object."""
|
|
1161
|
+
if not select:
|
|
1162
|
+
return JoinInputs(renames=[])
|
|
1163
|
+
|
|
1164
|
+
if all(isinstance(c, SelectInput) for c in select):
|
|
1165
|
+
return JoinInputs(renames=select)
|
|
1166
|
+
elif all(isinstance(c, dict) for c in select):
|
|
1167
|
+
return JoinInputs(renames=[SelectInput(**c) for c in select])
|
|
1168
|
+
elif isinstance(select, dict):
|
|
1169
|
+
renames = select.get("renames")
|
|
1170
|
+
if renames:
|
|
1171
|
+
return JoinInputs(renames=[SelectInput(**c) for c in renames])
|
|
1172
|
+
return JoinInputs(renames=[])
|
|
1173
|
+
elif all(isinstance(c, str) for c in select):
|
|
1174
|
+
return JoinInputs(renames=[SelectInput(old_name=s, new_name=s) for s in select])
|
|
1175
|
+
|
|
1176
|
+
raise ValueError(f"Unable to parse select input: {type(select)}")
|
|
1177
|
+
|
|
1178
|
+
def get_overlapping_columns(self) -> set[str]:
|
|
1179
|
+
"""Finds column names that would conflict after the join."""
|
|
1180
|
+
return self.left_manager.get_new_cols() & self.right_manager.get_new_cols()
|
|
1181
|
+
|
|
1182
|
+
def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
|
|
1183
|
+
"""Generates a new, non-conflicting column name by adding a suffix if necessary."""
|
|
1184
|
+
current_names = self.get_overlapping_columns()
|
|
1185
|
+
if old_col_name not in current_names:
|
|
1186
|
+
return old_col_name
|
|
1187
|
+
|
|
1188
|
+
new_name = old_col_name
|
|
1189
|
+
while new_name in current_names:
|
|
1190
|
+
new_name = f"{side}_{new_name}"
|
|
1191
|
+
return new_name
|
|
1192
|
+
|
|
1193
|
+
def add_new_select_column(self, select_input: SelectInput, side: str) -> None:
|
|
1194
|
+
"""Adds a new column to the selection for either the left or right side."""
|
|
1195
|
+
target_input = self.input.right_select if side == "right" else self.input.left_select
|
|
1196
|
+
|
|
1197
|
+
select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
|
|
1198
|
+
|
|
1199
|
+
target_input.renames.append(select_input)
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
class CrossJoinInputManager(JoinSelectManagerMixin):
|
|
1203
|
+
"""Manager for cross join operations."""
|
|
1204
|
+
|
|
1205
|
+
def __init__(self, cross_join_input: CrossJoinInput):
|
|
1206
|
+
self.input = deepcopy(cross_join_input)
|
|
1207
|
+
self.left_manager = JoinInputsManager(self.input.left_select)
|
|
1208
|
+
self.right_manager = JoinInputsManager(self.input.right_select)
|
|
1209
|
+
|
|
1210
|
+
@classmethod
|
|
1211
|
+
def create(
|
|
1212
|
+
cls, left_select: list[SelectInput] | list[str], right_select: list[SelectInput] | list[str]
|
|
1213
|
+
) -> "CrossJoinInputManager":
|
|
1214
|
+
"""Factory method to create CrossJoinInput from various input formats."""
|
|
1215
|
+
left_inputs = cls.parse_select(left_select)
|
|
1216
|
+
right_inputs = cls.parse_select(right_select)
|
|
1217
|
+
|
|
1218
|
+
cross_join = CrossJoinInput(left_select=left_inputs, right_select=right_inputs)
|
|
1219
|
+
return cls(cross_join)
|
|
1220
|
+
|
|
1221
|
+
def get_overlapping_records(self) -> set[str]:
|
|
1222
|
+
"""Finds column names that would conflict after the join."""
|
|
1223
|
+
return self.get_overlapping_columns()
|
|
1224
|
+
|
|
1225
|
+
def auto_rename(self, rename_mode: Literal["suffix", "prefix"] = "prefix") -> None:
|
|
1226
|
+
"""Automatically renames columns on the right side to prevent naming conflicts."""
|
|
1227
|
+
overlapping_records = self.get_overlapping_records()
|
|
1228
|
+
|
|
1229
|
+
while len(overlapping_records) > 0:
|
|
1230
|
+
for right_col in self.input.right_select.renames:
|
|
1231
|
+
if right_col.new_name in overlapping_records:
|
|
1232
|
+
if rename_mode == "prefix":
|
|
1233
|
+
right_col.new_name = "right_" + right_col.new_name
|
|
1234
|
+
elif rename_mode == "suffix":
|
|
1235
|
+
right_col.new_name = right_col.new_name + "_right"
|
|
1236
|
+
else:
|
|
1237
|
+
raise ValueError(f"Unknown rename_mode: {rename_mode}")
|
|
1238
|
+
overlapping_records = self.get_overlapping_records()
|
|
1239
|
+
|
|
1240
|
+
# === Backward Compatibility Properties ===
|
|
1241
|
+
|
|
1242
|
+
@property
|
|
1243
|
+
def left_select(self) -> JoinInputsManager:
|
|
1244
|
+
"""Backward compatibility: Access left_manager as left_select."""
|
|
1245
|
+
return self.left_manager
|
|
1246
|
+
|
|
1247
|
+
@property
|
|
1248
|
+
def right_select(self) -> JoinInputsManager:
|
|
1249
|
+
"""Backward compatibility: Access right_manager as right_select."""
|
|
1250
|
+
return self.right_manager
|
|
1251
|
+
|
|
1252
|
+
@property
|
|
1253
|
+
def overlapping_records(self) -> set[str]:
|
|
1254
|
+
"""Backward compatibility: Returns overlapping column names."""
|
|
1255
|
+
return self.get_overlapping_records()
|
|
1256
|
+
|
|
1257
|
+
def to_cross_join_input(self) -> CrossJoinInput:
|
|
1258
|
+
"""Creates a new CrossJoinInput instance based on the current manager settings.
|
|
1259
|
+
|
|
1260
|
+
This is useful when you've modified the manager (e.g., via auto_rename) and
|
|
1261
|
+
want to get a fresh CrossJoinInput with all the current settings applied.
|
|
1262
|
+
|
|
1263
|
+
Returns:
|
|
1264
|
+
A new CrossJoinInput instance with current settings
|
|
1265
|
+
"""
|
|
1266
|
+
return CrossJoinInput(
|
|
1267
|
+
left_select=JoinInputs(renames=self.input.left_select.renames.copy()),
|
|
1268
|
+
right_select=JoinInputs(renames=self.input.right_select.renames.copy()),
|
|
1269
|
+
)
|
|
1270
|
+
|
|
1271
|
+
|
|
1272
|
+
class JoinInputManager(JoinSelectManagerMixin):
|
|
1273
|
+
"""Manager for standard SQL-style join operations."""
|
|
1274
|
+
|
|
1275
|
+
def __init__(self, join_input: JoinInput):
|
|
1276
|
+
self.input = deepcopy(join_input)
|
|
1277
|
+
self.left_manager = JoinInputsManager(self.input.left_select)
|
|
1278
|
+
self.right_manager = JoinInputsManager(self.input.right_select)
|
|
1279
|
+
self.set_join_keys()
|
|
1280
|
+
|
|
1281
|
+
@classmethod
|
|
1282
|
+
def create(
|
|
1283
|
+
cls,
|
|
1284
|
+
join_mapping: list[JoinMap] | tuple[str, str] | str,
|
|
1285
|
+
left_select: list[SelectInput] | list[str],
|
|
1286
|
+
right_select: list[SelectInput] | list[str],
|
|
1287
|
+
how: JoinStrategy = "inner",
|
|
1288
|
+
) -> "JoinInputManager":
|
|
1289
|
+
"""Factory method to create JoinInput from various input formats."""
|
|
1290
|
+
# Use JoinInput's own create method for parsing
|
|
1291
|
+
join_input = JoinInput(join_mapping=join_mapping, left_select=left_select, right_select=right_select, how=how)
|
|
1292
|
+
|
|
1293
|
+
manager = cls(join_input)
|
|
1294
|
+
manager.set_join_keys()
|
|
1295
|
+
return manager
|
|
1296
|
+
|
|
1297
|
+
def set_join_keys(self) -> None:
|
|
1298
|
+
"""Marks the `SelectInput` objects corresponding to join keys."""
|
|
1299
|
+
left_join_keys = self._get_left_join_keys_set()
|
|
1300
|
+
right_join_keys = self._get_right_join_keys_set()
|
|
1301
|
+
|
|
1302
|
+
for select_input in self.input.left_select.renames:
|
|
1303
|
+
select_input.join_key = select_input.old_name in left_join_keys
|
|
1304
|
+
|
|
1305
|
+
for select_input in self.input.right_select.renames:
|
|
1306
|
+
select_input.join_key = select_input.old_name in right_join_keys
|
|
1307
|
+
|
|
1308
|
+
def _get_left_join_keys_set(self) -> set[str]:
|
|
1309
|
+
"""Internal: Returns a set of the left-side join key column names."""
|
|
1310
|
+
return {jm.left_col for jm in self.input.join_mapping}
|
|
1311
|
+
|
|
1312
|
+
def _get_right_join_keys_set(self) -> set[str]:
|
|
1313
|
+
"""Internal: Returns a set of the right-side join key column names."""
|
|
1314
|
+
return {jm.right_col for jm in self.input.join_mapping}
|
|
1315
|
+
|
|
1316
|
+
def get_left_join_keys(self) -> set[str]:
|
|
1317
|
+
"""Returns a set of the left-side join key column names."""
|
|
1318
|
+
return self._get_left_join_keys_set()
|
|
1319
|
+
|
|
1320
|
+
def get_right_join_keys(self) -> set[str]:
|
|
1321
|
+
"""Returns a set of the right-side join key column names."""
|
|
1322
|
+
return self._get_right_join_keys_set()
|
|
1323
|
+
|
|
1324
|
+
def get_left_join_keys_list(self) -> list[str]:
|
|
1325
|
+
"""Returns an ordered list of the left-side join key column names."""
|
|
1326
|
+
return [jm.left_col for jm in self.used_join_mapping]
|
|
1327
|
+
|
|
1328
|
+
def get_right_join_keys_list(self) -> list[str]:
|
|
1329
|
+
"""Returns an ordered list of the right-side join key column names."""
|
|
1330
|
+
return [jm.right_col for jm in self.used_join_mapping]
|
|
1331
|
+
|
|
1332
|
+
def get_overlapping_records(self) -> set[str]:
|
|
1333
|
+
"""Finds column names that would conflict after the join."""
|
|
1334
|
+
return self.get_overlapping_columns()
|
|
1335
|
+
|
|
1336
|
+
def auto_rename(self) -> None:
|
|
1337
|
+
"""Automatically renames columns on the right side to prevent naming conflicts."""
|
|
1338
|
+
self.set_join_keys()
|
|
1339
|
+
overlapping_records = self.get_overlapping_records()
|
|
1340
|
+
|
|
1341
|
+
while len(overlapping_records) > 0:
|
|
1342
|
+
for right_col in self.input.right_select.renames:
|
|
1343
|
+
if right_col.new_name in overlapping_records:
|
|
1344
|
+
right_col.new_name = right_col.new_name + "_right"
|
|
1345
|
+
overlapping_records = self.get_overlapping_records()
|
|
1346
|
+
|
|
1347
|
+
def get_join_key_renames(self, filter_drop: bool = False) -> FullJoinKeyResponse:
|
|
1348
|
+
"""Gets the temporary rename mappings for the join keys on both sides."""
|
|
1349
|
+
left_renames = self.left_manager.get_join_key_renames(side="left", filter_drop=filter_drop)
|
|
1350
|
+
right_renames = self.right_manager.get_join_key_renames(side="right", filter_drop=filter_drop)
|
|
1351
|
+
return FullJoinKeyResponse(left_renames, right_renames)
|
|
1352
|
+
|
|
1353
|
+
def get_names_for_table_rename(self) -> list[JoinMap]:
|
|
1354
|
+
"""Gets join mapping with renamed columns applied."""
|
|
1355
|
+
new_mappings: list[JoinMap] = []
|
|
1356
|
+
left_rename_table = self.left_manager.get_rename_table()
|
|
1357
|
+
right_rename_table = self.right_manager.get_rename_table()
|
|
1358
|
+
|
|
1359
|
+
for join_map in self.input.join_mapping:
|
|
1360
|
+
new_left = left_rename_table.get(join_map.left_col, join_map.left_col)
|
|
1361
|
+
new_right = right_rename_table.get(join_map.right_col, join_map.right_col)
|
|
1362
|
+
new_mappings.append(JoinMap(left_col=new_left, right_col=new_right))
|
|
1363
|
+
|
|
1364
|
+
return new_mappings
|
|
1365
|
+
|
|
1366
|
+
def get_used_join_mapping(self) -> list[JoinMap]:
|
|
1367
|
+
"""Returns the final join mapping after applying all renames and transformations."""
|
|
1368
|
+
new_mappings: list[JoinMap] = []
|
|
1369
|
+
left_rename_table = self.left_manager.get_rename_table()
|
|
1370
|
+
right_rename_table = self.right_manager.get_rename_table()
|
|
1371
|
+
left_join_rename_mapping = self.left_manager.get_join_key_rename_mapping("left")
|
|
1372
|
+
right_join_rename_mapping = self.right_manager.get_join_key_rename_mapping("right")
|
|
1373
|
+
for join_map in self.input.join_mapping:
|
|
1374
|
+
left_col = left_rename_table.get(join_map.left_col, join_map.left_col)
|
|
1375
|
+
right_col = right_rename_table.get(join_map.right_col, join_map.left_col)
|
|
1376
|
+
|
|
1377
|
+
final_left = left_join_rename_mapping.get(left_col, None)
|
|
1378
|
+
final_right = right_join_rename_mapping.get(right_col, None)
|
|
1379
|
+
|
|
1380
|
+
new_mappings.append(JoinMap(left_col=final_left, right_col=final_right))
|
|
1381
|
+
|
|
1382
|
+
return new_mappings
|
|
1383
|
+
|
|
1384
|
+
def to_join_input(self) -> JoinInput:
|
|
1385
|
+
"""Creates a new JoinInput instance based on the current manager settings.
|
|
1386
|
+
|
|
1387
|
+
This is useful when you've modified the manager (e.g., via auto_rename) and
|
|
1388
|
+
want to get a fresh JoinInput with all the current settings applied.
|
|
1389
|
+
|
|
1390
|
+
Returns:
|
|
1391
|
+
A new JoinInput instance with current settings
|
|
1392
|
+
"""
|
|
1393
|
+
return JoinInput(
|
|
1394
|
+
join_mapping=self.input.join_mapping,
|
|
1395
|
+
left_select=JoinInputs(renames=self.input.left_select.renames.copy()),
|
|
1396
|
+
right_select=JoinInputs(renames=self.input.right_select.renames.copy()),
|
|
1397
|
+
how=self.input.how,
|
|
1398
|
+
)
|
|
1399
|
+
|
|
1400
|
+
@property
|
|
1401
|
+
def left_select(self) -> JoinInputsManager:
|
|
1402
|
+
"""Backward compatibility: Access left_manager as left_select.
|
|
1403
|
+
|
|
1404
|
+
This returns the MANAGER, not the data model.
|
|
1405
|
+
Usage: manager.left_select.join_key_selects
|
|
1406
|
+
"""
|
|
1407
|
+
return self.left_manager
|
|
1408
|
+
|
|
1409
|
+
@property
|
|
1410
|
+
def right_select(self) -> JoinInputsManager:
|
|
1411
|
+
"""Backward compatibility: Access right_manager as right_select.
|
|
1412
|
+
|
|
1413
|
+
This returns the MANAGER, not the data model.
|
|
1414
|
+
Usage: manager.right_select.join_key_selects
|
|
1415
|
+
"""
|
|
1416
|
+
return self.right_manager
|
|
1417
|
+
|
|
1418
|
+
@property
|
|
1419
|
+
def how(self) -> JoinStrategy:
|
|
1420
|
+
"""Backward compatibility: Access join strategy."""
|
|
1421
|
+
return self.input.how
|
|
1422
|
+
|
|
1423
|
+
@property
|
|
1424
|
+
def join_mapping(self) -> list[JoinMap]:
|
|
1425
|
+
"""Backward compatibility: Access join mapping."""
|
|
1426
|
+
return self.input.join_mapping
|
|
1427
|
+
|
|
1428
|
+
@property
|
|
1429
|
+
def overlapping_records(self) -> set[str]:
|
|
1430
|
+
"""Backward compatibility: Returns overlapping column names."""
|
|
1431
|
+
return self.get_overlapping_records()
|
|
1432
|
+
|
|
1433
|
+
@property
|
|
1434
|
+
def used_join_mapping(self) -> list[JoinMap]:
|
|
1435
|
+
"""Backward compatibility: Returns used join mapping.
|
|
1436
|
+
|
|
1437
|
+
This property is critical - it's used by left_join_keys and right_join_keys.
|
|
1438
|
+
"""
|
|
1439
|
+
return self.get_used_join_mapping()
|
|
1440
|
+
|
|
1441
|
+
@property
|
|
1442
|
+
def left_join_keys(self) -> list[str]:
|
|
1443
|
+
"""Backward compatibility: Returns left join keys list.
|
|
1444
|
+
|
|
1445
|
+
IMPORTANT: Uses the used_join_mapping PROPERTY (not method).
|
|
1446
|
+
"""
|
|
1447
|
+
return [jm.left_col for jm in self.used_join_mapping]
|
|
1448
|
+
|
|
1449
|
+
@property
|
|
1450
|
+
def right_join_keys(self) -> list[str]:
|
|
1451
|
+
"""Backward compatibility: Returns right join keys list.
|
|
1452
|
+
|
|
1453
|
+
IMPORTANT: Uses the used_join_mapping PROPERTY (not method).
|
|
1454
|
+
"""
|
|
1455
|
+
return [jm.right_col for jm in self.used_join_mapping]
|
|
1456
|
+
|
|
1457
|
+
@property
|
|
1458
|
+
def _left_join_keys(self) -> set[str]:
|
|
1459
|
+
"""Backward compatibility: Private property for left join key set."""
|
|
1460
|
+
return self._get_left_join_keys_set()
|
|
1461
|
+
|
|
1462
|
+
@property
|
|
1463
|
+
def _right_join_keys(self) -> set[str]:
|
|
1464
|
+
"""Backward compatibility: Private property for right join key set."""
|
|
1465
|
+
return self._get_right_join_keys_set()
|
|
1466
|
+
|
|
1467
|
+
|
|
1468
|
+
class FuzzyMatchInputManager(JoinInputManager):
|
|
1469
|
+
"""Manager for fuzzy matching join operations."""
|
|
1470
|
+
|
|
1471
|
+
def __init__(self, fuzzy_input: FuzzyMatchInput):
|
|
1472
|
+
self.fuzzy_input = deepcopy(fuzzy_input)
|
|
1473
|
+
super().__init__(
|
|
1474
|
+
JoinInput(
|
|
1475
|
+
join_mapping=[
|
|
1476
|
+
JoinMap(left_col=fm.left_col, right_col=fm.right_col) for fm in self.fuzzy_input.join_mapping
|
|
1477
|
+
],
|
|
1478
|
+
left_select=self.fuzzy_input.left_select,
|
|
1479
|
+
right_select=self.fuzzy_input.right_select,
|
|
1480
|
+
how=self.fuzzy_input.how,
|
|
1481
|
+
)
|
|
1482
|
+
)
|
|
1483
|
+
|
|
1484
|
+
@classmethod
|
|
1485
|
+
def create(
|
|
1486
|
+
cls,
|
|
1487
|
+
join_mapping: list[FuzzyMapping] | tuple[str, str] | str,
|
|
1488
|
+
left_select: list[SelectInput] | list[str],
|
|
1489
|
+
right_select: list[SelectInput] | list[str],
|
|
1490
|
+
aggregate_output: bool = False,
|
|
1491
|
+
how: JoinStrategy = "inner",
|
|
1492
|
+
) -> "FuzzyMatchInputManager":
|
|
1493
|
+
"""Factory method to create FuzzyMatchInput from various input formats."""
|
|
1494
|
+
parsed_mapping = cls.parse_fuzz_mapping(join_mapping)
|
|
1495
|
+
left_inputs = cls.parse_select(left_select)
|
|
1496
|
+
right_inputs = cls.parse_select(right_select)
|
|
1497
|
+
|
|
1498
|
+
fuzzy_input = FuzzyMatchInput(
|
|
1499
|
+
join_mapping=parsed_mapping,
|
|
1500
|
+
left_select=left_inputs,
|
|
1501
|
+
right_select=right_inputs,
|
|
1502
|
+
how=how,
|
|
1503
|
+
aggregate_output=aggregate_output,
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
manager = cls(fuzzy_input)
|
|
1507
|
+
|
|
1508
|
+
right_old_names = {v.old_name for v in fuzzy_input.right_select.renames}
|
|
1509
|
+
left_old_names = {v.old_name for v in fuzzy_input.left_select.renames}
|
|
1510
|
+
|
|
1511
|
+
for jm in parsed_mapping:
|
|
1512
|
+
if jm.right_col not in right_old_names:
|
|
1513
|
+
manager.right_manager.append(SelectInput(old_name=jm.right_col, keep=False, join_key=True))
|
|
1514
|
+
if jm.left_col not in left_old_names:
|
|
1515
|
+
manager.left_manager.append(SelectInput(old_name=jm.left_col, keep=False, join_key=True))
|
|
1516
|
+
|
|
1517
|
+
manager.set_join_keys()
|
|
1518
|
+
return manager
|
|
1519
|
+
|
|
1520
|
+
@staticmethod
|
|
1521
|
+
def parse_fuzz_mapping(
|
|
1522
|
+
fuzz_mapping: list[FuzzyMapping] | tuple[str, str] | str | FuzzyMapping | list[dict],
|
|
1523
|
+
) -> list[FuzzyMapping]:
|
|
1524
|
+
"""Parses various input formats into a list of FuzzyMapping objects."""
|
|
1525
|
+
if isinstance(fuzz_mapping, (tuple, list)):
|
|
1526
|
+
if len(fuzz_mapping) == 0:
|
|
1527
|
+
raise ValueError("Fuzzy mapping cannot be empty")
|
|
1528
|
+
|
|
1529
|
+
if all(isinstance(fm, dict) for fm in fuzz_mapping):
|
|
1530
|
+
return [FuzzyMapping(**fm) for fm in fuzz_mapping]
|
|
1531
|
+
|
|
1532
|
+
if all(isinstance(fm, FuzzyMapping) for fm in fuzz_mapping):
|
|
1533
|
+
return fuzz_mapping
|
|
1534
|
+
|
|
1535
|
+
if len(fuzz_mapping) <= 2:
|
|
1536
|
+
if len(fuzz_mapping) == 2:
|
|
1537
|
+
if isinstance(fuzz_mapping[0], str) and isinstance(fuzz_mapping[1], str):
|
|
1538
|
+
return [FuzzyMapping(left_col=fuzz_mapping[0], right_col=fuzz_mapping[1])]
|
|
1539
|
+
elif len(fuzz_mapping) == 1 and isinstance(fuzz_mapping[0], str):
|
|
1540
|
+
return [FuzzyMapping(left_col=fuzz_mapping[0], right_col=fuzz_mapping[0])]
|
|
1541
|
+
|
|
1542
|
+
elif isinstance(fuzz_mapping, str):
|
|
1543
|
+
return [FuzzyMapping(left_col=fuzz_mapping, right_col=fuzz_mapping)]
|
|
1544
|
+
|
|
1545
|
+
elif isinstance(fuzz_mapping, FuzzyMapping):
|
|
1546
|
+
return [fuzz_mapping]
|
|
1547
|
+
|
|
1548
|
+
raise ValueError(f"No valid fuzzy mapping as input: {type(fuzz_mapping)}")
|
|
1549
|
+
|
|
1550
|
+
def get_fuzzy_maps(self) -> list[FuzzyMapping]:
|
|
1551
|
+
"""Returns the final fuzzy mappings after applying all column renames."""
|
|
1552
|
+
new_mappings = []
|
|
1553
|
+
left_rename_table = self.left_manager.get_rename_table()
|
|
1554
|
+
right_rename_table = self.right_manager.get_rename_table()
|
|
1555
|
+
|
|
1556
|
+
for org_fuzzy_map in self.fuzzy_input.join_mapping:
|
|
1557
|
+
right_col = right_rename_table.get(org_fuzzy_map.right_col, org_fuzzy_map.right_col)
|
|
1558
|
+
left_col = left_rename_table.get(org_fuzzy_map.left_col, org_fuzzy_map.left_col)
|
|
1559
|
+
|
|
1560
|
+
if right_col != org_fuzzy_map.right_col or left_col != org_fuzzy_map.left_col:
|
|
1561
|
+
new_mapping = deepcopy(org_fuzzy_map)
|
|
1562
|
+
new_mapping.left_col = left_col
|
|
1563
|
+
new_mapping.right_col = right_col
|
|
1564
|
+
new_mappings.append(new_mapping)
|
|
1565
|
+
else:
|
|
1566
|
+
new_mappings.append(org_fuzzy_map)
|
|
1567
|
+
|
|
1568
|
+
return new_mappings
|
|
1569
|
+
|
|
1570
|
+
# === Backward Compatibility Properties ===
|
|
1571
|
+
|
|
1572
|
+
@property
|
|
1573
|
+
def fuzzy_maps(self) -> list[FuzzyMapping]:
|
|
1574
|
+
"""Backward compatibility: Returns fuzzy mappings."""
|
|
1575
|
+
return self.get_fuzzy_maps()
|
|
1576
|
+
|
|
1577
|
+
@property
|
|
1578
|
+
def join_mapping(self) -> list[FuzzyMapping]:
|
|
1579
|
+
"""Backward compatibility: Access fuzzy join mapping."""
|
|
1580
|
+
return self.get_fuzzy_maps()
|
|
1581
|
+
|
|
1582
|
+
@property
|
|
1583
|
+
def aggregate_output(self) -> bool:
|
|
1584
|
+
"""Backward compatibility: Access aggregate_output setting."""
|
|
1585
|
+
return self.fuzzy_input.aggregate_output
|
|
1586
|
+
|
|
1587
|
+
def to_fuzzy_match_input(self) -> FuzzyMatchInput:
|
|
1588
|
+
"""Creates a new FuzzyMatchInput instance based on the current manager settings.
|
|
1589
|
+
|
|
1590
|
+
This is useful when you've modified the manager (e.g., via auto_rename) and
|
|
1591
|
+
want to get a fresh FuzzyMatchInput with all the current settings applied.
|
|
1592
|
+
|
|
1593
|
+
Returns:
|
|
1594
|
+
A new FuzzyMatchInput instance with current settings
|
|
1595
|
+
"""
|
|
1596
|
+
return FuzzyMatchInput(
|
|
1597
|
+
join_mapping=self.fuzzy_input.join_mapping,
|
|
1598
|
+
left_select=JoinInputs(renames=self.input.left_select.renames.copy()),
|
|
1599
|
+
right_select=JoinInputs(renames=self.input.right_select.renames.copy()),
|
|
1600
|
+
how=self.fuzzy_input.how,
|
|
1601
|
+
aggregate_output=self.fuzzy_input.aggregate_output,
|
|
1602
|
+
)
|