Flowfile 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backends/main.py +25 -22
- build_backends/main_prd.py +10 -19
- flowfile/__init__.py +194 -74
- flowfile/__main__.py +10 -7
- flowfile/api.py +51 -57
- flowfile/web/__init__.py +14 -9
- flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
- flowfile/web/static/assets/AdminView-f9847d67.js +713 -0
- flowfile/web/static/assets/CloudConnectionView-cf85f943.css +72 -0
- flowfile/web/static/assets/{CloudConnectionManager-0dfba9f2.js → CloudConnectionView-faace55b.js} +11 -11
- flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
- flowfile/web/static/assets/{CloudStorageReader-d5b1b6c9.js → CloudStorageReader-d86ecaa7.js} +10 -8
- flowfile/web/static/assets/{CloudStorageWriter-00d87aad.js → CloudStorageWriter-0f4d9a44.js} +10 -8
- flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
- flowfile/web/static/assets/ColumnActionInput-c44b7aee.css +159 -0
- flowfile/web/static/assets/ColumnActionInput-f4189ae0.js +330 -0
- flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
- flowfile/web/static/assets/{ColumnSelector-4685e75d.js → ColumnSelector-e66b33da.js} +3 -5
- flowfile/web/static/assets/ContextMenu-49463352.js +9 -0
- flowfile/web/static/assets/ContextMenu-dd5f3f25.js +9 -0
- flowfile/web/static/assets/ContextMenu-f709b884.js +9 -0
- flowfile/web/static/assets/ContextMenu.vue_vue_type_script_setup_true_lang-a1bd6314.js +59 -0
- flowfile/web/static/assets/{CrossJoin-702a3edd.js → CrossJoin-24694b8f.js} +12 -10
- flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
- flowfile/web/static/assets/{CustomNode-b1519993.js → CustomNode-569d45ff.js} +43 -24
- flowfile/web/static/assets/CustomNode-edb9b939.css +42 -0
- flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-c20a1e16.css} +23 -21
- flowfile/web/static/assets/{DatabaseConnectionSettings-6f3e4ea5.js → DatabaseConnectionSettings-cfc08938.js} +5 -4
- flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-5bf8c75b.css} +41 -46
- flowfile/web/static/assets/{DatabaseReader-d38c7295.js → DatabaseReader-701feabb.js} +25 -15
- flowfile/web/static/assets/{DatabaseManager-cf5ef661.js → DatabaseView-0482e5b5.js} +11 -11
- flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
- flowfile/web/static/assets/{DatabaseWriter-b04ef46a.js → DatabaseWriter-16721989.js} +17 -10
- flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-bdcf2c8b.css} +29 -27
- flowfile/web/static/assets/{designer-8da3ba3a.css → DesignerView-49abb835.css} +783 -663
- flowfile/web/static/assets/{designer-9633482a.js → DesignerView-f64749fb.js} +1292 -3253
- flowfile/web/static/assets/{documentation-ca400224.js → DocumentationView-61bd2990.js} +5 -5
- flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-9ea6e871.css} +9 -9
- flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
- flowfile/web/static/assets/{ExploreData-5fa10ed8.js → ExploreData-e2735b13.js} +18 -9
- flowfile/web/static/assets/{ExternalSource-d39af878.js → ExternalSource-2535c3b2.js} +9 -7
- flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-7ac7373f.css} +20 -20
- flowfile/web/static/assets/Filter-2cdbc93c.js +287 -0
- flowfile/web/static/assets/Filter-7494ea97.css +48 -0
- flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
- flowfile/web/static/assets/{Formula-6b04fb1d.js → Formula-fcda3c2c.js} +13 -11
- flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
- flowfile/web/static/assets/{FuzzyMatch-999521f4.js → FuzzyMatch-f8d3b7d3.js} +12 -10
- flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-4b4d7db9.css} +5 -5
- flowfile/web/static/assets/{GraphSolver-17dd2198.js → GraphSolver-72eaa695.js} +14 -12
- flowfile/web/static/assets/GroupBy-5792782d.css +9 -0
- flowfile/web/static/assets/{GroupBy-6b039e18.js → GroupBy-8aa0598b.js} +9 -7
- flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
- flowfile/web/static/assets/{Join-24d0f113.js → Join-e40f0ffa.js} +13 -11
- flowfile/web/static/assets/LoginView-5111c9ae.js +134 -0
- flowfile/web/static/assets/LoginView-d325d632.css +172 -0
- flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
- flowfile/web/static/assets/{ManualInput-34639209.js → ManualInput-9b6f3224.js} +170 -116
- flowfile/web/static/assets/{MultiSelect-0e8724a3.js → MultiSelect-ef28e19e.js} +2 -2
- flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js → MultiSelect.vue_vue_type_script_setup_true_lang-83b3bbfd.js} +1 -1
- flowfile/web/static/assets/NodeDesigner-94cd4dd3.css +1429 -0
- flowfile/web/static/assets/NodeDesigner-d2b7ee2b.js +2712 -0
- flowfile/web/static/assets/{NumericInput-3d63a470.js → NumericInput-1d789794.js} +2 -2
- flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js → NumericInput.vue_vue_type_script_setup_true_lang-7775f83e.js} +5 -2
- flowfile/web/static/assets/Output-692dd25d.css +37 -0
- flowfile/web/static/assets/{Output-edea9802.js → Output-cefef801.js} +14 -10
- flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
- flowfile/web/static/assets/{Pivot-61d19301.js → Pivot-bab1b75b.js} +12 -10
- flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
- flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
- flowfile/web/static/assets/{PivotValidation-f97fec5b.js → PivotValidation-e7941f91.js} +3 -3
- flowfile/web/static/assets/{PivotValidation-de9f43fe.js → PivotValidation-fba09336.js} +3 -3
- flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
- flowfile/web/static/assets/{PolarsCode-bc3c9984.js → PolarsCode-740e40fa.js} +18 -9
- flowfile/web/static/assets/PopOver-862d7e28.js +939 -0
- flowfile/web/static/assets/PopOver-d96599db.css +33 -0
- flowfile/web/static/assets/{Read-64a3f259.js → Read-225cc63f.js} +16 -12
- flowfile/web/static/assets/{Read-e808b239.css → Read-90f366bc.css} +15 -15
- flowfile/web/static/assets/{RecordCount-3d5039be.js → RecordCount-ffc71eca.js} +6 -4
- flowfile/web/static/assets/{RecordId-597510e0.js → RecordId-a70bb8df.js} +9 -7
- flowfile/web/static/assets/{SQLQueryComponent-df51adbe.js → SQLQueryComponent-15a421f5.js} +3 -3
- flowfile/web/static/assets/SQLQueryComponent-edb90b98.css +29 -0
- flowfile/web/static/assets/{Sample-4be0a507.js → Sample-6c26afc7.js} +6 -4
- flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
- flowfile/web/static/assets/SecretSelector-ceed9496.js +113 -0
- flowfile/web/static/assets/{SecretManager-4839be57.js → SecretsView-214d255a.js} +35 -36
- flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
- flowfile/web/static/assets/{Select-9b72f201.js → Select-8fc29999.js} +9 -7
- flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-7ded385d.js → SettingsSection-3f70e4c3.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-f0f75a42.js → SettingsSection-83090218.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-e1e9c953.js → SettingsSection-9f0d1725.js} +3 -3
- flowfile/web/static/assets/SetupView-3fa0aa03.js +160 -0
- flowfile/web/static/assets/SetupView-e2da3442.css +230 -0
- flowfile/web/static/assets/{SingleSelect-6c777aac.js → SingleSelect-a4a568cb.js} +2 -2
- flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js → SingleSelect.vue_vue_type_script_setup_true_lang-c8ebdd33.js} +1 -1
- flowfile/web/static/assets/{SliderInput-7cb93e62.js → SliderInput-be533e71.js} +7 -4
- flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
- flowfile/web/static/assets/{Sort-6cbde21a.js → Sort-154dad81.js} +9 -7
- flowfile/web/static/assets/Sort-4abb7fae.css +9 -0
- flowfile/web/static/assets/{TextInput-d9a40c11.js → TextInput-454e2bda.js} +2 -2
- flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-5896c375.js → TextInput.vue_vue_type_script_setup_true_lang-e86510d0.js} +5 -2
- flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
- flowfile/web/static/assets/{TextToRows-c4fcbf4d.js → TextToRows-ea73433d.js} +11 -10
- flowfile/web/static/assets/{ToggleSwitch-4ef91d19.js → ToggleSwitch-9d7b30f1.js} +2 -2
- flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-00f2580e.js} +1 -1
- flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-394a1f78.css} +14 -14
- flowfile/web/static/assets/{UnavailableFields-a03f512c.js → UnavailableFields-b72a2c72.js} +4 -4
- flowfile/web/static/assets/{Union-bfe9b996.js → Union-1e44f263.js} +8 -6
- flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
- flowfile/web/static/assets/Unique-2b705521.css +3 -0
- flowfile/web/static/assets/{Unique-5d023a27.js → Unique-a3bc6d0a.js} +13 -10
- flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-b6ad6427.css} +7 -7
- flowfile/web/static/assets/{Unpivot-91cc5354.js → Unpivot-e27935fc.js} +11 -9
- flowfile/web/static/assets/{UnpivotValidation-7ee2de44.js → UnpivotValidation-72497680.js} +3 -3
- flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
- flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
- flowfile/web/static/assets/{VueGraphicWalker-e51b9924.js → VueGraphicWalker-d9ab70a3.js} +4 -4
- flowfile/web/static/assets/{api-cf1221f0.js → api-a2102880.js} +1 -1
- flowfile/web/static/assets/{api-c1bad5ca.js → api-f75042b0.js} +1 -1
- flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-1d6acbd9.css} +41 -41
- flowfile/web/static/assets/{dropDown-614b998d.js → dropDown-2798a109.js} +3 -3
- flowfile/web/static/assets/{fullEditor-f7971590.js → fullEditor-cf7d7d93.js} +11 -10
- flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-fe9f7e18.css} +77 -65
- flowfile/web/static/assets/{genericNodeSettings-4fe5f36b.js → genericNodeSettings-14eac1c3.js} +5 -5
- flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
- flowfile/web/static/assets/{index-5429bbf8.js → index-387a6f18.js} +41806 -40958
- flowfile/web/static/assets/index-6b367bb5.js +38 -0
- flowfile/web/static/assets/{index-50508d4d.css → index-e96ab018.css} +2184 -569
- flowfile/web/static/assets/index-f0a6e5a5.js +2696 -0
- flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
- flowfile/web/static/assets/nodeInput-ed2ae8d7.js +2 -0
- flowfile/web/static/assets/{outputCsv-076b85ab.js → outputCsv-3c1757e8.js} +3 -3
- flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
- flowfile/web/static/assets/{outputExcel-0fd17dbe.js → outputExcel-686e1f48.js} +3 -3
- flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
- flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
- flowfile/web/static/assets/{outputParquet-b61e0847.js → outputParquet-df28faa7.js} +4 -4
- flowfile/web/static/assets/{readCsv-c767cb37.css → readCsv-3bfac4c3.css} +15 -15
- flowfile/web/static/assets/{readCsv-a8bb8b61.js → readCsv-e37eee21.js} +3 -3
- flowfile/web/static/assets/{readExcel-806d2826.css → readExcel-3db6b763.css} +13 -13
- flowfile/web/static/assets/{readExcel-67b4aee0.js → readExcel-a13f14bb.js} +5 -5
- flowfile/web/static/assets/{readParquet-92ce1dbc.js → readParquet-344cf746.js} +3 -3
- flowfile/web/static/assets/{readParquet-48c81530.css → readParquet-c5244ad5.css} +4 -4
- flowfile/web/static/assets/secrets.api-ae198c5c.js +65 -0
- flowfile/web/static/assets/{selectDynamic-92e25ee3.js → selectDynamic-6b4b0767.js} +5 -5
- flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
- flowfile/web/static/assets/{vue-codemirror.esm-41b0e0d7.js → vue-codemirror.esm-31ba0e0b.js} +31 -640
- flowfile/web/static/assets/{vue-content-loader.es-2c8e608f.js → vue-content-loader.es-4469c8ff.js} +1 -1
- flowfile/web/static/index.html +2 -2
- {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/METADATA +3 -4
- flowfile-0.5.4.dist-info/RECORD +407 -0
- flowfile_core/__init__.py +13 -6
- flowfile_core/auth/jwt.py +51 -16
- flowfile_core/auth/models.py +32 -7
- flowfile_core/auth/password.py +89 -0
- flowfile_core/auth/secrets.py +64 -19
- flowfile_core/configs/__init__.py +9 -7
- flowfile_core/configs/flow_logger.py +15 -14
- flowfile_core/configs/node_store/__init__.py +72 -4
- flowfile_core/configs/node_store/nodes.py +155 -172
- flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
- flowfile_core/configs/settings.py +28 -15
- flowfile_core/database/connection.py +7 -6
- flowfile_core/database/init_db.py +96 -2
- flowfile_core/database/models.py +3 -1
- flowfile_core/fileExplorer/__init__.py +17 -0
- flowfile_core/fileExplorer/funcs.py +145 -57
- flowfile_core/fileExplorer/utils.py +10 -11
- flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
- flowfile_core/flowfile/analytics/analytics_processor.py +26 -24
- flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
- flowfile_core/flowfile/analytics/utils.py +1 -1
- flowfile_core/flowfile/code_generator/__init__.py +11 -0
- flowfile_core/flowfile/code_generator/code_generator.py +706 -247
- flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
- flowfile_core/flowfile/database_connection_manager/models.py +1 -1
- flowfile_core/flowfile/extensions.py +17 -12
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +115 -83
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +493 -423
- flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
- flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +31 -20
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +14 -15
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
- flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +190 -127
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
- flowfile_core/flowfile/flow_data_engine/utils.py +99 -67
- flowfile_core/flowfile/flow_graph.py +920 -571
- flowfile_core/flowfile/flow_graph_utils.py +31 -49
- flowfile_core/flowfile/flow_node/flow_node.py +379 -258
- flowfile_core/flowfile/flow_node/models.py +53 -41
- flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
- flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
- flowfile_core/flowfile/handler.py +80 -30
- flowfile_core/flowfile/manage/compatibility_enhancements.py +209 -126
- flowfile_core/flowfile/manage/io_flowfile.py +54 -57
- flowfile_core/flowfile/node_designer/__init__.py +19 -13
- flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
- flowfile_core/flowfile/node_designer/custom_node.py +162 -36
- flowfile_core/flowfile/node_designer/ui_components.py +278 -34
- flowfile_core/flowfile/schema_callbacks.py +71 -51
- flowfile_core/flowfile/setting_generator/__init__.py +0 -1
- flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
- flowfile_core/flowfile/setting_generator/settings.py +64 -53
- flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
- flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
- flowfile_core/flowfile/util/calculate_layout.py +9 -13
- flowfile_core/flowfile/util/execution_orderer.py +25 -17
- flowfile_core/flowfile/util/node_skipper.py +4 -4
- flowfile_core/flowfile/utils.py +19 -21
- flowfile_core/main.py +26 -19
- flowfile_core/routes/auth.py +284 -11
- flowfile_core/routes/cloud_connections.py +25 -25
- flowfile_core/routes/logs.py +21 -29
- flowfile_core/routes/public.py +46 -4
- flowfile_core/routes/routes.py +70 -34
- flowfile_core/routes/secrets.py +25 -27
- flowfile_core/routes/user_defined_components.py +483 -4
- flowfile_core/run_lock.py +0 -1
- flowfile_core/schemas/__init__.py +4 -6
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
- flowfile_core/schemas/cloud_storage_schemas.py +96 -66
- flowfile_core/schemas/input_schema.py +231 -144
- flowfile_core/schemas/output_model.py +49 -34
- flowfile_core/schemas/schemas.py +116 -89
- flowfile_core/schemas/transform_schema.py +518 -263
- flowfile_core/schemas/yaml_types.py +21 -7
- flowfile_core/secret_manager/secret_manager.py +123 -18
- flowfile_core/types.py +29 -9
- flowfile_core/utils/arrow_reader.py +7 -6
- flowfile_core/utils/excel_file_manager.py +3 -3
- flowfile_core/utils/fileManager.py +7 -7
- flowfile_core/utils/fl_executor.py +8 -10
- flowfile_core/utils/utils.py +4 -4
- flowfile_core/utils/validate_setup.py +5 -4
- flowfile_frame/__init__.py +117 -51
- flowfile_frame/adapters.py +2 -9
- flowfile_frame/adding_expr.py +73 -32
- flowfile_frame/cloud_storage/frame_helpers.py +27 -23
- flowfile_frame/cloud_storage/secret_manager.py +12 -26
- flowfile_frame/config.py +2 -5
- flowfile_frame/database/__init__.py +36 -0
- flowfile_frame/database/connection_manager.py +205 -0
- flowfile_frame/database/frame_helpers.py +249 -0
- flowfile_frame/expr.py +311 -218
- flowfile_frame/expr.pyi +160 -159
- flowfile_frame/expr_name.py +23 -23
- flowfile_frame/flow_frame.py +571 -476
- flowfile_frame/flow_frame.pyi +123 -104
- flowfile_frame/flow_frame_methods.py +227 -246
- flowfile_frame/group_frame.py +50 -20
- flowfile_frame/join.py +2 -2
- flowfile_frame/lazy.py +129 -87
- flowfile_frame/lazy_methods.py +83 -30
- flowfile_frame/list_name_space.py +55 -50
- flowfile_frame/selectors.py +148 -68
- flowfile_frame/series.py +9 -7
- flowfile_frame/utils.py +19 -21
- flowfile_worker/__init__.py +12 -7
- flowfile_worker/configs.py +41 -33
- flowfile_worker/create/__init__.py +14 -9
- flowfile_worker/create/funcs.py +114 -77
- flowfile_worker/create/models.py +46 -43
- flowfile_worker/create/pl_types.py +14 -15
- flowfile_worker/create/read_excel_tables.py +34 -41
- flowfile_worker/create/utils.py +22 -19
- flowfile_worker/external_sources/s3_source/main.py +18 -51
- flowfile_worker/external_sources/s3_source/models.py +34 -27
- flowfile_worker/external_sources/sql_source/main.py +8 -5
- flowfile_worker/external_sources/sql_source/models.py +13 -9
- flowfile_worker/flow_logger.py +10 -8
- flowfile_worker/funcs.py +214 -155
- flowfile_worker/main.py +11 -17
- flowfile_worker/models.py +35 -28
- flowfile_worker/process_manager.py +2 -3
- flowfile_worker/routes.py +121 -90
- flowfile_worker/secrets.py +114 -21
- flowfile_worker/spawner.py +89 -54
- flowfile_worker/utils.py +3 -2
- shared/__init__.py +2 -7
- shared/storage_config.py +25 -13
- test_utils/postgres/commands.py +3 -2
- test_utils/postgres/fixtures.py +9 -9
- test_utils/s3/commands.py +1 -1
- test_utils/s3/data_generator.py +3 -4
- test_utils/s3/demo_data_generator.py +4 -7
- test_utils/s3/fixtures.py +7 -5
- tools/migrate/__init__.py +1 -1
- tools/migrate/__main__.py +16 -29
- tools/migrate/legacy_schemas.py +251 -190
- tools/migrate/migrate.py +193 -181
- tools/migrate/tests/conftest.py +1 -3
- tools/migrate/tests/test_migrate.py +36 -41
- tools/migrate/tests/test_migration_e2e.py +28 -29
- tools/migrate/tests/test_node_migrations.py +50 -20
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
- flowfile/web/static/assets/ContextMenu-23e909da.js +0 -41
- flowfile/web/static/assets/ContextMenu-4c74eef1.css +0 -26
- flowfile/web/static/assets/ContextMenu-63cfa99b.css +0 -26
- flowfile/web/static/assets/ContextMenu-70ae0c79.js +0 -41
- flowfile/web/static/assets/ContextMenu-c13f91d0.css +0 -26
- flowfile/web/static/assets/ContextMenu-f149cf7c.js +0 -41
- flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
- flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
- flowfile/web/static/assets/Filter-9b6d08db.js +0 -164
- flowfile/web/static/assets/Filter-f62091b3.css +0 -20
- flowfile/web/static/assets/GroupBy-b9505323.css +0 -51
- flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
- flowfile/web/static/assets/Output-283fe388.css +0 -37
- flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
- flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
- flowfile/web/static/assets/SQLQueryComponent-36cef432.css +0 -27
- flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
- flowfile/web/static/assets/Sort-3643d625.css +0 -51
- flowfile/web/static/assets/Unique-f9fb0809.css +0 -51
- flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
- flowfile/web/static/assets/nodeInput-5d0d6b79.js +0 -41
- flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
- flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
- flowfile/web/static/assets/secretApi-68435402.js +0 -46
- flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
- flowfile-0.5.1.dist-info/RECORD +0 -388
- {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/WHEEL +0 -0
- {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/entry_points.txt +0 -0
- {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/licenses/LICENSE +0 -0
flowfile_frame/flow_frame.py
CHANGED
|
@@ -1,34 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import inspect
|
|
2
4
|
import os
|
|
3
|
-
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable, get_args, get_origin
|
|
4
|
-
|
|
5
5
|
import re
|
|
6
|
+
from collections.abc import Iterable, Iterator, Mapping
|
|
7
|
+
from typing import Any, Literal, Optional, Union, get_args, get_origin
|
|
6
8
|
|
|
7
9
|
import polars as pl
|
|
8
|
-
from flowfile_frame.lazy_methods import add_lazyframe_methods
|
|
9
|
-
|
|
10
|
-
from polars._typing import (CsvEncoding, FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
|
|
11
|
-
from collections.abc import Iterator
|
|
12
|
-
|
|
13
10
|
from pl_fuzzy_frame_match import FuzzyMapping
|
|
11
|
+
from polars._typing import CsvEncoding, FrameInitTypes, Orientation, SchemaDefinition, SchemaDict
|
|
14
12
|
|
|
13
|
+
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
15
14
|
from flowfile_core.flowfile.flow_graph import FlowGraph, add_connection
|
|
16
15
|
from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
|
|
17
|
-
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
18
16
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
19
17
|
from flowfile_core.schemas import input_schema, transform_schema
|
|
20
|
-
|
|
21
|
-
from flowfile_frame.expr import Expr, Column, lit, col
|
|
22
|
-
from flowfile_frame.selectors import Selector
|
|
23
|
-
from flowfile_frame.group_frame import GroupByFrame
|
|
24
|
-
from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
|
|
25
|
-
ensure_inputs_as_iterable, generate_node_id, data as node_id_data)
|
|
26
|
-
from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
|
|
27
|
-
from flowfile_frame.utils import _check_if_convertible_to_code
|
|
28
|
-
from flowfile_frame.config import logger
|
|
29
18
|
from flowfile_frame.cloud_storage.frame_helpers import add_write_ff_to_cloud_storage
|
|
30
|
-
from
|
|
31
|
-
|
|
19
|
+
from flowfile_frame.config import logger
|
|
20
|
+
from flowfile_frame.expr import Column, Expr, col, lit
|
|
21
|
+
from flowfile_frame.group_frame import GroupByFrame
|
|
22
|
+
from flowfile_frame.join import _create_join_mappings, _normalize_columns_to_list
|
|
23
|
+
from flowfile_frame.lazy_methods import add_lazyframe_methods
|
|
24
|
+
from flowfile_frame.selectors import Selector
|
|
25
|
+
from flowfile_frame.utils import (
|
|
26
|
+
_check_if_convertible_to_code,
|
|
27
|
+
_parse_inputs_as_iterable,
|
|
28
|
+
create_flow_graph,
|
|
29
|
+
ensure_inputs_as_iterable,
|
|
30
|
+
generate_node_id,
|
|
31
|
+
stringify_values,
|
|
32
|
+
)
|
|
33
|
+
from flowfile_frame.utils import data as node_id_data
|
|
32
34
|
|
|
33
35
|
|
|
34
36
|
def can_be_expr(param: inspect.Parameter) -> bool:
|
|
@@ -83,7 +85,7 @@ def _extract_expr_parts(expr_obj) -> tuple[str, str]:
|
|
|
83
85
|
raw_definitions = []
|
|
84
86
|
|
|
85
87
|
# Add function sources if any
|
|
86
|
-
if hasattr(expr_obj,
|
|
88
|
+
if hasattr(expr_obj, "_function_sources") and expr_obj._function_sources:
|
|
87
89
|
# Remove duplicates while preserving order
|
|
88
90
|
unique_sources = []
|
|
89
91
|
seen = set()
|
|
@@ -101,8 +103,9 @@ def _extract_expr_parts(expr_obj) -> tuple[str, str]:
|
|
|
101
103
|
return pure_expr_str, raw_defs_str
|
|
102
104
|
|
|
103
105
|
|
|
104
|
-
def _check_ok_for_serialization(
|
|
105
|
-
|
|
106
|
+
def _check_ok_for_serialization(
|
|
107
|
+
method_name: str = None, polars_expr: pl.Expr | None = None, group_expr: pl.Expr | None = None
|
|
108
|
+
) -> None:
|
|
106
109
|
if method_name is None:
|
|
107
110
|
raise NotImplementedError("Cannot create a polars lambda expression without the method")
|
|
108
111
|
if polars_expr is None:
|
|
@@ -110,7 +113,7 @@ def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr |
|
|
|
110
113
|
method_ref = getattr(pl.LazyFrame, method_name)
|
|
111
114
|
if method_ref is None:
|
|
112
115
|
raise ModuleNotFoundError(f"Could not find the method {method_name} in polars lazyframe")
|
|
113
|
-
if method_name ==
|
|
116
|
+
if method_name == "group_by":
|
|
114
117
|
if group_expr is None:
|
|
115
118
|
raise NotImplementedError("Cannot create a polars lambda expression without the groupby expression")
|
|
116
119
|
if not all(isinstance(ge, pl.Expr) for ge in group_expr):
|
|
@@ -120,6 +123,7 @@ def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr |
|
|
|
120
123
|
@add_lazyframe_methods
|
|
121
124
|
class FlowFrame:
|
|
122
125
|
"""Main class that wraps FlowDataEngine and maintains the ETL graph."""
|
|
126
|
+
|
|
123
127
|
flow_graph: FlowGraph
|
|
124
128
|
data: pl.LazyFrame
|
|
125
129
|
|
|
@@ -197,8 +201,10 @@ class FlowFrame:
|
|
|
197
201
|
raise ValueError(f"Could not dconvert data to a polars DataFrame: {e}")
|
|
198
202
|
# Create a FlowDataEngine to get data in the right format for manual input
|
|
199
203
|
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
200
|
-
raw_data_format = input_schema.RawData(
|
|
201
|
-
|
|
204
|
+
raw_data_format = input_schema.RawData(
|
|
205
|
+
data=list(flow_table.to_dict().values()),
|
|
206
|
+
columns=[c.get_minimal_field_info() for c in flow_table.schema],
|
|
207
|
+
)
|
|
202
208
|
# Create a manual input node
|
|
203
209
|
input_node = input_schema.NodeManualInput(
|
|
204
210
|
flow_id=flow_id,
|
|
@@ -220,19 +226,19 @@ class FlowFrame:
|
|
|
220
226
|
)
|
|
221
227
|
|
|
222
228
|
def __new__(
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
229
|
+
cls,
|
|
230
|
+
data: pl.LazyFrame | FrameInitTypes = None,
|
|
231
|
+
schema: SchemaDefinition | None = None,
|
|
232
|
+
*,
|
|
233
|
+
schema_overrides: SchemaDict | None = None,
|
|
234
|
+
strict: bool = True,
|
|
235
|
+
orient: Orientation | None = None,
|
|
236
|
+
infer_schema_length: int | None = 100,
|
|
237
|
+
nan_to_null: bool = False,
|
|
238
|
+
flow_graph: FlowGraph | None = None,
|
|
239
|
+
node_id: int | None = None,
|
|
240
|
+
parent_node_id: int | None = None,
|
|
241
|
+
**kwargs, # Accept and ignore any other kwargs for API compatibility
|
|
236
242
|
) -> "FlowFrame":
|
|
237
243
|
"""
|
|
238
244
|
Unified constructor for FlowFrame.
|
|
@@ -252,11 +258,18 @@ class FlowFrame:
|
|
|
252
258
|
instance.parent_node_id = parent_node_id
|
|
253
259
|
return instance
|
|
254
260
|
elif flow_graph is not None and not isinstance(data, pl.LazyFrame):
|
|
255
|
-
instance = cls.create_from_any_type(
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
261
|
+
instance = cls.create_from_any_type(
|
|
262
|
+
data=data,
|
|
263
|
+
schema=schema,
|
|
264
|
+
schema_overrides=schema_overrides,
|
|
265
|
+
strict=strict,
|
|
266
|
+
orient=orient,
|
|
267
|
+
infer_schema_length=infer_schema_length,
|
|
268
|
+
nan_to_null=nan_to_null,
|
|
269
|
+
flow_graph=flow_graph,
|
|
270
|
+
node_id=node_id,
|
|
271
|
+
parent_node_id=parent_node_id,
|
|
272
|
+
)
|
|
260
273
|
return instance
|
|
261
274
|
|
|
262
275
|
source_graph = create_flow_graph()
|
|
@@ -265,37 +278,41 @@ class FlowFrame:
|
|
|
265
278
|
if data is None:
|
|
266
279
|
data = pl.LazyFrame()
|
|
267
280
|
if not isinstance(data, pl.LazyFrame):
|
|
268
|
-
|
|
269
281
|
description = "Data imported from Python object"
|
|
270
282
|
try:
|
|
271
283
|
pl_df = pl.DataFrame(
|
|
272
|
-
data,
|
|
273
|
-
|
|
274
|
-
|
|
284
|
+
data,
|
|
285
|
+
schema=schema,
|
|
286
|
+
schema_overrides=schema_overrides,
|
|
287
|
+
strict=strict,
|
|
288
|
+
orient=orient,
|
|
289
|
+
infer_schema_length=infer_schema_length,
|
|
290
|
+
nan_to_null=nan_to_null,
|
|
275
291
|
)
|
|
276
292
|
pl_data = pl_df.lazy()
|
|
277
293
|
except Exception as e:
|
|
278
294
|
raise ValueError(f"Could not convert data to a Polars DataFrame: {e}")
|
|
279
295
|
|
|
280
296
|
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
281
|
-
raw_data_format = input_schema.RawData(
|
|
282
|
-
|
|
297
|
+
raw_data_format = input_schema.RawData(
|
|
298
|
+
data=list(flow_table.to_dict().values()),
|
|
299
|
+
columns=[c.get_minimal_field_info() for c in flow_table.schema],
|
|
300
|
+
)
|
|
283
301
|
input_node = input_schema.NodeManualInput(
|
|
284
|
-
flow_id=source_graph.flow_id,
|
|
285
|
-
|
|
286
|
-
|
|
302
|
+
flow_id=source_graph.flow_id,
|
|
303
|
+
node_id=source_node_id,
|
|
304
|
+
raw_data_format=raw_data_format,
|
|
305
|
+
pos_x=100,
|
|
306
|
+
pos_y=100,
|
|
307
|
+
is_setup=True,
|
|
308
|
+
description=description,
|
|
287
309
|
)
|
|
288
310
|
source_graph.add_manual_input(input_node)
|
|
289
311
|
else:
|
|
290
312
|
source_graph.add_dependency_on_polars_lazy_frame(data, source_node_id)
|
|
291
313
|
|
|
292
314
|
final_data = source_graph.get_node(source_node_id).get_resulting_data().data_frame
|
|
293
|
-
return cls(
|
|
294
|
-
data=final_data,
|
|
295
|
-
flow_graph=source_graph,
|
|
296
|
-
node_id=source_node_id,
|
|
297
|
-
parent_node_id=parent_node_id
|
|
298
|
-
)
|
|
315
|
+
return cls(data=final_data, flow_graph=source_graph, node_id=source_node_id, parent_node_id=parent_node_id)
|
|
299
316
|
|
|
300
317
|
def __init__(self, *args, **kwargs):
|
|
301
318
|
"""
|
|
@@ -328,20 +345,20 @@ class FlowFrame:
|
|
|
328
345
|
parent_node_id=self.node_id,
|
|
329
346
|
)
|
|
330
347
|
except AttributeError:
|
|
331
|
-
raise ValueError(
|
|
348
|
+
raise ValueError("Could not execute the function")
|
|
332
349
|
|
|
333
350
|
@staticmethod
|
|
334
351
|
def _generate_sort_polars_code(
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
352
|
+
pure_sort_expr_strs: list[str],
|
|
353
|
+
descending_values: list[bool],
|
|
354
|
+
nulls_last_values: list[bool],
|
|
355
|
+
multithreaded: bool,
|
|
356
|
+
maintain_order: bool,
|
|
340
357
|
) -> str:
|
|
341
358
|
"""
|
|
342
359
|
Generates the `input_df.sort(...)` Polars code string using pure expression strings.
|
|
343
360
|
"""
|
|
344
|
-
kwargs_for_code:
|
|
361
|
+
kwargs_for_code: dict[str, Any] = {}
|
|
345
362
|
if any(descending_values):
|
|
346
363
|
kwargs_for_code["descending"] = descending_values[0] if len(descending_values) == 1 else descending_values
|
|
347
364
|
if any(nulls_last_values):
|
|
@@ -353,19 +370,20 @@ class FlowFrame:
|
|
|
353
370
|
|
|
354
371
|
kwargs_str_for_code = ", ".join(f"{k}={repr(v)}" for k, v in kwargs_for_code.items())
|
|
355
372
|
|
|
356
|
-
by_arg_for_code =
|
|
357
|
-
pure_sort_expr_strs) == 1 else f"[{', '.join(pure_sort_expr_strs)}]"
|
|
373
|
+
by_arg_for_code = (
|
|
374
|
+
pure_sort_expr_strs[0] if len(pure_sort_expr_strs) == 1 else f"[{', '.join(pure_sort_expr_strs)}]"
|
|
375
|
+
)
|
|
358
376
|
return f"input_df.sort({by_arg_for_code}{', ' + kwargs_str_for_code if kwargs_str_for_code else ''})"
|
|
359
377
|
|
|
360
378
|
def sort(
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
379
|
+
self,
|
|
380
|
+
by: list[Expr | str] | Expr | str,
|
|
381
|
+
*more_by: Expr | str,
|
|
382
|
+
descending: bool | list[bool] = False,
|
|
383
|
+
nulls_last: bool | list[bool] = False,
|
|
384
|
+
multithreaded: bool = True,
|
|
385
|
+
maintain_order: bool = False,
|
|
386
|
+
description: str | None = None,
|
|
369
387
|
) -> "FlowFrame":
|
|
370
388
|
"""
|
|
371
389
|
Sort the dataframe by the given columns.
|
|
@@ -377,10 +395,10 @@ class FlowFrame:
|
|
|
377
395
|
if more_by:
|
|
378
396
|
sort_expressions_input.extend(list(_parse_inputs_as_iterable(more_by)))
|
|
379
397
|
|
|
380
|
-
all_processed_expr_objects:
|
|
381
|
-
pure_polars_expr_strings_for_sort:
|
|
382
|
-
collected_raw_definitions:
|
|
383
|
-
column_names_for_native_node:
|
|
398
|
+
all_processed_expr_objects: list[Expr] = []
|
|
399
|
+
pure_polars_expr_strings_for_sort: list[str] = []
|
|
400
|
+
collected_raw_definitions: list[str] = []
|
|
401
|
+
column_names_for_native_node: list[str] = []
|
|
384
402
|
|
|
385
403
|
use_polars_code_path = False
|
|
386
404
|
|
|
@@ -429,10 +447,12 @@ class FlowFrame:
|
|
|
429
447
|
if not is_simple_col_for_native: # If it wasn't a simple string or unaltered Column
|
|
430
448
|
use_polars_code_path = True
|
|
431
449
|
|
|
432
|
-
desc_values =
|
|
433
|
-
all_processed_expr_objects)
|
|
434
|
-
|
|
435
|
-
|
|
450
|
+
desc_values = (
|
|
451
|
+
list(descending) if isinstance(descending, list) else [descending] * len(all_processed_expr_objects)
|
|
452
|
+
)
|
|
453
|
+
null_last_values = (
|
|
454
|
+
list(nulls_last) if isinstance(nulls_last, list) else [nulls_last] * len(all_processed_expr_objects)
|
|
455
|
+
)
|
|
436
456
|
|
|
437
457
|
if len(desc_values) != len(all_processed_expr_objects):
|
|
438
458
|
raise ValueError("Length of 'descending' does not match the number of sort expressions.")
|
|
@@ -448,23 +468,31 @@ class FlowFrame:
|
|
|
448
468
|
if collected_raw_definitions:
|
|
449
469
|
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
|
|
450
470
|
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
451
|
-
final_code_for_node =
|
|
452
|
-
|
|
453
|
-
|
|
471
|
+
final_code_for_node = (
|
|
472
|
+
definitions_section + "\\#─────SPLIT─────\n\n" + f"output_df = {polars_operation_code}"
|
|
473
|
+
)
|
|
454
474
|
else:
|
|
455
475
|
final_code_for_node = polars_operation_code
|
|
456
476
|
|
|
457
|
-
pl_expressions_for_fallback = [
|
|
458
|
-
|
|
477
|
+
pl_expressions_for_fallback = [
|
|
478
|
+
e.expr for e in all_processed_expr_objects if hasattr(e, "expr") and e.expr is not None
|
|
479
|
+
]
|
|
459
480
|
kwargs_for_fallback = {
|
|
460
481
|
"descending": desc_values[0] if len(desc_values) == 1 else desc_values,
|
|
461
482
|
"nulls_last": null_last_values[0] if len(null_last_values) == 1 else null_last_values,
|
|
462
|
-
"multithreaded": multithreaded,
|
|
483
|
+
"multithreaded": multithreaded,
|
|
484
|
+
"maintain_order": maintain_order,
|
|
485
|
+
}
|
|
463
486
|
|
|
464
|
-
self._add_polars_code(
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
487
|
+
self._add_polars_code(
|
|
488
|
+
new_node_id,
|
|
489
|
+
final_code_for_node,
|
|
490
|
+
description,
|
|
491
|
+
method_name="sort",
|
|
492
|
+
convertable_to_code=_check_if_convertible_to_code(all_processed_expr_objects),
|
|
493
|
+
polars_expr=pl_expressions_for_fallback,
|
|
494
|
+
kwargs_expr=kwargs_for_fallback,
|
|
495
|
+
)
|
|
468
496
|
else:
|
|
469
497
|
sort_inputs_for_node = []
|
|
470
498
|
for i, col_name_for_native in enumerate(column_names_for_native_node):
|
|
@@ -473,30 +501,44 @@ class FlowFrame:
|
|
|
473
501
|
# type: ignore
|
|
474
502
|
)
|
|
475
503
|
sort_settings = input_schema.NodeSort(
|
|
476
|
-
flow_id=self.flow_graph.flow_id,
|
|
477
|
-
|
|
478
|
-
|
|
504
|
+
flow_id=self.flow_graph.flow_id,
|
|
505
|
+
node_id=new_node_id,
|
|
506
|
+
sort_input=sort_inputs_for_node, # type: ignore
|
|
507
|
+
pos_x=200,
|
|
508
|
+
pos_y=150,
|
|
509
|
+
is_setup=True,
|
|
510
|
+
depending_on_id=self.node_id,
|
|
511
|
+
description=description or f"Sort by {', '.join(column_names_for_native_node)}",
|
|
512
|
+
)
|
|
479
513
|
self.flow_graph.add_sort(sort_settings)
|
|
480
514
|
|
|
481
515
|
return self._create_child_frame(new_node_id)
|
|
482
516
|
|
|
483
|
-
def _add_polars_code(
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
517
|
+
def _add_polars_code(
|
|
518
|
+
self,
|
|
519
|
+
new_node_id: int,
|
|
520
|
+
code: str,
|
|
521
|
+
description: str = None,
|
|
522
|
+
depending_on_ids: list[str] | None = None,
|
|
523
|
+
convertable_to_code: bool = True,
|
|
524
|
+
method_name: str = None,
|
|
525
|
+
polars_expr: Expr | list[Expr] | None = None,
|
|
526
|
+
group_expr: Expr | list[Expr] | None = None,
|
|
527
|
+
kwargs_expr: dict | None = None,
|
|
528
|
+
group_kwargs: dict | None = None,
|
|
529
|
+
):
|
|
489
530
|
polars_code_for_node: str
|
|
490
531
|
if not convertable_to_code or _contains_lambda_pattern(code):
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
532
|
+
effective_method_name = (
|
|
533
|
+
get_method_name_from_code(code) if method_name is None and "input_df." in code else method_name
|
|
534
|
+
)
|
|
494
535
|
|
|
495
536
|
pl_expr_list = ensure_inputs_as_iterable(polars_expr) if polars_expr is not None else []
|
|
496
537
|
group_expr_list = ensure_inputs_as_iterable(group_expr) if group_expr is not None else []
|
|
497
538
|
|
|
498
|
-
_check_ok_for_serialization(
|
|
499
|
-
|
|
539
|
+
_check_ok_for_serialization(
|
|
540
|
+
polars_expr=pl_expr_list, method_name=effective_method_name, group_expr=group_expr_list
|
|
541
|
+
)
|
|
500
542
|
|
|
501
543
|
current_kwargs_expr = kwargs_expr if kwargs_expr is not None else {}
|
|
502
544
|
result_lazyframe_or_expr: Any
|
|
@@ -508,22 +550,27 @@ class FlowFrame:
|
|
|
508
550
|
target_obj = getattr(self.data, effective_method_name)(*group_expr_list, **group_kwargs)
|
|
509
551
|
if not pl_expr_list:
|
|
510
552
|
raise ValueError(
|
|
511
|
-
"Aggregation expressions (polars_expr) are required for group_by().agg() in serialization fallback."
|
|
553
|
+
"Aggregation expressions (polars_expr) are required for group_by().agg() in serialization fallback."
|
|
554
|
+
)
|
|
512
555
|
result_lazyframe_or_expr = target_obj.agg(*pl_expr_list, **current_kwargs_expr)
|
|
513
556
|
elif effective_method_name:
|
|
514
|
-
result_lazyframe_or_expr = getattr(self.data, effective_method_name)(
|
|
515
|
-
|
|
557
|
+
result_lazyframe_or_expr = getattr(self.data, effective_method_name)(
|
|
558
|
+
*pl_expr_list, **current_kwargs_expr
|
|
559
|
+
)
|
|
516
560
|
else:
|
|
517
561
|
raise ValueError(
|
|
518
|
-
"Cannot execute Polars operation: method_name is missing and could not be inferred for serialization fallback."
|
|
562
|
+
"Cannot execute Polars operation: method_name is missing and could not be inferred for serialization fallback."
|
|
563
|
+
)
|
|
519
564
|
try:
|
|
520
565
|
if isinstance(result_lazyframe_or_expr, pl.LazyFrame):
|
|
521
|
-
serialized_value_for_code = result_lazyframe_or_expr.serialize(format=
|
|
522
|
-
polars_code_for_node = "\n".join(
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
566
|
+
serialized_value_for_code = result_lazyframe_or_expr.serialize(format="json")
|
|
567
|
+
polars_code_for_node = "\n".join(
|
|
568
|
+
[
|
|
569
|
+
f"serialized_value = r'''{serialized_value_for_code}'''",
|
|
570
|
+
"buffer = BytesIO(serialized_value.encode('utf-8'))",
|
|
571
|
+
"output_df = pl.LazyFrame.deserialize(buffer, format='json')",
|
|
572
|
+
]
|
|
573
|
+
)
|
|
527
574
|
logger.warning(
|
|
528
575
|
f"Transformation '{effective_method_name}' uses non-serializable elements. "
|
|
529
576
|
"Falling back to serializing the resulting Polars LazyFrame object."
|
|
@@ -556,18 +603,18 @@ class FlowFrame:
|
|
|
556
603
|
self.flow_graph.add_polars_code(polars_code_settings)
|
|
557
604
|
|
|
558
605
|
def join(
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
606
|
+
self,
|
|
607
|
+
other,
|
|
608
|
+
on: list[str | Column] | str | Column = None,
|
|
609
|
+
how: str = "inner",
|
|
610
|
+
left_on: list[str | Column] | str | Column = None,
|
|
611
|
+
right_on: list[str | Column] | str | Column = None,
|
|
612
|
+
suffix: str = "_right",
|
|
613
|
+
validate: str = None,
|
|
614
|
+
nulls_equal: bool = False,
|
|
615
|
+
coalesce: bool = None,
|
|
616
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"] = None,
|
|
617
|
+
description: str = None,
|
|
571
618
|
) -> "FlowFrame":
|
|
572
619
|
"""
|
|
573
620
|
Add a join operation to the Logical Plan.
|
|
@@ -613,9 +660,7 @@ class FlowFrame:
|
|
|
613
660
|
New FlowFrame with join operation applied.
|
|
614
661
|
"""
|
|
615
662
|
# Step 1: Determine if we need to use Polars code
|
|
616
|
-
use_polars_code = self._should_use_polars_code_for_join(
|
|
617
|
-
maintain_order, coalesce, nulls_equal, validate, suffix
|
|
618
|
-
)
|
|
663
|
+
use_polars_code = self._should_use_polars_code_for_join(maintain_order, coalesce, nulls_equal, validate, suffix)
|
|
619
664
|
# Step 2: Ensure both FlowFrames are in the same graph
|
|
620
665
|
self._ensure_same_graph(other)
|
|
621
666
|
|
|
@@ -623,11 +668,9 @@ class FlowFrame:
|
|
|
623
668
|
new_node_id = generate_node_id()
|
|
624
669
|
|
|
625
670
|
# Step 4: Parse and validate join columns
|
|
626
|
-
left_columns, right_columns = self._parse_join_columns(
|
|
627
|
-
on, left_on, right_on, how
|
|
628
|
-
)
|
|
671
|
+
left_columns, right_columns = self._parse_join_columns(on, left_on, right_on, how)
|
|
629
672
|
# Step 5: Validate column lists have same length (except for cross join)
|
|
630
|
-
if how !=
|
|
673
|
+
if how != "cross" and left_columns is not None and right_columns is not None:
|
|
631
674
|
if len(left_columns) != len(right_columns):
|
|
632
675
|
raise ValueError(
|
|
633
676
|
f"Length mismatch: left columns ({len(left_columns)}) != right columns ({len(right_columns)})"
|
|
@@ -635,42 +678,46 @@ class FlowFrame:
|
|
|
635
678
|
|
|
636
679
|
# Step 6: Create join mappings if not using Polars code
|
|
637
680
|
join_mappings = None
|
|
638
|
-
if not use_polars_code and how !=
|
|
639
|
-
join_mappings, use_polars_code = _create_join_mappings(
|
|
640
|
-
left_columns or [], right_columns or []
|
|
641
|
-
)
|
|
681
|
+
if not use_polars_code and how != "cross":
|
|
682
|
+
join_mappings, use_polars_code = _create_join_mappings(left_columns or [], right_columns or [])
|
|
642
683
|
|
|
643
684
|
# Step 7: Execute join based on approach
|
|
644
|
-
if use_polars_code or suffix !=
|
|
685
|
+
if use_polars_code or suffix != "_right":
|
|
645
686
|
return self._execute_polars_code_join(
|
|
646
|
-
other,
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
687
|
+
other,
|
|
688
|
+
new_node_id,
|
|
689
|
+
on,
|
|
690
|
+
left_on,
|
|
691
|
+
right_on,
|
|
692
|
+
left_columns,
|
|
693
|
+
right_columns,
|
|
694
|
+
how,
|
|
695
|
+
suffix,
|
|
696
|
+
validate,
|
|
697
|
+
nulls_equal,
|
|
698
|
+
coalesce,
|
|
699
|
+
maintain_order,
|
|
700
|
+
description,
|
|
652
701
|
)
|
|
702
|
+
elif join_mappings or how == "cross":
|
|
703
|
+
return self._execute_native_join(other, new_node_id, join_mappings, how, description)
|
|
653
704
|
else:
|
|
654
705
|
raise ValueError("Could not execute join")
|
|
655
706
|
|
|
656
|
-
def _should_use_polars_code_for_join(
|
|
657
|
-
self, maintain_order, coalesce, nulls_equal, validate, suffix
|
|
658
|
-
) -> bool:
|
|
707
|
+
def _should_use_polars_code_for_join(self, maintain_order, coalesce, nulls_equal, validate, suffix) -> bool:
|
|
659
708
|
"""Determine if we should use Polars code instead of native join."""
|
|
660
709
|
return not (
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
710
|
+
maintain_order is None
|
|
711
|
+
and coalesce is None
|
|
712
|
+
and nulls_equal is False
|
|
713
|
+
and validate is None
|
|
714
|
+
and suffix == "_right"
|
|
666
715
|
)
|
|
667
716
|
|
|
668
717
|
def _ensure_same_graph(self, other: "FlowFrame") -> None:
|
|
669
718
|
"""Ensure both FlowFrames are in the same graph, combining if necessary."""
|
|
670
719
|
if self.flow_graph.flow_id != other.flow_graph.flow_id:
|
|
671
|
-
combined_graph, node_mappings = combine_flow_graphs_with_mapping(
|
|
672
|
-
self.flow_graph, other.flow_graph
|
|
673
|
-
)
|
|
720
|
+
combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
|
|
674
721
|
|
|
675
722
|
new_self_node_id = node_mappings.get((self.flow_graph.flow_id, self.node_id), None)
|
|
676
723
|
new_other_node_id = node_mappings.get((other.flow_graph.flow_id, other.node_id), None)
|
|
@@ -685,19 +732,19 @@ class FlowFrame:
|
|
|
685
732
|
node_id_data["c"] = node_id_data["c"] + len(combined_graph.nodes)
|
|
686
733
|
|
|
687
734
|
def _parse_join_columns(
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
) -> tuple[
|
|
735
|
+
self,
|
|
736
|
+
on: list[str | Column] | str | Column,
|
|
737
|
+
left_on: list[str | Column] | str | Column,
|
|
738
|
+
right_on: list[str | Column] | str | Column,
|
|
739
|
+
how: str,
|
|
740
|
+
) -> tuple[list[str] | None, list[str] | None]:
|
|
694
741
|
"""Parse and validate join column specifications."""
|
|
695
742
|
if on is not None:
|
|
696
743
|
left_columns = right_columns = _normalize_columns_to_list(on)
|
|
697
744
|
elif left_on is not None and right_on is not None:
|
|
698
745
|
left_columns = _normalize_columns_to_list(left_on)
|
|
699
746
|
right_columns = _normalize_columns_to_list(right_on)
|
|
700
|
-
elif how ==
|
|
747
|
+
elif how == "cross" and left_on is None and right_on is None and on is None:
|
|
701
748
|
left_columns = None
|
|
702
749
|
right_columns = None
|
|
703
750
|
else:
|
|
@@ -706,37 +753,43 @@ class FlowFrame:
|
|
|
706
753
|
return left_columns, right_columns
|
|
707
754
|
|
|
708
755
|
def _execute_polars_code_join(
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
756
|
+
self,
|
|
757
|
+
other: "FlowFrame",
|
|
758
|
+
new_node_id: int,
|
|
759
|
+
on: list[str | Column] | str | Column,
|
|
760
|
+
left_on: list[str | Column] | str | Column,
|
|
761
|
+
right_on: list[str | Column] | str | Column,
|
|
762
|
+
left_columns: list[str] | None,
|
|
763
|
+
right_columns: list[str] | None,
|
|
764
|
+
how: str,
|
|
765
|
+
suffix: str,
|
|
766
|
+
validate: str,
|
|
767
|
+
nulls_equal: bool,
|
|
768
|
+
coalesce: bool,
|
|
769
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"],
|
|
770
|
+
description: str,
|
|
724
771
|
) -> "FlowFrame":
|
|
725
772
|
"""Execute join using Polars code approach."""
|
|
726
773
|
# Build the code arguments
|
|
727
774
|
code_kwargs = self._build_polars_join_kwargs(
|
|
728
|
-
on,
|
|
729
|
-
|
|
775
|
+
on,
|
|
776
|
+
left_on,
|
|
777
|
+
right_on,
|
|
778
|
+
left_columns,
|
|
779
|
+
right_columns,
|
|
780
|
+
how,
|
|
781
|
+
suffix,
|
|
782
|
+
validate,
|
|
783
|
+
nulls_equal,
|
|
784
|
+
coalesce,
|
|
785
|
+
maintain_order,
|
|
730
786
|
)
|
|
731
787
|
|
|
732
788
|
kwargs_str = ", ".join(f"{k}={v}" for k, v in code_kwargs.items() if v is not None)
|
|
733
789
|
code = f"input_df_1.join({kwargs_str})"
|
|
734
790
|
|
|
735
791
|
# Add the Polars code node
|
|
736
|
-
self._add_polars_code(
|
|
737
|
-
new_node_id, code, description,
|
|
738
|
-
depending_on_ids=[self.node_id, other.node_id]
|
|
739
|
-
)
|
|
792
|
+
self._add_polars_code(new_node_id, code, description, depending_on_ids=[self.node_id, other.node_id])
|
|
740
793
|
|
|
741
794
|
# Add connections
|
|
742
795
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
@@ -751,28 +804,29 @@ class FlowFrame:
|
|
|
751
804
|
)
|
|
752
805
|
|
|
753
806
|
def _build_polars_join_kwargs(
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
807
|
+
self,
|
|
808
|
+
on: list[str | Column] | str | Column,
|
|
809
|
+
left_on: list[str | Column] | str | Column,
|
|
810
|
+
right_on: list[str | Column] | str | Column,
|
|
811
|
+
left_columns: list[str] | None,
|
|
812
|
+
right_columns: list[str] | None,
|
|
813
|
+
how: str,
|
|
814
|
+
suffix: str,
|
|
815
|
+
validate: str,
|
|
816
|
+
nulls_equal: bool,
|
|
817
|
+
coalesce: bool,
|
|
818
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"],
|
|
766
819
|
) -> dict:
|
|
767
820
|
"""Build kwargs dictionary for Polars join code."""
|
|
768
821
|
|
|
769
822
|
def format_column_list(cols):
|
|
770
823
|
if cols is None:
|
|
771
824
|
return None
|
|
772
|
-
return
|
|
773
|
-
|
|
774
|
-
for v in _normalize_columns_to_list(cols)
|
|
775
|
-
|
|
825
|
+
return (
|
|
826
|
+
"["
|
|
827
|
+
+ ", ".join(f"'{v}'" if isinstance(v, str) else str(v) for v in _normalize_columns_to_list(cols))
|
|
828
|
+
+ "]"
|
|
829
|
+
)
|
|
776
830
|
|
|
777
831
|
return {
|
|
778
832
|
"other": "input_df_2",
|
|
@@ -784,16 +838,16 @@ class FlowFrame:
|
|
|
784
838
|
"validate": _to_string_val(validate),
|
|
785
839
|
"nulls_equal": nulls_equal,
|
|
786
840
|
"coalesce": coalesce,
|
|
787
|
-
"maintain_order": _to_string_val(maintain_order)
|
|
841
|
+
"maintain_order": _to_string_val(maintain_order),
|
|
788
842
|
}
|
|
789
843
|
|
|
790
844
|
def _execute_native_join(
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
845
|
+
self,
|
|
846
|
+
other: "FlowFrame",
|
|
847
|
+
new_node_id: int,
|
|
848
|
+
join_mappings: list | None,
|
|
849
|
+
how: str,
|
|
850
|
+
description: str,
|
|
797
851
|
) -> "FlowFrame":
|
|
798
852
|
"""Execute join using native FlowFile join nodes."""
|
|
799
853
|
# Create select inputs for both frames
|
|
@@ -801,7 +855,7 @@ class FlowFrame:
|
|
|
801
855
|
left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
|
|
802
856
|
right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
|
|
803
857
|
# Create appropriate join input based on join type
|
|
804
|
-
if how ==
|
|
858
|
+
if how == "cross":
|
|
805
859
|
join_input = transform_schema.CrossJoinInput(
|
|
806
860
|
left_select=transform_schema.JoinInputs(renames=left_select.renames),
|
|
807
861
|
right_select=right_select.renames,
|
|
@@ -823,7 +877,7 @@ class FlowFrame:
|
|
|
823
877
|
right_column.keep = False
|
|
824
878
|
|
|
825
879
|
# Create and add appropriate node
|
|
826
|
-
if how ==
|
|
880
|
+
if how == "cross":
|
|
827
881
|
self._add_cross_join_node(new_node_id, join_input_manager.to_cross_join_input(), description, other)
|
|
828
882
|
else:
|
|
829
883
|
self._add_regular_join_node(new_node_id, join_input_manager.to_join_input(), description, other)
|
|
@@ -840,11 +894,11 @@ class FlowFrame:
|
|
|
840
894
|
)
|
|
841
895
|
|
|
842
896
|
def _add_cross_join_node(
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
897
|
+
self,
|
|
898
|
+
new_node_id: int,
|
|
899
|
+
join_input: "transform_schema.CrossJoinInput",
|
|
900
|
+
description: str,
|
|
901
|
+
other: "FlowFrame",
|
|
848
902
|
) -> None:
|
|
849
903
|
"""Add a cross join node to the graph."""
|
|
850
904
|
cross_join_settings = input_schema.NodeCrossJoin(
|
|
@@ -853,18 +907,18 @@ class FlowFrame:
|
|
|
853
907
|
cross_join_input=join_input,
|
|
854
908
|
is_setup=True,
|
|
855
909
|
depending_on_ids=[self.node_id, other.node_id],
|
|
856
|
-
description=description or
|
|
910
|
+
description=description or "Join with cross strategy",
|
|
857
911
|
auto_generate_selection=True,
|
|
858
912
|
verify_integrity=True,
|
|
859
913
|
)
|
|
860
914
|
self.flow_graph.add_cross_join(cross_join_settings)
|
|
861
915
|
|
|
862
916
|
def _add_regular_join_node(
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
917
|
+
self,
|
|
918
|
+
new_node_id: int,
|
|
919
|
+
join_input: "transform_schema.JoinInput",
|
|
920
|
+
description: str,
|
|
921
|
+
other: "FlowFrame",
|
|
868
922
|
) -> None:
|
|
869
923
|
"""Add a regular join node to the graph."""
|
|
870
924
|
join_settings = input_schema.NodeJoin(
|
|
@@ -889,34 +943,41 @@ class FlowFrame:
|
|
|
889
943
|
pos_y=100,
|
|
890
944
|
is_setup=True,
|
|
891
945
|
depending_on_id=self.node_id,
|
|
892
|
-
description=description
|
|
946
|
+
description=description,
|
|
893
947
|
)
|
|
894
948
|
self.flow_graph.add_record_count(node_number_of_records)
|
|
895
949
|
return self._create_child_frame(new_node_id)
|
|
896
950
|
|
|
897
|
-
def rename(self, mapping: Mapping[str, str], *, strict: bool = True,
|
|
898
|
-
description: str = None) -> "FlowFrame":
|
|
951
|
+
def rename(self, mapping: Mapping[str, str], *, strict: bool = True, description: str = None) -> "FlowFrame":
|
|
899
952
|
"""Rename columns based on a mapping or function."""
|
|
900
|
-
return self.select(
|
|
901
|
-
|
|
953
|
+
return self.select(
|
|
954
|
+
[col(old_name).alias(new_name) for old_name, new_name in mapping.items()],
|
|
955
|
+
description=description,
|
|
956
|
+
_keep_missing=True,
|
|
957
|
+
)
|
|
902
958
|
|
|
903
|
-
def select(
|
|
959
|
+
def select(
|
|
960
|
+
self, *columns: str | Expr | Selector, description: str | None = None, _keep_missing: bool = False
|
|
961
|
+
) -> "FlowFrame":
|
|
904
962
|
"""
|
|
905
963
|
Select columns from the frame.
|
|
906
964
|
"""
|
|
907
965
|
columns_iterable = list(_parse_inputs_as_iterable(columns))
|
|
908
966
|
new_node_id = generate_node_id()
|
|
909
|
-
if (
|
|
910
|
-
|
|
967
|
+
if (
|
|
968
|
+
len(columns_iterable) == 1
|
|
969
|
+
and isinstance(columns_iterable[0], Expr)
|
|
970
|
+
and str(columns_iterable[0]) == "pl.Expr(len()).alias('number_of_records')"
|
|
971
|
+
):
|
|
911
972
|
return self._add_number_of_records(new_node_id, description)
|
|
912
973
|
|
|
913
|
-
all_input_expr_objects:
|
|
914
|
-
pure_polars_expr_strings_for_select:
|
|
915
|
-
collected_raw_definitions:
|
|
916
|
-
selected_col_names_for_native:
|
|
974
|
+
all_input_expr_objects: list[Expr] = []
|
|
975
|
+
pure_polars_expr_strings_for_select: list[str] = []
|
|
976
|
+
collected_raw_definitions: list[str] = []
|
|
977
|
+
selected_col_names_for_native: list[transform_schema.SelectInput] = [] # For native node
|
|
917
978
|
|
|
918
979
|
can_use_native_node = True
|
|
919
|
-
if len(columns_iterable) == 1 and isinstance(columns_iterable[0], str) and columns_iterable[0] ==
|
|
980
|
+
if len(columns_iterable) == 1 and isinstance(columns_iterable[0], str) and columns_iterable[0] == "*":
|
|
920
981
|
effective_columns_iterable = [col(c_name) for c_name in self.columns]
|
|
921
982
|
else:
|
|
922
983
|
effective_columns_iterable = columns_iterable
|
|
@@ -950,13 +1011,17 @@ class FlowFrame:
|
|
|
950
1011
|
if can_use_native_node:
|
|
951
1012
|
existing_cols = self.columns
|
|
952
1013
|
selected_col_names = {select_col.old_name for select_col in selected_col_names_for_native}
|
|
953
|
-
not_selected_columns = [
|
|
954
|
-
|
|
1014
|
+
not_selected_columns = [
|
|
1015
|
+
transform_schema.SelectInput(c, keep=_keep_missing)
|
|
1016
|
+
for c in existing_cols
|
|
1017
|
+
if c not in selected_col_names
|
|
1018
|
+
]
|
|
955
1019
|
selected_col_names_for_native.extend(not_selected_columns)
|
|
956
1020
|
if _keep_missing:
|
|
957
1021
|
lookup_selection = {_col.old_name: _col for _col in selected_col_names_for_native}
|
|
958
|
-
selected_col_names_for_native = [
|
|
959
|
-
|
|
1022
|
+
selected_col_names_for_native = [
|
|
1023
|
+
lookup_selection.get(_col) for _col in existing_cols if _col in lookup_selection
|
|
1024
|
+
]
|
|
960
1025
|
select_settings = input_schema.NodeSelect(
|
|
961
1026
|
flow_id=self.flow_graph.flow_id,
|
|
962
1027
|
node_id=new_node_id,
|
|
@@ -966,7 +1031,7 @@ class FlowFrame:
|
|
|
966
1031
|
pos_y=100,
|
|
967
1032
|
is_setup=True,
|
|
968
1033
|
depending_on_id=self.node_id,
|
|
969
|
-
description=description
|
|
1034
|
+
description=description,
|
|
970
1035
|
)
|
|
971
1036
|
self.flow_graph.add_select(select_settings)
|
|
972
1037
|
else:
|
|
@@ -975,23 +1040,35 @@ class FlowFrame:
|
|
|
975
1040
|
if collected_raw_definitions:
|
|
976
1041
|
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
|
|
977
1042
|
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
978
|
-
final_code_for_node =
|
|
979
|
-
|
|
980
|
-
|
|
1043
|
+
final_code_for_node = (
|
|
1044
|
+
definitions_section + "\\#─────SPLIT─────\n\n" + f"output_df = {polars_operation_code}"
|
|
1045
|
+
)
|
|
981
1046
|
else:
|
|
982
1047
|
final_code_for_node = polars_operation_code
|
|
983
1048
|
|
|
984
|
-
pl_expressions_for_fallback = [
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
1049
|
+
pl_expressions_for_fallback = [
|
|
1050
|
+
e.expr
|
|
1051
|
+
for e in all_input_expr_objects
|
|
1052
|
+
if isinstance(e, Expr) and hasattr(e, "expr") and e.expr is not None
|
|
1053
|
+
]
|
|
1054
|
+
self._add_polars_code(
|
|
1055
|
+
new_node_id,
|
|
1056
|
+
final_code_for_node,
|
|
1057
|
+
description,
|
|
1058
|
+
method_name="select",
|
|
1059
|
+
convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
|
|
1060
|
+
polars_expr=pl_expressions_for_fallback,
|
|
1061
|
+
)
|
|
990
1062
|
|
|
991
1063
|
return self._create_child_frame(new_node_id)
|
|
992
1064
|
|
|
993
|
-
def filter(
|
|
994
|
-
|
|
1065
|
+
def filter(
|
|
1066
|
+
self,
|
|
1067
|
+
*predicates: Expr | Any,
|
|
1068
|
+
flowfile_formula: str | None = None,
|
|
1069
|
+
description: str | None = None,
|
|
1070
|
+
**constraints: Any,
|
|
1071
|
+
) -> "FlowFrame":
|
|
995
1072
|
"""
|
|
996
1073
|
Filter rows based on a predicate.
|
|
997
1074
|
"""
|
|
@@ -1000,9 +1077,9 @@ class FlowFrame:
|
|
|
1000
1077
|
available_columns = self.columns
|
|
1001
1078
|
new_node_id = generate_node_id()
|
|
1002
1079
|
if len(predicates) > 0 or len(constraints) > 0:
|
|
1003
|
-
all_input_expr_objects:
|
|
1004
|
-
pure_polars_expr_strings:
|
|
1005
|
-
collected_raw_definitions:
|
|
1080
|
+
all_input_expr_objects: list[Expr] = []
|
|
1081
|
+
pure_polars_expr_strings: list[str] = []
|
|
1082
|
+
collected_raw_definitions: list[str] = []
|
|
1006
1083
|
|
|
1007
1084
|
processed_predicates = []
|
|
1008
1085
|
for pred_item in predicates:
|
|
@@ -1031,10 +1108,11 @@ class FlowFrame:
|
|
|
1031
1108
|
collected_raw_definitions.append(raw_defs_str)
|
|
1032
1109
|
|
|
1033
1110
|
for k, v_val in constraints.items():
|
|
1034
|
-
constraint_expr_obj =
|
|
1111
|
+
constraint_expr_obj = col(k) == lit(v_val)
|
|
1035
1112
|
all_input_expr_objects.append(constraint_expr_obj)
|
|
1036
1113
|
pure_expr_str, raw_defs_str = _extract_expr_parts(
|
|
1037
|
-
constraint_expr_obj
|
|
1114
|
+
constraint_expr_obj
|
|
1115
|
+
) # Constraint exprs are unlikely to have defs
|
|
1038
1116
|
pure_polars_expr_strings.append(f"({pure_expr_str})")
|
|
1039
1117
|
if raw_defs_str and raw_defs_str not in collected_raw_definitions: # Should be rare here
|
|
1040
1118
|
collected_raw_definitions.append(raw_defs_str)
|
|
@@ -1046,31 +1124,36 @@ class FlowFrame:
|
|
|
1046
1124
|
if collected_raw_definitions:
|
|
1047
1125
|
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
|
|
1048
1126
|
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
1049
|
-
final_code_for_node =
|
|
1050
|
-
|
|
1051
|
-
|
|
1127
|
+
final_code_for_node = (
|
|
1128
|
+
definitions_section + "\\#─────SPLIT─────\n\n" + f"output_df = {polars_operation_code}"
|
|
1129
|
+
)
|
|
1052
1130
|
else:
|
|
1053
1131
|
final_code_for_node = polars_operation_code
|
|
1054
1132
|
|
|
1055
1133
|
convertable_to_code = _check_if_convertible_to_code(all_input_expr_objects)
|
|
1056
|
-
pl_expressions_for_fallback = [
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1134
|
+
pl_expressions_for_fallback = [
|
|
1135
|
+
e.expr
|
|
1136
|
+
for e in all_input_expr_objects
|
|
1137
|
+
if isinstance(e, Expr) and hasattr(e, "expr") and e.expr is not None
|
|
1138
|
+
]
|
|
1139
|
+
self._add_polars_code(
|
|
1140
|
+
new_node_id,
|
|
1141
|
+
final_code_for_node,
|
|
1142
|
+
description,
|
|
1143
|
+
method_name="filter",
|
|
1144
|
+
convertable_to_code=convertable_to_code,
|
|
1145
|
+
polars_expr=pl_expressions_for_fallback,
|
|
1146
|
+
)
|
|
1061
1147
|
elif flowfile_formula:
|
|
1062
1148
|
filter_settings = input_schema.NodeFilter(
|
|
1063
1149
|
flow_id=self.flow_graph.flow_id,
|
|
1064
1150
|
node_id=new_node_id,
|
|
1065
|
-
filter_input=transform_schema.FilterInput(
|
|
1066
|
-
advanced_filter=flowfile_formula,
|
|
1067
|
-
filter_type="advanced"
|
|
1068
|
-
),
|
|
1151
|
+
filter_input=transform_schema.FilterInput(advanced_filter=flowfile_formula, filter_type="advanced"),
|
|
1069
1152
|
pos_x=200,
|
|
1070
1153
|
pos_y=150,
|
|
1071
1154
|
is_setup=True,
|
|
1072
1155
|
depending_on_id=self.node_id,
|
|
1073
|
-
description=description
|
|
1156
|
+
description=description,
|
|
1074
1157
|
)
|
|
1075
1158
|
self.flow_graph.add_filter(filter_settings)
|
|
1076
1159
|
else:
|
|
@@ -1079,12 +1162,7 @@ class FlowFrame:
|
|
|
1079
1162
|
|
|
1080
1163
|
return self._create_child_frame(new_node_id)
|
|
1081
1164
|
|
|
1082
|
-
def sink_csv(self,
|
|
1083
|
-
file: str,
|
|
1084
|
-
*args,
|
|
1085
|
-
separator: str = ",",
|
|
1086
|
-
encoding: str = "utf-8",
|
|
1087
|
-
description: str = None):
|
|
1165
|
+
def sink_csv(self, file: str, *args, separator: str = ",", encoding: str = "utf-8", description: str = None):
|
|
1088
1166
|
"""
|
|
1089
1167
|
Write the data to a CSV file.
|
|
1090
1168
|
|
|
@@ -1100,12 +1178,12 @@ class FlowFrame:
|
|
|
1100
1178
|
return self.write_csv(file, *args, separator=separator, encoding=encoding, description=description)
|
|
1101
1179
|
|
|
1102
1180
|
def write_parquet(
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1181
|
+
self,
|
|
1182
|
+
path: str | os.PathLike,
|
|
1183
|
+
*,
|
|
1184
|
+
description: str = None,
|
|
1185
|
+
convert_to_absolute_path: bool = True,
|
|
1186
|
+
**kwargs: Any,
|
|
1109
1187
|
) -> "FlowFrame":
|
|
1110
1188
|
"""
|
|
1111
1189
|
Write the data to a Parquet file. Creates a standard Output node if only
|
|
@@ -1143,10 +1221,10 @@ class FlowFrame:
|
|
|
1143
1221
|
use_polars_code = bool(kwargs.items()) or not is_path_input
|
|
1144
1222
|
|
|
1145
1223
|
output_settings = input_schema.OutputSettings(
|
|
1146
|
-
file_type=
|
|
1224
|
+
file_type="parquet",
|
|
1147
1225
|
name=file_name,
|
|
1148
1226
|
directory=file_str if is_path_input else str(file_str),
|
|
1149
|
-
table_settings=input_schema.OutputParquetTable()
|
|
1227
|
+
table_settings=input_schema.OutputParquetTable(),
|
|
1150
1228
|
)
|
|
1151
1229
|
|
|
1152
1230
|
if is_path_input:
|
|
@@ -1163,7 +1241,7 @@ class FlowFrame:
|
|
|
1163
1241
|
node_id=new_node_id,
|
|
1164
1242
|
output_settings=output_settings,
|
|
1165
1243
|
depending_on_id=self.node_id,
|
|
1166
|
-
description=description
|
|
1244
|
+
description=description,
|
|
1167
1245
|
)
|
|
1168
1246
|
self.flow_graph.add_output(node_output)
|
|
1169
1247
|
else:
|
|
@@ -1189,16 +1267,15 @@ class FlowFrame:
|
|
|
1189
1267
|
return self._create_child_frame(new_node_id)
|
|
1190
1268
|
|
|
1191
1269
|
def write_csv(
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1270
|
+
self,
|
|
1271
|
+
file: str | os.PathLike,
|
|
1272
|
+
*,
|
|
1273
|
+
separator: str = ",",
|
|
1274
|
+
encoding: str = "utf-8",
|
|
1275
|
+
description: str = None,
|
|
1276
|
+
convert_to_absolute_path: bool = True,
|
|
1277
|
+
**kwargs: Any,
|
|
1200
1278
|
) -> "FlowFrame":
|
|
1201
|
-
|
|
1202
1279
|
new_node_id = generate_node_id()
|
|
1203
1280
|
is_path_input = isinstance(file, (str, os.PathLike))
|
|
1204
1281
|
if isinstance(file, os.PathLike):
|
|
@@ -1214,13 +1291,10 @@ class FlowFrame:
|
|
|
1214
1291
|
|
|
1215
1292
|
use_polars_code = bool(kwargs) or not is_path_input
|
|
1216
1293
|
output_settings = input_schema.OutputSettings(
|
|
1217
|
-
file_type=
|
|
1294
|
+
file_type="csv",
|
|
1218
1295
|
name=file_name,
|
|
1219
1296
|
directory=file_str if is_path_input else str(file_str),
|
|
1220
|
-
table_settings=input_schema.OutputCsvTable(
|
|
1221
|
-
delimiter=separator,
|
|
1222
|
-
encoding=encoding
|
|
1223
|
-
)
|
|
1297
|
+
table_settings=input_schema.OutputCsvTable(delimiter=separator, encoding=encoding),
|
|
1224
1298
|
)
|
|
1225
1299
|
if is_path_input:
|
|
1226
1300
|
try:
|
|
@@ -1236,7 +1310,7 @@ class FlowFrame:
|
|
|
1236
1310
|
node_id=new_node_id,
|
|
1237
1311
|
output_settings=output_settings,
|
|
1238
1312
|
depending_on_id=self.node_id,
|
|
1239
|
-
description=description
|
|
1313
|
+
description=description,
|
|
1240
1314
|
)
|
|
1241
1315
|
self.flow_graph.add_output(node_output)
|
|
1242
1316
|
else:
|
|
@@ -1250,9 +1324,9 @@ class FlowFrame:
|
|
|
1250
1324
|
path_arg_repr = repr(output_settings.directory)
|
|
1251
1325
|
|
|
1252
1326
|
all_kwargs_for_code = {
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
**kwargs # Add the extra kwargs
|
|
1327
|
+
"separator": separator,
|
|
1328
|
+
"encoding": encoding,
|
|
1329
|
+
**kwargs, # Add the extra kwargs
|
|
1256
1330
|
}
|
|
1257
1331
|
kwargs_repr = ", ".join(f"{k}={repr(v)}" for k, v in all_kwargs_for_code.items())
|
|
1258
1332
|
|
|
@@ -1266,42 +1340,47 @@ class FlowFrame:
|
|
|
1266
1340
|
|
|
1267
1341
|
return self._create_child_frame(new_node_id)
|
|
1268
1342
|
|
|
1269
|
-
def write_parquet_to_cloud_storage(
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1343
|
+
def write_parquet_to_cloud_storage(
|
|
1344
|
+
self,
|
|
1345
|
+
path: str,
|
|
1346
|
+
connection_name: str | None = None,
|
|
1347
|
+
compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy",
|
|
1348
|
+
description: str | None = None,
|
|
1349
|
+
) -> "FlowFrame":
|
|
1275
1350
|
"""
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1351
|
+
Write the data frame to cloud storage in Parquet format.
|
|
1352
|
+
|
|
1353
|
+
Args:
|
|
1354
|
+
path (str): The destination path in cloud storage where the Parquet file will be written.
|
|
1355
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1356
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1357
|
+
compression (Literal["snappy", "gzip", "brotli", "lz4", "zstd"], optional):
|
|
1358
|
+
The compression algorithm to use for the Parquet file. Defaults to "snappy".
|
|
1359
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1360
|
+
|
|
1361
|
+
Returns:
|
|
1362
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1288
1363
|
"""
|
|
1289
1364
|
|
|
1290
|
-
new_node_id = add_write_ff_to_cloud_storage(
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1365
|
+
new_node_id = add_write_ff_to_cloud_storage(
|
|
1366
|
+
path,
|
|
1367
|
+
flow_graph=self.flow_graph,
|
|
1368
|
+
connection_name=connection_name,
|
|
1369
|
+
depends_on_node_id=self.node_id,
|
|
1370
|
+
parquet_compression=compression,
|
|
1371
|
+
file_format="parquet",
|
|
1372
|
+
description=description,
|
|
1373
|
+
)
|
|
1296
1374
|
return self._create_child_frame(new_node_id)
|
|
1297
1375
|
|
|
1298
|
-
def write_csv_to_cloud_storage(
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1376
|
+
def write_csv_to_cloud_storage(
|
|
1377
|
+
self,
|
|
1378
|
+
path: str,
|
|
1379
|
+
connection_name: str | None = None,
|
|
1380
|
+
delimiter: str = ";",
|
|
1381
|
+
encoding: CsvEncoding = "utf8",
|
|
1382
|
+
description: str | None = None,
|
|
1383
|
+
) -> "FlowFrame":
|
|
1305
1384
|
"""
|
|
1306
1385
|
Write the data frame to cloud storage in CSV format.
|
|
1307
1386
|
|
|
@@ -1318,21 +1397,25 @@ class FlowFrame:
|
|
|
1318
1397
|
Returns:
|
|
1319
1398
|
FlowFrame: A new child data frame representing the written data.
|
|
1320
1399
|
"""
|
|
1321
|
-
new_node_id = add_write_ff_to_cloud_storage(
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1400
|
+
new_node_id = add_write_ff_to_cloud_storage(
|
|
1401
|
+
path,
|
|
1402
|
+
flow_graph=self.flow_graph,
|
|
1403
|
+
connection_name=connection_name,
|
|
1404
|
+
depends_on_node_id=self.node_id,
|
|
1405
|
+
csv_delimiter=delimiter,
|
|
1406
|
+
csv_encoding=encoding,
|
|
1407
|
+
file_format="csv",
|
|
1408
|
+
description=description,
|
|
1409
|
+
)
|
|
1328
1410
|
return self._create_child_frame(new_node_id)
|
|
1329
1411
|
|
|
1330
|
-
def write_delta(
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1412
|
+
def write_delta(
|
|
1413
|
+
self,
|
|
1414
|
+
path: str,
|
|
1415
|
+
connection_name: str | None = None,
|
|
1416
|
+
write_mode: Literal["overwrite", "append"] = "overwrite",
|
|
1417
|
+
description: str | None = None,
|
|
1418
|
+
) -> "FlowFrame":
|
|
1336
1419
|
"""
|
|
1337
1420
|
Write the data frame to cloud storage in Delta Lake format.
|
|
1338
1421
|
|
|
@@ -1346,19 +1429,23 @@ class FlowFrame:
|
|
|
1346
1429
|
Returns:
|
|
1347
1430
|
FlowFrame: A new child data frame representing the written data.
|
|
1348
1431
|
"""
|
|
1349
|
-
new_node_id = add_write_ff_to_cloud_storage(
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1432
|
+
new_node_id = add_write_ff_to_cloud_storage(
|
|
1433
|
+
path,
|
|
1434
|
+
flow_graph=self.flow_graph,
|
|
1435
|
+
connection_name=connection_name,
|
|
1436
|
+
depends_on_node_id=self.node_id,
|
|
1437
|
+
write_mode=write_mode,
|
|
1438
|
+
file_format="delta",
|
|
1439
|
+
description=description,
|
|
1440
|
+
)
|
|
1355
1441
|
return self._create_child_frame(new_node_id)
|
|
1356
1442
|
|
|
1357
|
-
def write_json_to_cloud_storage(
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1443
|
+
def write_json_to_cloud_storage(
|
|
1444
|
+
self,
|
|
1445
|
+
path: str,
|
|
1446
|
+
connection_name: str | None = None,
|
|
1447
|
+
description: str | None = None,
|
|
1448
|
+
) -> "FlowFrame":
|
|
1362
1449
|
"""
|
|
1363
1450
|
Write the data frame to cloud storage in JSON format.
|
|
1364
1451
|
|
|
@@ -1370,11 +1457,14 @@ class FlowFrame:
|
|
|
1370
1457
|
Returns:
|
|
1371
1458
|
FlowFrame: A new child data frame representing the written data.
|
|
1372
1459
|
"""
|
|
1373
|
-
new_node_id = add_write_ff_to_cloud_storage(
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1460
|
+
new_node_id = add_write_ff_to_cloud_storage(
|
|
1461
|
+
path,
|
|
1462
|
+
flow_graph=self.flow_graph,
|
|
1463
|
+
connection_name=connection_name,
|
|
1464
|
+
depends_on_node_id=self.node_id,
|
|
1465
|
+
file_format="json",
|
|
1466
|
+
description=description,
|
|
1467
|
+
)
|
|
1378
1468
|
return self._create_child_frame(new_node_id)
|
|
1379
1469
|
|
|
1380
1470
|
def group_by(self, *by, description: str = None, maintain_order=False, **named_by) -> GroupByFrame:
|
|
@@ -1411,7 +1501,10 @@ class FlowFrame:
|
|
|
1411
1501
|
# Create a GroupByFrame
|
|
1412
1502
|
return GroupByFrame(
|
|
1413
1503
|
node_id=new_node_id,
|
|
1414
|
-
parent_frame=self,
|
|
1504
|
+
parent_frame=self,
|
|
1505
|
+
by_cols=by_cols,
|
|
1506
|
+
maintain_order=maintain_order,
|
|
1507
|
+
description=description,
|
|
1415
1508
|
)
|
|
1416
1509
|
|
|
1417
1510
|
def to_graph(self):
|
|
@@ -1419,7 +1512,7 @@ class FlowFrame:
|
|
|
1419
1512
|
return self.flow_graph
|
|
1420
1513
|
|
|
1421
1514
|
def save_graph(self, file_path: str, auto_arrange: bool = True):
|
|
1422
|
-
"""Save the graph
|
|
1515
|
+
"""Save the graph"""
|
|
1423
1516
|
if auto_arrange:
|
|
1424
1517
|
self.flow_graph.apply_layout()
|
|
1425
1518
|
self.flow_graph.save_flow(file_path)
|
|
@@ -1432,23 +1525,27 @@ class FlowFrame:
|
|
|
1432
1525
|
|
|
1433
1526
|
def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> "FlowFrame":
|
|
1434
1527
|
new_node_id = generate_node_id()
|
|
1435
|
-
function_settings = (
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1528
|
+
function_settings = input_schema.NodeFormula(
|
|
1529
|
+
flow_id=self.flow_graph.flow_id,
|
|
1530
|
+
node_id=new_node_id,
|
|
1531
|
+
depending_on_id=self.node_id,
|
|
1532
|
+
function=transform_schema.FunctionInput(
|
|
1533
|
+
function=flowfile_formula, field=transform_schema.FieldInput(name=output_column_name, data_type="Auto")
|
|
1534
|
+
),
|
|
1535
|
+
description=description,
|
|
1536
|
+
)
|
|
1441
1537
|
self.flow_graph.add_formula(function_settings)
|
|
1442
1538
|
return self._create_child_frame(new_node_id)
|
|
1443
1539
|
|
|
1444
1540
|
def head(self, n: int, description: str = None):
|
|
1445
1541
|
new_node_id = generate_node_id()
|
|
1446
|
-
settings = input_schema.NodeSample(
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1542
|
+
settings = input_schema.NodeSample(
|
|
1543
|
+
flow_id=self.flow_graph.flow_id,
|
|
1544
|
+
node_id=new_node_id,
|
|
1545
|
+
depending_on_id=self.node_id,
|
|
1546
|
+
sample_size=n,
|
|
1547
|
+
description=description,
|
|
1548
|
+
)
|
|
1452
1549
|
self.flow_graph.add_sample(settings)
|
|
1453
1550
|
return self._create_child_frame(new_node_id)
|
|
1454
1551
|
|
|
@@ -1464,16 +1561,18 @@ class FlowFrame:
|
|
|
1464
1561
|
def get_node_settings(self) -> FlowNode:
|
|
1465
1562
|
return self.flow_graph.get_node(self.node_id)
|
|
1466
1563
|
|
|
1467
|
-
def pivot(
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1564
|
+
def pivot(
|
|
1565
|
+
self,
|
|
1566
|
+
on: str | list[str],
|
|
1567
|
+
*,
|
|
1568
|
+
index: str | list[str] | None = None,
|
|
1569
|
+
values: str | list[str] | None = None,
|
|
1570
|
+
aggregate_function: str | None = "first",
|
|
1571
|
+
maintain_order: bool = True,
|
|
1572
|
+
sort_columns: bool = False,
|
|
1573
|
+
separator: str = "_",
|
|
1574
|
+
description: str = None,
|
|
1575
|
+
) -> "FlowFrame":
|
|
1477
1576
|
"""
|
|
1478
1577
|
Pivot a DataFrame from long to wide format.
|
|
1479
1578
|
|
|
@@ -1522,17 +1621,14 @@ class FlowFrame:
|
|
|
1522
1621
|
value_col = values if isinstance(values, str) else values[0]
|
|
1523
1622
|
|
|
1524
1623
|
# Set valid aggregations
|
|
1525
|
-
valid_aggs = [
|
|
1624
|
+
valid_aggs = ["first", "last", "min", "max", "sum", "mean", "median", "count"]
|
|
1526
1625
|
if aggregate_function not in valid_aggs:
|
|
1527
|
-
raise ValueError(
|
|
1528
|
-
|
|
1626
|
+
raise ValueError(
|
|
1627
|
+
f"Invalid aggregate_function: {aggregate_function}. " f"Must be one of: {', '.join(valid_aggs)}"
|
|
1628
|
+
)
|
|
1529
1629
|
|
|
1530
1630
|
# Check if we can use the native implementation
|
|
1531
|
-
can_use_native = (
|
|
1532
|
-
isinstance(on_value, str) and
|
|
1533
|
-
isinstance(value_col, str) and
|
|
1534
|
-
aggregate_function in valid_aggs
|
|
1535
|
-
)
|
|
1631
|
+
can_use_native = isinstance(on_value, str) and isinstance(value_col, str) and aggregate_function in valid_aggs
|
|
1536
1632
|
|
|
1537
1633
|
if can_use_native:
|
|
1538
1634
|
# Create pivot input for native implementation
|
|
@@ -1540,7 +1636,7 @@ class FlowFrame:
|
|
|
1540
1636
|
index_columns=index_columns,
|
|
1541
1637
|
pivot_column=on_value,
|
|
1542
1638
|
value_col=value_col,
|
|
1543
|
-
aggregations=[aggregate_function]
|
|
1639
|
+
aggregations=[aggregate_function],
|
|
1544
1640
|
)
|
|
1545
1641
|
|
|
1546
1642
|
# Create node settings
|
|
@@ -1552,7 +1648,7 @@ class FlowFrame:
|
|
|
1552
1648
|
pos_y=150,
|
|
1553
1649
|
is_setup=True,
|
|
1554
1650
|
depending_on_id=self.node_id,
|
|
1555
|
-
description=description or f"Pivot {value_col} by {on_value}"
|
|
1651
|
+
description=description or f"Pivot {value_col} by {on_value}",
|
|
1556
1652
|
)
|
|
1557
1653
|
|
|
1558
1654
|
# Add to graph using native implementation
|
|
@@ -1580,8 +1676,9 @@ class FlowFrame:
|
|
|
1580
1676
|
# Generate description if not provided
|
|
1581
1677
|
if description is None:
|
|
1582
1678
|
on_str = on if isinstance(on, str) else ", ".join(on if isinstance(on, list) else [on])
|
|
1583
|
-
values_str =
|
|
1584
|
-
values if isinstance(values, list) else [values])
|
|
1679
|
+
values_str = (
|
|
1680
|
+
values if isinstance(values, str) else ", ".join(values if isinstance(values, list) else [values])
|
|
1681
|
+
)
|
|
1585
1682
|
description = f"Pivot {values_str} by {on_str}"
|
|
1586
1683
|
|
|
1587
1684
|
# Add polars code node
|
|
@@ -1589,13 +1686,15 @@ class FlowFrame:
|
|
|
1589
1686
|
|
|
1590
1687
|
return self._create_child_frame(new_node_id)
|
|
1591
1688
|
|
|
1592
|
-
def unpivot(
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1689
|
+
def unpivot(
|
|
1690
|
+
self,
|
|
1691
|
+
on: list[str | Selector] | str | None | Selector = None,
|
|
1692
|
+
*,
|
|
1693
|
+
index: list[str] | str | None = None,
|
|
1694
|
+
variable_name: str = "variable",
|
|
1695
|
+
value_name: str = "value",
|
|
1696
|
+
description: str = None,
|
|
1697
|
+
) -> "FlowFrame":
|
|
1599
1698
|
"""
|
|
1600
1699
|
Unpivot a DataFrame from wide to long format.
|
|
1601
1700
|
|
|
@@ -1642,13 +1741,13 @@ class FlowFrame:
|
|
|
1642
1741
|
value_columns = [on]
|
|
1643
1742
|
|
|
1644
1743
|
if can_use_native:
|
|
1645
|
-
can_use_native =
|
|
1744
|
+
can_use_native = variable_name == "variable" and value_name == "value"
|
|
1646
1745
|
if can_use_native:
|
|
1647
1746
|
unpivot_input = transform_schema.UnpivotInput(
|
|
1648
1747
|
index_columns=index_columns,
|
|
1649
1748
|
value_columns=value_columns,
|
|
1650
1749
|
data_type_selector=None,
|
|
1651
|
-
data_type_selector_mode=
|
|
1750
|
+
data_type_selector_mode="column",
|
|
1652
1751
|
)
|
|
1653
1752
|
|
|
1654
1753
|
# Create node settings
|
|
@@ -1660,7 +1759,7 @@ class FlowFrame:
|
|
|
1660
1759
|
pos_y=150,
|
|
1661
1760
|
is_setup=True,
|
|
1662
1761
|
depending_on_id=self.node_id,
|
|
1663
|
-
description=description or "Unpivot data from wide to long format"
|
|
1762
|
+
description=description or "Unpivot data from wide to long format",
|
|
1664
1763
|
)
|
|
1665
1764
|
|
|
1666
1765
|
# Add to graph using native implementation
|
|
@@ -1696,7 +1795,7 @@ class FlowFrame:
|
|
|
1696
1795
|
|
|
1697
1796
|
def concat(
|
|
1698
1797
|
self,
|
|
1699
|
-
other: "FlowFrame" |
|
|
1798
|
+
other: "FlowFrame" | list["FlowFrame"],
|
|
1700
1799
|
how: str = "vertical",
|
|
1701
1800
|
rechunk: bool = False,
|
|
1702
1801
|
parallel: bool = True,
|
|
@@ -1797,14 +1896,11 @@ class FlowFrame:
|
|
|
1797
1896
|
|
|
1798
1897
|
# Add polars code node with dependencies on all input frames
|
|
1799
1898
|
depending_on_ids = [self.node_id] + [frame.node_id for frame in others]
|
|
1800
|
-
self._add_polars_code(
|
|
1801
|
-
new_node_id, code, description, depending_on_ids=depending_on_ids
|
|
1802
|
-
)
|
|
1899
|
+
self._add_polars_code(new_node_id, code, description, depending_on_ids=depending_on_ids)
|
|
1803
1900
|
# Add connections to ensure all frames are available
|
|
1804
1901
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
1805
1902
|
|
|
1806
1903
|
for other_frame in others:
|
|
1807
|
-
|
|
1808
1904
|
other_frame.flow_graph = combined_graph
|
|
1809
1905
|
other_frame._add_connection(other_frame.node_id, new_node_id, "main")
|
|
1810
1906
|
# Create and return the new frame
|
|
@@ -1816,8 +1912,8 @@ class FlowFrame:
|
|
|
1816
1912
|
)
|
|
1817
1913
|
|
|
1818
1914
|
def _detect_cum_count_record_id(
|
|
1819
|
-
self, expr: Any, new_node_id: int, description:
|
|
1820
|
-
) ->
|
|
1915
|
+
self, expr: Any, new_node_id: int, description: str | None = None
|
|
1916
|
+
) -> tuple[bool, Optional["FlowFrame"]]:
|
|
1821
1917
|
"""
|
|
1822
1918
|
Detect if the expression is a cum_count operation and use record_id if possible.
|
|
1823
1919
|
|
|
@@ -1838,8 +1934,12 @@ class FlowFrame:
|
|
|
1838
1934
|
- Optional[FlowFrame]: The new FlowFrame if detection was successful, otherwise None
|
|
1839
1935
|
"""
|
|
1840
1936
|
# Check if this is a cum_count operation
|
|
1841
|
-
if (
|
|
1842
|
-
|
|
1937
|
+
if (
|
|
1938
|
+
not isinstance(expr, Expr)
|
|
1939
|
+
or not expr._repr_str
|
|
1940
|
+
or "cum_count" not in expr._repr_str
|
|
1941
|
+
or not hasattr(expr, "name")
|
|
1942
|
+
):
|
|
1843
1943
|
return False, None
|
|
1844
1944
|
|
|
1845
1945
|
# Extract the output name
|
|
@@ -1926,24 +2026,24 @@ class FlowFrame:
|
|
|
1926
2026
|
return False, None
|
|
1927
2027
|
|
|
1928
2028
|
def with_columns(
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
2029
|
+
self,
|
|
2030
|
+
*exprs: Expr | Iterable[Expr] | Any, # Allow Any for implicit lit conversion
|
|
2031
|
+
flowfile_formulas: list[str] | None = None,
|
|
2032
|
+
output_column_names: list[str] | None = None,
|
|
2033
|
+
description: str | None = None,
|
|
2034
|
+
**named_exprs: Expr | Any, # Allow Any for implicit lit conversion
|
|
1935
2035
|
) -> "FlowFrame":
|
|
1936
2036
|
"""
|
|
1937
2037
|
Add or replace columns in the DataFrame.
|
|
1938
2038
|
"""
|
|
1939
2039
|
new_node_id = generate_node_id()
|
|
1940
2040
|
|
|
1941
|
-
all_input_expr_objects:
|
|
1942
|
-
pure_polars_expr_strings_for_wc:
|
|
1943
|
-
collected_raw_definitions:
|
|
2041
|
+
all_input_expr_objects: list[Expr] = []
|
|
2042
|
+
pure_polars_expr_strings_for_wc: list[str] = []
|
|
2043
|
+
collected_raw_definitions: list[str] = []
|
|
1944
2044
|
has_exprs_or_named_exprs = bool(exprs or named_exprs)
|
|
1945
2045
|
if has_exprs_or_named_exprs:
|
|
1946
|
-
actual_exprs_to_process:
|
|
2046
|
+
actual_exprs_to_process: list[Expr] = []
|
|
1947
2047
|
temp_exprs_iterable = list(_parse_inputs_as_iterable(exprs))
|
|
1948
2048
|
|
|
1949
2049
|
for item in temp_exprs_iterable:
|
|
@@ -1974,38 +2074,43 @@ class FlowFrame:
|
|
|
1974
2074
|
if collected_raw_definitions:
|
|
1975
2075
|
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
|
|
1976
2076
|
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
1977
|
-
final_code_for_node =
|
|
1978
|
-
|
|
1979
|
-
|
|
2077
|
+
final_code_for_node = (
|
|
2078
|
+
definitions_section + "\n#─────SPLIT─────\n\n" + f"output_df = {polars_operation_code}"
|
|
2079
|
+
)
|
|
1980
2080
|
else:
|
|
1981
2081
|
final_code_for_node = polars_operation_code
|
|
1982
2082
|
|
|
1983
|
-
pl_expressions_for_fallback = [
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
2083
|
+
pl_expressions_for_fallback = [
|
|
2084
|
+
e.expr
|
|
2085
|
+
for e in all_input_expr_objects
|
|
2086
|
+
if isinstance(e, Expr) and hasattr(e, "expr") and e.expr is not None
|
|
2087
|
+
]
|
|
2088
|
+
self._add_polars_code(
|
|
2089
|
+
new_node_id,
|
|
2090
|
+
final_code_for_node,
|
|
2091
|
+
description,
|
|
2092
|
+
method_name="with_columns",
|
|
2093
|
+
convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
|
|
2094
|
+
polars_expr=pl_expressions_for_fallback,
|
|
2095
|
+
)
|
|
1988
2096
|
return self._create_child_frame(new_node_id)
|
|
1989
2097
|
|
|
1990
2098
|
elif flowfile_formulas is not None and output_column_names is not None:
|
|
1991
|
-
|
|
1992
2099
|
if len(output_column_names) != len(flowfile_formulas):
|
|
1993
|
-
raise ValueError(
|
|
1994
|
-
"Length of both the formulas and the output columns names must be identical"
|
|
1995
|
-
)
|
|
2100
|
+
raise ValueError("Length of both the formulas and the output columns names must be identical")
|
|
1996
2101
|
|
|
1997
2102
|
if len(flowfile_formulas) == 1:
|
|
1998
2103
|
return self._with_flowfile_formula(flowfile_formulas[0], output_column_names[0], description)
|
|
1999
2104
|
ff = self
|
|
2000
|
-
for i, (flowfile_formula, output_column_name) in enumerate(
|
|
2105
|
+
for i, (flowfile_formula, output_column_name) in enumerate(
|
|
2106
|
+
zip(flowfile_formulas, output_column_names, strict=False)
|
|
2107
|
+
):
|
|
2001
2108
|
ff = ff._with_flowfile_formula(flowfile_formula, output_column_name, f"{i}: {description}")
|
|
2002
2109
|
return ff
|
|
2003
2110
|
else:
|
|
2004
2111
|
raise ValueError("Either exprs/named_exprs or flowfile_formulas with output_column_names must be provided")
|
|
2005
2112
|
|
|
2006
|
-
def with_row_index(
|
|
2007
|
-
self, name: str = "index", offset: int = 0, description: str = None
|
|
2008
|
-
) -> "FlowFrame":
|
|
2113
|
+
def with_row_index(self, name: str = "index", offset: int = 0, description: str = None) -> "FlowFrame":
|
|
2009
2114
|
"""
|
|
2010
2115
|
Add a row index as the first column in the DataFrame.
|
|
2011
2116
|
|
|
@@ -2052,9 +2157,7 @@ class FlowFrame:
|
|
|
2052
2157
|
else:
|
|
2053
2158
|
# Use the polars code approach for other cases
|
|
2054
2159
|
code = f"input_df.with_row_index(name='{name}', offset={offset})"
|
|
2055
|
-
self._add_polars_code(
|
|
2056
|
-
new_node_id, code, description or f"Add row index column '{name}'"
|
|
2057
|
-
)
|
|
2160
|
+
self._add_polars_code(new_node_id, code, description or f"Add row index column '{name}'")
|
|
2058
2161
|
|
|
2059
2162
|
return self._create_child_frame(new_node_id)
|
|
2060
2163
|
|
|
@@ -2088,9 +2191,7 @@ class FlowFrame:
|
|
|
2088
2191
|
all_columns = []
|
|
2089
2192
|
|
|
2090
2193
|
if isinstance(columns, (list, tuple)):
|
|
2091
|
-
all_columns.extend(
|
|
2092
|
-
[col.column_name if isinstance(col, Column) else col for col in columns]
|
|
2093
|
-
)
|
|
2194
|
+
all_columns.extend([col.column_name if isinstance(col, Column) else col for col in columns])
|
|
2094
2195
|
else:
|
|
2095
2196
|
all_columns.append(columns.column_name if isinstance(columns, Column) else columns)
|
|
2096
2197
|
|
|
@@ -2099,10 +2200,9 @@ class FlowFrame:
|
|
|
2099
2200
|
all_columns.append(col.column_name if isinstance(col, Column) else col)
|
|
2100
2201
|
|
|
2101
2202
|
if len(all_columns) == 1:
|
|
2102
|
-
|
|
2103
2203
|
columns_str = stringify_values(all_columns[0])
|
|
2104
2204
|
else:
|
|
2105
|
-
columns_str = "[" + ", ".join([
|
|
2205
|
+
columns_str = "[" + ", ".join([stringify_values(col) for col in all_columns]) + "]"
|
|
2106
2206
|
|
|
2107
2207
|
code = f"""
|
|
2108
2208
|
# Explode columns into multiple rows
|
|
@@ -2117,24 +2217,25 @@ class FlowFrame:
|
|
|
2117
2217
|
|
|
2118
2218
|
return self._create_child_frame(new_node_id)
|
|
2119
2219
|
|
|
2120
|
-
def fuzzy_match(
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2220
|
+
def fuzzy_match(
|
|
2221
|
+
self,
|
|
2222
|
+
other: "FlowFrame",
|
|
2223
|
+
fuzzy_mappings: list[FuzzyMapping],
|
|
2224
|
+
description: str = None,
|
|
2225
|
+
) -> "FlowFrame":
|
|
2125
2226
|
self._ensure_same_graph(other)
|
|
2126
2227
|
|
|
2127
2228
|
# Step 3: Generate new node ID
|
|
2128
2229
|
new_node_id = generate_node_id()
|
|
2129
|
-
node_fuzzy_match = input_schema.NodeFuzzyMatch(
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2230
|
+
node_fuzzy_match = input_schema.NodeFuzzyMatch(
|
|
2231
|
+
flow_id=self.flow_graph.flow_id,
|
|
2232
|
+
node_id=new_node_id,
|
|
2233
|
+
join_input=transform_schema.FuzzyMatchInput(
|
|
2234
|
+
join_mapping=fuzzy_mappings, left_select=self.columns, right_select=other.columns
|
|
2235
|
+
),
|
|
2236
|
+
description=description or "Fuzzy match between two FlowFrames",
|
|
2237
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
2238
|
+
)
|
|
2138
2239
|
self.flow_graph.add_fuzzy_match(node_fuzzy_match)
|
|
2139
2240
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
2140
2241
|
other._add_connection(other.node_id, new_node_id, "right")
|
|
@@ -2213,7 +2314,7 @@ class FlowFrame:
|
|
|
2213
2314
|
|
|
2214
2315
|
def unique(
|
|
2215
2316
|
self,
|
|
2216
|
-
subset: Union[str, "Expr",
|
|
2317
|
+
subset: Union[str, "Expr", list[Union[str, "Expr"]]] = None,
|
|
2217
2318
|
*,
|
|
2218
2319
|
keep: Literal["first", "last", "any", "none"] = "any",
|
|
2219
2320
|
maintain_order: bool = False,
|
|
@@ -2270,17 +2371,11 @@ class FlowFrame:
|
|
|
2270
2371
|
break
|
|
2271
2372
|
|
|
2272
2373
|
# Determine if we can use the native implementation
|
|
2273
|
-
can_use_native =
|
|
2274
|
-
can_use_native
|
|
2275
|
-
and keep in ["any", "first", "last", "none"]
|
|
2276
|
-
and not maintain_order
|
|
2277
|
-
)
|
|
2374
|
+
can_use_native = can_use_native and keep in ["any", "first", "last", "none"] and not maintain_order
|
|
2278
2375
|
|
|
2279
2376
|
if can_use_native:
|
|
2280
2377
|
# Use the native NodeUnique implementation
|
|
2281
|
-
unique_input = transform_schema.UniqueInput(
|
|
2282
|
-
columns=processed_subset, strategy=keep
|
|
2283
|
-
)
|
|
2378
|
+
unique_input = transform_schema.UniqueInput(columns=processed_subset, strategy=keep)
|
|
2284
2379
|
|
|
2285
2380
|
# Create node settings
|
|
2286
2381
|
unique_settings = input_schema.NodeUnique(
|
|
@@ -2333,12 +2428,12 @@ class FlowFrame:
|
|
|
2333
2428
|
return self._create_child_frame(new_node_id)
|
|
2334
2429
|
|
|
2335
2430
|
@property
|
|
2336
|
-
def columns(self) ->
|
|
2431
|
+
def columns(self) -> list[str]:
|
|
2337
2432
|
"""Get the column names."""
|
|
2338
2433
|
return self.data.collect_schema().names()
|
|
2339
2434
|
|
|
2340
2435
|
@property
|
|
2341
|
-
def dtypes(self) ->
|
|
2436
|
+
def dtypes(self) -> list[pl.DataType]:
|
|
2342
2437
|
"""Get the column data types."""
|
|
2343
2438
|
return self.data.dtypes
|
|
2344
2439
|
|