Flowfile 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backends/main.py +25 -22
- build_backends/main_prd.py +10 -19
- flowfile/__init__.py +178 -74
- flowfile/__main__.py +10 -7
- flowfile/api.py +51 -57
- flowfile/web/__init__.py +14 -9
- flowfile/web/static/assets/AdminView-49392a9a.js +713 -0
- flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
- flowfile/web/static/assets/CloudConnectionView-36bcd6df.css +72 -0
- flowfile/web/static/assets/{CloudConnectionManager-0dfba9f2.js → CloudConnectionView-f13f202b.js} +11 -11
- flowfile/web/static/assets/{CloudStorageReader-d5b1b6c9.js → CloudStorageReader-0023d4a5.js} +10 -8
- flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
- flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
- flowfile/web/static/assets/{CloudStorageWriter-00d87aad.js → CloudStorageWriter-8e781e11.js} +10 -8
- flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
- flowfile/web/static/assets/{ColumnSelector-4685e75d.js → ColumnSelector-8ad68ea9.js} +3 -5
- flowfile/web/static/assets/{ContextMenu-c13f91d0.css → ContextMenu-26d4dd27.css} +6 -6
- flowfile/web/static/assets/{ContextMenu-23e909da.js → ContextMenu-31ee57f0.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-70ae0c79.js → ContextMenu-69a74055.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-f149cf7c.js → ContextMenu-8e2051c6.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-4c74eef1.css → ContextMenu-8ec1729e.css} +6 -6
- flowfile/web/static/assets/{ContextMenu-63cfa99b.css → ContextMenu-9b310c60.css} +6 -6
- flowfile/web/static/assets/{CrossJoin-702a3edd.js → CrossJoin-03df6938.js} +12 -10
- flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
- flowfile/web/static/assets/CustomNode-59e99a86.css +32 -0
- flowfile/web/static/assets/{CustomNode-b1519993.js → CustomNode-8479239b.js} +36 -24
- flowfile/web/static/assets/{DatabaseConnectionSettings-6f3e4ea5.js → DatabaseConnectionSettings-869e3efd.js} +5 -4
- flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-e91df89a.css} +13 -13
- flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-36898a00.css} +24 -24
- flowfile/web/static/assets/{DatabaseReader-d38c7295.js → DatabaseReader-c58b9552.js} +25 -15
- flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
- flowfile/web/static/assets/{DatabaseManager-cf5ef661.js → DatabaseView-d26a9140.js} +11 -11
- flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-217a99f1.css} +19 -19
- flowfile/web/static/assets/{DatabaseWriter-b04ef46a.js → DatabaseWriter-4d05ddc7.js} +17 -10
- flowfile/web/static/assets/{designer-8da3ba3a.css → DesignerView-a6d0ee84.css} +614 -546
- flowfile/web/static/assets/{designer-9633482a.js → DesignerView-e6f5c0e8.js} +1107 -3170
- flowfile/web/static/assets/{documentation-ca400224.js → DocumentationView-2e78ef1b.js} +5 -5
- flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-fd46c656.css} +7 -7
- flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
- flowfile/web/static/assets/{ExploreData-5fa10ed8.js → ExploreData-7b54caca.js} +18 -9
- flowfile/web/static/assets/{ExternalSource-d39af878.js → ExternalSource-3fa399b2.js} +9 -7
- flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-47ab05a3.css} +17 -17
- flowfile/web/static/assets/Filter-7494ea97.css +48 -0
- flowfile/web/static/assets/Filter-8cbbdbf3.js +287 -0
- flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
- flowfile/web/static/assets/{Formula-6b04fb1d.js → Formula-aac42b1e.js} +13 -11
- flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
- flowfile/web/static/assets/{FuzzyMatch-999521f4.js → FuzzyMatch-cd9bbfca.js} +12 -10
- flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-c24dec17.css} +5 -5
- flowfile/web/static/assets/{GraphSolver-17dd2198.js → GraphSolver-c7e6780e.js} +13 -11
- flowfile/web/static/assets/{GroupBy-6b039e18.js → GroupBy-93c5d22b.js} +9 -7
- flowfile/web/static/assets/{GroupBy-b9505323.css → GroupBy-be7ac0bf.css} +10 -10
- flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
- flowfile/web/static/assets/{Join-24d0f113.js → Join-a19b2de2.js} +13 -11
- flowfile/web/static/assets/LoginView-0df4ed0a.js +134 -0
- flowfile/web/static/assets/LoginView-d325d632.css +172 -0
- flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
- flowfile/web/static/assets/{ManualInput-34639209.js → ManualInput-8d3374b2.js} +170 -116
- flowfile/web/static/assets/{MultiSelect-0e8724a3.js → MultiSelect-ad1b6243.js} +2 -2
- flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js → MultiSelect.vue_vue_type_script_setup_true_lang-e278950d.js} +1 -1
- flowfile/web/static/assets/NodeDesigner-40b647c9.js +2610 -0
- flowfile/web/static/assets/NodeDesigner-5f53be3f.css +1429 -0
- flowfile/web/static/assets/{NumericInput-3d63a470.js → NumericInput-7100234c.js} +2 -2
- flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js → NumericInput.vue_vue_type_script_setup_true_lang-5130219f.js} +5 -2
- flowfile/web/static/assets/{Output-283fe388.css → Output-35e97000.css} +6 -6
- flowfile/web/static/assets/{Output-edea9802.js → Output-f5efd2aa.js} +12 -9
- flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
- flowfile/web/static/assets/{Pivot-61d19301.js → Pivot-d981d23c.js} +11 -9
- flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
- flowfile/web/static/assets/{PivotValidation-f97fec5b.js → PivotValidation-39386e95.js} +3 -3
- flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
- flowfile/web/static/assets/{PivotValidation-de9f43fe.js → PivotValidation-63de1f73.js} +3 -3
- flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
- flowfile/web/static/assets/{PolarsCode-bc3c9984.js → PolarsCode-f9d69217.js} +18 -9
- flowfile/web/static/assets/PopOver-b22f049e.js +939 -0
- flowfile/web/static/assets/PopOver-d96599db.css +33 -0
- flowfile/web/static/assets/{Read-e808b239.css → Read-36e7bd51.css} +12 -12
- flowfile/web/static/assets/{Read-64a3f259.js → Read-aec2e377.js} +14 -11
- flowfile/web/static/assets/{RecordCount-3d5039be.js → RecordCount-78ed6845.js} +6 -4
- flowfile/web/static/assets/{RecordId-597510e0.js → RecordId-2156e890.js} +8 -6
- flowfile/web/static/assets/{SQLQueryComponent-36cef432.css → SQLQueryComponent-1c2f26b4.css} +5 -5
- flowfile/web/static/assets/{SQLQueryComponent-df51adbe.js → SQLQueryComponent-48c72f5b.js} +3 -3
- flowfile/web/static/assets/{Sample-4be0a507.js → Sample-1352ca74.js} +6 -4
- flowfile/web/static/assets/SecretSelector-22b5ff89.js +113 -0
- flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
- flowfile/web/static/assets/{SecretManager-4839be57.js → SecretsView-17df66ee.js} +35 -36
- flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
- flowfile/web/static/assets/{Select-9b72f201.js → Select-0aee4c54.js} +9 -7
- flowfile/web/static/assets/{SettingsSection-f0f75a42.js → SettingsSection-0784e157.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-e1e9c953.js → SettingsSection-cd341bb6.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-7ded385d.js → SettingsSection-f2002a6d.js} +3 -3
- flowfile/web/static/assets/{SingleSelect-6c777aac.js → SingleSelect-460cc0ea.js} +2 -2
- flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js → SingleSelect.vue_vue_type_script_setup_true_lang-30741bb2.js} +1 -1
- flowfile/web/static/assets/{SliderInput-7cb93e62.js → SliderInput-5d926864.js} +7 -4
- flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
- flowfile/web/static/assets/{Sort-6cbde21a.js → Sort-3cdc971b.js} +9 -7
- flowfile/web/static/assets/{Unique-f9fb0809.css → Sort-8a871341.css} +10 -10
- flowfile/web/static/assets/{TextInput-d9a40c11.js → TextInput-a2d0bfbd.js} +2 -2
- flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-5896c375.js → TextInput.vue_vue_type_script_setup_true_lang-abad1ca2.js} +5 -2
- flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
- flowfile/web/static/assets/{TextToRows-c4fcbf4d.js → TextToRows-918945f7.js} +11 -10
- flowfile/web/static/assets/{ToggleSwitch-4ef91d19.js → ToggleSwitch-f0ef5196.js} +2 -2
- flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-5605c793.js} +1 -1
- flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-54d2f518.css} +6 -6
- flowfile/web/static/assets/{UnavailableFields-a03f512c.js → UnavailableFields-bdad6144.js} +4 -4
- flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
- flowfile/web/static/assets/{Union-bfe9b996.js → Union-e8ab8c86.js} +8 -6
- flowfile/web/static/assets/{Unique-5d023a27.js → Unique-8cd4f976.js} +13 -10
- flowfile/web/static/assets/{Sort-3643d625.css → Unique-9fb2f567.css} +10 -10
- flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-710a2948.css} +7 -7
- flowfile/web/static/assets/{Unpivot-91cc5354.js → Unpivot-8da14095.js} +10 -8
- flowfile/web/static/assets/{UnpivotValidation-7ee2de44.js → UnpivotValidation-6f7d89ff.js} +3 -3
- flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
- flowfile/web/static/assets/{VueGraphicWalker-e51b9924.js → VueGraphicWalker-3fb312e1.js} +4 -4
- flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
- flowfile/web/static/assets/{api-cf1221f0.js → api-24483f0d.js} +1 -1
- flowfile/web/static/assets/{api-c1bad5ca.js → api-8b81fa73.js} +1 -1
- flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-3d8dc5fa.css} +40 -40
- flowfile/web/static/assets/{dropDown-614b998d.js → dropDown-ac0fda9d.js} +3 -3
- flowfile/web/static/assets/{fullEditor-f7971590.js → fullEditor-5497a84a.js} +11 -10
- flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-a0be62b3.css} +74 -62
- flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
- flowfile/web/static/assets/{genericNodeSettings-4fe5f36b.js → genericNodeSettings-99014e1d.js} +5 -5
- flowfile/web/static/assets/index-07dda503.js +38 -0
- flowfile/web/static/assets/index-3ba44389.js +2696 -0
- flowfile/web/static/assets/{index-50508d4d.css → index-e6289dd0.css} +1945 -569
- flowfile/web/static/assets/{index-5429bbf8.js → index-fb6493ae.js} +41626 -40867
- flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
- flowfile/web/static/assets/nodeInput-0eb13f1a.js +2 -0
- flowfile/web/static/assets/{outputCsv-076b85ab.js → outputCsv-8f8ba42d.js} +3 -3
- flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
- flowfile/web/static/assets/{outputExcel-0fd17dbe.js → outputExcel-393f4fef.js} +3 -3
- flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
- flowfile/web/static/assets/{outputParquet-b61e0847.js → outputParquet-07c81f65.js} +4 -4
- flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
- flowfile/web/static/assets/{readCsv-a8bb8b61.js → readCsv-07f6d9ad.js} +3 -3
- flowfile/web/static/assets/{readCsv-c767cb37.css → readCsv-3bfac4c3.css} +15 -15
- flowfile/web/static/assets/{readExcel-806d2826.css → readExcel-3db6b763.css} +13 -13
- flowfile/web/static/assets/{readExcel-67b4aee0.js → readExcel-ed69bc8f.js} +5 -5
- flowfile/web/static/assets/{readParquet-48c81530.css → readParquet-c5244ad5.css} +4 -4
- flowfile/web/static/assets/{readParquet-92ce1dbc.js → readParquet-e3ed4528.js} +3 -3
- flowfile/web/static/assets/secrets.api-002e7d7e.js +65 -0
- flowfile/web/static/assets/{selectDynamic-92e25ee3.js → selectDynamic-80b92899.js} +5 -5
- flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
- flowfile/web/static/assets/{vue-codemirror.esm-41b0e0d7.js → vue-codemirror.esm-0965f39f.js} +31 -640
- flowfile/web/static/assets/{vue-content-loader.es-2c8e608f.js → vue-content-loader.es-c506ad97.js} +1 -1
- flowfile/web/static/index.html +2 -2
- {flowfile-0.5.1.dist-info → flowfile-0.5.3.dist-info}/METADATA +2 -3
- flowfile-0.5.3.dist-info/RECORD +402 -0
- flowfile_core/__init__.py +13 -6
- flowfile_core/auth/jwt.py +51 -16
- flowfile_core/auth/models.py +32 -7
- flowfile_core/auth/password.py +89 -0
- flowfile_core/auth/secrets.py +8 -6
- flowfile_core/configs/__init__.py +9 -7
- flowfile_core/configs/flow_logger.py +15 -14
- flowfile_core/configs/node_store/__init__.py +72 -4
- flowfile_core/configs/node_store/nodes.py +155 -172
- flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
- flowfile_core/configs/settings.py +28 -15
- flowfile_core/database/connection.py +7 -6
- flowfile_core/database/init_db.py +96 -2
- flowfile_core/database/models.py +3 -1
- flowfile_core/fileExplorer/__init__.py +17 -0
- flowfile_core/fileExplorer/funcs.py +123 -57
- flowfile_core/fileExplorer/utils.py +10 -11
- flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
- flowfile_core/flowfile/analytics/analytics_processor.py +26 -24
- flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
- flowfile_core/flowfile/analytics/utils.py +1 -1
- flowfile_core/flowfile/code_generator/code_generator.py +358 -244
- flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
- flowfile_core/flowfile/database_connection_manager/models.py +1 -1
- flowfile_core/flowfile/extensions.py +17 -12
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +115 -83
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +481 -423
- flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
- flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +31 -20
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +14 -15
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
- flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +190 -127
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
- flowfile_core/flowfile/flow_data_engine/utils.py +99 -67
- flowfile_core/flowfile/flow_graph.py +918 -571
- flowfile_core/flowfile/flow_graph_utils.py +31 -49
- flowfile_core/flowfile/flow_node/flow_node.py +330 -233
- flowfile_core/flowfile/flow_node/models.py +53 -41
- flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
- flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
- flowfile_core/flowfile/handler.py +80 -30
- flowfile_core/flowfile/manage/compatibility_enhancements.py +209 -126
- flowfile_core/flowfile/manage/io_flowfile.py +54 -57
- flowfile_core/flowfile/node_designer/__init__.py +15 -13
- flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
- flowfile_core/flowfile/node_designer/custom_node.py +162 -36
- flowfile_core/flowfile/node_designer/ui_components.py +135 -34
- flowfile_core/flowfile/schema_callbacks.py +71 -51
- flowfile_core/flowfile/setting_generator/__init__.py +0 -1
- flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
- flowfile_core/flowfile/setting_generator/settings.py +64 -53
- flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
- flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
- flowfile_core/flowfile/util/calculate_layout.py +9 -13
- flowfile_core/flowfile/util/execution_orderer.py +25 -17
- flowfile_core/flowfile/util/node_skipper.py +4 -4
- flowfile_core/flowfile/utils.py +19 -21
- flowfile_core/main.py +26 -19
- flowfile_core/routes/auth.py +284 -11
- flowfile_core/routes/cloud_connections.py +25 -25
- flowfile_core/routes/logs.py +21 -29
- flowfile_core/routes/public.py +3 -3
- flowfile_core/routes/routes.py +70 -34
- flowfile_core/routes/secrets.py +25 -27
- flowfile_core/routes/user_defined_components.py +483 -4
- flowfile_core/run_lock.py +0 -1
- flowfile_core/schemas/__init__.py +4 -6
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
- flowfile_core/schemas/cloud_storage_schemas.py +59 -53
- flowfile_core/schemas/input_schema.py +231 -144
- flowfile_core/schemas/output_model.py +49 -34
- flowfile_core/schemas/schemas.py +116 -89
- flowfile_core/schemas/transform_schema.py +518 -263
- flowfile_core/schemas/yaml_types.py +21 -7
- flowfile_core/secret_manager/secret_manager.py +17 -13
- flowfile_core/types.py +29 -9
- flowfile_core/utils/arrow_reader.py +7 -6
- flowfile_core/utils/excel_file_manager.py +3 -3
- flowfile_core/utils/fileManager.py +7 -7
- flowfile_core/utils/fl_executor.py +8 -10
- flowfile_core/utils/utils.py +4 -4
- flowfile_core/utils/validate_setup.py +5 -4
- flowfile_frame/__init__.py +106 -51
- flowfile_frame/adapters.py +2 -9
- flowfile_frame/adding_expr.py +73 -32
- flowfile_frame/cloud_storage/frame_helpers.py +27 -23
- flowfile_frame/cloud_storage/secret_manager.py +12 -26
- flowfile_frame/config.py +2 -5
- flowfile_frame/expr.py +311 -218
- flowfile_frame/expr.pyi +160 -159
- flowfile_frame/expr_name.py +23 -23
- flowfile_frame/flow_frame.py +571 -476
- flowfile_frame/flow_frame.pyi +123 -104
- flowfile_frame/flow_frame_methods.py +227 -246
- flowfile_frame/group_frame.py +50 -20
- flowfile_frame/join.py +2 -2
- flowfile_frame/lazy.py +129 -87
- flowfile_frame/lazy_methods.py +83 -30
- flowfile_frame/list_name_space.py +55 -50
- flowfile_frame/selectors.py +148 -68
- flowfile_frame/series.py +9 -7
- flowfile_frame/utils.py +19 -21
- flowfile_worker/__init__.py +12 -7
- flowfile_worker/configs.py +11 -19
- flowfile_worker/create/__init__.py +14 -9
- flowfile_worker/create/funcs.py +114 -77
- flowfile_worker/create/models.py +46 -43
- flowfile_worker/create/pl_types.py +14 -15
- flowfile_worker/create/read_excel_tables.py +34 -41
- flowfile_worker/create/utils.py +22 -19
- flowfile_worker/external_sources/s3_source/main.py +18 -51
- flowfile_worker/external_sources/s3_source/models.py +34 -27
- flowfile_worker/external_sources/sql_source/main.py +8 -5
- flowfile_worker/external_sources/sql_source/models.py +13 -9
- flowfile_worker/flow_logger.py +10 -8
- flowfile_worker/funcs.py +214 -155
- flowfile_worker/main.py +11 -17
- flowfile_worker/models.py +35 -28
- flowfile_worker/process_manager.py +2 -3
- flowfile_worker/routes.py +121 -90
- flowfile_worker/secrets.py +9 -6
- flowfile_worker/spawner.py +80 -49
- flowfile_worker/utils.py +3 -2
- shared/__init__.py +2 -7
- shared/storage_config.py +25 -13
- test_utils/postgres/commands.py +3 -2
- test_utils/postgres/fixtures.py +9 -9
- test_utils/s3/commands.py +1 -1
- test_utils/s3/data_generator.py +3 -4
- test_utils/s3/demo_data_generator.py +4 -7
- test_utils/s3/fixtures.py +7 -5
- tools/migrate/__init__.py +1 -1
- tools/migrate/__main__.py +16 -29
- tools/migrate/legacy_schemas.py +251 -190
- tools/migrate/migrate.py +193 -181
- tools/migrate/tests/conftest.py +1 -3
- tools/migrate/tests/test_migrate.py +36 -41
- tools/migrate/tests/test_migration_e2e.py +28 -29
- tools/migrate/tests/test_node_migrations.py +50 -20
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
- flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
- flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
- flowfile/web/static/assets/Filter-9b6d08db.js +0 -164
- flowfile/web/static/assets/Filter-f62091b3.css +0 -20
- flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
- flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
- flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
- flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
- flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
- flowfile/web/static/assets/nodeInput-5d0d6b79.js +0 -41
- flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
- flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
- flowfile/web/static/assets/secretApi-68435402.js +0 -46
- flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
- flowfile-0.5.1.dist-info/RECORD +0 -388
- {flowfile-0.5.1.dist-info → flowfile-0.5.3.dist-info}/WHEEL +0 -0
- {flowfile-0.5.1.dist-info → flowfile-0.5.3.dist-info}/entry_points.txt +0 -0
- {flowfile-0.5.1.dist-info → flowfile-0.5.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,52 +1,50 @@
|
|
|
1
1
|
# Standard library imports
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
6
|
+
from collections.abc import Callable, Generator, Iterable
|
|
4
7
|
from copy import deepcopy
|
|
5
8
|
from dataclasses import dataclass
|
|
6
9
|
from math import ceil
|
|
7
|
-
from typing import Any,
|
|
10
|
+
from typing import Any, Literal, TypeVar, Union
|
|
8
11
|
|
|
9
|
-
|
|
12
|
+
import polars as pl
|
|
10
13
|
|
|
11
14
|
# Third-party imports
|
|
12
15
|
from loky import Future
|
|
13
|
-
import
|
|
16
|
+
from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
|
|
14
17
|
from polars.exceptions import PanicException
|
|
15
|
-
from polars_grouper import graph_solver
|
|
16
18
|
from polars_expr_transformer import simple_function_to_expr as to_expr
|
|
19
|
+
from polars_grouper import graph_solver
|
|
17
20
|
from pyarrow import Table as PaTable
|
|
18
21
|
from pyarrow.parquet import ParquetFile
|
|
19
22
|
|
|
20
23
|
# Local imports - Core
|
|
21
24
|
from flowfile_core.configs import logger
|
|
22
|
-
from flowfile_core.utils.utils import ensure_similarity_dicts
|
|
23
25
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
24
|
-
from flowfile_core.schemas import (
|
|
25
|
-
cloud_storage_schemas,
|
|
26
|
-
input_schema,
|
|
27
|
-
transform_schema as transform_schemas
|
|
28
|
-
)
|
|
29
|
-
from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
|
|
30
26
|
|
|
31
27
|
# Local imports - Flow File Components
|
|
32
28
|
from flowfile_core.flowfile.flow_data_engine import utils
|
|
33
|
-
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
|
|
34
|
-
|
|
35
|
-
|
|
29
|
+
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
|
|
30
|
+
CloudStorageReader,
|
|
31
|
+
ensure_path_has_wildcard_pattern,
|
|
32
|
+
get_first_file_from_s3_dir,
|
|
33
|
+
)
|
|
36
34
|
from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
|
|
37
35
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
|
|
38
36
|
FlowfileColumn,
|
|
39
37
|
assert_if_flowfile_schema,
|
|
40
|
-
convert_stats_to_column_info
|
|
38
|
+
convert_stats_to_column_info,
|
|
41
39
|
)
|
|
42
40
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
43
41
|
from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
|
|
44
42
|
from flowfile_core.flowfile.flow_data_engine.join import (
|
|
45
|
-
|
|
46
|
-
verify_join_map_integrity,
|
|
47
|
-
rename_df_table_for_join,
|
|
43
|
+
get_col_name_to_delete,
|
|
48
44
|
get_undo_rename_mapping_join,
|
|
49
|
-
|
|
45
|
+
rename_df_table_for_join,
|
|
46
|
+
verify_join_map_integrity,
|
|
47
|
+
verify_join_select_integrity,
|
|
50
48
|
)
|
|
51
49
|
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
52
50
|
from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
|
|
@@ -55,19 +53,21 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
|
|
|
55
53
|
ExternalDfFetcher,
|
|
56
54
|
ExternalExecutorTracker,
|
|
57
55
|
ExternalFuzzyMatchFetcher,
|
|
58
|
-
fetch_unique_values
|
|
59
|
-
)
|
|
60
|
-
from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
|
|
61
|
-
get_join_count,
|
|
62
|
-
write_threaded
|
|
56
|
+
fetch_unique_values,
|
|
63
57
|
)
|
|
64
|
-
|
|
58
|
+
from flowfile_core.flowfile.flow_data_engine.threaded_processes import write_threaded
|
|
65
59
|
from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
|
|
60
|
+
from flowfile_core.schemas import cloud_storage_schemas, input_schema
|
|
61
|
+
from flowfile_core.schemas import transform_schema as transform_schemas
|
|
62
|
+
from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
|
|
63
|
+
from flowfile_core.utils.utils import ensure_similarity_dicts
|
|
66
64
|
|
|
67
|
-
T = TypeVar(
|
|
65
|
+
T = TypeVar("T", pl.DataFrame, pl.LazyFrame)
|
|
68
66
|
|
|
69
67
|
|
|
70
|
-
def _handle_duplication_join_keys(
|
|
68
|
+
def _handle_duplication_join_keys(
|
|
69
|
+
left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager
|
|
70
|
+
) -> tuple[T, T, dict[str, str]]:
|
|
71
71
|
"""Temporarily renames join keys to avoid conflicts during a join.
|
|
72
72
|
|
|
73
73
|
This helper function checks the join type and renames the join key columns
|
|
@@ -88,20 +88,26 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_manager: transfo
|
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
90
|
def _construct_temp_name(column_name: str) -> str:
|
|
91
|
-
return "__FL_TEMP__"+column_name
|
|
91
|
+
return "__FL_TEMP__" + column_name
|
|
92
92
|
|
|
93
|
-
if join_manager.how ==
|
|
94
|
-
left_df = left_df.with_columns(
|
|
95
|
-
|
|
93
|
+
if join_manager.how == "right":
|
|
94
|
+
left_df = left_df.with_columns(
|
|
95
|
+
pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
96
|
+
for jk in join_manager.left_manager.get_join_key_selects()
|
|
97
|
+
)
|
|
96
98
|
reverse_actions = {
|
|
97
99
|
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
|
|
98
|
-
for jk in join_manager.left_manager.get_join_key_selects()
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
100
|
+
for jk in join_manager.left_manager.get_join_key_selects()
|
|
101
|
+
}
|
|
102
|
+
elif join_manager.how in ("left", "inner"):
|
|
103
|
+
right_df = right_df.with_columns(
|
|
104
|
+
pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
105
|
+
for jk in join_manager.right_manager.get_join_key_selects()
|
|
106
|
+
)
|
|
102
107
|
reverse_actions = {
|
|
103
108
|
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
|
|
104
|
-
for jk in join_manager.right_manager.get_join_key_selects()
|
|
109
|
+
for jk in join_manager.right_manager.get_join_key_selects()
|
|
110
|
+
}
|
|
105
111
|
else:
|
|
106
112
|
reverse_actions = {}
|
|
107
113
|
return left_df, right_df, reverse_actions
|
|
@@ -118,12 +124,12 @@ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.
|
|
|
118
124
|
Args:
|
|
119
125
|
join_input: The JoinInput settings object to modify.
|
|
120
126
|
"""
|
|
121
|
-
if join_input.how in (
|
|
127
|
+
if join_input.how in ("semi", "anti"):
|
|
122
128
|
for jk in join_input.right_select.renames:
|
|
123
129
|
jk.keep = False
|
|
124
130
|
|
|
125
131
|
|
|
126
|
-
def get_select_columns(full_select_input:
|
|
132
|
+
def get_select_columns(full_select_input: list[transform_schemas.SelectInput]) -> list[str]:
|
|
127
133
|
"""Extracts a list of column names to be selected from a SelectInput list.
|
|
128
134
|
|
|
129
135
|
This function filters a list of `SelectInput` objects to return the names
|
|
@@ -156,15 +162,16 @@ class FlowDataEngine:
|
|
|
156
162
|
errors: A list of errors encountered during operations.
|
|
157
163
|
_schema: A cached list of `FlowfileColumn` objects representing the schema.
|
|
158
164
|
"""
|
|
165
|
+
|
|
159
166
|
# Core attributes
|
|
160
|
-
_data_frame:
|
|
161
|
-
columns:
|
|
167
|
+
_data_frame: pl.DataFrame | pl.LazyFrame
|
|
168
|
+
columns: list[Any]
|
|
162
169
|
|
|
163
170
|
# Metadata attributes
|
|
164
171
|
name: str = None
|
|
165
172
|
number_of_records: int = None
|
|
166
|
-
errors:
|
|
167
|
-
_schema:
|
|
173
|
+
errors: list = None
|
|
174
|
+
_schema: list["FlowfileColumn"] | None = None
|
|
168
175
|
|
|
169
176
|
# Configuration attributes
|
|
170
177
|
_optimize_memory: bool = False
|
|
@@ -173,16 +180,16 @@ class FlowDataEngine:
|
|
|
173
180
|
_calculate_schema_stats: bool = False
|
|
174
181
|
|
|
175
182
|
# Cache and optimization attributes
|
|
176
|
-
__col_name_idx_map:
|
|
177
|
-
__data_map:
|
|
178
|
-
__optimized_columns:
|
|
183
|
+
__col_name_idx_map: dict = None
|
|
184
|
+
__data_map: dict = None
|
|
185
|
+
__optimized_columns: list = None
|
|
179
186
|
__sample__: str = None
|
|
180
187
|
__number_of_fields: int = None
|
|
181
|
-
_col_idx:
|
|
188
|
+
_col_idx: dict[str, int] = None
|
|
182
189
|
|
|
183
190
|
# Source tracking
|
|
184
|
-
_org_path:
|
|
185
|
-
_external_source:
|
|
191
|
+
_org_path: str | None = None
|
|
192
|
+
_external_source: ExternalDataSource | None = None
|
|
186
193
|
|
|
187
194
|
# State tracking
|
|
188
195
|
sorted_by: int = None
|
|
@@ -195,17 +202,21 @@ class FlowDataEngine:
|
|
|
195
202
|
_number_of_records_callback: Callable = None
|
|
196
203
|
_data_callback: Callable = None
|
|
197
204
|
|
|
198
|
-
def __init__(
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
205
|
+
def __init__(
|
|
206
|
+
self,
|
|
207
|
+
raw_data: Union[
|
|
208
|
+
list[dict], list[Any], dict[str, Any], "ParquetFile", pl.DataFrame, pl.LazyFrame, input_schema.RawData
|
|
209
|
+
] = None,
|
|
210
|
+
path_ref: str = None,
|
|
211
|
+
name: str = None,
|
|
212
|
+
optimize_memory: bool = True,
|
|
213
|
+
schema: list["FlowfileColumn"] | list[str] | pl.Schema = None,
|
|
214
|
+
number_of_records: int = None,
|
|
215
|
+
calculate_schema_stats: bool = False,
|
|
216
|
+
streamable: bool = True,
|
|
217
|
+
number_of_records_callback: Callable = None,
|
|
218
|
+
data_callback: Callable = None,
|
|
219
|
+
):
|
|
209
220
|
"""Initializes the FlowDataEngine from various data sources.
|
|
210
221
|
|
|
211
222
|
Args:
|
|
@@ -265,12 +276,12 @@ class FlowDataEngine:
|
|
|
265
276
|
elif isinstance(raw_data, (list, dict)):
|
|
266
277
|
self._handle_python_data(raw_data)
|
|
267
278
|
|
|
268
|
-
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records:
|
|
279
|
+
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: int | None):
|
|
269
280
|
"""(Internal) Initializes the engine from an eager Polars DataFrame."""
|
|
270
281
|
self.data_frame = df
|
|
271
282
|
self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
|
|
272
283
|
|
|
273
|
-
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records:
|
|
284
|
+
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: int | None, optimize_memory: bool):
|
|
274
285
|
"""(Internal) Initializes the engine from a Polars LazyFrame."""
|
|
275
286
|
self.data_frame = lf
|
|
276
287
|
self._lazy = True
|
|
@@ -281,14 +292,14 @@ class FlowDataEngine:
|
|
|
281
292
|
else:
|
|
282
293
|
self.number_of_records = lf.select(pl.len()).collect()[0, 0]
|
|
283
294
|
|
|
284
|
-
def _handle_python_data(self, data:
|
|
295
|
+
def _handle_python_data(self, data: list | dict):
|
|
285
296
|
"""(Internal) Dispatches Python collections to the correct handler."""
|
|
286
297
|
if isinstance(data, dict):
|
|
287
298
|
self._handle_dict_input(data)
|
|
288
299
|
else:
|
|
289
300
|
self._handle_list_input(data)
|
|
290
301
|
|
|
291
|
-
def _handle_dict_input(self, data:
|
|
302
|
+
def _handle_dict_input(self, data: dict):
|
|
292
303
|
"""(Internal) Initializes the engine from a Python dictionary."""
|
|
293
304
|
if len(data) == 0:
|
|
294
305
|
self.initialize_empty_fl()
|
|
@@ -312,8 +323,12 @@ class FlowDataEngine:
|
|
|
312
323
|
raw_data: An instance of `RawData` containing the data and schema.
|
|
313
324
|
"""
|
|
314
325
|
flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
|
|
315
|
-
polars_schema = pl.Schema(
|
|
316
|
-
|
|
326
|
+
polars_schema = pl.Schema(
|
|
327
|
+
[
|
|
328
|
+
(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
|
|
329
|
+
for flowfile_column in flowfile_schema
|
|
330
|
+
]
|
|
331
|
+
)
|
|
317
332
|
try:
|
|
318
333
|
df = pl.DataFrame(raw_data.data, polars_schema, strict=False)
|
|
319
334
|
except TypeError as e:
|
|
@@ -323,7 +338,7 @@ class FlowDataEngine:
|
|
|
323
338
|
self.data_frame = df.lazy()
|
|
324
339
|
self.lazy = True
|
|
325
340
|
|
|
326
|
-
def _handle_list_input(self, data:
|
|
341
|
+
def _handle_list_input(self, data: list):
|
|
327
342
|
"""(Internal) Initializes the engine from a list of records."""
|
|
328
343
|
number_of_records = len(data)
|
|
329
344
|
if number_of_records > 0:
|
|
@@ -336,19 +351,19 @@ class FlowDataEngine:
|
|
|
336
351
|
self.number_of_records = 0
|
|
337
352
|
|
|
338
353
|
@staticmethod
|
|
339
|
-
def _process_list_data(data:
|
|
354
|
+
def _process_list_data(data: list) -> list[dict]:
|
|
340
355
|
"""(Internal) Normalizes list data into a list of dictionaries.
|
|
341
356
|
|
|
342
357
|
Ensures that a list of objects or non-dict items is converted into a
|
|
343
358
|
uniform list of dictionaries suitable for Polars DataFrame creation.
|
|
344
359
|
"""
|
|
345
|
-
if not (isinstance(data[0], dict) or hasattr(data[0],
|
|
360
|
+
if not (isinstance(data[0], dict) or hasattr(data[0], "__dict__")):
|
|
346
361
|
try:
|
|
347
362
|
return pl.DataFrame(data).to_dicts()
|
|
348
363
|
except TypeError:
|
|
349
|
-
raise Exception(
|
|
364
|
+
raise Exception("Value must be able to be converted to dictionary")
|
|
350
365
|
except Exception as e:
|
|
351
|
-
raise Exception(f
|
|
366
|
+
raise Exception(f"Value must be able to be converted to dictionary: {e}")
|
|
352
367
|
|
|
353
368
|
if not isinstance(data[0], dict):
|
|
354
369
|
data = [row.__dict__ for row in data]
|
|
@@ -375,49 +390,37 @@ class FlowDataEngine:
|
|
|
375
390
|
|
|
376
391
|
logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
|
|
377
392
|
|
|
378
|
-
if write_settings.write_mode ==
|
|
393
|
+
if write_settings.write_mode == "append" and write_settings.file_format != "delta":
|
|
379
394
|
raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
|
|
380
395
|
storage_options = CloudStorageReader.get_storage_options(connection)
|
|
381
396
|
credential_provider = CloudStorageReader.get_credential_provider(connection)
|
|
382
397
|
# Dispatch to the correct writer based on file format
|
|
383
398
|
if write_settings.file_format == "parquet":
|
|
384
399
|
self._write_parquet_to_cloud(
|
|
385
|
-
write_settings.resource_path,
|
|
386
|
-
storage_options,
|
|
387
|
-
credential_provider,
|
|
388
|
-
write_settings
|
|
400
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
389
401
|
)
|
|
390
402
|
elif write_settings.file_format == "delta":
|
|
391
403
|
self._write_delta_to_cloud(
|
|
392
|
-
write_settings.resource_path,
|
|
393
|
-
storage_options,
|
|
394
|
-
credential_provider,
|
|
395
|
-
write_settings
|
|
404
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
396
405
|
)
|
|
397
406
|
elif write_settings.file_format == "csv":
|
|
398
|
-
self._write_csv_to_cloud(
|
|
399
|
-
write_settings.resource_path,
|
|
400
|
-
storage_options,
|
|
401
|
-
credential_provider,
|
|
402
|
-
write_settings
|
|
403
|
-
)
|
|
407
|
+
self._write_csv_to_cloud(write_settings.resource_path, storage_options, credential_provider, write_settings)
|
|
404
408
|
elif write_settings.file_format == "json":
|
|
405
409
|
self._write_json_to_cloud(
|
|
406
|
-
write_settings.resource_path,
|
|
407
|
-
storage_options,
|
|
408
|
-
credential_provider,
|
|
409
|
-
write_settings
|
|
410
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
410
411
|
)
|
|
411
412
|
else:
|
|
412
413
|
raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
|
|
413
414
|
|
|
414
415
|
logger.info(f"Successfully wrote data to {write_settings.resource_path}")
|
|
415
416
|
|
|
416
|
-
def _write_parquet_to_cloud(
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
417
|
+
def _write_parquet_to_cloud(
|
|
418
|
+
self,
|
|
419
|
+
resource_path: str,
|
|
420
|
+
storage_options: dict[str, Any],
|
|
421
|
+
credential_provider: Callable | None,
|
|
422
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
423
|
+
):
|
|
421
424
|
"""(Internal) Writes the DataFrame to a Parquet file in cloud storage.
|
|
422
425
|
|
|
423
426
|
Uses `sink_parquet` for efficient streaming writes. Falls back to a
|
|
@@ -437,18 +440,20 @@ class FlowDataEngine:
|
|
|
437
440
|
except Exception as e:
|
|
438
441
|
logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
|
|
439
442
|
pl_df = self.collect()
|
|
440
|
-
sink_kwargs[
|
|
443
|
+
sink_kwargs["file"] = sink_kwargs.pop("path")
|
|
441
444
|
pl_df.write_parquet(**sink_kwargs)
|
|
442
445
|
|
|
443
446
|
except Exception as e:
|
|
444
447
|
logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
|
|
445
448
|
raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
|
|
446
449
|
|
|
447
|
-
def _write_delta_to_cloud(
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
450
|
+
def _write_delta_to_cloud(
|
|
451
|
+
self,
|
|
452
|
+
resource_path: str,
|
|
453
|
+
storage_options: dict[str, Any],
|
|
454
|
+
credential_provider: Callable | None,
|
|
455
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
456
|
+
):
|
|
452
457
|
"""(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
|
|
453
458
|
|
|
454
459
|
This operation requires collecting the data first, as `write_delta` operates
|
|
@@ -464,11 +469,13 @@ class FlowDataEngine:
|
|
|
464
469
|
sink_kwargs["credential_provider"] = credential_provider
|
|
465
470
|
self.collect().write_delta(**sink_kwargs)
|
|
466
471
|
|
|
467
|
-
def _write_csv_to_cloud(
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
+
def _write_csv_to_cloud(
|
|
473
|
+
self,
|
|
474
|
+
resource_path: str,
|
|
475
|
+
storage_options: dict[str, Any],
|
|
476
|
+
credential_provider: Callable | None,
|
|
477
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
478
|
+
):
|
|
472
479
|
"""(Internal) Writes the DataFrame to a CSV file in cloud storage.
|
|
473
480
|
|
|
474
481
|
Uses `sink_csv` for efficient, streaming writes of the data.
|
|
@@ -490,11 +497,13 @@ class FlowDataEngine:
|
|
|
490
497
|
logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
|
|
491
498
|
raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
|
|
492
499
|
|
|
493
|
-
def _write_json_to_cloud(
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
500
|
+
def _write_json_to_cloud(
|
|
501
|
+
self,
|
|
502
|
+
resource_path: str,
|
|
503
|
+
storage_options: dict[str, Any],
|
|
504
|
+
credential_provider: Callable | None,
|
|
505
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
506
|
+
):
|
|
498
507
|
"""(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
|
|
499
508
|
|
|
500
509
|
Uses `sink_ndjson` for efficient, streaming writes.
|
|
@@ -512,7 +521,9 @@ class FlowDataEngine:
|
|
|
512
521
|
raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
|
|
513
522
|
|
|
514
523
|
@classmethod
|
|
515
|
-
def from_cloud_storage_obj(
|
|
524
|
+
def from_cloud_storage_obj(
|
|
525
|
+
cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal
|
|
526
|
+
) -> "FlowDataEngine":
|
|
516
527
|
"""Creates a FlowDataEngine from an object in cloud storage.
|
|
517
528
|
|
|
518
529
|
This method supports reading from various cloud storage providers like AWS S3,
|
|
@@ -549,31 +560,22 @@ class FlowDataEngine:
|
|
|
549
560
|
)
|
|
550
561
|
elif read_settings.file_format == "delta":
|
|
551
562
|
return cls._read_delta_from_cloud(
|
|
552
|
-
read_settings.resource_path,
|
|
553
|
-
storage_options,
|
|
554
|
-
credential_provider,
|
|
555
|
-
read_settings
|
|
563
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
556
564
|
)
|
|
557
565
|
elif read_settings.file_format == "csv":
|
|
558
566
|
return cls._read_csv_from_cloud(
|
|
559
|
-
read_settings.resource_path,
|
|
560
|
-
storage_options,
|
|
561
|
-
credential_provider,
|
|
562
|
-
read_settings
|
|
567
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
563
568
|
)
|
|
564
569
|
elif read_settings.file_format == "json":
|
|
565
570
|
return cls._read_json_from_cloud(
|
|
566
571
|
read_settings.resource_path,
|
|
567
572
|
storage_options,
|
|
568
573
|
credential_provider,
|
|
569
|
-
read_settings.scan_mode == "directory"
|
|
574
|
+
read_settings.scan_mode == "directory",
|
|
570
575
|
)
|
|
571
576
|
elif read_settings.file_format == "iceberg":
|
|
572
577
|
return cls._read_iceberg_from_cloud(
|
|
573
|
-
read_settings.resource_path,
|
|
574
|
-
storage_options,
|
|
575
|
-
credential_provider,
|
|
576
|
-
read_settings
|
|
578
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
577
579
|
)
|
|
578
580
|
|
|
579
581
|
elif read_settings.file_format in ["delta", "iceberg"]:
|
|
@@ -583,33 +585,40 @@ class FlowDataEngine:
|
|
|
583
585
|
raise ValueError(f"Unsupported file format: {read_settings.file_format}")
|
|
584
586
|
|
|
585
587
|
@staticmethod
|
|
586
|
-
def _get_schema_from_first_file_in_dir(
|
|
587
|
-
|
|
588
|
+
def _get_schema_from_first_file_in_dir(
|
|
589
|
+
source: str, storage_options: dict[str, Any], file_format: Literal["csv", "parquet", "json", "delta"]
|
|
590
|
+
) -> list[FlowfileColumn] | None:
|
|
588
591
|
"""Infers the schema by scanning the first file in a cloud directory."""
|
|
589
592
|
try:
|
|
590
593
|
scan_func = getattr(pl, "scan_" + file_format)
|
|
591
594
|
first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
|
|
592
|
-
return convert_stats_to_column_info(
|
|
593
|
-
|
|
595
|
+
return convert_stats_to_column_info(
|
|
596
|
+
FlowDataEngine._create_schema_stats_from_pl_schema(
|
|
597
|
+
scan_func(first_file_ref, storage_options=storage_options).collect_schema()
|
|
598
|
+
)
|
|
599
|
+
)
|
|
594
600
|
except Exception as e:
|
|
595
601
|
logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
|
|
596
602
|
|
|
597
|
-
|
|
598
603
|
@classmethod
|
|
599
|
-
def _read_iceberg_from_cloud(
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
+
def _read_iceberg_from_cloud(
|
|
605
|
+
cls,
|
|
606
|
+
resource_path: str,
|
|
607
|
+
storage_options: dict[str, Any],
|
|
608
|
+
credential_provider: Callable | None,
|
|
609
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
610
|
+
) -> "FlowDataEngine":
|
|
604
611
|
"""Reads Iceberg table(s) from cloud storage."""
|
|
605
|
-
raise NotImplementedError(
|
|
612
|
+
raise NotImplementedError("Failed to read Iceberg table from cloud storage: Not yet implemented")
|
|
606
613
|
|
|
607
614
|
@classmethod
|
|
608
|
-
def _read_parquet_from_cloud(
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
615
|
+
def _read_parquet_from_cloud(
|
|
616
|
+
cls,
|
|
617
|
+
resource_path: str,
|
|
618
|
+
storage_options: dict[str, Any],
|
|
619
|
+
credential_provider: Callable | None,
|
|
620
|
+
is_directory: bool,
|
|
621
|
+
) -> "FlowDataEngine":
|
|
613
622
|
"""Reads Parquet file(s) from cloud storage."""
|
|
614
623
|
try:
|
|
615
624
|
# Use scan_parquet for lazy evaluation
|
|
@@ -633,7 +642,7 @@ class FlowDataEngine:
|
|
|
633
642
|
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
634
643
|
optimize_memory=True,
|
|
635
644
|
streamable=True,
|
|
636
|
-
schema=schema
|
|
645
|
+
schema=schema,
|
|
637
646
|
)
|
|
638
647
|
|
|
639
648
|
except Exception as e:
|
|
@@ -641,18 +650,20 @@ class FlowDataEngine:
|
|
|
641
650
|
raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
|
|
642
651
|
|
|
643
652
|
@classmethod
|
|
644
|
-
def _read_delta_from_cloud(
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
653
|
+
def _read_delta_from_cloud(
|
|
654
|
+
cls,
|
|
655
|
+
resource_path: str,
|
|
656
|
+
storage_options: dict[str, Any],
|
|
657
|
+
credential_provider: Callable | None,
|
|
658
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
659
|
+
) -> "FlowDataEngine":
|
|
649
660
|
"""Reads a Delta Lake table from cloud storage."""
|
|
650
661
|
try:
|
|
651
662
|
logger.info("Reading Delta file from cloud storage...")
|
|
652
663
|
logger.info(f"read_settings: {read_settings}")
|
|
653
664
|
scan_kwargs = {"source": resource_path}
|
|
654
665
|
if read_settings.delta_version:
|
|
655
|
-
scan_kwargs[
|
|
666
|
+
scan_kwargs["version"] = read_settings.delta_version
|
|
656
667
|
if storage_options:
|
|
657
668
|
scan_kwargs["storage_options"] = storage_options
|
|
658
669
|
if credential_provider:
|
|
@@ -663,18 +674,20 @@ class FlowDataEngine:
|
|
|
663
674
|
lf,
|
|
664
675
|
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
665
676
|
optimize_memory=True,
|
|
666
|
-
streamable=True
|
|
677
|
+
streamable=True,
|
|
667
678
|
)
|
|
668
679
|
except Exception as e:
|
|
669
680
|
logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
|
|
670
681
|
raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
|
|
671
682
|
|
|
672
683
|
@classmethod
|
|
673
|
-
def _read_csv_from_cloud(
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
684
|
+
def _read_csv_from_cloud(
|
|
685
|
+
cls,
|
|
686
|
+
resource_path: str,
|
|
687
|
+
storage_options: dict[str, Any],
|
|
688
|
+
credential_provider: Callable | None,
|
|
689
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
690
|
+
) -> "FlowDataEngine":
|
|
678
691
|
"""Reads CSV file(s) from cloud storage."""
|
|
679
692
|
try:
|
|
680
693
|
scan_kwargs = {
|
|
@@ -703,7 +716,7 @@ class FlowDataEngine:
|
|
|
703
716
|
number_of_records=6_666_666, # Will be calculated lazily
|
|
704
717
|
optimize_memory=True,
|
|
705
718
|
streamable=True,
|
|
706
|
-
schema=schema
|
|
719
|
+
schema=schema,
|
|
707
720
|
)
|
|
708
721
|
|
|
709
722
|
except Exception as e:
|
|
@@ -711,11 +724,13 @@ class FlowDataEngine:
|
|
|
711
724
|
raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
|
|
712
725
|
|
|
713
726
|
@classmethod
|
|
714
|
-
def _read_json_from_cloud(
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
727
|
+
def _read_json_from_cloud(
|
|
728
|
+
cls,
|
|
729
|
+
resource_path: str,
|
|
730
|
+
storage_options: dict[str, Any],
|
|
731
|
+
credential_provider: Callable | None,
|
|
732
|
+
is_directory: bool,
|
|
733
|
+
) -> "FlowDataEngine":
|
|
719
734
|
"""Reads JSON file(s) from cloud storage."""
|
|
720
735
|
try:
|
|
721
736
|
if is_directory:
|
|
@@ -755,8 +770,9 @@ class FlowDataEngine:
|
|
|
755
770
|
else:
|
|
756
771
|
self.data_frame = pl.read_parquet(path_ref)
|
|
757
772
|
|
|
758
|
-
def _finalize_initialization(
|
|
759
|
-
|
|
773
|
+
def _finalize_initialization(
|
|
774
|
+
self, name: str, optimize_memory: bool, schema: Any | None, calculate_schema_stats: bool
|
|
775
|
+
):
|
|
760
776
|
"""Finalizes initialization by setting remaining attributes."""
|
|
761
777
|
_ = calculate_schema_stats
|
|
762
778
|
self.name = name
|
|
@@ -803,23 +819,20 @@ class FlowDataEngine:
|
|
|
803
819
|
def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
|
|
804
820
|
"""Sets the underlying Polars DataFrame or LazyFrame."""
|
|
805
821
|
if self.lazy and isinstance(df, pl.DataFrame):
|
|
806
|
-
raise Exception(
|
|
822
|
+
raise Exception("Cannot set a non-lazy dataframe to a lazy flowfile")
|
|
807
823
|
self._data_frame = df
|
|
808
824
|
|
|
809
825
|
@staticmethod
|
|
810
|
-
def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) ->
|
|
826
|
+
def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> list[dict]:
|
|
811
827
|
"""Converts a Polars Schema into a list of schema statistics dictionaries."""
|
|
812
|
-
return [
|
|
813
|
-
dict(column_name=k, pl_datatype=v, col_index=i)
|
|
814
|
-
for i, (k, v) in enumerate(pl_schema.items())
|
|
815
|
-
]
|
|
828
|
+
return [dict(column_name=k, pl_datatype=v, col_index=i) for i, (k, v) in enumerate(pl_schema.items())]
|
|
816
829
|
|
|
817
|
-
def _add_schema_from_schema_stats(self, schema_stats:
|
|
830
|
+
def _add_schema_from_schema_stats(self, schema_stats: list[dict]):
|
|
818
831
|
"""Populates the schema from a list of schema statistics dictionaries."""
|
|
819
832
|
self._schema = convert_stats_to_column_info(schema_stats)
|
|
820
833
|
|
|
821
834
|
@property
|
|
822
|
-
def schema(self) ->
|
|
835
|
+
def schema(self) -> list[FlowfileColumn]:
|
|
823
836
|
"""The schema of the DataFrame as a list of `FlowfileColumn` objects.
|
|
824
837
|
|
|
825
838
|
This property lazily calculates the schema if it hasn't been determined yet.
|
|
@@ -866,8 +879,10 @@ class FlowDataEngine:
|
|
|
866
879
|
if n_records is None:
|
|
867
880
|
logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
|
|
868
881
|
else:
|
|
869
|
-
logger.info(
|
|
870
|
-
|
|
882
|
+
logger.info(
|
|
883
|
+
f'Fetching {n_records} record(s) for Table object "{id(self)}". '
|
|
884
|
+
f"Settings: streaming={self._streamable}"
|
|
885
|
+
)
|
|
871
886
|
|
|
872
887
|
if not self.lazy:
|
|
873
888
|
return self.data_frame
|
|
@@ -881,16 +896,15 @@ class FlowDataEngine:
|
|
|
881
896
|
def _collect_data(self, n_records: int = None) -> pl.DataFrame:
|
|
882
897
|
"""Internal method to handle data collection logic."""
|
|
883
898
|
if n_records is None:
|
|
884
|
-
|
|
885
899
|
self.collect_external()
|
|
886
900
|
if self._streamable:
|
|
887
901
|
try:
|
|
888
|
-
logger.info(
|
|
902
|
+
logger.info("Collecting data in streaming mode")
|
|
889
903
|
return self.data_frame.collect(engine="streaming")
|
|
890
904
|
except PanicException:
|
|
891
905
|
self._streamable = False
|
|
892
906
|
|
|
893
|
-
logger.info(
|
|
907
|
+
logger.info("Collecting data in non-streaming mode")
|
|
894
908
|
return self.data_frame.collect()
|
|
895
909
|
|
|
896
910
|
if self.external_source is not None:
|
|
@@ -919,7 +933,7 @@ class FlowDataEngine:
|
|
|
919
933
|
return self._create_partial_dataframe(ok_cols, error_cols, n_records)
|
|
920
934
|
return self._create_empty_dataframe(n_records)
|
|
921
935
|
|
|
922
|
-
def _identify_valid_columns(self, n_records: int) ->
|
|
936
|
+
def _identify_valid_columns(self, n_records: int) -> tuple[list[str], list[tuple[str, Any]]]:
|
|
923
937
|
"""Identifies which columns can be collected successfully."""
|
|
924
938
|
ok_cols = []
|
|
925
939
|
error_cols = []
|
|
@@ -931,30 +945,30 @@ class FlowDataEngine:
|
|
|
931
945
|
error_cols.append((c, self.data_frame.schema[c]))
|
|
932
946
|
return ok_cols, error_cols
|
|
933
947
|
|
|
934
|
-
def _create_partial_dataframe(
|
|
935
|
-
|
|
948
|
+
def _create_partial_dataframe(
|
|
949
|
+
self, ok_cols: list[str], error_cols: list[tuple[str, Any]], n_records: int
|
|
950
|
+
) -> pl.DataFrame:
|
|
936
951
|
"""Creates a DataFrame with partial data for columns that could be collected."""
|
|
937
952
|
df = self.data_frame.select(ok_cols)
|
|
938
|
-
df = df.with_columns([
|
|
939
|
-
pl.lit(None).alias(column_name).cast(data_type)
|
|
940
|
-
for column_name, data_type in error_cols
|
|
941
|
-
])
|
|
953
|
+
df = df.with_columns([pl.lit(None).alias(column_name).cast(data_type) for column_name, data_type in error_cols])
|
|
942
954
|
return df.select(self.columns).head(n_records).collect()
|
|
943
955
|
|
|
944
956
|
def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
|
|
945
957
|
"""Creates an empty DataFrame with the correct schema."""
|
|
946
958
|
if self.number_of_records > 0:
|
|
947
|
-
return pl.DataFrame(
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
959
|
+
return pl.DataFrame(
|
|
960
|
+
{
|
|
961
|
+
column_name: pl.Series(
|
|
962
|
+
name=column_name, values=[None] * min(self.number_of_records, n_records)
|
|
963
|
+
).cast(data_type)
|
|
964
|
+
for column_name, data_type in self.data_frame.schema.items()
|
|
965
|
+
}
|
|
966
|
+
)
|
|
954
967
|
return pl.DataFrame(schema=self.data_frame.schema)
|
|
955
968
|
|
|
956
|
-
def do_group_by(
|
|
957
|
-
|
|
969
|
+
def do_group_by(
|
|
970
|
+
self, group_by_input: transform_schemas.GroupByInput, calculate_schema_stats: bool = True
|
|
971
|
+
) -> "FlowDataEngine":
|
|
958
972
|
"""Performs a group-by operation on the DataFrame.
|
|
959
973
|
|
|
960
974
|
Args:
|
|
@@ -966,27 +980,23 @@ class FlowDataEngine:
|
|
|
966
980
|
Returns:
|
|
967
981
|
A new `FlowDataEngine` instance with the grouped and aggregated data.
|
|
968
982
|
"""
|
|
969
|
-
aggregations = [c for c in group_by_input.agg_cols if c.agg !=
|
|
970
|
-
group_columns = [c for c in group_by_input.agg_cols if c.agg ==
|
|
983
|
+
aggregations = [c for c in group_by_input.agg_cols if c.agg != "groupby"]
|
|
984
|
+
group_columns = [c for c in group_by_input.agg_cols if c.agg == "groupby"]
|
|
971
985
|
|
|
972
986
|
if len(group_columns) == 0:
|
|
973
987
|
return FlowDataEngine(
|
|
974
|
-
self.data_frame.select(
|
|
975
|
-
|
|
976
|
-
),
|
|
977
|
-
calculate_schema_stats=calculate_schema_stats
|
|
988
|
+
self.data_frame.select(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
|
|
989
|
+
calculate_schema_stats=calculate_schema_stats,
|
|
978
990
|
)
|
|
979
991
|
|
|
980
992
|
df = self.data_frame.rename({c.old_name: c.new_name for c in group_columns})
|
|
981
993
|
group_by_columns = [n_c.new_name for n_c in group_columns]
|
|
982
994
|
return FlowDataEngine(
|
|
983
|
-
df.group_by(*group_by_columns).agg(
|
|
984
|
-
|
|
985
|
-
),
|
|
986
|
-
calculate_schema_stats=calculate_schema_stats
|
|
995
|
+
df.group_by(*group_by_columns).agg(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
|
|
996
|
+
calculate_schema_stats=calculate_schema_stats,
|
|
987
997
|
)
|
|
988
998
|
|
|
989
|
-
def do_sort(self, sorts:
|
|
999
|
+
def do_sort(self, sorts: list[transform_schemas.SortByInput]) -> "FlowDataEngine":
|
|
990
1000
|
"""Sorts the DataFrame by one or more columns.
|
|
991
1001
|
|
|
992
1002
|
Args:
|
|
@@ -999,12 +1009,13 @@ class FlowDataEngine:
|
|
|
999
1009
|
if not sorts:
|
|
1000
1010
|
return self
|
|
1001
1011
|
|
|
1002
|
-
descending = [s.how ==
|
|
1012
|
+
descending = [s.how == "desc" or s.how.lower() == "descending" for s in sorts]
|
|
1003
1013
|
df = self.data_frame.sort([sort_by.column for sort_by in sorts], descending=descending)
|
|
1004
1014
|
return FlowDataEngine(df, number_of_records=self.number_of_records, schema=self.schema)
|
|
1005
1015
|
|
|
1006
|
-
def change_column_types(
|
|
1007
|
-
|
|
1016
|
+
def change_column_types(
|
|
1017
|
+
self, transforms: list[transform_schemas.SelectInput], calculate_schema: bool = False
|
|
1018
|
+
) -> "FlowDataEngine":
|
|
1008
1019
|
"""Changes the data type of one or more columns.
|
|
1009
1020
|
|
|
1010
1021
|
Args:
|
|
@@ -1018,7 +1029,8 @@ class FlowDataEngine:
|
|
|
1018
1029
|
dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
|
|
1019
1030
|
idx_mapping = list(
|
|
1020
1031
|
(transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
|
|
1021
|
-
for transform in transforms
|
|
1032
|
+
for transform in transforms
|
|
1033
|
+
if transform.data_type is not None
|
|
1022
1034
|
)
|
|
1023
1035
|
|
|
1024
1036
|
actual_transforms = [c for c in idx_mapping if c[2] != dtypes[c[1]]]
|
|
@@ -1032,10 +1044,10 @@ class FlowDataEngine:
|
|
|
1032
1044
|
df,
|
|
1033
1045
|
number_of_records=self.number_of_records,
|
|
1034
1046
|
calculate_schema_stats=calculate_schema,
|
|
1035
|
-
streamable=self._streamable
|
|
1047
|
+
streamable=self._streamable,
|
|
1036
1048
|
)
|
|
1037
1049
|
|
|
1038
|
-
def save(self, path: str, data_type: str =
|
|
1050
|
+
def save(self, path: str, data_type: str = "parquet") -> Future:
|
|
1039
1051
|
"""Saves the DataFrame to a file in a separate thread.
|
|
1040
1052
|
|
|
1041
1053
|
Args:
|
|
@@ -1049,7 +1061,7 @@ class FlowDataEngine:
|
|
|
1049
1061
|
df = deepcopy(self.data_frame)
|
|
1050
1062
|
return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
|
|
1051
1063
|
|
|
1052
|
-
def to_pylist(self) ->
|
|
1064
|
+
def to_pylist(self) -> list[dict]:
|
|
1053
1065
|
"""Converts the DataFrame to a list of Python dictionaries.
|
|
1054
1066
|
|
|
1055
1067
|
Returns:
|
|
@@ -1083,15 +1095,15 @@ class FlowDataEngine:
|
|
|
1083
1095
|
data = list(self.to_dict().values())
|
|
1084
1096
|
return input_schema.RawData(columns=columns, data=data)
|
|
1085
1097
|
|
|
1086
|
-
def to_dict(self) ->
|
|
1098
|
+
def to_dict(self) -> dict[str, list]:
|
|
1087
1099
|
"""Converts the DataFrame to a Python dictionary of columns.
|
|
1088
1100
|
|
|
1089
|
-
|
|
1090
|
-
|
|
1101
|
+
Each key in the dictionary is a column name, and the corresponding value
|
|
1102
|
+
is a list of the data in that column.
|
|
1091
1103
|
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1104
|
+
Returns:
|
|
1105
|
+
A dictionary mapping column names to lists of their values.
|
|
1106
|
+
"""
|
|
1095
1107
|
if self.lazy:
|
|
1096
1108
|
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
|
|
1097
1109
|
else:
|
|
@@ -1131,7 +1143,7 @@ class FlowDataEngine:
|
|
|
1131
1143
|
return cls(pl.read_sql(sql, conn))
|
|
1132
1144
|
|
|
1133
1145
|
@classmethod
|
|
1134
|
-
def create_from_schema(cls, schema:
|
|
1146
|
+
def create_from_schema(cls, schema: list[FlowfileColumn]) -> "FlowDataEngine":
|
|
1135
1147
|
"""Creates an empty FlowDataEngine from a schema definition.
|
|
1136
1148
|
|
|
1137
1149
|
Args:
|
|
@@ -1162,14 +1174,14 @@ class FlowDataEngine:
|
|
|
1162
1174
|
"""
|
|
1163
1175
|
received_table.set_absolute_filepath()
|
|
1164
1176
|
file_type_handlers = {
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1177
|
+
"csv": create_funcs.create_from_path_csv,
|
|
1178
|
+
"parquet": create_funcs.create_from_path_parquet,
|
|
1179
|
+
"excel": create_funcs.create_from_path_excel,
|
|
1168
1180
|
}
|
|
1169
1181
|
|
|
1170
1182
|
handler = file_type_handlers.get(received_table.file_type)
|
|
1171
1183
|
if not handler:
|
|
1172
|
-
raise Exception(f
|
|
1184
|
+
raise Exception(f"Cannot create from {received_table.file_type}")
|
|
1173
1185
|
|
|
1174
1186
|
flow_file = cls(handler(received_table))
|
|
1175
1187
|
flow_file._org_path = received_table.abs_file_path
|
|
@@ -1190,7 +1202,7 @@ class FlowDataEngine:
|
|
|
1190
1202
|
return cls(create_fake_data(number_of_records))
|
|
1191
1203
|
|
|
1192
1204
|
@classmethod
|
|
1193
|
-
def generate_enumerator(cls, length: int = 1000, output_name: str =
|
|
1205
|
+
def generate_enumerator(cls, length: int = 1000, output_name: str = "output_column") -> "FlowDataEngine":
|
|
1194
1206
|
"""Generates a FlowDataEngine with a single column containing a sequence of integers.
|
|
1195
1207
|
|
|
1196
1208
|
Args:
|
|
@@ -1204,8 +1216,9 @@ class FlowDataEngine:
|
|
|
1204
1216
|
length = 10_000_000
|
|
1205
1217
|
return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
|
|
1206
1218
|
|
|
1207
|
-
def _handle_schema(
|
|
1208
|
-
|
|
1219
|
+
def _handle_schema(
|
|
1220
|
+
self, schema: list[FlowfileColumn] | list[str] | pl.Schema | None, pl_schema: pl.Schema
|
|
1221
|
+
) -> list[FlowfileColumn] | None:
|
|
1209
1222
|
"""Handles schema processing and validation during initialization."""
|
|
1210
1223
|
if schema is None and pl_schema is not None:
|
|
1211
1224
|
return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
|
|
@@ -1216,7 +1229,8 @@ class FlowDataEngine:
|
|
|
1216
1229
|
elif pl_schema is not None and schema is not None:
|
|
1217
1230
|
if schema.__len__() != pl_schema.__len__():
|
|
1218
1231
|
raise Exception(
|
|
1219
|
-
f
|
|
1232
|
+
f"Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}"
|
|
1233
|
+
)
|
|
1220
1234
|
if isinstance(schema, pl.Schema):
|
|
1221
1235
|
return self._handle_polars_schema(schema, pl_schema)
|
|
1222
1236
|
elif isinstance(schema, list) and len(schema) == 0:
|
|
@@ -1225,31 +1239,29 @@ class FlowDataEngine:
|
|
|
1225
1239
|
return self._handle_string_schema(schema, pl_schema)
|
|
1226
1240
|
return schema
|
|
1227
1241
|
|
|
1228
|
-
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) ->
|
|
1242
|
+
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> list[FlowfileColumn]:
|
|
1229
1243
|
"""Handles Polars schema conversion."""
|
|
1230
1244
|
flow_file_columns = [
|
|
1231
1245
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
1232
|
-
for col_name, dtype in zip(schema.names(), schema.dtypes())
|
|
1246
|
+
for col_name, dtype in zip(schema.names(), schema.dtypes(), strict=False)
|
|
1233
1247
|
]
|
|
1234
1248
|
|
|
1235
1249
|
select_arg = [
|
|
1236
1250
|
pl.col(o).alias(n).cast(schema_dtype)
|
|
1237
|
-
for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes())
|
|
1251
|
+
for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes(), strict=False)
|
|
1238
1252
|
]
|
|
1239
1253
|
|
|
1240
1254
|
self.data_frame = self.data_frame.select(select_arg)
|
|
1241
1255
|
return flow_file_columns
|
|
1242
1256
|
|
|
1243
|
-
def _handle_string_schema(self, schema:
|
|
1257
|
+
def _handle_string_schema(self, schema: list[str], pl_schema: pl.Schema) -> list[FlowfileColumn]:
|
|
1244
1258
|
"""Handles string-based schema conversion."""
|
|
1245
1259
|
flow_file_columns = [
|
|
1246
1260
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
1247
|
-
for col_name, dtype in zip(schema, pl_schema.dtypes())
|
|
1261
|
+
for col_name, dtype in zip(schema, pl_schema.dtypes(), strict=False)
|
|
1248
1262
|
]
|
|
1249
1263
|
|
|
1250
|
-
self.data_frame = self.data_frame.rename({
|
|
1251
|
-
o: n for o, n in zip(pl_schema.names(), schema)
|
|
1252
|
-
})
|
|
1264
|
+
self.data_frame = self.data_frame.rename({o: n for o, n in zip(pl_schema.names(), schema, strict=False)})
|
|
1253
1265
|
|
|
1254
1266
|
return flow_file_columns
|
|
1255
1267
|
|
|
@@ -1267,25 +1279,16 @@ class FlowDataEngine:
|
|
|
1267
1279
|
A new `FlowDataEngine` instance with the exploded rows.
|
|
1268
1280
|
"""
|
|
1269
1281
|
output_column_name = (
|
|
1270
|
-
split_input.output_column_name
|
|
1271
|
-
if split_input.output_column_name
|
|
1272
|
-
else split_input.column_to_split
|
|
1282
|
+
split_input.output_column_name if split_input.output_column_name else split_input.column_to_split
|
|
1273
1283
|
)
|
|
1274
1284
|
|
|
1275
1285
|
split_value = (
|
|
1276
|
-
split_input.split_fixed_value
|
|
1277
|
-
if split_input.split_by_fixed_value
|
|
1278
|
-
else pl.col(split_input.split_by_column)
|
|
1286
|
+
split_input.split_fixed_value if split_input.split_by_fixed_value else pl.col(split_input.split_by_column)
|
|
1279
1287
|
)
|
|
1280
1288
|
|
|
1281
|
-
df = (
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
.str.split(by=split_value)
|
|
1285
|
-
.alias(output_column_name)
|
|
1286
|
-
)
|
|
1287
|
-
.explode(output_column_name)
|
|
1288
|
-
)
|
|
1289
|
+
df = self.data_frame.with_columns(
|
|
1290
|
+
pl.col(split_input.column_to_split).str.split(by=split_value).alias(output_column_name)
|
|
1291
|
+
).explode(output_column_name)
|
|
1289
1292
|
|
|
1290
1293
|
return FlowDataEngine(df)
|
|
1291
1294
|
|
|
@@ -1305,15 +1308,9 @@ class FlowDataEngine:
|
|
|
1305
1308
|
lf = self.data_frame
|
|
1306
1309
|
|
|
1307
1310
|
if unpivot_input.data_type_selector_expr is not None:
|
|
1308
|
-
result = lf.unpivot(
|
|
1309
|
-
on=unpivot_input.data_type_selector_expr(),
|
|
1310
|
-
index=unpivot_input.index_columns
|
|
1311
|
-
)
|
|
1311
|
+
result = lf.unpivot(on=unpivot_input.data_type_selector_expr(), index=unpivot_input.index_columns)
|
|
1312
1312
|
elif unpivot_input.value_columns is not None:
|
|
1313
|
-
result = lf.unpivot(
|
|
1314
|
-
on=unpivot_input.value_columns,
|
|
1315
|
-
index=unpivot_input.index_columns
|
|
1316
|
-
)
|
|
1313
|
+
result = lf.unpivot(on=unpivot_input.value_columns, index=unpivot_input.index_columns)
|
|
1317
1314
|
else:
|
|
1318
1315
|
result = lf.unpivot()
|
|
1319
1316
|
|
|
@@ -1333,19 +1330,24 @@ class FlowDataEngine:
|
|
|
1333
1330
|
"""
|
|
1334
1331
|
# Get unique values for pivot columns
|
|
1335
1332
|
max_unique_vals = 200
|
|
1336
|
-
new_cols_unique = fetch_unique_values(
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1333
|
+
new_cols_unique = fetch_unique_values(
|
|
1334
|
+
self.data_frame.select(pivot_input.pivot_column)
|
|
1335
|
+
.unique()
|
|
1336
|
+
.sort(pivot_input.pivot_column)
|
|
1337
|
+
.limit(max_unique_vals)
|
|
1338
|
+
.cast(pl.String)
|
|
1339
|
+
)
|
|
1340
1340
|
if len(new_cols_unique) >= max_unique_vals:
|
|
1341
1341
|
if node_logger:
|
|
1342
|
-
node_logger.warning(
|
|
1343
|
-
|
|
1342
|
+
node_logger.warning(
|
|
1343
|
+
"Pivot column has too many unique values. Please consider using a different column."
|
|
1344
|
+
f" Max unique values: {max_unique_vals}"
|
|
1345
|
+
)
|
|
1344
1346
|
|
|
1345
1347
|
if len(pivot_input.index_columns) == 0:
|
|
1346
1348
|
no_index_cols = True
|
|
1347
|
-
pivot_input.index_columns = [
|
|
1348
|
-
ff = self.apply_flowfile_formula(
|
|
1349
|
+
pivot_input.index_columns = ["__temp__"]
|
|
1350
|
+
ff = self.apply_flowfile_formula("1", col_name="__temp__")
|
|
1349
1351
|
else:
|
|
1350
1352
|
no_index_cols = False
|
|
1351
1353
|
ff = self
|
|
@@ -1355,36 +1357,32 @@ class FlowDataEngine:
|
|
|
1355
1357
|
grouped_ff = ff.do_group_by(pivot_input.get_group_by_input(), False)
|
|
1356
1358
|
pivot_column = pivot_input.get_pivot_column()
|
|
1357
1359
|
|
|
1358
|
-
input_df = grouped_ff.data_frame.with_columns(
|
|
1359
|
-
pivot_column.cast(pl.String).alias(pivot_input.pivot_column)
|
|
1360
|
-
)
|
|
1360
|
+
input_df = grouped_ff.data_frame.with_columns(pivot_column.cast(pl.String).alias(pivot_input.pivot_column))
|
|
1361
1361
|
number_of_aggregations = len(pivot_input.aggregations)
|
|
1362
1362
|
df = (
|
|
1363
|
-
input_df.select(
|
|
1364
|
-
*index_columns,
|
|
1365
|
-
pivot_column,
|
|
1366
|
-
pivot_input.get_values_expr()
|
|
1367
|
-
)
|
|
1363
|
+
input_df.select(*index_columns, pivot_column, pivot_input.get_values_expr())
|
|
1368
1364
|
.group_by(*index_columns)
|
|
1369
|
-
.agg(
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1365
|
+
.agg(
|
|
1366
|
+
[
|
|
1367
|
+
(pl.col("vals").filter(pivot_column == new_col_value)).first().alias(new_col_value)
|
|
1368
|
+
for new_col_value in new_cols_unique
|
|
1369
|
+
]
|
|
1370
|
+
)
|
|
1375
1371
|
.select(
|
|
1376
1372
|
*index_columns,
|
|
1377
1373
|
*[
|
|
1378
|
-
pl.col(new_col)
|
|
1374
|
+
pl.col(new_col)
|
|
1375
|
+
.struct.field(agg)
|
|
1376
|
+
.alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
|
|
1379
1377
|
for new_col in new_cols_unique
|
|
1380
1378
|
for agg in pivot_input.aggregations
|
|
1381
|
-
]
|
|
1379
|
+
],
|
|
1382
1380
|
)
|
|
1383
1381
|
)
|
|
1384
1382
|
|
|
1385
1383
|
# Clean up temporary columns if needed
|
|
1386
1384
|
if no_index_cols:
|
|
1387
|
-
df = df.drop(
|
|
1385
|
+
df = df.drop("__temp__")
|
|
1388
1386
|
pivot_input.index_columns = []
|
|
1389
1387
|
|
|
1390
1388
|
return FlowDataEngine(df, calculate_schema_stats=False)
|
|
@@ -1403,7 +1401,7 @@ class FlowDataEngine:
|
|
|
1403
1401
|
try:
|
|
1404
1402
|
f = to_expr(predicate)
|
|
1405
1403
|
except Exception as e:
|
|
1406
|
-
logger.warning(f
|
|
1404
|
+
logger.warning(f"Error in filter expression: {e}")
|
|
1407
1405
|
f = to_expr("False")
|
|
1408
1406
|
df = self.data_frame.filter(f)
|
|
1409
1407
|
return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
|
|
@@ -1430,29 +1428,27 @@ class FlowDataEngine:
|
|
|
1430
1428
|
select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
|
|
1431
1429
|
|
|
1432
1430
|
df = (
|
|
1433
|
-
self.data_frame
|
|
1434
|
-
.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
|
|
1431
|
+
self.data_frame.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
|
|
1435
1432
|
.with_columns(
|
|
1436
|
-
(
|
|
1437
|
-
|
|
1438
|
-
|
|
1433
|
+
(
|
|
1434
|
+
pl.cum_count(record_id_settings.output_column_name).over(record_id_settings.group_by_columns)
|
|
1435
|
+
+ record_id_settings.offset
|
|
1436
|
+
- 1
|
|
1437
|
+
).alias(record_id_settings.output_column_name)
|
|
1439
1438
|
)
|
|
1440
1439
|
.select(select_cols)
|
|
1441
1440
|
)
|
|
1442
1441
|
|
|
1443
|
-
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name,
|
|
1442
|
+
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
|
|
1444
1443
|
output_schema.extend(self.schema)
|
|
1445
1444
|
|
|
1446
1445
|
return FlowDataEngine(df, schema=output_schema)
|
|
1447
1446
|
|
|
1448
1447
|
def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
1449
1448
|
"""Adds a simple sequential record ID column."""
|
|
1450
|
-
df = self.data_frame.with_row_index(
|
|
1451
|
-
record_id_settings.output_column_name,
|
|
1452
|
-
record_id_settings.offset
|
|
1453
|
-
)
|
|
1449
|
+
df = self.data_frame.with_row_index(record_id_settings.output_column_name, record_id_settings.offset)
|
|
1454
1450
|
|
|
1455
|
-
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name,
|
|
1451
|
+
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
|
|
1456
1452
|
output_schema.extend(self.schema)
|
|
1457
1453
|
|
|
1458
1454
|
return FlowDataEngine(df, schema=output_schema)
|
|
@@ -1484,7 +1480,7 @@ class FlowDataEngine:
|
|
|
1484
1480
|
|
|
1485
1481
|
def __repr__(self) -> str:
|
|
1486
1482
|
"""Returns a string representation of the FlowDataEngine."""
|
|
1487
|
-
return f
|
|
1483
|
+
return f"flow data engine\n{self.data_frame.__repr__()}"
|
|
1488
1484
|
|
|
1489
1485
|
def __call__(self) -> "FlowDataEngine":
|
|
1490
1486
|
"""Makes the class instance callable, returning itself."""
|
|
@@ -1504,16 +1500,16 @@ class FlowDataEngine:
|
|
|
1504
1500
|
Returns:
|
|
1505
1501
|
The same `FlowDataEngine` instance, now backed by the cached data.
|
|
1506
1502
|
"""
|
|
1507
|
-
edf = ExternalDfFetcher(
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
logger.info(
|
|
1503
|
+
edf = ExternalDfFetcher(
|
|
1504
|
+
lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False, flow_id=-1, node_id=-1
|
|
1505
|
+
)
|
|
1506
|
+
logger.info("Caching data in background")
|
|
1511
1507
|
result = edf.get_result()
|
|
1512
1508
|
if isinstance(result, pl.LazyFrame):
|
|
1513
|
-
logger.info(
|
|
1509
|
+
logger.info("Data cached")
|
|
1514
1510
|
del self._data_frame
|
|
1515
1511
|
self.data_frame = result
|
|
1516
|
-
logger.info(
|
|
1512
|
+
logger.info("Data loaded from cache")
|
|
1517
1513
|
return self
|
|
1518
1514
|
|
|
1519
1515
|
def collect_external(self):
|
|
@@ -1525,14 +1521,14 @@ class FlowDataEngine:
|
|
|
1525
1521
|
re-evaluated.
|
|
1526
1522
|
"""
|
|
1527
1523
|
if self._external_source is not None:
|
|
1528
|
-
logger.info(
|
|
1524
|
+
logger.info("Collecting external source")
|
|
1529
1525
|
if self.external_source.get_pl_df() is not None:
|
|
1530
1526
|
self.data_frame = self.external_source.get_pl_df().lazy()
|
|
1531
1527
|
else:
|
|
1532
1528
|
self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
|
|
1533
1529
|
self._schema = None # enforce reset schema
|
|
1534
1530
|
|
|
1535
|
-
def get_output_sample(self, n_rows: int = 10) ->
|
|
1531
|
+
def get_output_sample(self, n_rows: int = 10) -> list[dict]:
|
|
1536
1532
|
"""Gets a sample of the data as a list of dictionaries.
|
|
1537
1533
|
|
|
1538
1534
|
This is typically used to display a preview of the data in a UI.
|
|
@@ -1560,14 +1556,20 @@ class FlowDataEngine:
|
|
|
1560
1556
|
try:
|
|
1561
1557
|
df = df.head(n_rows).collect()
|
|
1562
1558
|
except Exception as e:
|
|
1563
|
-
logger.warning(f
|
|
1559
|
+
logger.warning(f"Error in getting sample: {e}")
|
|
1564
1560
|
df = df.head(n_rows).collect(engine="auto")
|
|
1565
1561
|
else:
|
|
1566
1562
|
df = self.collect()
|
|
1567
1563
|
return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
|
|
1568
1564
|
|
|
1569
|
-
def get_sample(
|
|
1570
|
-
|
|
1565
|
+
def get_sample(
|
|
1566
|
+
self,
|
|
1567
|
+
n_rows: int = 100,
|
|
1568
|
+
random: bool = False,
|
|
1569
|
+
shuffle: bool = False,
|
|
1570
|
+
seed: int = None,
|
|
1571
|
+
execution_location: ExecutionLocationsLiteral | None = None,
|
|
1572
|
+
) -> "FlowDataEngine":
|
|
1571
1573
|
"""Gets a sample of rows from the DataFrame.
|
|
1572
1574
|
|
|
1573
1575
|
Args:
|
|
@@ -1579,22 +1581,23 @@ class FlowDataEngine:
|
|
|
1579
1581
|
Returns:
|
|
1580
1582
|
A new `FlowDataEngine` instance containing the sampled data.
|
|
1581
1583
|
"""
|
|
1582
|
-
logging.info(f
|
|
1584
|
+
logging.info(f"Getting sample of {n_rows} rows")
|
|
1583
1585
|
if random:
|
|
1584
1586
|
if self.lazy and self.external_source is not None:
|
|
1585
1587
|
self.collect_external()
|
|
1586
1588
|
|
|
1587
1589
|
if self.lazy and shuffle:
|
|
1588
|
-
sample_df =
|
|
1589
|
-
|
|
1590
|
+
sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(
|
|
1591
|
+
n_rows, seed=seed, shuffle=shuffle
|
|
1592
|
+
)
|
|
1590
1593
|
elif shuffle:
|
|
1591
1594
|
sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
|
|
1592
1595
|
else:
|
|
1593
1596
|
if execution_location is None:
|
|
1594
1597
|
execution_location = get_global_execution_location()
|
|
1595
|
-
n_rows = min(
|
|
1596
|
-
calculate_in_worker_process=execution_location == "remote")
|
|
1597
|
-
|
|
1598
|
+
n_rows = min(
|
|
1599
|
+
n_rows, self.get_number_of_records(calculate_in_worker_process=execution_location == "remote")
|
|
1600
|
+
)
|
|
1598
1601
|
|
|
1599
1602
|
every_n_records = ceil(self.number_of_records / n_rows)
|
|
1600
1603
|
sample_df = self.data_frame.gather_every(every_n_records)
|
|
@@ -1619,8 +1622,9 @@ class FlowDataEngine:
|
|
|
1619
1622
|
else:
|
|
1620
1623
|
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
1621
1624
|
|
|
1622
|
-
def iter_batches(
|
|
1623
|
-
|
|
1625
|
+
def iter_batches(
|
|
1626
|
+
self, batch_size: int = 1000, columns: list | tuple | str = None
|
|
1627
|
+
) -> Generator["FlowDataEngine", None, None]:
|
|
1624
1628
|
"""Iterates over the DataFrame in batches.
|
|
1625
1629
|
|
|
1626
1630
|
Args:
|
|
@@ -1638,9 +1642,14 @@ class FlowDataEngine:
|
|
|
1638
1642
|
for batch in batches:
|
|
1639
1643
|
yield FlowDataEngine(batch)
|
|
1640
1644
|
|
|
1641
|
-
def start_fuzzy_join(
|
|
1642
|
-
|
|
1643
|
-
|
|
1645
|
+
def start_fuzzy_join(
|
|
1646
|
+
self,
|
|
1647
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1648
|
+
other: "FlowDataEngine",
|
|
1649
|
+
file_ref: str,
|
|
1650
|
+
flow_id: int = -1,
|
|
1651
|
+
node_id: int | str = -1,
|
|
1652
|
+
) -> ExternalFuzzyMatchFetcher:
|
|
1644
1653
|
"""Starts a fuzzy join operation in a background process.
|
|
1645
1654
|
|
|
1646
1655
|
This method prepares the data and initiates the fuzzy matching in a
|
|
@@ -1658,51 +1667,70 @@ class FlowDataEngine:
|
|
|
1658
1667
|
progress and retrieve the result of the fuzzy join.
|
|
1659
1668
|
"""
|
|
1660
1669
|
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1661
|
-
left_df, right_df = prepare_for_fuzzy_match(
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1670
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1671
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1672
|
+
)
|
|
1673
|
+
|
|
1674
|
+
return ExternalFuzzyMatchFetcher(
|
|
1675
|
+
left_df,
|
|
1676
|
+
right_df,
|
|
1677
|
+
fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
|
|
1678
|
+
file_ref=file_ref + "_fm",
|
|
1679
|
+
wait_on_completion=False,
|
|
1680
|
+
flow_id=flow_id,
|
|
1681
|
+
node_id=node_id,
|
|
1682
|
+
)
|
|
1683
|
+
|
|
1684
|
+
def fuzzy_join_external(
|
|
1685
|
+
self,
|
|
1686
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1687
|
+
other: "FlowDataEngine",
|
|
1688
|
+
file_ref: str = None,
|
|
1689
|
+
flow_id: int = -1,
|
|
1690
|
+
node_id: int = -1,
|
|
1691
|
+
):
|
|
1678
1692
|
if file_ref is None:
|
|
1679
|
-
file_ref = str(id(self)) +
|
|
1693
|
+
file_ref = str(id(self)) + "_" + str(id(other))
|
|
1680
1694
|
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1681
1695
|
|
|
1682
|
-
left_df, right_df = prepare_for_fuzzy_match(
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1696
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1697
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1698
|
+
)
|
|
1699
|
+
external_tracker = ExternalFuzzyMatchFetcher(
|
|
1700
|
+
left_df,
|
|
1701
|
+
right_df,
|
|
1702
|
+
fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
|
|
1703
|
+
file_ref=file_ref + "_fm",
|
|
1704
|
+
wait_on_completion=False,
|
|
1705
|
+
flow_id=flow_id,
|
|
1706
|
+
node_id=node_id,
|
|
1707
|
+
)
|
|
1690
1708
|
return FlowDataEngine(external_tracker.get_result())
|
|
1691
1709
|
|
|
1692
|
-
def fuzzy_join(
|
|
1693
|
-
|
|
1694
|
-
|
|
1710
|
+
def fuzzy_join(
|
|
1711
|
+
self,
|
|
1712
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1713
|
+
other: "FlowDataEngine",
|
|
1714
|
+
node_logger: NodeLogger = None,
|
|
1715
|
+
) -> "FlowDataEngine":
|
|
1695
1716
|
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1696
|
-
left_df, right_df = prepare_for_fuzzy_match(
|
|
1697
|
-
|
|
1717
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1718
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1719
|
+
)
|
|
1698
1720
|
fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input_manager.fuzzy_maps]
|
|
1699
|
-
return FlowDataEngine(
|
|
1700
|
-
|
|
1701
|
-
|
|
1721
|
+
return FlowDataEngine(
|
|
1722
|
+
fuzzy_match_dfs(
|
|
1723
|
+
left_df, right_df, fuzzy_maps=fuzzy_mappings, logger=node_logger.logger if node_logger else logger
|
|
1724
|
+
).lazy()
|
|
1725
|
+
)
|
|
1702
1726
|
|
|
1703
|
-
def do_cross_join(
|
|
1704
|
-
|
|
1705
|
-
|
|
1727
|
+
def do_cross_join(
|
|
1728
|
+
self,
|
|
1729
|
+
cross_join_input: transform_schemas.CrossJoinInput,
|
|
1730
|
+
auto_generate_selection: bool,
|
|
1731
|
+
verify_integrity: bool,
|
|
1732
|
+
other: "FlowDataEngine",
|
|
1733
|
+
) -> "FlowDataEngine":
|
|
1706
1734
|
"""Performs a cross join with another DataFrame.
|
|
1707
1735
|
|
|
1708
1736
|
A cross join produces the Cartesian product of the two DataFrames.
|
|
@@ -1723,26 +1751,41 @@ class FlowDataEngine:
|
|
|
1723
1751
|
self.lazy = True
|
|
1724
1752
|
other.lazy = True
|
|
1725
1753
|
cross_join_input_manager = transform_schemas.CrossJoinInputManager(cross_join_input)
|
|
1726
|
-
verify_join_select_integrity(
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1754
|
+
verify_join_select_integrity(
|
|
1755
|
+
cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns
|
|
1756
|
+
)
|
|
1757
|
+
right_select = [
|
|
1758
|
+
v.old_name
|
|
1759
|
+
for v in cross_join_input_manager.right_select.renames
|
|
1760
|
+
if (v.keep or v.join_key) and v.is_available
|
|
1761
|
+
]
|
|
1762
|
+
left_select = [
|
|
1763
|
+
v.old_name
|
|
1764
|
+
for v in cross_join_input_manager.left_select.renames
|
|
1765
|
+
if (v.keep or v.join_key) and v.is_available
|
|
1766
|
+
]
|
|
1731
1767
|
cross_join_input_manager.auto_rename(rename_mode="suffix")
|
|
1732
1768
|
left = self.data_frame.select(left_select).rename(cross_join_input_manager.left_select.rename_table)
|
|
1733
1769
|
right = other.data_frame.select(right_select).rename(cross_join_input_manager.right_select.rename_table)
|
|
1734
1770
|
|
|
1735
|
-
joined_df = left.join(right, how=
|
|
1771
|
+
joined_df = left.join(right, how="cross")
|
|
1736
1772
|
|
|
1737
|
-
cols_to_delete_after = [
|
|
1738
|
-
|
|
1739
|
-
|
|
1773
|
+
cols_to_delete_after = [
|
|
1774
|
+
col.new_name
|
|
1775
|
+
for col in cross_join_input_manager.left_select.renames + cross_join_input_manager.right_select.renames
|
|
1776
|
+
if col.join_key and not col.keep and col.is_available
|
|
1777
|
+
]
|
|
1740
1778
|
|
|
1741
1779
|
fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
|
|
1742
1780
|
return fl
|
|
1743
1781
|
|
|
1744
|
-
def join(
|
|
1745
|
-
|
|
1782
|
+
def join(
|
|
1783
|
+
self,
|
|
1784
|
+
join_input: transform_schemas.JoinInput,
|
|
1785
|
+
auto_generate_selection: bool,
|
|
1786
|
+
verify_integrity: bool,
|
|
1787
|
+
other: "FlowDataEngine",
|
|
1788
|
+
) -> "FlowDataEngine":
|
|
1746
1789
|
"""Performs a standard SQL-style join with another DataFrame."""
|
|
1747
1790
|
# Create manager from input
|
|
1748
1791
|
join_manager = transform_schemas.JoinInputManager(join_input)
|
|
@@ -1754,40 +1797,52 @@ class FlowDataEngine:
|
|
|
1754
1797
|
join_manager.right_select.append(transform_schemas.SelectInput(jk.right_col, keep=False))
|
|
1755
1798
|
verify_join_select_integrity(join_manager.input, left_columns=self.columns, right_columns=other.columns)
|
|
1756
1799
|
if not verify_join_map_integrity(join_manager.input, left_columns=self.schema, right_columns=other.schema):
|
|
1757
|
-
raise Exception(
|
|
1800
|
+
raise Exception("Join is not valid by the data fields")
|
|
1758
1801
|
|
|
1759
1802
|
if auto_generate_selection:
|
|
1760
1803
|
join_manager.auto_rename()
|
|
1761
1804
|
|
|
1762
1805
|
# Use manager properties throughout
|
|
1763
|
-
left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(
|
|
1764
|
-
|
|
1806
|
+
left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(
|
|
1807
|
+
join_manager.left_manager.get_rename_table()
|
|
1808
|
+
)
|
|
1809
|
+
right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(
|
|
1810
|
+
join_manager.right_manager.get_rename_table()
|
|
1811
|
+
)
|
|
1765
1812
|
|
|
1766
1813
|
left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_manager)
|
|
1767
1814
|
left, right = rename_df_table_for_join(left, right, join_manager.get_join_key_renames())
|
|
1768
|
-
if join_manager.how ==
|
|
1815
|
+
if join_manager.how == "right":
|
|
1769
1816
|
joined_df = right.join(
|
|
1770
1817
|
other=left,
|
|
1771
1818
|
left_on=join_manager.right_join_keys,
|
|
1772
1819
|
right_on=join_manager.left_join_keys,
|
|
1773
1820
|
how="left",
|
|
1774
|
-
suffix=""
|
|
1821
|
+
suffix="",
|
|
1822
|
+
).rename(reverse_join_key_mapping)
|
|
1775
1823
|
else:
|
|
1776
1824
|
joined_df = left.join(
|
|
1777
1825
|
other=right,
|
|
1778
1826
|
left_on=join_manager.left_join_keys,
|
|
1779
1827
|
right_on=join_manager.right_join_keys,
|
|
1780
1828
|
how=join_manager.how,
|
|
1781
|
-
suffix=""
|
|
1829
|
+
suffix="",
|
|
1830
|
+
).rename(reverse_join_key_mapping)
|
|
1782
1831
|
|
|
1783
|
-
left_cols_to_delete_after = [
|
|
1784
|
-
|
|
1785
|
-
|
|
1832
|
+
left_cols_to_delete_after = [
|
|
1833
|
+
get_col_name_to_delete(col, "left")
|
|
1834
|
+
for col in join_manager.input.left_select.renames
|
|
1835
|
+
if not col.keep and col.is_available and col.join_key
|
|
1836
|
+
]
|
|
1786
1837
|
|
|
1787
|
-
right_cols_to_delete_after = [
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1838
|
+
right_cols_to_delete_after = [
|
|
1839
|
+
get_col_name_to_delete(col, "right")
|
|
1840
|
+
for col in join_manager.input.right_select.renames
|
|
1841
|
+
if not col.keep
|
|
1842
|
+
and col.is_available
|
|
1843
|
+
and col.join_key
|
|
1844
|
+
and join_manager.how in ("left", "right", "inner", "cross", "outer")
|
|
1845
|
+
]
|
|
1791
1846
|
|
|
1792
1847
|
if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
|
|
1793
1848
|
joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
|
|
@@ -1795,8 +1850,7 @@ class FlowDataEngine:
|
|
|
1795
1850
|
undo_join_key_remapping = get_undo_rename_mapping_join(join_manager)
|
|
1796
1851
|
joined_df = joined_df.rename(undo_join_key_remapping)
|
|
1797
1852
|
|
|
1798
|
-
return FlowDataEngine(joined_df, calculate_schema_stats=False,
|
|
1799
|
-
number_of_records=0, streamable=False)
|
|
1853
|
+
return FlowDataEngine(joined_df, calculate_schema_stats=False, number_of_records=0, streamable=False)
|
|
1800
1854
|
|
|
1801
1855
|
def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
|
|
1802
1856
|
"""Solves a graph problem represented by 'from' and 'to' columns.
|
|
@@ -1811,8 +1865,9 @@ class FlowDataEngine:
|
|
|
1811
1865
|
A new `FlowDataEngine` instance with the solved graph data.
|
|
1812
1866
|
"""
|
|
1813
1867
|
lf = self.data_frame.with_columns(
|
|
1814
|
-
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
|
|
1815
|
-
|
|
1868
|
+
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to).alias(
|
|
1869
|
+
graph_solver_input.output_column_name
|
|
1870
|
+
)
|
|
1816
1871
|
)
|
|
1817
1872
|
return FlowDataEngine(lf)
|
|
1818
1873
|
|
|
@@ -1827,7 +1882,7 @@ class FlowDataEngine:
|
|
|
1827
1882
|
A new `FlowDataEngine` instance with the added column.
|
|
1828
1883
|
"""
|
|
1829
1884
|
if col_name is None:
|
|
1830
|
-
col_name =
|
|
1885
|
+
col_name = "new_values"
|
|
1831
1886
|
return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
|
|
1832
1887
|
|
|
1833
1888
|
def get_record_count(self) -> "FlowDataEngine":
|
|
@@ -1837,7 +1892,7 @@ class FlowDataEngine:
|
|
|
1837
1892
|
Returns:
|
|
1838
1893
|
A new `FlowDataEngine` instance.
|
|
1839
1894
|
"""
|
|
1840
|
-
return FlowDataEngine(self.data_frame.select(pl.len().alias(
|
|
1895
|
+
return FlowDataEngine(self.data_frame.select(pl.len().alias("number_of_records")))
|
|
1841
1896
|
|
|
1842
1897
|
def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
|
|
1843
1898
|
"""Asserts that this DataFrame is equal to another.
|
|
@@ -1860,13 +1915,13 @@ class FlowDataEngine:
|
|
|
1860
1915
|
other = other.select_columns(self.columns)
|
|
1861
1916
|
|
|
1862
1917
|
if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
|
|
1863
|
-
raise Exception(
|
|
1918
|
+
raise Exception("Number of records is not equal")
|
|
1864
1919
|
|
|
1865
1920
|
if self.columns != other.columns:
|
|
1866
|
-
raise Exception(
|
|
1921
|
+
raise Exception("Schema is not equal")
|
|
1867
1922
|
|
|
1868
1923
|
if strict_schema:
|
|
1869
|
-
assert self.data_frame.schema == other.data_frame.schema,
|
|
1924
|
+
assert self.data_frame.schema == other.data_frame.schema, "Data types do not match"
|
|
1870
1925
|
|
|
1871
1926
|
if ordered:
|
|
1872
1927
|
self_lf = self.data_frame.sort(by=self.columns)
|
|
@@ -1876,7 +1931,7 @@ class FlowDataEngine:
|
|
|
1876
1931
|
other_lf = other.data_frame
|
|
1877
1932
|
|
|
1878
1933
|
self.lazy, other.lazy = org_laziness
|
|
1879
|
-
assert self_lf.equals(other_lf),
|
|
1934
|
+
assert self_lf.equals(other_lf), "Data is not equal"
|
|
1880
1935
|
|
|
1881
1936
|
def initialize_empty_fl(self):
|
|
1882
1937
|
"""Initializes an empty LazyFrame."""
|
|
@@ -1891,7 +1946,7 @@ class FlowDataEngine:
|
|
|
1891
1946
|
operation_type="calculate_number_of_records",
|
|
1892
1947
|
flow_id=-1,
|
|
1893
1948
|
node_id=-1,
|
|
1894
|
-
wait_on_completion=True
|
|
1949
|
+
wait_on_completion=True,
|
|
1895
1950
|
).result
|
|
1896
1951
|
return number_of_records
|
|
1897
1952
|
|
|
@@ -1907,8 +1962,9 @@ class FlowDataEngine:
|
|
|
1907
1962
|
"""
|
|
1908
1963
|
return self.get_number_of_records(force_calculate=force_calculate)
|
|
1909
1964
|
|
|
1910
|
-
def get_number_of_records(
|
|
1911
|
-
|
|
1965
|
+
def get_number_of_records(
|
|
1966
|
+
self, warn: bool = False, force_calculate: bool = False, calculate_in_worker_process: bool = False
|
|
1967
|
+
) -> int:
|
|
1912
1968
|
"""Gets the total number of records in the DataFrame.
|
|
1913
1969
|
|
|
1914
1970
|
For lazy frames, this may trigger a full data scan, which can be expensive.
|
|
@@ -1938,12 +1994,13 @@ class FlowDataEngine:
|
|
|
1938
1994
|
except Exception as e:
|
|
1939
1995
|
logger.error(f"Error: {e}")
|
|
1940
1996
|
if warn:
|
|
1941
|
-
logger.warning(
|
|
1997
|
+
logger.warning("Calculating the number of records this can be expensive on a lazy frame")
|
|
1942
1998
|
try:
|
|
1943
1999
|
self.number_of_records = self.data_frame.select(pl.len()).collect(
|
|
1944
|
-
engine="streaming" if self._streamable else "auto"
|
|
2000
|
+
engine="streaming" if self._streamable else "auto"
|
|
2001
|
+
)[0, 0]
|
|
1945
2002
|
except Exception:
|
|
1946
|
-
raise ValueError(
|
|
2003
|
+
raise ValueError("Could not get number of records")
|
|
1947
2004
|
else:
|
|
1948
2005
|
self.number_of_records = self.data_frame.__len__()
|
|
1949
2006
|
return self.number_of_records
|
|
@@ -1984,7 +2041,7 @@ class FlowDataEngine:
|
|
|
1984
2041
|
return self._external_source
|
|
1985
2042
|
|
|
1986
2043
|
@property
|
|
1987
|
-
def cols_idx(self) ->
|
|
2044
|
+
def cols_idx(self) -> dict[str, int]:
|
|
1988
2045
|
"""A dictionary mapping column names to their integer index."""
|
|
1989
2046
|
if self._col_idx is None:
|
|
1990
2047
|
self._col_idx = {c: i for i, c in enumerate(self.columns)}
|
|
@@ -2006,7 +2063,7 @@ class FlowDataEngine:
|
|
|
2006
2063
|
[transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
|
|
2007
2064
|
)
|
|
2008
2065
|
|
|
2009
|
-
def select_columns(self, list_select:
|
|
2066
|
+
def select_columns(self, list_select: list[str] | tuple[str] | str) -> "FlowDataEngine":
|
|
2010
2067
|
"""Selects a subset of columns from the DataFrame.
|
|
2011
2068
|
|
|
2012
2069
|
Args:
|
|
@@ -2019,17 +2076,17 @@ class FlowDataEngine:
|
|
|
2019
2076
|
list_select = [list_select]
|
|
2020
2077
|
|
|
2021
2078
|
idx_to_keep = [self.cols_idx.get(c) for c in list_select]
|
|
2022
|
-
selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep) if id_to_keep is not None]
|
|
2079
|
+
selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep, strict=False) if id_to_keep is not None]
|
|
2023
2080
|
new_schema = [self.schema[i] for i in idx_to_keep if i is not None]
|
|
2024
2081
|
|
|
2025
2082
|
return FlowDataEngine(
|
|
2026
2083
|
self.data_frame.select(selects),
|
|
2027
2084
|
number_of_records=self.number_of_records,
|
|
2028
2085
|
schema=new_schema,
|
|
2029
|
-
streamable=self._streamable
|
|
2086
|
+
streamable=self._streamable,
|
|
2030
2087
|
)
|
|
2031
2088
|
|
|
2032
|
-
def drop_columns(self, columns:
|
|
2089
|
+
def drop_columns(self, columns: list[str]) -> "FlowDataEngine":
|
|
2033
2090
|
"""Drops specified columns from the DataFrame.
|
|
2034
2091
|
|
|
2035
2092
|
Args:
|
|
@@ -2043,12 +2100,10 @@ class FlowDataEngine:
|
|
|
2043
2100
|
new_schema = [self.schema[i] for i in idx_to_keep]
|
|
2044
2101
|
|
|
2045
2102
|
return FlowDataEngine(
|
|
2046
|
-
self.data_frame.select(cols_for_select),
|
|
2047
|
-
number_of_records=self.number_of_records,
|
|
2048
|
-
schema=new_schema
|
|
2103
|
+
self.data_frame.select(cols_for_select), number_of_records=self.number_of_records, schema=new_schema
|
|
2049
2104
|
)
|
|
2050
2105
|
|
|
2051
|
-
def reorganize_order(self, column_order:
|
|
2106
|
+
def reorganize_order(self, column_order: list[str]) -> "FlowDataEngine":
|
|
2052
2107
|
"""Reorganizes columns into a specified order.
|
|
2053
2108
|
|
|
2054
2109
|
Args:
|
|
@@ -2061,8 +2116,9 @@ class FlowDataEngine:
|
|
|
2061
2116
|
schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
|
|
2062
2117
|
return FlowDataEngine(df, schema=schema, number_of_records=self.number_of_records)
|
|
2063
2118
|
|
|
2064
|
-
def apply_flowfile_formula(
|
|
2065
|
-
|
|
2119
|
+
def apply_flowfile_formula(
|
|
2120
|
+
self, func: str, col_name: str, output_data_type: pl.DataType = None
|
|
2121
|
+
) -> "FlowDataEngine":
|
|
2066
2122
|
"""Applies a formula to create a new column or transform an existing one.
|
|
2067
2123
|
|
|
2068
2124
|
Args:
|
|
@@ -2081,8 +2137,7 @@ class FlowDataEngine:
|
|
|
2081
2137
|
|
|
2082
2138
|
return FlowDataEngine(df2, number_of_records=self.number_of_records)
|
|
2083
2139
|
|
|
2084
|
-
def apply_sql_formula(self, func: str, col_name: str,
|
|
2085
|
-
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
2140
|
+
def apply_sql_formula(self, func: str, col_name: str, output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
2086
2141
|
"""Applies an SQL-style formula using `pl.sql_expr`.
|
|
2087
2142
|
|
|
2088
2143
|
Args:
|
|
@@ -2101,8 +2156,9 @@ class FlowDataEngine:
|
|
|
2101
2156
|
|
|
2102
2157
|
return FlowDataEngine(df, number_of_records=self.number_of_records)
|
|
2103
2158
|
|
|
2104
|
-
def output(
|
|
2105
|
-
|
|
2159
|
+
def output(
|
|
2160
|
+
self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str, execute_remote: bool = True
|
|
2161
|
+
) -> "FlowDataEngine":
|
|
2106
2162
|
"""Writes the DataFrame to an output file.
|
|
2107
2163
|
|
|
2108
2164
|
Can execute the write operation locally or in a remote worker process.
|
|
@@ -2116,7 +2172,7 @@ class FlowDataEngine:
|
|
|
2116
2172
|
Returns:
|
|
2117
2173
|
The same `FlowDataEngine` instance for chaining.
|
|
2118
2174
|
"""
|
|
2119
|
-
logger.info(
|
|
2175
|
+
logger.info("Starting to write output")
|
|
2120
2176
|
if execute_remote:
|
|
2121
2177
|
status = utils.write_output(
|
|
2122
2178
|
self.data_frame,
|
|
@@ -2126,11 +2182,11 @@ class FlowDataEngine:
|
|
|
2126
2182
|
sheet_name=output_fs.sheet_name,
|
|
2127
2183
|
delimiter=output_fs.delimiter,
|
|
2128
2184
|
flow_id=flow_id,
|
|
2129
|
-
node_id=node_id
|
|
2185
|
+
node_id=node_id,
|
|
2130
2186
|
)
|
|
2131
2187
|
tracker = ExternalExecutorTracker(status)
|
|
2132
2188
|
tracker.get_result()
|
|
2133
|
-
logger.info(
|
|
2189
|
+
logger.info("Finished writing output")
|
|
2134
2190
|
else:
|
|
2135
2191
|
logger.info("Starting to write results locally")
|
|
2136
2192
|
utils.local_write_output(
|
|
@@ -2172,11 +2228,10 @@ class FlowDataEngine:
|
|
|
2172
2228
|
if isinstance(other, FlowDataEngine):
|
|
2173
2229
|
other = [other]
|
|
2174
2230
|
|
|
2175
|
-
dfs:
|
|
2176
|
-
return FlowDataEngine(pl.concat(dfs, how=
|
|
2231
|
+
dfs: list[pl.LazyFrame] | list[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
|
|
2232
|
+
return FlowDataEngine(pl.concat(dfs, how="diagonal_relaxed"))
|
|
2177
2233
|
|
|
2178
|
-
def do_select(self, select_inputs: transform_schemas.SelectInputs,
|
|
2179
|
-
keep_missing: bool = True) -> "FlowDataEngine":
|
|
2234
|
+
def do_select(self, select_inputs: transform_schemas.SelectInputs, keep_missing: bool = True) -> "FlowDataEngine":
|
|
2180
2235
|
"""Performs a complex column selection, renaming, and reordering operation.
|
|
2181
2236
|
|
|
2182
2237
|
Args:
|
|
@@ -2192,7 +2247,8 @@ class FlowDataEngine:
|
|
|
2192
2247
|
|
|
2193
2248
|
if not keep_missing:
|
|
2194
2249
|
drop_cols = set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames).union(
|
|
2195
|
-
set(r.old_name for r in renames if not r.keep)
|
|
2250
|
+
set(r.old_name for r in renames if not r.keep)
|
|
2251
|
+
)
|
|
2196
2252
|
keep_cols = []
|
|
2197
2253
|
else:
|
|
2198
2254
|
keep_cols = list(set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames))
|
|
@@ -2212,12 +2268,14 @@ class FlowDataEngine:
|
|
|
2212
2268
|
|
|
2213
2269
|
rename_dict = {r.old_name: r.new_name for r in available_renames}
|
|
2214
2270
|
fl = self.select_columns(
|
|
2215
|
-
list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
|
|
2271
|
+
list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
|
|
2272
|
+
)
|
|
2216
2273
|
fl = fl.change_column_types(transforms=[r for r in renames if r.keep])
|
|
2217
2274
|
ndf = fl.data_frame.rename(rename_dict)
|
|
2218
2275
|
renames.sort(key=lambda r: 0 if r.position is None else r.position)
|
|
2219
|
-
sorted_cols = utils.match_order(
|
|
2220
|
-
|
|
2276
|
+
sorted_cols = utils.match_order(
|
|
2277
|
+
ndf.collect_schema().names(), [r.new_name for r in renames] + self.data_frame.collect_schema().names()
|
|
2278
|
+
)
|
|
2221
2279
|
output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
|
|
2222
2280
|
return output_file.reorganize_order(sorted_cols)
|
|
2223
2281
|
|
|
@@ -2225,10 +2283,9 @@ class FlowDataEngine:
|
|
|
2225
2283
|
"""Sets whether DataFrame operations should be streamable."""
|
|
2226
2284
|
self._streamable = streamable
|
|
2227
2285
|
|
|
2228
|
-
def _calculate_schema(self) ->
|
|
2286
|
+
def _calculate_schema(self) -> list[dict]:
|
|
2229
2287
|
"""Calculates schema statistics."""
|
|
2230
2288
|
if self.external_source is not None:
|
|
2231
|
-
|
|
2232
2289
|
self.collect_external()
|
|
2233
2290
|
v = utils.calculate_schema(self.data_frame)
|
|
2234
2291
|
return v
|
|
@@ -2247,8 +2304,9 @@ class FlowDataEngine:
|
|
|
2247
2304
|
"""Creates a FlowDataEngine from a path in a worker process."""
|
|
2248
2305
|
received_table.set_absolute_filepath()
|
|
2249
2306
|
|
|
2250
|
-
external_fetcher = ExternalCreateFetcher(
|
|
2251
|
-
|
|
2307
|
+
external_fetcher = ExternalCreateFetcher(
|
|
2308
|
+
received_table=received_table, file_type=received_table.file_type, flow_id=flow_id, node_id=node_id
|
|
2309
|
+
)
|
|
2252
2310
|
return cls(external_fetcher.get_result())
|
|
2253
2311
|
|
|
2254
2312
|
|
|
@@ -2271,10 +2329,10 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
|
|
|
2271
2329
|
if len(flowfile_tables) == 0:
|
|
2272
2330
|
kwargs = {}
|
|
2273
2331
|
elif len(flowfile_tables) == 1:
|
|
2274
|
-
kwargs = {
|
|
2332
|
+
kwargs = {"input_df": flowfile_tables[0].data_frame}
|
|
2275
2333
|
else:
|
|
2276
|
-
kwargs = {f
|
|
2334
|
+
kwargs = {f"input_df_{i+1}": flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
|
|
2277
2335
|
df = polars_executable(**kwargs)
|
|
2278
2336
|
if isinstance(df, pl.DataFrame):
|
|
2279
2337
|
logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
|
|
2280
|
-
return FlowDataEngine(df)
|
|
2338
|
+
return FlowDataEngine(df)
|