Flowfile 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backends/main.py +25 -22
- build_backends/main_prd.py +10 -19
- flowfile/__init__.py +194 -74
- flowfile/__main__.py +10 -7
- flowfile/api.py +51 -57
- flowfile/web/__init__.py +14 -9
- flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
- flowfile/web/static/assets/AdminView-f9847d67.js +713 -0
- flowfile/web/static/assets/CloudConnectionView-cf85f943.css +72 -0
- flowfile/web/static/assets/{CloudConnectionManager-0dfba9f2.js → CloudConnectionView-faace55b.js} +11 -11
- flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
- flowfile/web/static/assets/{CloudStorageReader-d5b1b6c9.js → CloudStorageReader-d86ecaa7.js} +10 -8
- flowfile/web/static/assets/{CloudStorageWriter-00d87aad.js → CloudStorageWriter-0f4d9a44.js} +10 -8
- flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
- flowfile/web/static/assets/ColumnActionInput-c44b7aee.css +159 -0
- flowfile/web/static/assets/ColumnActionInput-f4189ae0.js +330 -0
- flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
- flowfile/web/static/assets/{ColumnSelector-4685e75d.js → ColumnSelector-e66b33da.js} +3 -5
- flowfile/web/static/assets/ContextMenu-49463352.js +9 -0
- flowfile/web/static/assets/ContextMenu-dd5f3f25.js +9 -0
- flowfile/web/static/assets/ContextMenu-f709b884.js +9 -0
- flowfile/web/static/assets/ContextMenu.vue_vue_type_script_setup_true_lang-a1bd6314.js +59 -0
- flowfile/web/static/assets/{CrossJoin-702a3edd.js → CrossJoin-24694b8f.js} +12 -10
- flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
- flowfile/web/static/assets/{CustomNode-b1519993.js → CustomNode-569d45ff.js} +43 -24
- flowfile/web/static/assets/CustomNode-edb9b939.css +42 -0
- flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-c20a1e16.css} +23 -21
- flowfile/web/static/assets/{DatabaseConnectionSettings-6f3e4ea5.js → DatabaseConnectionSettings-cfc08938.js} +5 -4
- flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-5bf8c75b.css} +41 -46
- flowfile/web/static/assets/{DatabaseReader-d38c7295.js → DatabaseReader-701feabb.js} +25 -15
- flowfile/web/static/assets/{DatabaseManager-cf5ef661.js → DatabaseView-0482e5b5.js} +11 -11
- flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
- flowfile/web/static/assets/{DatabaseWriter-b04ef46a.js → DatabaseWriter-16721989.js} +17 -10
- flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-bdcf2c8b.css} +29 -27
- flowfile/web/static/assets/{designer-8da3ba3a.css → DesignerView-49abb835.css} +783 -663
- flowfile/web/static/assets/{designer-9633482a.js → DesignerView-f64749fb.js} +1292 -3253
- flowfile/web/static/assets/{documentation-ca400224.js → DocumentationView-61bd2990.js} +5 -5
- flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-9ea6e871.css} +9 -9
- flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
- flowfile/web/static/assets/{ExploreData-5fa10ed8.js → ExploreData-e2735b13.js} +18 -9
- flowfile/web/static/assets/{ExternalSource-d39af878.js → ExternalSource-2535c3b2.js} +9 -7
- flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-7ac7373f.css} +20 -20
- flowfile/web/static/assets/Filter-2cdbc93c.js +287 -0
- flowfile/web/static/assets/Filter-7494ea97.css +48 -0
- flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
- flowfile/web/static/assets/{Formula-6b04fb1d.js → Formula-fcda3c2c.js} +13 -11
- flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
- flowfile/web/static/assets/{FuzzyMatch-999521f4.js → FuzzyMatch-f8d3b7d3.js} +12 -10
- flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-4b4d7db9.css} +5 -5
- flowfile/web/static/assets/{GraphSolver-17dd2198.js → GraphSolver-72eaa695.js} +14 -12
- flowfile/web/static/assets/GroupBy-5792782d.css +9 -0
- flowfile/web/static/assets/{GroupBy-6b039e18.js → GroupBy-8aa0598b.js} +9 -7
- flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
- flowfile/web/static/assets/{Join-24d0f113.js → Join-e40f0ffa.js} +13 -11
- flowfile/web/static/assets/LoginView-5111c9ae.js +134 -0
- flowfile/web/static/assets/LoginView-d325d632.css +172 -0
- flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
- flowfile/web/static/assets/{ManualInput-34639209.js → ManualInput-9b6f3224.js} +170 -116
- flowfile/web/static/assets/{MultiSelect-0e8724a3.js → MultiSelect-ef28e19e.js} +2 -2
- flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js → MultiSelect.vue_vue_type_script_setup_true_lang-83b3bbfd.js} +1 -1
- flowfile/web/static/assets/NodeDesigner-94cd4dd3.css +1429 -0
- flowfile/web/static/assets/NodeDesigner-d2b7ee2b.js +2712 -0
- flowfile/web/static/assets/{NumericInput-3d63a470.js → NumericInput-1d789794.js} +2 -2
- flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js → NumericInput.vue_vue_type_script_setup_true_lang-7775f83e.js} +5 -2
- flowfile/web/static/assets/Output-692dd25d.css +37 -0
- flowfile/web/static/assets/{Output-edea9802.js → Output-cefef801.js} +14 -10
- flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
- flowfile/web/static/assets/{Pivot-61d19301.js → Pivot-bab1b75b.js} +12 -10
- flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
- flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
- flowfile/web/static/assets/{PivotValidation-f97fec5b.js → PivotValidation-e7941f91.js} +3 -3
- flowfile/web/static/assets/{PivotValidation-de9f43fe.js → PivotValidation-fba09336.js} +3 -3
- flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
- flowfile/web/static/assets/{PolarsCode-bc3c9984.js → PolarsCode-740e40fa.js} +18 -9
- flowfile/web/static/assets/PopOver-862d7e28.js +939 -0
- flowfile/web/static/assets/PopOver-d96599db.css +33 -0
- flowfile/web/static/assets/{Read-64a3f259.js → Read-225cc63f.js} +16 -12
- flowfile/web/static/assets/{Read-e808b239.css → Read-90f366bc.css} +15 -15
- flowfile/web/static/assets/{RecordCount-3d5039be.js → RecordCount-ffc71eca.js} +6 -4
- flowfile/web/static/assets/{RecordId-597510e0.js → RecordId-a70bb8df.js} +9 -7
- flowfile/web/static/assets/{SQLQueryComponent-df51adbe.js → SQLQueryComponent-15a421f5.js} +3 -3
- flowfile/web/static/assets/SQLQueryComponent-edb90b98.css +29 -0
- flowfile/web/static/assets/{Sample-4be0a507.js → Sample-6c26afc7.js} +6 -4
- flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
- flowfile/web/static/assets/SecretSelector-ceed9496.js +113 -0
- flowfile/web/static/assets/{SecretManager-4839be57.js → SecretsView-214d255a.js} +35 -36
- flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
- flowfile/web/static/assets/{Select-9b72f201.js → Select-8fc29999.js} +9 -7
- flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-7ded385d.js → SettingsSection-3f70e4c3.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-f0f75a42.js → SettingsSection-83090218.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-e1e9c953.js → SettingsSection-9f0d1725.js} +3 -3
- flowfile/web/static/assets/SetupView-3fa0aa03.js +160 -0
- flowfile/web/static/assets/SetupView-e2da3442.css +230 -0
- flowfile/web/static/assets/{SingleSelect-6c777aac.js → SingleSelect-a4a568cb.js} +2 -2
- flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js → SingleSelect.vue_vue_type_script_setup_true_lang-c8ebdd33.js} +1 -1
- flowfile/web/static/assets/{SliderInput-7cb93e62.js → SliderInput-be533e71.js} +7 -4
- flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
- flowfile/web/static/assets/{Sort-6cbde21a.js → Sort-154dad81.js} +9 -7
- flowfile/web/static/assets/Sort-4abb7fae.css +9 -0
- flowfile/web/static/assets/{TextInput-d9a40c11.js → TextInput-454e2bda.js} +2 -2
- flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-5896c375.js → TextInput.vue_vue_type_script_setup_true_lang-e86510d0.js} +5 -2
- flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
- flowfile/web/static/assets/{TextToRows-c4fcbf4d.js → TextToRows-ea73433d.js} +11 -10
- flowfile/web/static/assets/{ToggleSwitch-4ef91d19.js → ToggleSwitch-9d7b30f1.js} +2 -2
- flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-00f2580e.js} +1 -1
- flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-394a1f78.css} +14 -14
- flowfile/web/static/assets/{UnavailableFields-a03f512c.js → UnavailableFields-b72a2c72.js} +4 -4
- flowfile/web/static/assets/{Union-bfe9b996.js → Union-1e44f263.js} +8 -6
- flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
- flowfile/web/static/assets/Unique-2b705521.css +3 -0
- flowfile/web/static/assets/{Unique-5d023a27.js → Unique-a3bc6d0a.js} +13 -10
- flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-b6ad6427.css} +7 -7
- flowfile/web/static/assets/{Unpivot-91cc5354.js → Unpivot-e27935fc.js} +11 -9
- flowfile/web/static/assets/{UnpivotValidation-7ee2de44.js → UnpivotValidation-72497680.js} +3 -3
- flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
- flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
- flowfile/web/static/assets/{VueGraphicWalker-e51b9924.js → VueGraphicWalker-d9ab70a3.js} +4 -4
- flowfile/web/static/assets/{api-cf1221f0.js → api-a2102880.js} +1 -1
- flowfile/web/static/assets/{api-c1bad5ca.js → api-f75042b0.js} +1 -1
- flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-1d6acbd9.css} +41 -41
- flowfile/web/static/assets/{dropDown-614b998d.js → dropDown-2798a109.js} +3 -3
- flowfile/web/static/assets/{fullEditor-f7971590.js → fullEditor-cf7d7d93.js} +11 -10
- flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-fe9f7e18.css} +77 -65
- flowfile/web/static/assets/{genericNodeSettings-4fe5f36b.js → genericNodeSettings-14eac1c3.js} +5 -5
- flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
- flowfile/web/static/assets/{index-5429bbf8.js → index-387a6f18.js} +41806 -40958
- flowfile/web/static/assets/index-6b367bb5.js +38 -0
- flowfile/web/static/assets/{index-50508d4d.css → index-e96ab018.css} +2184 -569
- flowfile/web/static/assets/index-f0a6e5a5.js +2696 -0
- flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
- flowfile/web/static/assets/nodeInput-ed2ae8d7.js +2 -0
- flowfile/web/static/assets/{outputCsv-076b85ab.js → outputCsv-3c1757e8.js} +3 -3
- flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
- flowfile/web/static/assets/{outputExcel-0fd17dbe.js → outputExcel-686e1f48.js} +3 -3
- flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
- flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
- flowfile/web/static/assets/{outputParquet-b61e0847.js → outputParquet-df28faa7.js} +4 -4
- flowfile/web/static/assets/{readCsv-c767cb37.css → readCsv-3bfac4c3.css} +15 -15
- flowfile/web/static/assets/{readCsv-a8bb8b61.js → readCsv-e37eee21.js} +3 -3
- flowfile/web/static/assets/{readExcel-806d2826.css → readExcel-3db6b763.css} +13 -13
- flowfile/web/static/assets/{readExcel-67b4aee0.js → readExcel-a13f14bb.js} +5 -5
- flowfile/web/static/assets/{readParquet-92ce1dbc.js → readParquet-344cf746.js} +3 -3
- flowfile/web/static/assets/{readParquet-48c81530.css → readParquet-c5244ad5.css} +4 -4
- flowfile/web/static/assets/secrets.api-ae198c5c.js +65 -0
- flowfile/web/static/assets/{selectDynamic-92e25ee3.js → selectDynamic-6b4b0767.js} +5 -5
- flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
- flowfile/web/static/assets/{vue-codemirror.esm-41b0e0d7.js → vue-codemirror.esm-31ba0e0b.js} +31 -640
- flowfile/web/static/assets/{vue-content-loader.es-2c8e608f.js → vue-content-loader.es-4469c8ff.js} +1 -1
- flowfile/web/static/index.html +2 -2
- {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/METADATA +3 -4
- flowfile-0.5.4.dist-info/RECORD +407 -0
- flowfile_core/__init__.py +13 -6
- flowfile_core/auth/jwt.py +51 -16
- flowfile_core/auth/models.py +32 -7
- flowfile_core/auth/password.py +89 -0
- flowfile_core/auth/secrets.py +64 -19
- flowfile_core/configs/__init__.py +9 -7
- flowfile_core/configs/flow_logger.py +15 -14
- flowfile_core/configs/node_store/__init__.py +72 -4
- flowfile_core/configs/node_store/nodes.py +155 -172
- flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
- flowfile_core/configs/settings.py +28 -15
- flowfile_core/database/connection.py +7 -6
- flowfile_core/database/init_db.py +96 -2
- flowfile_core/database/models.py +3 -1
- flowfile_core/fileExplorer/__init__.py +17 -0
- flowfile_core/fileExplorer/funcs.py +145 -57
- flowfile_core/fileExplorer/utils.py +10 -11
- flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
- flowfile_core/flowfile/analytics/analytics_processor.py +26 -24
- flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
- flowfile_core/flowfile/analytics/utils.py +1 -1
- flowfile_core/flowfile/code_generator/__init__.py +11 -0
- flowfile_core/flowfile/code_generator/code_generator.py +706 -247
- flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
- flowfile_core/flowfile/database_connection_manager/models.py +1 -1
- flowfile_core/flowfile/extensions.py +17 -12
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +115 -83
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +493 -423
- flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
- flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +31 -20
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +14 -15
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
- flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +190 -127
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
- flowfile_core/flowfile/flow_data_engine/utils.py +99 -67
- flowfile_core/flowfile/flow_graph.py +920 -571
- flowfile_core/flowfile/flow_graph_utils.py +31 -49
- flowfile_core/flowfile/flow_node/flow_node.py +379 -258
- flowfile_core/flowfile/flow_node/models.py +53 -41
- flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
- flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
- flowfile_core/flowfile/handler.py +80 -30
- flowfile_core/flowfile/manage/compatibility_enhancements.py +209 -126
- flowfile_core/flowfile/manage/io_flowfile.py +54 -57
- flowfile_core/flowfile/node_designer/__init__.py +19 -13
- flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
- flowfile_core/flowfile/node_designer/custom_node.py +162 -36
- flowfile_core/flowfile/node_designer/ui_components.py +278 -34
- flowfile_core/flowfile/schema_callbacks.py +71 -51
- flowfile_core/flowfile/setting_generator/__init__.py +0 -1
- flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
- flowfile_core/flowfile/setting_generator/settings.py +64 -53
- flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
- flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
- flowfile_core/flowfile/util/calculate_layout.py +9 -13
- flowfile_core/flowfile/util/execution_orderer.py +25 -17
- flowfile_core/flowfile/util/node_skipper.py +4 -4
- flowfile_core/flowfile/utils.py +19 -21
- flowfile_core/main.py +26 -19
- flowfile_core/routes/auth.py +284 -11
- flowfile_core/routes/cloud_connections.py +25 -25
- flowfile_core/routes/logs.py +21 -29
- flowfile_core/routes/public.py +46 -4
- flowfile_core/routes/routes.py +70 -34
- flowfile_core/routes/secrets.py +25 -27
- flowfile_core/routes/user_defined_components.py +483 -4
- flowfile_core/run_lock.py +0 -1
- flowfile_core/schemas/__init__.py +4 -6
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
- flowfile_core/schemas/cloud_storage_schemas.py +96 -66
- flowfile_core/schemas/input_schema.py +231 -144
- flowfile_core/schemas/output_model.py +49 -34
- flowfile_core/schemas/schemas.py +116 -89
- flowfile_core/schemas/transform_schema.py +518 -263
- flowfile_core/schemas/yaml_types.py +21 -7
- flowfile_core/secret_manager/secret_manager.py +123 -18
- flowfile_core/types.py +29 -9
- flowfile_core/utils/arrow_reader.py +7 -6
- flowfile_core/utils/excel_file_manager.py +3 -3
- flowfile_core/utils/fileManager.py +7 -7
- flowfile_core/utils/fl_executor.py +8 -10
- flowfile_core/utils/utils.py +4 -4
- flowfile_core/utils/validate_setup.py +5 -4
- flowfile_frame/__init__.py +117 -51
- flowfile_frame/adapters.py +2 -9
- flowfile_frame/adding_expr.py +73 -32
- flowfile_frame/cloud_storage/frame_helpers.py +27 -23
- flowfile_frame/cloud_storage/secret_manager.py +12 -26
- flowfile_frame/config.py +2 -5
- flowfile_frame/database/__init__.py +36 -0
- flowfile_frame/database/connection_manager.py +205 -0
- flowfile_frame/database/frame_helpers.py +249 -0
- flowfile_frame/expr.py +311 -218
- flowfile_frame/expr.pyi +160 -159
- flowfile_frame/expr_name.py +23 -23
- flowfile_frame/flow_frame.py +571 -476
- flowfile_frame/flow_frame.pyi +123 -104
- flowfile_frame/flow_frame_methods.py +227 -246
- flowfile_frame/group_frame.py +50 -20
- flowfile_frame/join.py +2 -2
- flowfile_frame/lazy.py +129 -87
- flowfile_frame/lazy_methods.py +83 -30
- flowfile_frame/list_name_space.py +55 -50
- flowfile_frame/selectors.py +148 -68
- flowfile_frame/series.py +9 -7
- flowfile_frame/utils.py +19 -21
- flowfile_worker/__init__.py +12 -7
- flowfile_worker/configs.py +41 -33
- flowfile_worker/create/__init__.py +14 -9
- flowfile_worker/create/funcs.py +114 -77
- flowfile_worker/create/models.py +46 -43
- flowfile_worker/create/pl_types.py +14 -15
- flowfile_worker/create/read_excel_tables.py +34 -41
- flowfile_worker/create/utils.py +22 -19
- flowfile_worker/external_sources/s3_source/main.py +18 -51
- flowfile_worker/external_sources/s3_source/models.py +34 -27
- flowfile_worker/external_sources/sql_source/main.py +8 -5
- flowfile_worker/external_sources/sql_source/models.py +13 -9
- flowfile_worker/flow_logger.py +10 -8
- flowfile_worker/funcs.py +214 -155
- flowfile_worker/main.py +11 -17
- flowfile_worker/models.py +35 -28
- flowfile_worker/process_manager.py +2 -3
- flowfile_worker/routes.py +121 -90
- flowfile_worker/secrets.py +114 -21
- flowfile_worker/spawner.py +89 -54
- flowfile_worker/utils.py +3 -2
- shared/__init__.py +2 -7
- shared/storage_config.py +25 -13
- test_utils/postgres/commands.py +3 -2
- test_utils/postgres/fixtures.py +9 -9
- test_utils/s3/commands.py +1 -1
- test_utils/s3/data_generator.py +3 -4
- test_utils/s3/demo_data_generator.py +4 -7
- test_utils/s3/fixtures.py +7 -5
- tools/migrate/__init__.py +1 -1
- tools/migrate/__main__.py +16 -29
- tools/migrate/legacy_schemas.py +251 -190
- tools/migrate/migrate.py +193 -181
- tools/migrate/tests/conftest.py +1 -3
- tools/migrate/tests/test_migrate.py +36 -41
- tools/migrate/tests/test_migration_e2e.py +28 -29
- tools/migrate/tests/test_node_migrations.py +50 -20
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
- flowfile/web/static/assets/ContextMenu-23e909da.js +0 -41
- flowfile/web/static/assets/ContextMenu-4c74eef1.css +0 -26
- flowfile/web/static/assets/ContextMenu-63cfa99b.css +0 -26
- flowfile/web/static/assets/ContextMenu-70ae0c79.js +0 -41
- flowfile/web/static/assets/ContextMenu-c13f91d0.css +0 -26
- flowfile/web/static/assets/ContextMenu-f149cf7c.js +0 -41
- flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
- flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
- flowfile/web/static/assets/Filter-9b6d08db.js +0 -164
- flowfile/web/static/assets/Filter-f62091b3.css +0 -20
- flowfile/web/static/assets/GroupBy-b9505323.css +0 -51
- flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
- flowfile/web/static/assets/Output-283fe388.css +0 -37
- flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
- flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
- flowfile/web/static/assets/SQLQueryComponent-36cef432.css +0 -27
- flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
- flowfile/web/static/assets/Sort-3643d625.css +0 -51
- flowfile/web/static/assets/Unique-f9fb0809.css +0 -51
- flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
- flowfile/web/static/assets/nodeInput-5d0d6b79.js +0 -41
- flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
- flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
- flowfile/web/static/assets/secretApi-68435402.js +0 -46
- flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
- flowfile-0.5.1.dist-info/RECORD +0 -388
- {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/WHEEL +0 -0
- {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/entry_points.txt +0 -0
- {flowfile-0.5.1.dist-info → flowfile-0.5.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,52 +1,50 @@
|
|
|
1
1
|
# Standard library imports
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
6
|
+
from collections.abc import Callable, Generator, Iterable
|
|
4
7
|
from copy import deepcopy
|
|
5
8
|
from dataclasses import dataclass
|
|
6
9
|
from math import ceil
|
|
7
|
-
from typing import Any,
|
|
10
|
+
from typing import Any, Literal, TypeVar, Union
|
|
8
11
|
|
|
9
|
-
|
|
12
|
+
import polars as pl
|
|
10
13
|
|
|
11
14
|
# Third-party imports
|
|
12
15
|
from loky import Future
|
|
13
|
-
import
|
|
16
|
+
from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
|
|
14
17
|
from polars.exceptions import PanicException
|
|
15
|
-
from polars_grouper import graph_solver
|
|
16
18
|
from polars_expr_transformer import simple_function_to_expr as to_expr
|
|
19
|
+
from polars_grouper import graph_solver
|
|
17
20
|
from pyarrow import Table as PaTable
|
|
18
21
|
from pyarrow.parquet import ParquetFile
|
|
19
22
|
|
|
20
23
|
# Local imports - Core
|
|
21
24
|
from flowfile_core.configs import logger
|
|
22
|
-
from flowfile_core.utils.utils import ensure_similarity_dicts
|
|
23
25
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
24
|
-
from flowfile_core.schemas import (
|
|
25
|
-
cloud_storage_schemas,
|
|
26
|
-
input_schema,
|
|
27
|
-
transform_schema as transform_schemas
|
|
28
|
-
)
|
|
29
|
-
from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
|
|
30
26
|
|
|
31
27
|
# Local imports - Flow File Components
|
|
32
28
|
from flowfile_core.flowfile.flow_data_engine import utils
|
|
33
|
-
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
|
|
34
|
-
|
|
35
|
-
|
|
29
|
+
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
|
|
30
|
+
CloudStorageReader,
|
|
31
|
+
ensure_path_has_wildcard_pattern,
|
|
32
|
+
get_first_file_from_s3_dir,
|
|
33
|
+
)
|
|
36
34
|
from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
|
|
37
35
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
|
|
38
36
|
FlowfileColumn,
|
|
39
37
|
assert_if_flowfile_schema,
|
|
40
|
-
convert_stats_to_column_info
|
|
38
|
+
convert_stats_to_column_info,
|
|
41
39
|
)
|
|
42
40
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
43
41
|
from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
|
|
44
42
|
from flowfile_core.flowfile.flow_data_engine.join import (
|
|
45
|
-
|
|
46
|
-
verify_join_map_integrity,
|
|
47
|
-
rename_df_table_for_join,
|
|
43
|
+
get_col_name_to_delete,
|
|
48
44
|
get_undo_rename_mapping_join,
|
|
49
|
-
|
|
45
|
+
rename_df_table_for_join,
|
|
46
|
+
verify_join_map_integrity,
|
|
47
|
+
verify_join_select_integrity,
|
|
50
48
|
)
|
|
51
49
|
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
52
50
|
from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
|
|
@@ -55,19 +53,21 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
|
|
|
55
53
|
ExternalDfFetcher,
|
|
56
54
|
ExternalExecutorTracker,
|
|
57
55
|
ExternalFuzzyMatchFetcher,
|
|
58
|
-
fetch_unique_values
|
|
59
|
-
)
|
|
60
|
-
from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
|
|
61
|
-
get_join_count,
|
|
62
|
-
write_threaded
|
|
56
|
+
fetch_unique_values,
|
|
63
57
|
)
|
|
64
|
-
|
|
58
|
+
from flowfile_core.flowfile.flow_data_engine.threaded_processes import write_threaded
|
|
65
59
|
from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
|
|
60
|
+
from flowfile_core.schemas import cloud_storage_schemas, input_schema
|
|
61
|
+
from flowfile_core.schemas import transform_schema as transform_schemas
|
|
62
|
+
from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
|
|
63
|
+
from flowfile_core.utils.utils import ensure_similarity_dicts
|
|
66
64
|
|
|
67
|
-
T = TypeVar(
|
|
65
|
+
T = TypeVar("T", pl.DataFrame, pl.LazyFrame)
|
|
68
66
|
|
|
69
67
|
|
|
70
|
-
def _handle_duplication_join_keys(
|
|
68
|
+
def _handle_duplication_join_keys(
|
|
69
|
+
left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager
|
|
70
|
+
) -> tuple[T, T, dict[str, str]]:
|
|
71
71
|
"""Temporarily renames join keys to avoid conflicts during a join.
|
|
72
72
|
|
|
73
73
|
This helper function checks the join type and renames the join key columns
|
|
@@ -88,20 +88,26 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_manager: transfo
|
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
90
|
def _construct_temp_name(column_name: str) -> str:
|
|
91
|
-
return "__FL_TEMP__"+column_name
|
|
91
|
+
return "__FL_TEMP__" + column_name
|
|
92
92
|
|
|
93
|
-
if join_manager.how ==
|
|
94
|
-
left_df = left_df.with_columns(
|
|
95
|
-
|
|
93
|
+
if join_manager.how == "right":
|
|
94
|
+
left_df = left_df.with_columns(
|
|
95
|
+
pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
96
|
+
for jk in join_manager.left_manager.get_join_key_selects()
|
|
97
|
+
)
|
|
96
98
|
reverse_actions = {
|
|
97
99
|
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
|
|
98
|
-
for jk in join_manager.left_manager.get_join_key_selects()
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
100
|
+
for jk in join_manager.left_manager.get_join_key_selects()
|
|
101
|
+
}
|
|
102
|
+
elif join_manager.how in ("left", "inner"):
|
|
103
|
+
right_df = right_df.with_columns(
|
|
104
|
+
pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
105
|
+
for jk in join_manager.right_manager.get_join_key_selects()
|
|
106
|
+
)
|
|
102
107
|
reverse_actions = {
|
|
103
108
|
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
|
|
104
|
-
for jk in join_manager.right_manager.get_join_key_selects()
|
|
109
|
+
for jk in join_manager.right_manager.get_join_key_selects()
|
|
110
|
+
}
|
|
105
111
|
else:
|
|
106
112
|
reverse_actions = {}
|
|
107
113
|
return left_df, right_df, reverse_actions
|
|
@@ -118,12 +124,12 @@ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.
|
|
|
118
124
|
Args:
|
|
119
125
|
join_input: The JoinInput settings object to modify.
|
|
120
126
|
"""
|
|
121
|
-
if join_input.how in (
|
|
127
|
+
if join_input.how in ("semi", "anti"):
|
|
122
128
|
for jk in join_input.right_select.renames:
|
|
123
129
|
jk.keep = False
|
|
124
130
|
|
|
125
131
|
|
|
126
|
-
def get_select_columns(full_select_input:
|
|
132
|
+
def get_select_columns(full_select_input: list[transform_schemas.SelectInput]) -> list[str]:
|
|
127
133
|
"""Extracts a list of column names to be selected from a SelectInput list.
|
|
128
134
|
|
|
129
135
|
This function filters a list of `SelectInput` objects to return the names
|
|
@@ -156,15 +162,16 @@ class FlowDataEngine:
|
|
|
156
162
|
errors: A list of errors encountered during operations.
|
|
157
163
|
_schema: A cached list of `FlowfileColumn` objects representing the schema.
|
|
158
164
|
"""
|
|
165
|
+
|
|
159
166
|
# Core attributes
|
|
160
|
-
_data_frame:
|
|
161
|
-
columns:
|
|
167
|
+
_data_frame: pl.DataFrame | pl.LazyFrame
|
|
168
|
+
columns: list[Any]
|
|
162
169
|
|
|
163
170
|
# Metadata attributes
|
|
164
171
|
name: str = None
|
|
165
172
|
number_of_records: int = None
|
|
166
|
-
errors:
|
|
167
|
-
_schema:
|
|
173
|
+
errors: list = None
|
|
174
|
+
_schema: list["FlowfileColumn"] | None = None
|
|
168
175
|
|
|
169
176
|
# Configuration attributes
|
|
170
177
|
_optimize_memory: bool = False
|
|
@@ -173,16 +180,16 @@ class FlowDataEngine:
|
|
|
173
180
|
_calculate_schema_stats: bool = False
|
|
174
181
|
|
|
175
182
|
# Cache and optimization attributes
|
|
176
|
-
__col_name_idx_map:
|
|
177
|
-
__data_map:
|
|
178
|
-
__optimized_columns:
|
|
183
|
+
__col_name_idx_map: dict = None
|
|
184
|
+
__data_map: dict = None
|
|
185
|
+
__optimized_columns: list = None
|
|
179
186
|
__sample__: str = None
|
|
180
187
|
__number_of_fields: int = None
|
|
181
|
-
_col_idx:
|
|
188
|
+
_col_idx: dict[str, int] = None
|
|
182
189
|
|
|
183
190
|
# Source tracking
|
|
184
|
-
_org_path:
|
|
185
|
-
_external_source:
|
|
191
|
+
_org_path: str | None = None
|
|
192
|
+
_external_source: ExternalDataSource | None = None
|
|
186
193
|
|
|
187
194
|
# State tracking
|
|
188
195
|
sorted_by: int = None
|
|
@@ -195,17 +202,21 @@ class FlowDataEngine:
|
|
|
195
202
|
_number_of_records_callback: Callable = None
|
|
196
203
|
_data_callback: Callable = None
|
|
197
204
|
|
|
198
|
-
def __init__(
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
205
|
+
def __init__(
|
|
206
|
+
self,
|
|
207
|
+
raw_data: Union[
|
|
208
|
+
list[dict], list[Any], dict[str, Any], "ParquetFile", pl.DataFrame, pl.LazyFrame, input_schema.RawData
|
|
209
|
+
] = None,
|
|
210
|
+
path_ref: str = None,
|
|
211
|
+
name: str = None,
|
|
212
|
+
optimize_memory: bool = True,
|
|
213
|
+
schema: list["FlowfileColumn"] | list[str] | pl.Schema = None,
|
|
214
|
+
number_of_records: int = None,
|
|
215
|
+
calculate_schema_stats: bool = False,
|
|
216
|
+
streamable: bool = True,
|
|
217
|
+
number_of_records_callback: Callable = None,
|
|
218
|
+
data_callback: Callable = None,
|
|
219
|
+
):
|
|
209
220
|
"""Initializes the FlowDataEngine from various data sources.
|
|
210
221
|
|
|
211
222
|
Args:
|
|
@@ -265,12 +276,12 @@ class FlowDataEngine:
|
|
|
265
276
|
elif isinstance(raw_data, (list, dict)):
|
|
266
277
|
self._handle_python_data(raw_data)
|
|
267
278
|
|
|
268
|
-
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records:
|
|
279
|
+
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: int | None):
|
|
269
280
|
"""(Internal) Initializes the engine from an eager Polars DataFrame."""
|
|
270
281
|
self.data_frame = df
|
|
271
282
|
self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
|
|
272
283
|
|
|
273
|
-
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records:
|
|
284
|
+
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: int | None, optimize_memory: bool):
|
|
274
285
|
"""(Internal) Initializes the engine from a Polars LazyFrame."""
|
|
275
286
|
self.data_frame = lf
|
|
276
287
|
self._lazy = True
|
|
@@ -281,14 +292,14 @@ class FlowDataEngine:
|
|
|
281
292
|
else:
|
|
282
293
|
self.number_of_records = lf.select(pl.len()).collect()[0, 0]
|
|
283
294
|
|
|
284
|
-
def _handle_python_data(self, data:
|
|
295
|
+
def _handle_python_data(self, data: list | dict):
|
|
285
296
|
"""(Internal) Dispatches Python collections to the correct handler."""
|
|
286
297
|
if isinstance(data, dict):
|
|
287
298
|
self._handle_dict_input(data)
|
|
288
299
|
else:
|
|
289
300
|
self._handle_list_input(data)
|
|
290
301
|
|
|
291
|
-
def _handle_dict_input(self, data:
|
|
302
|
+
def _handle_dict_input(self, data: dict):
|
|
292
303
|
"""(Internal) Initializes the engine from a Python dictionary."""
|
|
293
304
|
if len(data) == 0:
|
|
294
305
|
self.initialize_empty_fl()
|
|
@@ -312,8 +323,12 @@ class FlowDataEngine:
|
|
|
312
323
|
raw_data: An instance of `RawData` containing the data and schema.
|
|
313
324
|
"""
|
|
314
325
|
flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
|
|
315
|
-
polars_schema = pl.Schema(
|
|
316
|
-
|
|
326
|
+
polars_schema = pl.Schema(
|
|
327
|
+
[
|
|
328
|
+
(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
|
|
329
|
+
for flowfile_column in flowfile_schema
|
|
330
|
+
]
|
|
331
|
+
)
|
|
317
332
|
try:
|
|
318
333
|
df = pl.DataFrame(raw_data.data, polars_schema, strict=False)
|
|
319
334
|
except TypeError as e:
|
|
@@ -323,7 +338,7 @@ class FlowDataEngine:
|
|
|
323
338
|
self.data_frame = df.lazy()
|
|
324
339
|
self.lazy = True
|
|
325
340
|
|
|
326
|
-
def _handle_list_input(self, data:
|
|
341
|
+
def _handle_list_input(self, data: list):
|
|
327
342
|
"""(Internal) Initializes the engine from a list of records."""
|
|
328
343
|
number_of_records = len(data)
|
|
329
344
|
if number_of_records > 0:
|
|
@@ -336,19 +351,19 @@ class FlowDataEngine:
|
|
|
336
351
|
self.number_of_records = 0
|
|
337
352
|
|
|
338
353
|
@staticmethod
|
|
339
|
-
def _process_list_data(data:
|
|
354
|
+
def _process_list_data(data: list) -> list[dict]:
|
|
340
355
|
"""(Internal) Normalizes list data into a list of dictionaries.
|
|
341
356
|
|
|
342
357
|
Ensures that a list of objects or non-dict items is converted into a
|
|
343
358
|
uniform list of dictionaries suitable for Polars DataFrame creation.
|
|
344
359
|
"""
|
|
345
|
-
if not (isinstance(data[0], dict) or hasattr(data[0],
|
|
360
|
+
if not (isinstance(data[0], dict) or hasattr(data[0], "__dict__")):
|
|
346
361
|
try:
|
|
347
362
|
return pl.DataFrame(data).to_dicts()
|
|
348
363
|
except TypeError:
|
|
349
|
-
raise Exception(
|
|
364
|
+
raise Exception("Value must be able to be converted to dictionary")
|
|
350
365
|
except Exception as e:
|
|
351
|
-
raise Exception(f
|
|
366
|
+
raise Exception(f"Value must be able to be converted to dictionary: {e}")
|
|
352
367
|
|
|
353
368
|
if not isinstance(data[0], dict):
|
|
354
369
|
data = [row.__dict__ for row in data]
|
|
@@ -375,49 +390,37 @@ class FlowDataEngine:
|
|
|
375
390
|
|
|
376
391
|
logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
|
|
377
392
|
|
|
378
|
-
if write_settings.write_mode ==
|
|
393
|
+
if write_settings.write_mode == "append" and write_settings.file_format != "delta":
|
|
379
394
|
raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
|
|
380
395
|
storage_options = CloudStorageReader.get_storage_options(connection)
|
|
381
396
|
credential_provider = CloudStorageReader.get_credential_provider(connection)
|
|
382
397
|
# Dispatch to the correct writer based on file format
|
|
383
398
|
if write_settings.file_format == "parquet":
|
|
384
399
|
self._write_parquet_to_cloud(
|
|
385
|
-
write_settings.resource_path,
|
|
386
|
-
storage_options,
|
|
387
|
-
credential_provider,
|
|
388
|
-
write_settings
|
|
400
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
389
401
|
)
|
|
390
402
|
elif write_settings.file_format == "delta":
|
|
391
403
|
self._write_delta_to_cloud(
|
|
392
|
-
write_settings.resource_path,
|
|
393
|
-
storage_options,
|
|
394
|
-
credential_provider,
|
|
395
|
-
write_settings
|
|
404
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
396
405
|
)
|
|
397
406
|
elif write_settings.file_format == "csv":
|
|
398
|
-
self._write_csv_to_cloud(
|
|
399
|
-
write_settings.resource_path,
|
|
400
|
-
storage_options,
|
|
401
|
-
credential_provider,
|
|
402
|
-
write_settings
|
|
403
|
-
)
|
|
407
|
+
self._write_csv_to_cloud(write_settings.resource_path, storage_options, credential_provider, write_settings)
|
|
404
408
|
elif write_settings.file_format == "json":
|
|
405
409
|
self._write_json_to_cloud(
|
|
406
|
-
write_settings.resource_path,
|
|
407
|
-
storage_options,
|
|
408
|
-
credential_provider,
|
|
409
|
-
write_settings
|
|
410
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
410
411
|
)
|
|
411
412
|
else:
|
|
412
413
|
raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
|
|
413
414
|
|
|
414
415
|
logger.info(f"Successfully wrote data to {write_settings.resource_path}")
|
|
415
416
|
|
|
416
|
-
def _write_parquet_to_cloud(
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
417
|
+
def _write_parquet_to_cloud(
|
|
418
|
+
self,
|
|
419
|
+
resource_path: str,
|
|
420
|
+
storage_options: dict[str, Any],
|
|
421
|
+
credential_provider: Callable | None,
|
|
422
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
423
|
+
):
|
|
421
424
|
"""(Internal) Writes the DataFrame to a Parquet file in cloud storage.
|
|
422
425
|
|
|
423
426
|
Uses `sink_parquet` for efficient streaming writes. Falls back to a
|
|
@@ -437,18 +440,20 @@ class FlowDataEngine:
|
|
|
437
440
|
except Exception as e:
|
|
438
441
|
logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
|
|
439
442
|
pl_df = self.collect()
|
|
440
|
-
sink_kwargs[
|
|
443
|
+
sink_kwargs["file"] = sink_kwargs.pop("path")
|
|
441
444
|
pl_df.write_parquet(**sink_kwargs)
|
|
442
445
|
|
|
443
446
|
except Exception as e:
|
|
444
447
|
logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
|
|
445
448
|
raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
|
|
446
449
|
|
|
447
|
-
def _write_delta_to_cloud(
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
450
|
+
def _write_delta_to_cloud(
|
|
451
|
+
self,
|
|
452
|
+
resource_path: str,
|
|
453
|
+
storage_options: dict[str, Any],
|
|
454
|
+
credential_provider: Callable | None,
|
|
455
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
456
|
+
):
|
|
452
457
|
"""(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
|
|
453
458
|
|
|
454
459
|
This operation requires collecting the data first, as `write_delta` operates
|
|
@@ -464,11 +469,13 @@ class FlowDataEngine:
|
|
|
464
469
|
sink_kwargs["credential_provider"] = credential_provider
|
|
465
470
|
self.collect().write_delta(**sink_kwargs)
|
|
466
471
|
|
|
467
|
-
def _write_csv_to_cloud(
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
+
def _write_csv_to_cloud(
|
|
473
|
+
self,
|
|
474
|
+
resource_path: str,
|
|
475
|
+
storage_options: dict[str, Any],
|
|
476
|
+
credential_provider: Callable | None,
|
|
477
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
478
|
+
):
|
|
472
479
|
"""(Internal) Writes the DataFrame to a CSV file in cloud storage.
|
|
473
480
|
|
|
474
481
|
Uses `sink_csv` for efficient, streaming writes of the data.
|
|
@@ -490,11 +497,13 @@ class FlowDataEngine:
|
|
|
490
497
|
logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
|
|
491
498
|
raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
|
|
492
499
|
|
|
493
|
-
def _write_json_to_cloud(
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
500
|
+
def _write_json_to_cloud(
|
|
501
|
+
self,
|
|
502
|
+
resource_path: str,
|
|
503
|
+
storage_options: dict[str, Any],
|
|
504
|
+
credential_provider: Callable | None,
|
|
505
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
506
|
+
):
|
|
498
507
|
"""(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
|
|
499
508
|
|
|
500
509
|
Uses `sink_ndjson` for efficient, streaming writes.
|
|
@@ -512,7 +521,9 @@ class FlowDataEngine:
|
|
|
512
521
|
raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
|
|
513
522
|
|
|
514
523
|
@classmethod
|
|
515
|
-
def from_cloud_storage_obj(
|
|
524
|
+
def from_cloud_storage_obj(
|
|
525
|
+
cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal
|
|
526
|
+
) -> "FlowDataEngine":
|
|
516
527
|
"""Creates a FlowDataEngine from an object in cloud storage.
|
|
517
528
|
|
|
518
529
|
This method supports reading from various cloud storage providers like AWS S3,
|
|
@@ -549,31 +560,22 @@ class FlowDataEngine:
|
|
|
549
560
|
)
|
|
550
561
|
elif read_settings.file_format == "delta":
|
|
551
562
|
return cls._read_delta_from_cloud(
|
|
552
|
-
read_settings.resource_path,
|
|
553
|
-
storage_options,
|
|
554
|
-
credential_provider,
|
|
555
|
-
read_settings
|
|
563
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
556
564
|
)
|
|
557
565
|
elif read_settings.file_format == "csv":
|
|
558
566
|
return cls._read_csv_from_cloud(
|
|
559
|
-
read_settings.resource_path,
|
|
560
|
-
storage_options,
|
|
561
|
-
credential_provider,
|
|
562
|
-
read_settings
|
|
567
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
563
568
|
)
|
|
564
569
|
elif read_settings.file_format == "json":
|
|
565
570
|
return cls._read_json_from_cloud(
|
|
566
571
|
read_settings.resource_path,
|
|
567
572
|
storage_options,
|
|
568
573
|
credential_provider,
|
|
569
|
-
read_settings.scan_mode == "directory"
|
|
574
|
+
read_settings.scan_mode == "directory",
|
|
570
575
|
)
|
|
571
576
|
elif read_settings.file_format == "iceberg":
|
|
572
577
|
return cls._read_iceberg_from_cloud(
|
|
573
|
-
read_settings.resource_path,
|
|
574
|
-
storage_options,
|
|
575
|
-
credential_provider,
|
|
576
|
-
read_settings
|
|
578
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
577
579
|
)
|
|
578
580
|
|
|
579
581
|
elif read_settings.file_format in ["delta", "iceberg"]:
|
|
@@ -583,33 +585,40 @@ class FlowDataEngine:
|
|
|
583
585
|
raise ValueError(f"Unsupported file format: {read_settings.file_format}")
|
|
584
586
|
|
|
585
587
|
@staticmethod
|
|
586
|
-
def _get_schema_from_first_file_in_dir(
|
|
587
|
-
|
|
588
|
+
def _get_schema_from_first_file_in_dir(
|
|
589
|
+
source: str, storage_options: dict[str, Any], file_format: Literal["csv", "parquet", "json", "delta"]
|
|
590
|
+
) -> list[FlowfileColumn] | None:
|
|
588
591
|
"""Infers the schema by scanning the first file in a cloud directory."""
|
|
589
592
|
try:
|
|
590
593
|
scan_func = getattr(pl, "scan_" + file_format)
|
|
591
594
|
first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
|
|
592
|
-
return convert_stats_to_column_info(
|
|
593
|
-
|
|
595
|
+
return convert_stats_to_column_info(
|
|
596
|
+
FlowDataEngine._create_schema_stats_from_pl_schema(
|
|
597
|
+
scan_func(first_file_ref, storage_options=storage_options).collect_schema()
|
|
598
|
+
)
|
|
599
|
+
)
|
|
594
600
|
except Exception as e:
|
|
595
601
|
logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
|
|
596
602
|
|
|
597
|
-
|
|
598
603
|
@classmethod
|
|
599
|
-
def _read_iceberg_from_cloud(
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
+
def _read_iceberg_from_cloud(
|
|
605
|
+
cls,
|
|
606
|
+
resource_path: str,
|
|
607
|
+
storage_options: dict[str, Any],
|
|
608
|
+
credential_provider: Callable | None,
|
|
609
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
610
|
+
) -> "FlowDataEngine":
|
|
604
611
|
"""Reads Iceberg table(s) from cloud storage."""
|
|
605
|
-
raise NotImplementedError(
|
|
612
|
+
raise NotImplementedError("Failed to read Iceberg table from cloud storage: Not yet implemented")
|
|
606
613
|
|
|
607
614
|
@classmethod
|
|
608
|
-
def _read_parquet_from_cloud(
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
615
|
+
def _read_parquet_from_cloud(
|
|
616
|
+
cls,
|
|
617
|
+
resource_path: str,
|
|
618
|
+
storage_options: dict[str, Any],
|
|
619
|
+
credential_provider: Callable | None,
|
|
620
|
+
is_directory: bool,
|
|
621
|
+
) -> "FlowDataEngine":
|
|
613
622
|
"""Reads Parquet file(s) from cloud storage."""
|
|
614
623
|
try:
|
|
615
624
|
# Use scan_parquet for lazy evaluation
|
|
@@ -633,7 +642,7 @@ class FlowDataEngine:
|
|
|
633
642
|
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
634
643
|
optimize_memory=True,
|
|
635
644
|
streamable=True,
|
|
636
|
-
schema=schema
|
|
645
|
+
schema=schema,
|
|
637
646
|
)
|
|
638
647
|
|
|
639
648
|
except Exception as e:
|
|
@@ -641,18 +650,20 @@ class FlowDataEngine:
|
|
|
641
650
|
raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
|
|
642
651
|
|
|
643
652
|
@classmethod
|
|
644
|
-
def _read_delta_from_cloud(
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
653
|
+
def _read_delta_from_cloud(
|
|
654
|
+
cls,
|
|
655
|
+
resource_path: str,
|
|
656
|
+
storage_options: dict[str, Any],
|
|
657
|
+
credential_provider: Callable | None,
|
|
658
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
659
|
+
) -> "FlowDataEngine":
|
|
649
660
|
"""Reads a Delta Lake table from cloud storage."""
|
|
650
661
|
try:
|
|
651
662
|
logger.info("Reading Delta file from cloud storage...")
|
|
652
663
|
logger.info(f"read_settings: {read_settings}")
|
|
653
664
|
scan_kwargs = {"source": resource_path}
|
|
654
665
|
if read_settings.delta_version:
|
|
655
|
-
scan_kwargs[
|
|
666
|
+
scan_kwargs["version"] = read_settings.delta_version
|
|
656
667
|
if storage_options:
|
|
657
668
|
scan_kwargs["storage_options"] = storage_options
|
|
658
669
|
if credential_provider:
|
|
@@ -663,18 +674,20 @@ class FlowDataEngine:
|
|
|
663
674
|
lf,
|
|
664
675
|
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
665
676
|
optimize_memory=True,
|
|
666
|
-
streamable=True
|
|
677
|
+
streamable=True,
|
|
667
678
|
)
|
|
668
679
|
except Exception as e:
|
|
669
680
|
logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
|
|
670
681
|
raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
|
|
671
682
|
|
|
672
683
|
@classmethod
|
|
673
|
-
def _read_csv_from_cloud(
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
684
|
+
def _read_csv_from_cloud(
|
|
685
|
+
cls,
|
|
686
|
+
resource_path: str,
|
|
687
|
+
storage_options: dict[str, Any],
|
|
688
|
+
credential_provider: Callable | None,
|
|
689
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
690
|
+
) -> "FlowDataEngine":
|
|
678
691
|
"""Reads CSV file(s) from cloud storage."""
|
|
679
692
|
try:
|
|
680
693
|
scan_kwargs = {
|
|
@@ -703,7 +716,7 @@ class FlowDataEngine:
|
|
|
703
716
|
number_of_records=6_666_666, # Will be calculated lazily
|
|
704
717
|
optimize_memory=True,
|
|
705
718
|
streamable=True,
|
|
706
|
-
schema=schema
|
|
719
|
+
schema=schema,
|
|
707
720
|
)
|
|
708
721
|
|
|
709
722
|
except Exception as e:
|
|
@@ -711,11 +724,13 @@ class FlowDataEngine:
|
|
|
711
724
|
raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
|
|
712
725
|
|
|
713
726
|
@classmethod
|
|
714
|
-
def _read_json_from_cloud(
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
727
|
+
def _read_json_from_cloud(
|
|
728
|
+
cls,
|
|
729
|
+
resource_path: str,
|
|
730
|
+
storage_options: dict[str, Any],
|
|
731
|
+
credential_provider: Callable | None,
|
|
732
|
+
is_directory: bool,
|
|
733
|
+
) -> "FlowDataEngine":
|
|
719
734
|
"""Reads JSON file(s) from cloud storage."""
|
|
720
735
|
try:
|
|
721
736
|
if is_directory:
|
|
@@ -755,8 +770,9 @@ class FlowDataEngine:
|
|
|
755
770
|
else:
|
|
756
771
|
self.data_frame = pl.read_parquet(path_ref)
|
|
757
772
|
|
|
758
|
-
def _finalize_initialization(
|
|
759
|
-
|
|
773
|
+
def _finalize_initialization(
|
|
774
|
+
self, name: str, optimize_memory: bool, schema: Any | None, calculate_schema_stats: bool
|
|
775
|
+
):
|
|
760
776
|
"""Finalizes initialization by setting remaining attributes."""
|
|
761
777
|
_ = calculate_schema_stats
|
|
762
778
|
self.name = name
|
|
@@ -803,23 +819,20 @@ class FlowDataEngine:
|
|
|
803
819
|
def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
|
|
804
820
|
"""Sets the underlying Polars DataFrame or LazyFrame."""
|
|
805
821
|
if self.lazy and isinstance(df, pl.DataFrame):
|
|
806
|
-
raise Exception(
|
|
822
|
+
raise Exception("Cannot set a non-lazy dataframe to a lazy flowfile")
|
|
807
823
|
self._data_frame = df
|
|
808
824
|
|
|
809
825
|
@staticmethod
|
|
810
|
-
def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) ->
|
|
826
|
+
def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> list[dict]:
|
|
811
827
|
"""Converts a Polars Schema into a list of schema statistics dictionaries."""
|
|
812
|
-
return [
|
|
813
|
-
dict(column_name=k, pl_datatype=v, col_index=i)
|
|
814
|
-
for i, (k, v) in enumerate(pl_schema.items())
|
|
815
|
-
]
|
|
828
|
+
return [dict(column_name=k, pl_datatype=v, col_index=i) for i, (k, v) in enumerate(pl_schema.items())]
|
|
816
829
|
|
|
817
|
-
def _add_schema_from_schema_stats(self, schema_stats:
|
|
830
|
+
def _add_schema_from_schema_stats(self, schema_stats: list[dict]):
|
|
818
831
|
"""Populates the schema from a list of schema statistics dictionaries."""
|
|
819
832
|
self._schema = convert_stats_to_column_info(schema_stats)
|
|
820
833
|
|
|
821
834
|
@property
|
|
822
|
-
def schema(self) ->
|
|
835
|
+
def schema(self) -> list[FlowfileColumn]:
|
|
823
836
|
"""The schema of the DataFrame as a list of `FlowfileColumn` objects.
|
|
824
837
|
|
|
825
838
|
This property lazily calculates the schema if it hasn't been determined yet.
|
|
@@ -866,8 +879,10 @@ class FlowDataEngine:
|
|
|
866
879
|
if n_records is None:
|
|
867
880
|
logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
|
|
868
881
|
else:
|
|
869
|
-
logger.info(
|
|
870
|
-
|
|
882
|
+
logger.info(
|
|
883
|
+
f'Fetching {n_records} record(s) for Table object "{id(self)}". '
|
|
884
|
+
f"Settings: streaming={self._streamable}"
|
|
885
|
+
)
|
|
871
886
|
|
|
872
887
|
if not self.lazy:
|
|
873
888
|
return self.data_frame
|
|
@@ -881,16 +896,15 @@ class FlowDataEngine:
|
|
|
881
896
|
def _collect_data(self, n_records: int = None) -> pl.DataFrame:
|
|
882
897
|
"""Internal method to handle data collection logic."""
|
|
883
898
|
if n_records is None:
|
|
884
|
-
|
|
885
899
|
self.collect_external()
|
|
886
900
|
if self._streamable:
|
|
887
901
|
try:
|
|
888
|
-
logger.info(
|
|
902
|
+
logger.info("Collecting data in streaming mode")
|
|
889
903
|
return self.data_frame.collect(engine="streaming")
|
|
890
904
|
except PanicException:
|
|
891
905
|
self._streamable = False
|
|
892
906
|
|
|
893
|
-
logger.info(
|
|
907
|
+
logger.info("Collecting data in non-streaming mode")
|
|
894
908
|
return self.data_frame.collect()
|
|
895
909
|
|
|
896
910
|
if self.external_source is not None:
|
|
@@ -919,7 +933,7 @@ class FlowDataEngine:
|
|
|
919
933
|
return self._create_partial_dataframe(ok_cols, error_cols, n_records)
|
|
920
934
|
return self._create_empty_dataframe(n_records)
|
|
921
935
|
|
|
922
|
-
def _identify_valid_columns(self, n_records: int) ->
|
|
936
|
+
def _identify_valid_columns(self, n_records: int) -> tuple[list[str], list[tuple[str, Any]]]:
|
|
923
937
|
"""Identifies which columns can be collected successfully."""
|
|
924
938
|
ok_cols = []
|
|
925
939
|
error_cols = []
|
|
@@ -931,30 +945,30 @@ class FlowDataEngine:
|
|
|
931
945
|
error_cols.append((c, self.data_frame.schema[c]))
|
|
932
946
|
return ok_cols, error_cols
|
|
933
947
|
|
|
934
|
-
def _create_partial_dataframe(
|
|
935
|
-
|
|
948
|
+
def _create_partial_dataframe(
|
|
949
|
+
self, ok_cols: list[str], error_cols: list[tuple[str, Any]], n_records: int
|
|
950
|
+
) -> pl.DataFrame:
|
|
936
951
|
"""Creates a DataFrame with partial data for columns that could be collected."""
|
|
937
952
|
df = self.data_frame.select(ok_cols)
|
|
938
|
-
df = df.with_columns([
|
|
939
|
-
pl.lit(None).alias(column_name).cast(data_type)
|
|
940
|
-
for column_name, data_type in error_cols
|
|
941
|
-
])
|
|
953
|
+
df = df.with_columns([pl.lit(None).alias(column_name).cast(data_type) for column_name, data_type in error_cols])
|
|
942
954
|
return df.select(self.columns).head(n_records).collect()
|
|
943
955
|
|
|
944
956
|
def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
|
|
945
957
|
"""Creates an empty DataFrame with the correct schema."""
|
|
946
958
|
if self.number_of_records > 0:
|
|
947
|
-
return pl.DataFrame(
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
959
|
+
return pl.DataFrame(
|
|
960
|
+
{
|
|
961
|
+
column_name: pl.Series(
|
|
962
|
+
name=column_name, values=[None] * min(self.number_of_records, n_records)
|
|
963
|
+
).cast(data_type)
|
|
964
|
+
for column_name, data_type in self.data_frame.schema.items()
|
|
965
|
+
}
|
|
966
|
+
)
|
|
954
967
|
return pl.DataFrame(schema=self.data_frame.schema)
|
|
955
968
|
|
|
956
|
-
def do_group_by(
|
|
957
|
-
|
|
969
|
+
def do_group_by(
|
|
970
|
+
self, group_by_input: transform_schemas.GroupByInput, calculate_schema_stats: bool = True
|
|
971
|
+
) -> "FlowDataEngine":
|
|
958
972
|
"""Performs a group-by operation on the DataFrame.
|
|
959
973
|
|
|
960
974
|
Args:
|
|
@@ -966,27 +980,35 @@ class FlowDataEngine:
|
|
|
966
980
|
Returns:
|
|
967
981
|
A new `FlowDataEngine` instance with the grouped and aggregated data.
|
|
968
982
|
"""
|
|
969
|
-
aggregations = [c for c in group_by_input.agg_cols if c.agg !=
|
|
970
|
-
group_columns = [c for c in group_by_input.agg_cols if c.agg ==
|
|
983
|
+
aggregations = [c for c in group_by_input.agg_cols if c.agg != "groupby"]
|
|
984
|
+
group_columns = [c for c in group_by_input.agg_cols if c.agg == "groupby"]
|
|
971
985
|
|
|
972
986
|
if len(group_columns) == 0:
|
|
973
987
|
return FlowDataEngine(
|
|
974
|
-
self.data_frame.select(
|
|
975
|
-
|
|
976
|
-
),
|
|
977
|
-
calculate_schema_stats=calculate_schema_stats
|
|
988
|
+
self.data_frame.select(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
|
|
989
|
+
calculate_schema_stats=calculate_schema_stats,
|
|
978
990
|
)
|
|
979
991
|
|
|
980
992
|
df = self.data_frame.rename({c.old_name: c.new_name for c in group_columns})
|
|
981
993
|
group_by_columns = [n_c.new_name for n_c in group_columns]
|
|
994
|
+
|
|
995
|
+
# Handle case where there are no aggregations - just get unique combinations of group columns
|
|
996
|
+
if len(aggregations) == 0:
|
|
997
|
+
return FlowDataEngine(
|
|
998
|
+
df.select(group_by_columns).unique(),
|
|
999
|
+
calculate_schema_stats=calculate_schema_stats,
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
grouped_df = df.group_by(*group_by_columns)
|
|
1003
|
+
agg_exprs = [ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations]
|
|
1004
|
+
result_df = grouped_df.agg(agg_exprs)
|
|
1005
|
+
|
|
982
1006
|
return FlowDataEngine(
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
),
|
|
986
|
-
calculate_schema_stats=calculate_schema_stats
|
|
1007
|
+
result_df,
|
|
1008
|
+
calculate_schema_stats=calculate_schema_stats,
|
|
987
1009
|
)
|
|
988
1010
|
|
|
989
|
-
def do_sort(self, sorts:
|
|
1011
|
+
def do_sort(self, sorts: list[transform_schemas.SortByInput]) -> "FlowDataEngine":
|
|
990
1012
|
"""Sorts the DataFrame by one or more columns.
|
|
991
1013
|
|
|
992
1014
|
Args:
|
|
@@ -999,12 +1021,13 @@ class FlowDataEngine:
|
|
|
999
1021
|
if not sorts:
|
|
1000
1022
|
return self
|
|
1001
1023
|
|
|
1002
|
-
descending = [s.how ==
|
|
1024
|
+
descending = [s.how == "desc" or s.how.lower() == "descending" for s in sorts]
|
|
1003
1025
|
df = self.data_frame.sort([sort_by.column for sort_by in sorts], descending=descending)
|
|
1004
1026
|
return FlowDataEngine(df, number_of_records=self.number_of_records, schema=self.schema)
|
|
1005
1027
|
|
|
1006
|
-
def change_column_types(
|
|
1007
|
-
|
|
1028
|
+
def change_column_types(
|
|
1029
|
+
self, transforms: list[transform_schemas.SelectInput], calculate_schema: bool = False
|
|
1030
|
+
) -> "FlowDataEngine":
|
|
1008
1031
|
"""Changes the data type of one or more columns.
|
|
1009
1032
|
|
|
1010
1033
|
Args:
|
|
@@ -1018,7 +1041,8 @@ class FlowDataEngine:
|
|
|
1018
1041
|
dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
|
|
1019
1042
|
idx_mapping = list(
|
|
1020
1043
|
(transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
|
|
1021
|
-
for transform in transforms
|
|
1044
|
+
for transform in transforms
|
|
1045
|
+
if transform.data_type is not None
|
|
1022
1046
|
)
|
|
1023
1047
|
|
|
1024
1048
|
actual_transforms = [c for c in idx_mapping if c[2] != dtypes[c[1]]]
|
|
@@ -1032,10 +1056,10 @@ class FlowDataEngine:
|
|
|
1032
1056
|
df,
|
|
1033
1057
|
number_of_records=self.number_of_records,
|
|
1034
1058
|
calculate_schema_stats=calculate_schema,
|
|
1035
|
-
streamable=self._streamable
|
|
1059
|
+
streamable=self._streamable,
|
|
1036
1060
|
)
|
|
1037
1061
|
|
|
1038
|
-
def save(self, path: str, data_type: str =
|
|
1062
|
+
def save(self, path: str, data_type: str = "parquet") -> Future:
|
|
1039
1063
|
"""Saves the DataFrame to a file in a separate thread.
|
|
1040
1064
|
|
|
1041
1065
|
Args:
|
|
@@ -1049,7 +1073,7 @@ class FlowDataEngine:
|
|
|
1049
1073
|
df = deepcopy(self.data_frame)
|
|
1050
1074
|
return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
|
|
1051
1075
|
|
|
1052
|
-
def to_pylist(self) ->
|
|
1076
|
+
def to_pylist(self) -> list[dict]:
|
|
1053
1077
|
"""Converts the DataFrame to a list of Python dictionaries.
|
|
1054
1078
|
|
|
1055
1079
|
Returns:
|
|
@@ -1083,15 +1107,15 @@ class FlowDataEngine:
|
|
|
1083
1107
|
data = list(self.to_dict().values())
|
|
1084
1108
|
return input_schema.RawData(columns=columns, data=data)
|
|
1085
1109
|
|
|
1086
|
-
def to_dict(self) ->
|
|
1110
|
+
def to_dict(self) -> dict[str, list]:
|
|
1087
1111
|
"""Converts the DataFrame to a Python dictionary of columns.
|
|
1088
1112
|
|
|
1089
|
-
|
|
1090
|
-
|
|
1113
|
+
Each key in the dictionary is a column name, and the corresponding value
|
|
1114
|
+
is a list of the data in that column.
|
|
1091
1115
|
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1116
|
+
Returns:
|
|
1117
|
+
A dictionary mapping column names to lists of their values.
|
|
1118
|
+
"""
|
|
1095
1119
|
if self.lazy:
|
|
1096
1120
|
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
|
|
1097
1121
|
else:
|
|
@@ -1131,7 +1155,7 @@ class FlowDataEngine:
|
|
|
1131
1155
|
return cls(pl.read_sql(sql, conn))
|
|
1132
1156
|
|
|
1133
1157
|
@classmethod
|
|
1134
|
-
def create_from_schema(cls, schema:
|
|
1158
|
+
def create_from_schema(cls, schema: list[FlowfileColumn]) -> "FlowDataEngine":
|
|
1135
1159
|
"""Creates an empty FlowDataEngine from a schema definition.
|
|
1136
1160
|
|
|
1137
1161
|
Args:
|
|
@@ -1162,14 +1186,14 @@ class FlowDataEngine:
|
|
|
1162
1186
|
"""
|
|
1163
1187
|
received_table.set_absolute_filepath()
|
|
1164
1188
|
file_type_handlers = {
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1189
|
+
"csv": create_funcs.create_from_path_csv,
|
|
1190
|
+
"parquet": create_funcs.create_from_path_parquet,
|
|
1191
|
+
"excel": create_funcs.create_from_path_excel,
|
|
1168
1192
|
}
|
|
1169
1193
|
|
|
1170
1194
|
handler = file_type_handlers.get(received_table.file_type)
|
|
1171
1195
|
if not handler:
|
|
1172
|
-
raise Exception(f
|
|
1196
|
+
raise Exception(f"Cannot create from {received_table.file_type}")
|
|
1173
1197
|
|
|
1174
1198
|
flow_file = cls(handler(received_table))
|
|
1175
1199
|
flow_file._org_path = received_table.abs_file_path
|
|
@@ -1190,7 +1214,7 @@ class FlowDataEngine:
|
|
|
1190
1214
|
return cls(create_fake_data(number_of_records))
|
|
1191
1215
|
|
|
1192
1216
|
@classmethod
|
|
1193
|
-
def generate_enumerator(cls, length: int = 1000, output_name: str =
|
|
1217
|
+
def generate_enumerator(cls, length: int = 1000, output_name: str = "output_column") -> "FlowDataEngine":
|
|
1194
1218
|
"""Generates a FlowDataEngine with a single column containing a sequence of integers.
|
|
1195
1219
|
|
|
1196
1220
|
Args:
|
|
@@ -1204,8 +1228,9 @@ class FlowDataEngine:
|
|
|
1204
1228
|
length = 10_000_000
|
|
1205
1229
|
return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
|
|
1206
1230
|
|
|
1207
|
-
def _handle_schema(
|
|
1208
|
-
|
|
1231
|
+
def _handle_schema(
|
|
1232
|
+
self, schema: list[FlowfileColumn] | list[str] | pl.Schema | None, pl_schema: pl.Schema
|
|
1233
|
+
) -> list[FlowfileColumn] | None:
|
|
1209
1234
|
"""Handles schema processing and validation during initialization."""
|
|
1210
1235
|
if schema is None and pl_schema is not None:
|
|
1211
1236
|
return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
|
|
@@ -1216,7 +1241,8 @@ class FlowDataEngine:
|
|
|
1216
1241
|
elif pl_schema is not None and schema is not None:
|
|
1217
1242
|
if schema.__len__() != pl_schema.__len__():
|
|
1218
1243
|
raise Exception(
|
|
1219
|
-
f
|
|
1244
|
+
f"Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}"
|
|
1245
|
+
)
|
|
1220
1246
|
if isinstance(schema, pl.Schema):
|
|
1221
1247
|
return self._handle_polars_schema(schema, pl_schema)
|
|
1222
1248
|
elif isinstance(schema, list) and len(schema) == 0:
|
|
@@ -1225,31 +1251,29 @@ class FlowDataEngine:
|
|
|
1225
1251
|
return self._handle_string_schema(schema, pl_schema)
|
|
1226
1252
|
return schema
|
|
1227
1253
|
|
|
1228
|
-
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) ->
|
|
1254
|
+
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> list[FlowfileColumn]:
|
|
1229
1255
|
"""Handles Polars schema conversion."""
|
|
1230
1256
|
flow_file_columns = [
|
|
1231
1257
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
1232
|
-
for col_name, dtype in zip(schema.names(), schema.dtypes())
|
|
1258
|
+
for col_name, dtype in zip(schema.names(), schema.dtypes(), strict=False)
|
|
1233
1259
|
]
|
|
1234
1260
|
|
|
1235
1261
|
select_arg = [
|
|
1236
1262
|
pl.col(o).alias(n).cast(schema_dtype)
|
|
1237
|
-
for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes())
|
|
1263
|
+
for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes(), strict=False)
|
|
1238
1264
|
]
|
|
1239
1265
|
|
|
1240
1266
|
self.data_frame = self.data_frame.select(select_arg)
|
|
1241
1267
|
return flow_file_columns
|
|
1242
1268
|
|
|
1243
|
-
def _handle_string_schema(self, schema:
|
|
1269
|
+
def _handle_string_schema(self, schema: list[str], pl_schema: pl.Schema) -> list[FlowfileColumn]:
|
|
1244
1270
|
"""Handles string-based schema conversion."""
|
|
1245
1271
|
flow_file_columns = [
|
|
1246
1272
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
1247
|
-
for col_name, dtype in zip(schema, pl_schema.dtypes())
|
|
1273
|
+
for col_name, dtype in zip(schema, pl_schema.dtypes(), strict=False)
|
|
1248
1274
|
]
|
|
1249
1275
|
|
|
1250
|
-
self.data_frame = self.data_frame.rename({
|
|
1251
|
-
o: n for o, n in zip(pl_schema.names(), schema)
|
|
1252
|
-
})
|
|
1276
|
+
self.data_frame = self.data_frame.rename({o: n for o, n in zip(pl_schema.names(), schema, strict=False)})
|
|
1253
1277
|
|
|
1254
1278
|
return flow_file_columns
|
|
1255
1279
|
|
|
@@ -1267,25 +1291,16 @@ class FlowDataEngine:
|
|
|
1267
1291
|
A new `FlowDataEngine` instance with the exploded rows.
|
|
1268
1292
|
"""
|
|
1269
1293
|
output_column_name = (
|
|
1270
|
-
split_input.output_column_name
|
|
1271
|
-
if split_input.output_column_name
|
|
1272
|
-
else split_input.column_to_split
|
|
1294
|
+
split_input.output_column_name if split_input.output_column_name else split_input.column_to_split
|
|
1273
1295
|
)
|
|
1274
1296
|
|
|
1275
1297
|
split_value = (
|
|
1276
|
-
split_input.split_fixed_value
|
|
1277
|
-
if split_input.split_by_fixed_value
|
|
1278
|
-
else pl.col(split_input.split_by_column)
|
|
1298
|
+
split_input.split_fixed_value if split_input.split_by_fixed_value else pl.col(split_input.split_by_column)
|
|
1279
1299
|
)
|
|
1280
1300
|
|
|
1281
|
-
df = (
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
.str.split(by=split_value)
|
|
1285
|
-
.alias(output_column_name)
|
|
1286
|
-
)
|
|
1287
|
-
.explode(output_column_name)
|
|
1288
|
-
)
|
|
1301
|
+
df = self.data_frame.with_columns(
|
|
1302
|
+
pl.col(split_input.column_to_split).str.split(by=split_value).alias(output_column_name)
|
|
1303
|
+
).explode(output_column_name)
|
|
1289
1304
|
|
|
1290
1305
|
return FlowDataEngine(df)
|
|
1291
1306
|
|
|
@@ -1305,15 +1320,9 @@ class FlowDataEngine:
|
|
|
1305
1320
|
lf = self.data_frame
|
|
1306
1321
|
|
|
1307
1322
|
if unpivot_input.data_type_selector_expr is not None:
|
|
1308
|
-
result = lf.unpivot(
|
|
1309
|
-
on=unpivot_input.data_type_selector_expr(),
|
|
1310
|
-
index=unpivot_input.index_columns
|
|
1311
|
-
)
|
|
1323
|
+
result = lf.unpivot(on=unpivot_input.data_type_selector_expr(), index=unpivot_input.index_columns)
|
|
1312
1324
|
elif unpivot_input.value_columns is not None:
|
|
1313
|
-
result = lf.unpivot(
|
|
1314
|
-
on=unpivot_input.value_columns,
|
|
1315
|
-
index=unpivot_input.index_columns
|
|
1316
|
-
)
|
|
1325
|
+
result = lf.unpivot(on=unpivot_input.value_columns, index=unpivot_input.index_columns)
|
|
1317
1326
|
else:
|
|
1318
1327
|
result = lf.unpivot()
|
|
1319
1328
|
|
|
@@ -1333,19 +1342,24 @@ class FlowDataEngine:
|
|
|
1333
1342
|
"""
|
|
1334
1343
|
# Get unique values for pivot columns
|
|
1335
1344
|
max_unique_vals = 200
|
|
1336
|
-
new_cols_unique = fetch_unique_values(
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1345
|
+
new_cols_unique = fetch_unique_values(
|
|
1346
|
+
self.data_frame.select(pivot_input.pivot_column)
|
|
1347
|
+
.unique()
|
|
1348
|
+
.sort(pivot_input.pivot_column)
|
|
1349
|
+
.limit(max_unique_vals)
|
|
1350
|
+
.cast(pl.String)
|
|
1351
|
+
)
|
|
1340
1352
|
if len(new_cols_unique) >= max_unique_vals:
|
|
1341
1353
|
if node_logger:
|
|
1342
|
-
node_logger.warning(
|
|
1343
|
-
|
|
1354
|
+
node_logger.warning(
|
|
1355
|
+
"Pivot column has too many unique values. Please consider using a different column."
|
|
1356
|
+
f" Max unique values: {max_unique_vals}"
|
|
1357
|
+
)
|
|
1344
1358
|
|
|
1345
1359
|
if len(pivot_input.index_columns) == 0:
|
|
1346
1360
|
no_index_cols = True
|
|
1347
|
-
pivot_input.index_columns = [
|
|
1348
|
-
ff = self.apply_flowfile_formula(
|
|
1361
|
+
pivot_input.index_columns = ["__temp__"]
|
|
1362
|
+
ff = self.apply_flowfile_formula("1", col_name="__temp__")
|
|
1349
1363
|
else:
|
|
1350
1364
|
no_index_cols = False
|
|
1351
1365
|
ff = self
|
|
@@ -1355,36 +1369,32 @@ class FlowDataEngine:
|
|
|
1355
1369
|
grouped_ff = ff.do_group_by(pivot_input.get_group_by_input(), False)
|
|
1356
1370
|
pivot_column = pivot_input.get_pivot_column()
|
|
1357
1371
|
|
|
1358
|
-
input_df = grouped_ff.data_frame.with_columns(
|
|
1359
|
-
pivot_column.cast(pl.String).alias(pivot_input.pivot_column)
|
|
1360
|
-
)
|
|
1372
|
+
input_df = grouped_ff.data_frame.with_columns(pivot_column.cast(pl.String).alias(pivot_input.pivot_column))
|
|
1361
1373
|
number_of_aggregations = len(pivot_input.aggregations)
|
|
1362
1374
|
df = (
|
|
1363
|
-
input_df.select(
|
|
1364
|
-
*index_columns,
|
|
1365
|
-
pivot_column,
|
|
1366
|
-
pivot_input.get_values_expr()
|
|
1367
|
-
)
|
|
1375
|
+
input_df.select(*index_columns, pivot_column, pivot_input.get_values_expr())
|
|
1368
1376
|
.group_by(*index_columns)
|
|
1369
|
-
.agg(
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1377
|
+
.agg(
|
|
1378
|
+
[
|
|
1379
|
+
(pl.col("vals").filter(pivot_column == new_col_value)).first().alias(new_col_value)
|
|
1380
|
+
for new_col_value in new_cols_unique
|
|
1381
|
+
]
|
|
1382
|
+
)
|
|
1375
1383
|
.select(
|
|
1376
1384
|
*index_columns,
|
|
1377
1385
|
*[
|
|
1378
|
-
pl.col(new_col)
|
|
1386
|
+
pl.col(new_col)
|
|
1387
|
+
.struct.field(agg)
|
|
1388
|
+
.alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
|
|
1379
1389
|
for new_col in new_cols_unique
|
|
1380
1390
|
for agg in pivot_input.aggregations
|
|
1381
|
-
]
|
|
1391
|
+
],
|
|
1382
1392
|
)
|
|
1383
1393
|
)
|
|
1384
1394
|
|
|
1385
1395
|
# Clean up temporary columns if needed
|
|
1386
1396
|
if no_index_cols:
|
|
1387
|
-
df = df.drop(
|
|
1397
|
+
df = df.drop("__temp__")
|
|
1388
1398
|
pivot_input.index_columns = []
|
|
1389
1399
|
|
|
1390
1400
|
return FlowDataEngine(df, calculate_schema_stats=False)
|
|
@@ -1403,7 +1413,7 @@ class FlowDataEngine:
|
|
|
1403
1413
|
try:
|
|
1404
1414
|
f = to_expr(predicate)
|
|
1405
1415
|
except Exception as e:
|
|
1406
|
-
logger.warning(f
|
|
1416
|
+
logger.warning(f"Error in filter expression: {e}")
|
|
1407
1417
|
f = to_expr("False")
|
|
1408
1418
|
df = self.data_frame.filter(f)
|
|
1409
1419
|
return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
|
|
@@ -1430,29 +1440,27 @@ class FlowDataEngine:
|
|
|
1430
1440
|
select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
|
|
1431
1441
|
|
|
1432
1442
|
df = (
|
|
1433
|
-
self.data_frame
|
|
1434
|
-
.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
|
|
1443
|
+
self.data_frame.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
|
|
1435
1444
|
.with_columns(
|
|
1436
|
-
(
|
|
1437
|
-
|
|
1438
|
-
|
|
1445
|
+
(
|
|
1446
|
+
pl.cum_count(record_id_settings.output_column_name).over(record_id_settings.group_by_columns)
|
|
1447
|
+
+ record_id_settings.offset
|
|
1448
|
+
- 1
|
|
1449
|
+
).alias(record_id_settings.output_column_name)
|
|
1439
1450
|
)
|
|
1440
1451
|
.select(select_cols)
|
|
1441
1452
|
)
|
|
1442
1453
|
|
|
1443
|
-
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name,
|
|
1454
|
+
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
|
|
1444
1455
|
output_schema.extend(self.schema)
|
|
1445
1456
|
|
|
1446
1457
|
return FlowDataEngine(df, schema=output_schema)
|
|
1447
1458
|
|
|
1448
1459
|
def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
1449
1460
|
"""Adds a simple sequential record ID column."""
|
|
1450
|
-
df = self.data_frame.with_row_index(
|
|
1451
|
-
record_id_settings.output_column_name,
|
|
1452
|
-
record_id_settings.offset
|
|
1453
|
-
)
|
|
1461
|
+
df = self.data_frame.with_row_index(record_id_settings.output_column_name, record_id_settings.offset)
|
|
1454
1462
|
|
|
1455
|
-
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name,
|
|
1463
|
+
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
|
|
1456
1464
|
output_schema.extend(self.schema)
|
|
1457
1465
|
|
|
1458
1466
|
return FlowDataEngine(df, schema=output_schema)
|
|
@@ -1484,7 +1492,7 @@ class FlowDataEngine:
|
|
|
1484
1492
|
|
|
1485
1493
|
def __repr__(self) -> str:
|
|
1486
1494
|
"""Returns a string representation of the FlowDataEngine."""
|
|
1487
|
-
return f
|
|
1495
|
+
return f"flow data engine\n{self.data_frame.__repr__()}"
|
|
1488
1496
|
|
|
1489
1497
|
def __call__(self) -> "FlowDataEngine":
|
|
1490
1498
|
"""Makes the class instance callable, returning itself."""
|
|
@@ -1504,16 +1512,16 @@ class FlowDataEngine:
|
|
|
1504
1512
|
Returns:
|
|
1505
1513
|
The same `FlowDataEngine` instance, now backed by the cached data.
|
|
1506
1514
|
"""
|
|
1507
|
-
edf = ExternalDfFetcher(
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
logger.info(
|
|
1515
|
+
edf = ExternalDfFetcher(
|
|
1516
|
+
lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False, flow_id=-1, node_id=-1
|
|
1517
|
+
)
|
|
1518
|
+
logger.info("Caching data in background")
|
|
1511
1519
|
result = edf.get_result()
|
|
1512
1520
|
if isinstance(result, pl.LazyFrame):
|
|
1513
|
-
logger.info(
|
|
1521
|
+
logger.info("Data cached")
|
|
1514
1522
|
del self._data_frame
|
|
1515
1523
|
self.data_frame = result
|
|
1516
|
-
logger.info(
|
|
1524
|
+
logger.info("Data loaded from cache")
|
|
1517
1525
|
return self
|
|
1518
1526
|
|
|
1519
1527
|
def collect_external(self):
|
|
@@ -1525,14 +1533,14 @@ class FlowDataEngine:
|
|
|
1525
1533
|
re-evaluated.
|
|
1526
1534
|
"""
|
|
1527
1535
|
if self._external_source is not None:
|
|
1528
|
-
logger.info(
|
|
1536
|
+
logger.info("Collecting external source")
|
|
1529
1537
|
if self.external_source.get_pl_df() is not None:
|
|
1530
1538
|
self.data_frame = self.external_source.get_pl_df().lazy()
|
|
1531
1539
|
else:
|
|
1532
1540
|
self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
|
|
1533
1541
|
self._schema = None # enforce reset schema
|
|
1534
1542
|
|
|
1535
|
-
def get_output_sample(self, n_rows: int = 10) ->
|
|
1543
|
+
def get_output_sample(self, n_rows: int = 10) -> list[dict]:
|
|
1536
1544
|
"""Gets a sample of the data as a list of dictionaries.
|
|
1537
1545
|
|
|
1538
1546
|
This is typically used to display a preview of the data in a UI.
|
|
@@ -1560,14 +1568,20 @@ class FlowDataEngine:
|
|
|
1560
1568
|
try:
|
|
1561
1569
|
df = df.head(n_rows).collect()
|
|
1562
1570
|
except Exception as e:
|
|
1563
|
-
logger.warning(f
|
|
1571
|
+
logger.warning(f"Error in getting sample: {e}")
|
|
1564
1572
|
df = df.head(n_rows).collect(engine="auto")
|
|
1565
1573
|
else:
|
|
1566
1574
|
df = self.collect()
|
|
1567
1575
|
return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
|
|
1568
1576
|
|
|
1569
|
-
def get_sample(
|
|
1570
|
-
|
|
1577
|
+
def get_sample(
|
|
1578
|
+
self,
|
|
1579
|
+
n_rows: int = 100,
|
|
1580
|
+
random: bool = False,
|
|
1581
|
+
shuffle: bool = False,
|
|
1582
|
+
seed: int = None,
|
|
1583
|
+
execution_location: ExecutionLocationsLiteral | None = None,
|
|
1584
|
+
) -> "FlowDataEngine":
|
|
1571
1585
|
"""Gets a sample of rows from the DataFrame.
|
|
1572
1586
|
|
|
1573
1587
|
Args:
|
|
@@ -1579,22 +1593,23 @@ class FlowDataEngine:
|
|
|
1579
1593
|
Returns:
|
|
1580
1594
|
A new `FlowDataEngine` instance containing the sampled data.
|
|
1581
1595
|
"""
|
|
1582
|
-
logging.info(f
|
|
1596
|
+
logging.info(f"Getting sample of {n_rows} rows")
|
|
1583
1597
|
if random:
|
|
1584
1598
|
if self.lazy and self.external_source is not None:
|
|
1585
1599
|
self.collect_external()
|
|
1586
1600
|
|
|
1587
1601
|
if self.lazy and shuffle:
|
|
1588
|
-
sample_df =
|
|
1589
|
-
|
|
1602
|
+
sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(
|
|
1603
|
+
n_rows, seed=seed, shuffle=shuffle
|
|
1604
|
+
)
|
|
1590
1605
|
elif shuffle:
|
|
1591
1606
|
sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
|
|
1592
1607
|
else:
|
|
1593
1608
|
if execution_location is None:
|
|
1594
1609
|
execution_location = get_global_execution_location()
|
|
1595
|
-
n_rows = min(
|
|
1596
|
-
calculate_in_worker_process=execution_location == "remote")
|
|
1597
|
-
|
|
1610
|
+
n_rows = min(
|
|
1611
|
+
n_rows, self.get_number_of_records(calculate_in_worker_process=execution_location == "remote")
|
|
1612
|
+
)
|
|
1598
1613
|
|
|
1599
1614
|
every_n_records = ceil(self.number_of_records / n_rows)
|
|
1600
1615
|
sample_df = self.data_frame.gather_every(every_n_records)
|
|
@@ -1619,8 +1634,9 @@ class FlowDataEngine:
|
|
|
1619
1634
|
else:
|
|
1620
1635
|
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
1621
1636
|
|
|
1622
|
-
def iter_batches(
|
|
1623
|
-
|
|
1637
|
+
def iter_batches(
|
|
1638
|
+
self, batch_size: int = 1000, columns: list | tuple | str = None
|
|
1639
|
+
) -> Generator["FlowDataEngine", None, None]:
|
|
1624
1640
|
"""Iterates over the DataFrame in batches.
|
|
1625
1641
|
|
|
1626
1642
|
Args:
|
|
@@ -1638,9 +1654,14 @@ class FlowDataEngine:
|
|
|
1638
1654
|
for batch in batches:
|
|
1639
1655
|
yield FlowDataEngine(batch)
|
|
1640
1656
|
|
|
1641
|
-
def start_fuzzy_join(
|
|
1642
|
-
|
|
1643
|
-
|
|
1657
|
+
def start_fuzzy_join(
|
|
1658
|
+
self,
|
|
1659
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1660
|
+
other: "FlowDataEngine",
|
|
1661
|
+
file_ref: str,
|
|
1662
|
+
flow_id: int = -1,
|
|
1663
|
+
node_id: int | str = -1,
|
|
1664
|
+
) -> ExternalFuzzyMatchFetcher:
|
|
1644
1665
|
"""Starts a fuzzy join operation in a background process.
|
|
1645
1666
|
|
|
1646
1667
|
This method prepares the data and initiates the fuzzy matching in a
|
|
@@ -1658,51 +1679,70 @@ class FlowDataEngine:
|
|
|
1658
1679
|
progress and retrieve the result of the fuzzy join.
|
|
1659
1680
|
"""
|
|
1660
1681
|
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1661
|
-
left_df, right_df = prepare_for_fuzzy_match(
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1682
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1683
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1684
|
+
)
|
|
1685
|
+
|
|
1686
|
+
return ExternalFuzzyMatchFetcher(
|
|
1687
|
+
left_df,
|
|
1688
|
+
right_df,
|
|
1689
|
+
fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
|
|
1690
|
+
file_ref=file_ref + "_fm",
|
|
1691
|
+
wait_on_completion=False,
|
|
1692
|
+
flow_id=flow_id,
|
|
1693
|
+
node_id=node_id,
|
|
1694
|
+
)
|
|
1695
|
+
|
|
1696
|
+
def fuzzy_join_external(
|
|
1697
|
+
self,
|
|
1698
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1699
|
+
other: "FlowDataEngine",
|
|
1700
|
+
file_ref: str = None,
|
|
1701
|
+
flow_id: int = -1,
|
|
1702
|
+
node_id: int = -1,
|
|
1703
|
+
):
|
|
1678
1704
|
if file_ref is None:
|
|
1679
|
-
file_ref = str(id(self)) +
|
|
1705
|
+
file_ref = str(id(self)) + "_" + str(id(other))
|
|
1680
1706
|
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1681
1707
|
|
|
1682
|
-
left_df, right_df = prepare_for_fuzzy_match(
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1708
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1709
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1710
|
+
)
|
|
1711
|
+
external_tracker = ExternalFuzzyMatchFetcher(
|
|
1712
|
+
left_df,
|
|
1713
|
+
right_df,
|
|
1714
|
+
fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
|
|
1715
|
+
file_ref=file_ref + "_fm",
|
|
1716
|
+
wait_on_completion=False,
|
|
1717
|
+
flow_id=flow_id,
|
|
1718
|
+
node_id=node_id,
|
|
1719
|
+
)
|
|
1690
1720
|
return FlowDataEngine(external_tracker.get_result())
|
|
1691
1721
|
|
|
1692
|
-
def fuzzy_join(
|
|
1693
|
-
|
|
1694
|
-
|
|
1722
|
+
def fuzzy_join(
|
|
1723
|
+
self,
|
|
1724
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1725
|
+
other: "FlowDataEngine",
|
|
1726
|
+
node_logger: NodeLogger = None,
|
|
1727
|
+
) -> "FlowDataEngine":
|
|
1695
1728
|
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1696
|
-
left_df, right_df = prepare_for_fuzzy_match(
|
|
1697
|
-
|
|
1729
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1730
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1731
|
+
)
|
|
1698
1732
|
fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input_manager.fuzzy_maps]
|
|
1699
|
-
return FlowDataEngine(
|
|
1700
|
-
|
|
1701
|
-
|
|
1733
|
+
return FlowDataEngine(
|
|
1734
|
+
fuzzy_match_dfs(
|
|
1735
|
+
left_df, right_df, fuzzy_maps=fuzzy_mappings, logger=node_logger.logger if node_logger else logger
|
|
1736
|
+
).lazy()
|
|
1737
|
+
)
|
|
1702
1738
|
|
|
1703
|
-
def do_cross_join(
|
|
1704
|
-
|
|
1705
|
-
|
|
1739
|
+
def do_cross_join(
|
|
1740
|
+
self,
|
|
1741
|
+
cross_join_input: transform_schemas.CrossJoinInput,
|
|
1742
|
+
auto_generate_selection: bool,
|
|
1743
|
+
verify_integrity: bool,
|
|
1744
|
+
other: "FlowDataEngine",
|
|
1745
|
+
) -> "FlowDataEngine":
|
|
1706
1746
|
"""Performs a cross join with another DataFrame.
|
|
1707
1747
|
|
|
1708
1748
|
A cross join produces the Cartesian product of the two DataFrames.
|
|
@@ -1723,26 +1763,41 @@ class FlowDataEngine:
|
|
|
1723
1763
|
self.lazy = True
|
|
1724
1764
|
other.lazy = True
|
|
1725
1765
|
cross_join_input_manager = transform_schemas.CrossJoinInputManager(cross_join_input)
|
|
1726
|
-
verify_join_select_integrity(
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1766
|
+
verify_join_select_integrity(
|
|
1767
|
+
cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns
|
|
1768
|
+
)
|
|
1769
|
+
right_select = [
|
|
1770
|
+
v.old_name
|
|
1771
|
+
for v in cross_join_input_manager.right_select.renames
|
|
1772
|
+
if (v.keep or v.join_key) and v.is_available
|
|
1773
|
+
]
|
|
1774
|
+
left_select = [
|
|
1775
|
+
v.old_name
|
|
1776
|
+
for v in cross_join_input_manager.left_select.renames
|
|
1777
|
+
if (v.keep or v.join_key) and v.is_available
|
|
1778
|
+
]
|
|
1731
1779
|
cross_join_input_manager.auto_rename(rename_mode="suffix")
|
|
1732
1780
|
left = self.data_frame.select(left_select).rename(cross_join_input_manager.left_select.rename_table)
|
|
1733
1781
|
right = other.data_frame.select(right_select).rename(cross_join_input_manager.right_select.rename_table)
|
|
1734
1782
|
|
|
1735
|
-
joined_df = left.join(right, how=
|
|
1783
|
+
joined_df = left.join(right, how="cross")
|
|
1736
1784
|
|
|
1737
|
-
cols_to_delete_after = [
|
|
1738
|
-
|
|
1739
|
-
|
|
1785
|
+
cols_to_delete_after = [
|
|
1786
|
+
col.new_name
|
|
1787
|
+
for col in cross_join_input_manager.left_select.renames + cross_join_input_manager.right_select.renames
|
|
1788
|
+
if col.join_key and not col.keep and col.is_available
|
|
1789
|
+
]
|
|
1740
1790
|
|
|
1741
1791
|
fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
|
|
1742
1792
|
return fl
|
|
1743
1793
|
|
|
1744
|
-
def join(
|
|
1745
|
-
|
|
1794
|
+
def join(
|
|
1795
|
+
self,
|
|
1796
|
+
join_input: transform_schemas.JoinInput,
|
|
1797
|
+
auto_generate_selection: bool,
|
|
1798
|
+
verify_integrity: bool,
|
|
1799
|
+
other: "FlowDataEngine",
|
|
1800
|
+
) -> "FlowDataEngine":
|
|
1746
1801
|
"""Performs a standard SQL-style join with another DataFrame."""
|
|
1747
1802
|
# Create manager from input
|
|
1748
1803
|
join_manager = transform_schemas.JoinInputManager(join_input)
|
|
@@ -1754,40 +1809,52 @@ class FlowDataEngine:
|
|
|
1754
1809
|
join_manager.right_select.append(transform_schemas.SelectInput(jk.right_col, keep=False))
|
|
1755
1810
|
verify_join_select_integrity(join_manager.input, left_columns=self.columns, right_columns=other.columns)
|
|
1756
1811
|
if not verify_join_map_integrity(join_manager.input, left_columns=self.schema, right_columns=other.schema):
|
|
1757
|
-
raise Exception(
|
|
1812
|
+
raise Exception("Join is not valid by the data fields")
|
|
1758
1813
|
|
|
1759
1814
|
if auto_generate_selection:
|
|
1760
1815
|
join_manager.auto_rename()
|
|
1761
1816
|
|
|
1762
1817
|
# Use manager properties throughout
|
|
1763
|
-
left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(
|
|
1764
|
-
|
|
1818
|
+
left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(
|
|
1819
|
+
join_manager.left_manager.get_rename_table()
|
|
1820
|
+
)
|
|
1821
|
+
right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(
|
|
1822
|
+
join_manager.right_manager.get_rename_table()
|
|
1823
|
+
)
|
|
1765
1824
|
|
|
1766
1825
|
left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_manager)
|
|
1767
1826
|
left, right = rename_df_table_for_join(left, right, join_manager.get_join_key_renames())
|
|
1768
|
-
if join_manager.how ==
|
|
1827
|
+
if join_manager.how == "right":
|
|
1769
1828
|
joined_df = right.join(
|
|
1770
1829
|
other=left,
|
|
1771
1830
|
left_on=join_manager.right_join_keys,
|
|
1772
1831
|
right_on=join_manager.left_join_keys,
|
|
1773
1832
|
how="left",
|
|
1774
|
-
suffix=""
|
|
1833
|
+
suffix="",
|
|
1834
|
+
).rename(reverse_join_key_mapping)
|
|
1775
1835
|
else:
|
|
1776
1836
|
joined_df = left.join(
|
|
1777
1837
|
other=right,
|
|
1778
1838
|
left_on=join_manager.left_join_keys,
|
|
1779
1839
|
right_on=join_manager.right_join_keys,
|
|
1780
1840
|
how=join_manager.how,
|
|
1781
|
-
suffix=""
|
|
1841
|
+
suffix="",
|
|
1842
|
+
).rename(reverse_join_key_mapping)
|
|
1782
1843
|
|
|
1783
|
-
left_cols_to_delete_after = [
|
|
1784
|
-
|
|
1785
|
-
|
|
1844
|
+
left_cols_to_delete_after = [
|
|
1845
|
+
get_col_name_to_delete(col, "left")
|
|
1846
|
+
for col in join_manager.input.left_select.renames
|
|
1847
|
+
if not col.keep and col.is_available and col.join_key
|
|
1848
|
+
]
|
|
1786
1849
|
|
|
1787
|
-
right_cols_to_delete_after = [
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1850
|
+
right_cols_to_delete_after = [
|
|
1851
|
+
get_col_name_to_delete(col, "right")
|
|
1852
|
+
for col in join_manager.input.right_select.renames
|
|
1853
|
+
if not col.keep
|
|
1854
|
+
and col.is_available
|
|
1855
|
+
and col.join_key
|
|
1856
|
+
and join_manager.how in ("left", "right", "inner", "cross", "outer")
|
|
1857
|
+
]
|
|
1791
1858
|
|
|
1792
1859
|
if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
|
|
1793
1860
|
joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
|
|
@@ -1795,8 +1862,7 @@ class FlowDataEngine:
|
|
|
1795
1862
|
undo_join_key_remapping = get_undo_rename_mapping_join(join_manager)
|
|
1796
1863
|
joined_df = joined_df.rename(undo_join_key_remapping)
|
|
1797
1864
|
|
|
1798
|
-
return FlowDataEngine(joined_df, calculate_schema_stats=False,
|
|
1799
|
-
number_of_records=0, streamable=False)
|
|
1865
|
+
return FlowDataEngine(joined_df, calculate_schema_stats=False, number_of_records=0, streamable=False)
|
|
1800
1866
|
|
|
1801
1867
|
def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
|
|
1802
1868
|
"""Solves a graph problem represented by 'from' and 'to' columns.
|
|
@@ -1811,8 +1877,9 @@ class FlowDataEngine:
|
|
|
1811
1877
|
A new `FlowDataEngine` instance with the solved graph data.
|
|
1812
1878
|
"""
|
|
1813
1879
|
lf = self.data_frame.with_columns(
|
|
1814
|
-
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
|
|
1815
|
-
|
|
1880
|
+
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to).alias(
|
|
1881
|
+
graph_solver_input.output_column_name
|
|
1882
|
+
)
|
|
1816
1883
|
)
|
|
1817
1884
|
return FlowDataEngine(lf)
|
|
1818
1885
|
|
|
@@ -1827,7 +1894,7 @@ class FlowDataEngine:
|
|
|
1827
1894
|
A new `FlowDataEngine` instance with the added column.
|
|
1828
1895
|
"""
|
|
1829
1896
|
if col_name is None:
|
|
1830
|
-
col_name =
|
|
1897
|
+
col_name = "new_values"
|
|
1831
1898
|
return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
|
|
1832
1899
|
|
|
1833
1900
|
def get_record_count(self) -> "FlowDataEngine":
|
|
@@ -1837,7 +1904,7 @@ class FlowDataEngine:
|
|
|
1837
1904
|
Returns:
|
|
1838
1905
|
A new `FlowDataEngine` instance.
|
|
1839
1906
|
"""
|
|
1840
|
-
return FlowDataEngine(self.data_frame.select(pl.len().alias(
|
|
1907
|
+
return FlowDataEngine(self.data_frame.select(pl.len().alias("number_of_records")))
|
|
1841
1908
|
|
|
1842
1909
|
def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
|
|
1843
1910
|
"""Asserts that this DataFrame is equal to another.
|
|
@@ -1860,13 +1927,13 @@ class FlowDataEngine:
|
|
|
1860
1927
|
other = other.select_columns(self.columns)
|
|
1861
1928
|
|
|
1862
1929
|
if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
|
|
1863
|
-
raise Exception(
|
|
1930
|
+
raise Exception("Number of records is not equal")
|
|
1864
1931
|
|
|
1865
1932
|
if self.columns != other.columns:
|
|
1866
|
-
raise Exception(
|
|
1933
|
+
raise Exception("Schema is not equal")
|
|
1867
1934
|
|
|
1868
1935
|
if strict_schema:
|
|
1869
|
-
assert self.data_frame.schema == other.data_frame.schema,
|
|
1936
|
+
assert self.data_frame.schema == other.data_frame.schema, "Data types do not match"
|
|
1870
1937
|
|
|
1871
1938
|
if ordered:
|
|
1872
1939
|
self_lf = self.data_frame.sort(by=self.columns)
|
|
@@ -1876,7 +1943,7 @@ class FlowDataEngine:
|
|
|
1876
1943
|
other_lf = other.data_frame
|
|
1877
1944
|
|
|
1878
1945
|
self.lazy, other.lazy = org_laziness
|
|
1879
|
-
assert self_lf.equals(other_lf),
|
|
1946
|
+
assert self_lf.equals(other_lf), "Data is not equal"
|
|
1880
1947
|
|
|
1881
1948
|
def initialize_empty_fl(self):
|
|
1882
1949
|
"""Initializes an empty LazyFrame."""
|
|
@@ -1891,7 +1958,7 @@ class FlowDataEngine:
|
|
|
1891
1958
|
operation_type="calculate_number_of_records",
|
|
1892
1959
|
flow_id=-1,
|
|
1893
1960
|
node_id=-1,
|
|
1894
|
-
wait_on_completion=True
|
|
1961
|
+
wait_on_completion=True,
|
|
1895
1962
|
).result
|
|
1896
1963
|
return number_of_records
|
|
1897
1964
|
|
|
@@ -1907,8 +1974,9 @@ class FlowDataEngine:
|
|
|
1907
1974
|
"""
|
|
1908
1975
|
return self.get_number_of_records(force_calculate=force_calculate)
|
|
1909
1976
|
|
|
1910
|
-
def get_number_of_records(
|
|
1911
|
-
|
|
1977
|
+
def get_number_of_records(
|
|
1978
|
+
self, warn: bool = False, force_calculate: bool = False, calculate_in_worker_process: bool = False
|
|
1979
|
+
) -> int:
|
|
1912
1980
|
"""Gets the total number of records in the DataFrame.
|
|
1913
1981
|
|
|
1914
1982
|
For lazy frames, this may trigger a full data scan, which can be expensive.
|
|
@@ -1938,12 +2006,13 @@ class FlowDataEngine:
|
|
|
1938
2006
|
except Exception as e:
|
|
1939
2007
|
logger.error(f"Error: {e}")
|
|
1940
2008
|
if warn:
|
|
1941
|
-
logger.warning(
|
|
2009
|
+
logger.warning("Calculating the number of records this can be expensive on a lazy frame")
|
|
1942
2010
|
try:
|
|
1943
2011
|
self.number_of_records = self.data_frame.select(pl.len()).collect(
|
|
1944
|
-
engine="streaming" if self._streamable else "auto"
|
|
2012
|
+
engine="streaming" if self._streamable else "auto"
|
|
2013
|
+
)[0, 0]
|
|
1945
2014
|
except Exception:
|
|
1946
|
-
raise ValueError(
|
|
2015
|
+
raise ValueError("Could not get number of records")
|
|
1947
2016
|
else:
|
|
1948
2017
|
self.number_of_records = self.data_frame.__len__()
|
|
1949
2018
|
return self.number_of_records
|
|
@@ -1984,7 +2053,7 @@ class FlowDataEngine:
|
|
|
1984
2053
|
return self._external_source
|
|
1985
2054
|
|
|
1986
2055
|
@property
|
|
1987
|
-
def cols_idx(self) ->
|
|
2056
|
+
def cols_idx(self) -> dict[str, int]:
|
|
1988
2057
|
"""A dictionary mapping column names to their integer index."""
|
|
1989
2058
|
if self._col_idx is None:
|
|
1990
2059
|
self._col_idx = {c: i for i, c in enumerate(self.columns)}
|
|
@@ -2006,7 +2075,7 @@ class FlowDataEngine:
|
|
|
2006
2075
|
[transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
|
|
2007
2076
|
)
|
|
2008
2077
|
|
|
2009
|
-
def select_columns(self, list_select:
|
|
2078
|
+
def select_columns(self, list_select: list[str] | tuple[str] | str) -> "FlowDataEngine":
|
|
2010
2079
|
"""Selects a subset of columns from the DataFrame.
|
|
2011
2080
|
|
|
2012
2081
|
Args:
|
|
@@ -2019,17 +2088,17 @@ class FlowDataEngine:
|
|
|
2019
2088
|
list_select = [list_select]
|
|
2020
2089
|
|
|
2021
2090
|
idx_to_keep = [self.cols_idx.get(c) for c in list_select]
|
|
2022
|
-
selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep) if id_to_keep is not None]
|
|
2091
|
+
selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep, strict=False) if id_to_keep is not None]
|
|
2023
2092
|
new_schema = [self.schema[i] for i in idx_to_keep if i is not None]
|
|
2024
2093
|
|
|
2025
2094
|
return FlowDataEngine(
|
|
2026
2095
|
self.data_frame.select(selects),
|
|
2027
2096
|
number_of_records=self.number_of_records,
|
|
2028
2097
|
schema=new_schema,
|
|
2029
|
-
streamable=self._streamable
|
|
2098
|
+
streamable=self._streamable,
|
|
2030
2099
|
)
|
|
2031
2100
|
|
|
2032
|
-
def drop_columns(self, columns:
|
|
2101
|
+
def drop_columns(self, columns: list[str]) -> "FlowDataEngine":
|
|
2033
2102
|
"""Drops specified columns from the DataFrame.
|
|
2034
2103
|
|
|
2035
2104
|
Args:
|
|
@@ -2043,12 +2112,10 @@ class FlowDataEngine:
|
|
|
2043
2112
|
new_schema = [self.schema[i] for i in idx_to_keep]
|
|
2044
2113
|
|
|
2045
2114
|
return FlowDataEngine(
|
|
2046
|
-
self.data_frame.select(cols_for_select),
|
|
2047
|
-
number_of_records=self.number_of_records,
|
|
2048
|
-
schema=new_schema
|
|
2115
|
+
self.data_frame.select(cols_for_select), number_of_records=self.number_of_records, schema=new_schema
|
|
2049
2116
|
)
|
|
2050
2117
|
|
|
2051
|
-
def reorganize_order(self, column_order:
|
|
2118
|
+
def reorganize_order(self, column_order: list[str]) -> "FlowDataEngine":
|
|
2052
2119
|
"""Reorganizes columns into a specified order.
|
|
2053
2120
|
|
|
2054
2121
|
Args:
|
|
@@ -2061,8 +2128,9 @@ class FlowDataEngine:
|
|
|
2061
2128
|
schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
|
|
2062
2129
|
return FlowDataEngine(df, schema=schema, number_of_records=self.number_of_records)
|
|
2063
2130
|
|
|
2064
|
-
def apply_flowfile_formula(
|
|
2065
|
-
|
|
2131
|
+
def apply_flowfile_formula(
|
|
2132
|
+
self, func: str, col_name: str, output_data_type: pl.DataType = None
|
|
2133
|
+
) -> "FlowDataEngine":
|
|
2066
2134
|
"""Applies a formula to create a new column or transform an existing one.
|
|
2067
2135
|
|
|
2068
2136
|
Args:
|
|
@@ -2081,8 +2149,7 @@ class FlowDataEngine:
|
|
|
2081
2149
|
|
|
2082
2150
|
return FlowDataEngine(df2, number_of_records=self.number_of_records)
|
|
2083
2151
|
|
|
2084
|
-
def apply_sql_formula(self, func: str, col_name: str,
|
|
2085
|
-
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
2152
|
+
def apply_sql_formula(self, func: str, col_name: str, output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
2086
2153
|
"""Applies an SQL-style formula using `pl.sql_expr`.
|
|
2087
2154
|
|
|
2088
2155
|
Args:
|
|
@@ -2101,8 +2168,9 @@ class FlowDataEngine:
|
|
|
2101
2168
|
|
|
2102
2169
|
return FlowDataEngine(df, number_of_records=self.number_of_records)
|
|
2103
2170
|
|
|
2104
|
-
def output(
|
|
2105
|
-
|
|
2171
|
+
def output(
|
|
2172
|
+
self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str, execute_remote: bool = True
|
|
2173
|
+
) -> "FlowDataEngine":
|
|
2106
2174
|
"""Writes the DataFrame to an output file.
|
|
2107
2175
|
|
|
2108
2176
|
Can execute the write operation locally or in a remote worker process.
|
|
@@ -2116,7 +2184,7 @@ class FlowDataEngine:
|
|
|
2116
2184
|
Returns:
|
|
2117
2185
|
The same `FlowDataEngine` instance for chaining.
|
|
2118
2186
|
"""
|
|
2119
|
-
logger.info(
|
|
2187
|
+
logger.info("Starting to write output")
|
|
2120
2188
|
if execute_remote:
|
|
2121
2189
|
status = utils.write_output(
|
|
2122
2190
|
self.data_frame,
|
|
@@ -2126,11 +2194,11 @@ class FlowDataEngine:
|
|
|
2126
2194
|
sheet_name=output_fs.sheet_name,
|
|
2127
2195
|
delimiter=output_fs.delimiter,
|
|
2128
2196
|
flow_id=flow_id,
|
|
2129
|
-
node_id=node_id
|
|
2197
|
+
node_id=node_id,
|
|
2130
2198
|
)
|
|
2131
2199
|
tracker = ExternalExecutorTracker(status)
|
|
2132
2200
|
tracker.get_result()
|
|
2133
|
-
logger.info(
|
|
2201
|
+
logger.info("Finished writing output")
|
|
2134
2202
|
else:
|
|
2135
2203
|
logger.info("Starting to write results locally")
|
|
2136
2204
|
utils.local_write_output(
|
|
@@ -2172,11 +2240,10 @@ class FlowDataEngine:
|
|
|
2172
2240
|
if isinstance(other, FlowDataEngine):
|
|
2173
2241
|
other = [other]
|
|
2174
2242
|
|
|
2175
|
-
dfs:
|
|
2176
|
-
return FlowDataEngine(pl.concat(dfs, how=
|
|
2243
|
+
dfs: list[pl.LazyFrame] | list[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
|
|
2244
|
+
return FlowDataEngine(pl.concat(dfs, how="diagonal_relaxed"))
|
|
2177
2245
|
|
|
2178
|
-
def do_select(self, select_inputs: transform_schemas.SelectInputs,
|
|
2179
|
-
keep_missing: bool = True) -> "FlowDataEngine":
|
|
2246
|
+
def do_select(self, select_inputs: transform_schemas.SelectInputs, keep_missing: bool = True) -> "FlowDataEngine":
|
|
2180
2247
|
"""Performs a complex column selection, renaming, and reordering operation.
|
|
2181
2248
|
|
|
2182
2249
|
Args:
|
|
@@ -2192,7 +2259,8 @@ class FlowDataEngine:
|
|
|
2192
2259
|
|
|
2193
2260
|
if not keep_missing:
|
|
2194
2261
|
drop_cols = set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames).union(
|
|
2195
|
-
set(r.old_name for r in renames if not r.keep)
|
|
2262
|
+
set(r.old_name for r in renames if not r.keep)
|
|
2263
|
+
)
|
|
2196
2264
|
keep_cols = []
|
|
2197
2265
|
else:
|
|
2198
2266
|
keep_cols = list(set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames))
|
|
@@ -2212,12 +2280,14 @@ class FlowDataEngine:
|
|
|
2212
2280
|
|
|
2213
2281
|
rename_dict = {r.old_name: r.new_name for r in available_renames}
|
|
2214
2282
|
fl = self.select_columns(
|
|
2215
|
-
list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
|
|
2283
|
+
list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
|
|
2284
|
+
)
|
|
2216
2285
|
fl = fl.change_column_types(transforms=[r for r in renames if r.keep])
|
|
2217
2286
|
ndf = fl.data_frame.rename(rename_dict)
|
|
2218
2287
|
renames.sort(key=lambda r: 0 if r.position is None else r.position)
|
|
2219
|
-
sorted_cols = utils.match_order(
|
|
2220
|
-
|
|
2288
|
+
sorted_cols = utils.match_order(
|
|
2289
|
+
ndf.collect_schema().names(), [r.new_name for r in renames] + self.data_frame.collect_schema().names()
|
|
2290
|
+
)
|
|
2221
2291
|
output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
|
|
2222
2292
|
return output_file.reorganize_order(sorted_cols)
|
|
2223
2293
|
|
|
@@ -2225,10 +2295,9 @@ class FlowDataEngine:
|
|
|
2225
2295
|
"""Sets whether DataFrame operations should be streamable."""
|
|
2226
2296
|
self._streamable = streamable
|
|
2227
2297
|
|
|
2228
|
-
def _calculate_schema(self) ->
|
|
2298
|
+
def _calculate_schema(self) -> list[dict]:
|
|
2229
2299
|
"""Calculates schema statistics."""
|
|
2230
2300
|
if self.external_source is not None:
|
|
2231
|
-
|
|
2232
2301
|
self.collect_external()
|
|
2233
2302
|
v = utils.calculate_schema(self.data_frame)
|
|
2234
2303
|
return v
|
|
@@ -2247,8 +2316,9 @@ class FlowDataEngine:
|
|
|
2247
2316
|
"""Creates a FlowDataEngine from a path in a worker process."""
|
|
2248
2317
|
received_table.set_absolute_filepath()
|
|
2249
2318
|
|
|
2250
|
-
external_fetcher = ExternalCreateFetcher(
|
|
2251
|
-
|
|
2319
|
+
external_fetcher = ExternalCreateFetcher(
|
|
2320
|
+
received_table=received_table, file_type=received_table.file_type, flow_id=flow_id, node_id=node_id
|
|
2321
|
+
)
|
|
2252
2322
|
return cls(external_fetcher.get_result())
|
|
2253
2323
|
|
|
2254
2324
|
|
|
@@ -2271,10 +2341,10 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
|
|
|
2271
2341
|
if len(flowfile_tables) == 0:
|
|
2272
2342
|
kwargs = {}
|
|
2273
2343
|
elif len(flowfile_tables) == 1:
|
|
2274
|
-
kwargs = {
|
|
2344
|
+
kwargs = {"input_df": flowfile_tables[0].data_frame}
|
|
2275
2345
|
else:
|
|
2276
|
-
kwargs = {f
|
|
2346
|
+
kwargs = {f"input_df_{i+1}": flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
|
|
2277
2347
|
df = polars_executable(**kwargs)
|
|
2278
2348
|
if isinstance(df, pl.DataFrame):
|
|
2279
2349
|
logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
|
|
2280
|
-
return FlowDataEngine(df)
|
|
2350
|
+
return FlowDataEngine(df)
|