Flowfile 0.4.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backends/main.py +25 -22
- build_backends/main_prd.py +10 -19
- flowfile/__init__.py +179 -73
- flowfile/__main__.py +10 -7
- flowfile/api.py +52 -59
- flowfile/web/__init__.py +14 -9
- flowfile/web/static/assets/AdminView-49392a9a.js +713 -0
- flowfile/web/static/assets/AdminView-f53bad23.css +129 -0
- flowfile/web/static/assets/CloudConnectionView-36bcd6df.css +72 -0
- flowfile/web/static/assets/{CloudConnectionManager-d3248f8d.js → CloudConnectionView-f13f202b.js} +11 -11
- flowfile/web/static/assets/{CloudStorageReader-d65bf041.js → CloudStorageReader-0023d4a5.js} +10 -8
- flowfile/web/static/assets/{CloudStorageReader-29d14fcc.css → CloudStorageReader-24c54524.css} +27 -27
- flowfile/web/static/assets/{CloudStorageWriter-b0ee067f.css → CloudStorageWriter-60547855.css} +26 -26
- flowfile/web/static/assets/{CloudStorageWriter-e83be3ed.js → CloudStorageWriter-8e781e11.js} +10 -8
- flowfile/web/static/assets/{ColumnSelector-47996a16.css → ColumnSelector-371637fb.css} +2 -2
- flowfile/web/static/assets/{ColumnSelector-cce661cf.js → ColumnSelector-8ad68ea9.js} +3 -5
- flowfile/web/static/assets/{ContextMenu-c13f91d0.css → ContextMenu-26d4dd27.css} +6 -6
- flowfile/web/static/assets/{ContextMenu-11a4652a.js → ContextMenu-31ee57f0.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-160afb08.js → ContextMenu-69a74055.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-cf18d2cc.js → ContextMenu-8e2051c6.js} +3 -3
- flowfile/web/static/assets/{ContextMenu-4c74eef1.css → ContextMenu-8ec1729e.css} +6 -6
- flowfile/web/static/assets/{ContextMenu-63cfa99b.css → ContextMenu-9b310c60.css} +6 -6
- flowfile/web/static/assets/{CrossJoin-d395d38c.js → CrossJoin-03df6938.js} +12 -10
- flowfile/web/static/assets/{CrossJoin-1119d18e.css → CrossJoin-71b4cc10.css} +20 -20
- flowfile/web/static/assets/CustomNode-59e99a86.css +32 -0
- flowfile/web/static/assets/{CustomNode-b812dc0b.js → CustomNode-8479239b.js} +36 -24
- flowfile/web/static/assets/{DatabaseConnectionSettings-7000bf2c.js → DatabaseConnectionSettings-869e3efd.js} +5 -4
- flowfile/web/static/assets/{DatabaseConnectionSettings-0c04b2e5.css → DatabaseConnectionSettings-e91df89a.css} +13 -13
- flowfile/web/static/assets/{DatabaseReader-ae61773c.css → DatabaseReader-36898a00.css} +24 -24
- flowfile/web/static/assets/{DatabaseReader-4f035d0c.js → DatabaseReader-c58b9552.js} +25 -15
- flowfile/web/static/assets/DatabaseView-6655afd6.css +57 -0
- flowfile/web/static/assets/{DatabaseManager-9662ec5b.js → DatabaseView-d26a9140.js} +11 -11
- flowfile/web/static/assets/{DatabaseWriter-2f570e53.css → DatabaseWriter-217a99f1.css} +19 -19
- flowfile/web/static/assets/{DatabaseWriter-f65dcd54.js → DatabaseWriter-4d05ddc7.js} +17 -10
- flowfile/web/static/assets/{designer-e3c150ec.css → DesignerView-a6d0ee84.css} +629 -538
- flowfile/web/static/assets/{designer-f3656d8c.js → DesignerView-e6f5c0e8.js} +1214 -3209
- flowfile/web/static/assets/{documentation-52b241e7.js → DocumentationView-2e78ef1b.js} +5 -5
- flowfile/web/static/assets/{documentation-12216a74.css → DocumentationView-fd46c656.css} +7 -7
- flowfile/web/static/assets/{ExploreData-2d0cf4db.css → ExploreData-10c5acc8.css} +13 -12
- flowfile/web/static/assets/{ExploreData-94c43dfc.js → ExploreData-7b54caca.js} +18 -9
- flowfile/web/static/assets/{ExternalSource-ac04b3cc.js → ExternalSource-3fa399b2.js} +9 -7
- flowfile/web/static/assets/{ExternalSource-e37b6275.css → ExternalSource-47ab05a3.css} +17 -17
- flowfile/web/static/assets/Filter-7494ea97.css +48 -0
- flowfile/web/static/assets/Filter-8cbbdbf3.js +287 -0
- flowfile/web/static/assets/{Formula-bb96803d.css → Formula-53d58c43.css} +7 -7
- flowfile/web/static/assets/{Formula-71472193.js → Formula-aac42b1e.js} +13 -11
- flowfile/web/static/assets/{FuzzyMatch-1010f966.css → FuzzyMatch-ad6361d6.css} +68 -69
- flowfile/web/static/assets/{FuzzyMatch-b317f631.js → FuzzyMatch-cd9bbfca.js} +12 -10
- flowfile/web/static/assets/{Pivot-cf333e3d.css → GraphSolver-c24dec17.css} +5 -5
- flowfile/web/static/assets/{GraphSolver-754a234f.js → GraphSolver-c7e6780e.js} +13 -11
- flowfile/web/static/assets/{GroupBy-6c6f9802.js → GroupBy-93c5d22b.js} +9 -7
- flowfile/web/static/assets/{GroupBy-b9505323.css → GroupBy-be7ac0bf.css} +10 -10
- flowfile/web/static/assets/{Join-fd79b451.css → Join-28b5e18f.css} +22 -22
- flowfile/web/static/assets/{Join-a1b800be.js → Join-a19b2de2.js} +13 -11
- flowfile/web/static/assets/LoginView-0df4ed0a.js +134 -0
- flowfile/web/static/assets/LoginView-d325d632.css +172 -0
- flowfile/web/static/assets/ManualInput-3702e677.css +293 -0
- flowfile/web/static/assets/{ManualInput-a9640276.js → ManualInput-8d3374b2.js} +170 -116
- flowfile/web/static/assets/{MultiSelect-97213888.js → MultiSelect-ad1b6243.js} +2 -2
- flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-6ffe088a.js → MultiSelect.vue_vue_type_script_setup_true_lang-e278950d.js} +1 -1
- flowfile/web/static/assets/NodeDesigner-40b647c9.js +2610 -0
- flowfile/web/static/assets/NodeDesigner-5f53be3f.css +1429 -0
- flowfile/web/static/assets/{NumericInput-e638088a.js → NumericInput-7100234c.js} +2 -2
- flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-90eb2cba.js → NumericInput.vue_vue_type_script_setup_true_lang-5130219f.js} +5 -2
- flowfile/web/static/assets/{Output-ddc9079f.css → Output-35e97000.css} +6 -6
- flowfile/web/static/assets/{Output-76750610.js → Output-f5efd2aa.js} +60 -38
- flowfile/web/static/assets/{GraphSolver-f0cb7bfb.css → Pivot-0eda81b4.css} +5 -5
- flowfile/web/static/assets/{Pivot-7814803f.js → Pivot-d981d23c.js} +11 -9
- flowfile/web/static/assets/PivotValidation-0e905b1a.css +13 -0
- flowfile/web/static/assets/{PivotValidation-f92137d2.js → PivotValidation-39386e95.js} +3 -3
- flowfile/web/static/assets/PivotValidation-41b57ad6.css +13 -0
- flowfile/web/static/assets/{PivotValidation-76dd431a.js → PivotValidation-63de1f73.js} +3 -3
- flowfile/web/static/assets/{PolarsCode-650322d1.css → PolarsCode-2b1f1f23.css} +4 -4
- flowfile/web/static/assets/{PolarsCode-889c3008.js → PolarsCode-f9d69217.js} +18 -9
- flowfile/web/static/assets/PopOver-b22f049e.js +939 -0
- flowfile/web/static/assets/PopOver-d96599db.css +33 -0
- flowfile/web/static/assets/{Read-6b17491f.css → Read-36e7bd51.css} +12 -12
- flowfile/web/static/assets/{Read-637b72a7.js → Read-aec2e377.js} +83 -105
- flowfile/web/static/assets/{RecordCount-2b050c41.js → RecordCount-78ed6845.js} +6 -4
- flowfile/web/static/assets/{RecordId-81df7784.js → RecordId-2156e890.js} +8 -6
- flowfile/web/static/assets/{SQLQueryComponent-36cef432.css → SQLQueryComponent-1c2f26b4.css} +5 -5
- flowfile/web/static/assets/{SQLQueryComponent-88dcfe53.js → SQLQueryComponent-48c72f5b.js} +3 -3
- flowfile/web/static/assets/{Sample-258ad2a9.js → Sample-1352ca74.js} +6 -4
- flowfile/web/static/assets/SecretSelector-22b5ff89.js +113 -0
- flowfile/web/static/assets/SecretSelector-6329f743.css +43 -0
- flowfile/web/static/assets/{SecretManager-2a2cb7e2.js → SecretsView-17df66ee.js} +35 -36
- flowfile/web/static/assets/SecretsView-aa291340.css +38 -0
- flowfile/web/static/assets/{Select-850215fd.js → Select-0aee4c54.js} +9 -7
- flowfile/web/static/assets/{SettingsSection-55bae608.js → SettingsSection-0784e157.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-71e6b7e3.css → SettingsSection-07fbbc39.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-5c696bee.css → SettingsSection-26fe48d4.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-2e4d03c4.css → SettingsSection-8f980839.css} +4 -4
- flowfile/web/static/assets/{SettingsSection-0e8d9123.js → SettingsSection-cd341bb6.js} +3 -3
- flowfile/web/static/assets/{SettingsSection-29b4fa6b.js → SettingsSection-f2002a6d.js} +3 -3
- flowfile/web/static/assets/{SingleSelect-bebd408b.js → SingleSelect-460cc0ea.js} +2 -2
- flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-6093741c.js → SingleSelect.vue_vue_type_script_setup_true_lang-30741bb2.js} +1 -1
- flowfile/web/static/assets/{SliderInput-6a05ab61.js → SliderInput-5d926864.js} +7 -4
- flowfile/web/static/assets/SliderInput-f2e4f23c.css +4 -0
- flowfile/web/static/assets/{Sort-10ab48ed.js → Sort-3cdc971b.js} +9 -7
- flowfile/web/static/assets/{Unique-f9fb0809.css → Sort-8a871341.css} +10 -10
- flowfile/web/static/assets/{TextInput-df9d6259.js → TextInput-a2d0bfbd.js} +2 -2
- flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-000e1178.js → TextInput.vue_vue_type_script_setup_true_lang-abad1ca2.js} +5 -2
- flowfile/web/static/assets/{TextToRows-5d2c1190.css → TextToRows-12afb4f4.css} +10 -10
- flowfile/web/static/assets/{TextToRows-6c2d93d8.js → TextToRows-918945f7.js} +11 -10
- flowfile/web/static/assets/{ToggleSwitch-0ff7ac52.js → ToggleSwitch-f0ef5196.js} +2 -2
- flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-c6dc3029.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-5605c793.js} +1 -1
- flowfile/web/static/assets/{UnavailableFields-5edd5322.css → UnavailableFields-54d2f518.css} +6 -6
- flowfile/web/static/assets/{UnavailableFields-1bab97cb.js → UnavailableFields-bdad6144.js} +4 -4
- flowfile/web/static/assets/{Union-af6c3d9b.css → Union-d6a8d7d5.css} +7 -7
- flowfile/web/static/assets/{Union-b563478a.js → Union-e8ab8c86.js} +8 -6
- flowfile/web/static/assets/{Unique-f90db5db.js → Unique-8cd4f976.js} +13 -22
- flowfile/web/static/assets/{Sort-3643d625.css → Unique-9fb2f567.css} +10 -10
- flowfile/web/static/assets/{Unpivot-1e422df3.css → Unpivot-710a2948.css} +7 -7
- flowfile/web/static/assets/{Unpivot-bcb0025f.js → Unpivot-8da14095.js} +10 -8
- flowfile/web/static/assets/{UnpivotValidation-c4e73b04.js → UnpivotValidation-6f7d89ff.js} +3 -3
- flowfile/web/static/assets/UnpivotValidation-d5ca3b7b.css +13 -0
- flowfile/web/static/assets/{VueGraphicWalker-bb8535e2.js → VueGraphicWalker-3fb312e1.js} +4 -4
- flowfile/web/static/assets/{VueGraphicWalker-ed5ab88b.css → VueGraphicWalker-430f0b86.css} +1 -1
- flowfile/web/static/assets/{api-4c8e3822.js → api-24483f0d.js} +1 -1
- flowfile/web/static/assets/{api-2d6adc4f.js → api-8b81fa73.js} +1 -1
- flowfile/web/static/assets/{dropDown-35135ba8.css → dropDown-3d8dc5fa.css} +40 -40
- flowfile/web/static/assets/{dropDown-1bca8a74.js → dropDown-ac0fda9d.js} +3 -3
- flowfile/web/static/assets/{fullEditor-2985687e.js → fullEditor-5497a84a.js} +11 -10
- flowfile/web/static/assets/{fullEditor-178376bb.css → fullEditor-a0be62b3.css} +74 -62
- flowfile/web/static/assets/{genericNodeSettings-924759c7.css → genericNodeSettings-3b2507ea.css} +10 -10
- flowfile/web/static/assets/{genericNodeSettings-0476ba4e.js → genericNodeSettings-99014e1d.js} +5 -5
- flowfile/web/static/assets/index-07dda503.js +38 -0
- flowfile/web/static/assets/index-3ba44389.js +2696 -0
- flowfile/web/static/assets/{index-50508d4d.css → index-e6289dd0.css} +1945 -569
- flowfile/web/static/assets/{index-246f201c.js → index-fb6493ae.js} +41626 -40869
- flowfile/web/static/assets/node.types-2c15bb7e.js +82 -0
- flowfile/web/static/assets/nodeInput-0eb13f1a.js +2 -0
- flowfile/web/static/assets/{outputCsv-d686eeaf.js → outputCsv-8f8ba42d.js} +3 -3
- flowfile/web/static/assets/outputCsv-b9a072af.css +2499 -0
- flowfile/web/static/assets/{outputExcel-8809ea2f.js → outputExcel-393f4fef.js} +3 -3
- flowfile/web/static/assets/{outputExcel-b41305c0.css → outputExcel-f5d272b2.css} +26 -26
- flowfile/web/static/assets/{outputParquet-53ba645a.js → outputParquet-07c81f65.js} +4 -4
- flowfile/web/static/assets/outputParquet-54597c3c.css +4 -0
- flowfile/web/static/assets/{readCsv-053bf97b.js → readCsv-07f6d9ad.js} +21 -20
- flowfile/web/static/assets/{readCsv-bca3ed53.css → readCsv-3bfac4c3.css} +15 -15
- flowfile/web/static/assets/{readExcel-e1b381ea.css → readExcel-3db6b763.css} +13 -13
- flowfile/web/static/assets/{readExcel-ad531eab.js → readExcel-ed69bc8f.js} +10 -12
- flowfile/web/static/assets/{readParquet-cee068e2.css → readParquet-c5244ad5.css} +4 -4
- flowfile/web/static/assets/{readParquet-58e899a1.js → readParquet-e3ed4528.js} +4 -7
- flowfile/web/static/assets/secrets.api-002e7d7e.js +65 -0
- flowfile/web/static/assets/{selectDynamic-b38de2ba.js → selectDynamic-80b92899.js} +5 -5
- flowfile/web/static/assets/{selectDynamic-aa913ff4.css → selectDynamic-f2fb394f.css} +21 -20
- flowfile/web/static/assets/{vue-codemirror.esm-db9b8936.js → vue-codemirror.esm-0965f39f.js} +31 -637
- flowfile/web/static/assets/{vue-content-loader.es-b5f3ac30.js → vue-content-loader.es-c506ad97.js} +1 -1
- flowfile/web/static/index.html +2 -2
- {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/METADATA +4 -4
- flowfile-0.5.3.dist-info/RECORD +402 -0
- {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/WHEEL +1 -1
- {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/entry_points.txt +1 -0
- flowfile_core/__init__.py +13 -3
- flowfile_core/auth/jwt.py +51 -16
- flowfile_core/auth/models.py +32 -7
- flowfile_core/auth/password.py +89 -0
- flowfile_core/auth/secrets.py +8 -6
- flowfile_core/configs/__init__.py +9 -7
- flowfile_core/configs/flow_logger.py +15 -14
- flowfile_core/configs/node_store/__init__.py +72 -4
- flowfile_core/configs/node_store/nodes.py +155 -172
- flowfile_core/configs/node_store/user_defined_node_registry.py +108 -27
- flowfile_core/configs/settings.py +28 -15
- flowfile_core/database/connection.py +7 -6
- flowfile_core/database/init_db.py +96 -2
- flowfile_core/database/models.py +3 -1
- flowfile_core/fileExplorer/__init__.py +17 -0
- flowfile_core/fileExplorer/funcs.py +123 -57
- flowfile_core/fileExplorer/utils.py +10 -11
- flowfile_core/flowfile/_extensions/real_time_interface.py +10 -8
- flowfile_core/flowfile/analytics/analytics_processor.py +27 -24
- flowfile_core/flowfile/analytics/graphic_walker.py +11 -12
- flowfile_core/flowfile/analytics/utils.py +1 -1
- flowfile_core/flowfile/code_generator/code_generator.py +391 -279
- flowfile_core/flowfile/connection_manager/_connection_manager.py +6 -5
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +60 -44
- flowfile_core/flowfile/database_connection_manager/models.py +1 -1
- flowfile_core/flowfile/extensions.py +17 -12
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +34 -32
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +152 -103
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +526 -477
- flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +2 -2
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +92 -52
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +12 -11
- flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +6 -6
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +26 -30
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +43 -32
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +11 -9
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +15 -11
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +5 -7
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +95 -82
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +66 -65
- flowfile_core/flowfile/flow_data_engine/sample_data.py +27 -21
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +13 -11
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +360 -191
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +8 -8
- flowfile_core/flowfile/flow_data_engine/utils.py +101 -67
- flowfile_core/flowfile/flow_graph.py +1011 -561
- flowfile_core/flowfile/flow_graph_utils.py +31 -49
- flowfile_core/flowfile/flow_node/flow_node.py +332 -232
- flowfile_core/flowfile/flow_node/models.py +54 -41
- flowfile_core/flowfile/flow_node/schema_callback.py +14 -19
- flowfile_core/flowfile/graph_tree/graph_tree.py +41 -41
- flowfile_core/flowfile/handler.py +82 -32
- flowfile_core/flowfile/manage/compatibility_enhancements.py +493 -47
- flowfile_core/flowfile/manage/io_flowfile.py +391 -0
- flowfile_core/flowfile/node_designer/__init__.py +15 -13
- flowfile_core/flowfile/node_designer/_type_registry.py +34 -37
- flowfile_core/flowfile/node_designer/custom_node.py +162 -36
- flowfile_core/flowfile/node_designer/ui_components.py +136 -35
- flowfile_core/flowfile/schema_callbacks.py +77 -54
- flowfile_core/flowfile/setting_generator/__init__.py +0 -1
- flowfile_core/flowfile/setting_generator/setting_generator.py +6 -5
- flowfile_core/flowfile/setting_generator/settings.py +72 -55
- flowfile_core/flowfile/sources/external_sources/base_class.py +12 -10
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +27 -17
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +9 -9
- flowfile_core/flowfile/sources/external_sources/factory.py +0 -1
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +45 -31
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +198 -73
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +250 -196
- flowfile_core/flowfile/util/calculate_layout.py +9 -13
- flowfile_core/flowfile/util/execution_orderer.py +25 -17
- flowfile_core/flowfile/util/node_skipper.py +4 -4
- flowfile_core/flowfile/utils.py +19 -21
- flowfile_core/main.py +26 -19
- flowfile_core/routes/auth.py +284 -11
- flowfile_core/routes/cloud_connections.py +25 -25
- flowfile_core/routes/logs.py +21 -29
- flowfile_core/routes/public.py +3 -3
- flowfile_core/routes/routes.py +77 -43
- flowfile_core/routes/secrets.py +25 -27
- flowfile_core/routes/user_defined_components.py +483 -4
- flowfile_core/run_lock.py +0 -1
- flowfile_core/schemas/__init__.py +4 -6
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +55 -55
- flowfile_core/schemas/cloud_storage_schemas.py +59 -55
- flowfile_core/schemas/input_schema.py +398 -154
- flowfile_core/schemas/output_model.py +50 -35
- flowfile_core/schemas/schemas.py +207 -67
- flowfile_core/schemas/transform_schema.py +1360 -435
- flowfile_core/schemas/yaml_types.py +117 -0
- flowfile_core/secret_manager/secret_manager.py +17 -13
- flowfile_core/{flowfile/node_designer/data_types.py → types.py} +33 -3
- flowfile_core/utils/arrow_reader.py +7 -6
- flowfile_core/utils/excel_file_manager.py +3 -3
- flowfile_core/utils/fileManager.py +7 -7
- flowfile_core/utils/fl_executor.py +8 -10
- flowfile_core/utils/utils.py +4 -4
- flowfile_core/utils/validate_setup.py +5 -4
- flowfile_frame/__init__.py +107 -50
- flowfile_frame/adapters.py +2 -9
- flowfile_frame/adding_expr.py +73 -32
- flowfile_frame/cloud_storage/frame_helpers.py +27 -23
- flowfile_frame/cloud_storage/secret_manager.py +12 -26
- flowfile_frame/config.py +2 -5
- flowfile_frame/expr.py +311 -218
- flowfile_frame/expr.pyi +160 -159
- flowfile_frame/expr_name.py +23 -23
- flowfile_frame/flow_frame.py +581 -489
- flowfile_frame/flow_frame.pyi +123 -104
- flowfile_frame/flow_frame_methods.py +236 -252
- flowfile_frame/group_frame.py +50 -20
- flowfile_frame/join.py +2 -2
- flowfile_frame/lazy.py +129 -87
- flowfile_frame/lazy_methods.py +83 -30
- flowfile_frame/list_name_space.py +55 -50
- flowfile_frame/selectors.py +148 -68
- flowfile_frame/series.py +9 -7
- flowfile_frame/utils.py +19 -21
- flowfile_worker/__init__.py +12 -4
- flowfile_worker/configs.py +11 -19
- flowfile_worker/create/__init__.py +14 -27
- flowfile_worker/create/funcs.py +143 -94
- flowfile_worker/create/models.py +139 -68
- flowfile_worker/create/pl_types.py +14 -15
- flowfile_worker/create/read_excel_tables.py +34 -41
- flowfile_worker/create/utils.py +22 -19
- flowfile_worker/external_sources/s3_source/main.py +18 -51
- flowfile_worker/external_sources/s3_source/models.py +34 -27
- flowfile_worker/external_sources/sql_source/main.py +8 -5
- flowfile_worker/external_sources/sql_source/models.py +13 -9
- flowfile_worker/flow_logger.py +10 -8
- flowfile_worker/funcs.py +214 -155
- flowfile_worker/main.py +11 -17
- flowfile_worker/models.py +35 -28
- flowfile_worker/process_manager.py +2 -3
- flowfile_worker/routes.py +121 -93
- flowfile_worker/secrets.py +9 -6
- flowfile_worker/spawner.py +80 -49
- flowfile_worker/utils.py +3 -2
- shared/__init__.py +2 -7
- shared/storage_config.py +25 -13
- test_utils/postgres/commands.py +3 -2
- test_utils/postgres/fixtures.py +9 -9
- test_utils/s3/commands.py +1 -1
- test_utils/s3/data_generator.py +3 -4
- test_utils/s3/demo_data_generator.py +4 -7
- test_utils/s3/fixtures.py +7 -5
- tools/migrate/README.md +56 -0
- tools/migrate/__init__.py +12 -0
- tools/migrate/__main__.py +118 -0
- tools/migrate/legacy_schemas.py +682 -0
- tools/migrate/migrate.py +610 -0
- tools/migrate/tests/__init__.py +0 -0
- tools/migrate/tests/conftest.py +21 -0
- tools/migrate/tests/test_migrate.py +622 -0
- tools/migrate/tests/test_migration_e2e.py +1009 -0
- tools/migrate/tests/test_node_migrations.py +843 -0
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +0 -86
- flowfile/web/static/assets/CustomNode-74a37f74.css +0 -32
- flowfile/web/static/assets/DatabaseManager-30fa27e5.css +0 -64
- flowfile/web/static/assets/Filter-812dcbca.js +0 -164
- flowfile/web/static/assets/Filter-f62091b3.css +0 -20
- flowfile/web/static/assets/ManualInput-3246a08d.css +0 -96
- flowfile/web/static/assets/PivotValidation-891ddfb0.css +0 -13
- flowfile/web/static/assets/PivotValidation-c46cd420.css +0 -13
- flowfile/web/static/assets/SliderInput-b8fb6a8c.css +0 -4
- flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +0 -13
- flowfile/web/static/assets/outputCsv-9cc59e0b.css +0 -2499
- flowfile/web/static/assets/outputParquet-cf8cf3f2.css +0 -4
- flowfile/web/static/assets/secretApi-538058f3.js +0 -46
- flowfile/web/static/assets/vue-codemirror-bccfde04.css +0 -32
- flowfile-0.4.1.dist-info/RECORD +0 -376
- flowfile_core/flowfile/manage/open_flowfile.py +0 -143
- {flowfile-0.4.1.dist-info → flowfile-0.5.3.dist-info}/licenses/LICENSE +0 -0
- /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
|
@@ -1,52 +1,50 @@
|
|
|
1
1
|
# Standard library imports
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
6
|
+
from collections.abc import Callable, Generator, Iterable
|
|
4
7
|
from copy import deepcopy
|
|
5
8
|
from dataclasses import dataclass
|
|
6
9
|
from math import ceil
|
|
7
|
-
from typing import Any,
|
|
10
|
+
from typing import Any, Literal, TypeVar, Union
|
|
8
11
|
|
|
9
|
-
|
|
12
|
+
import polars as pl
|
|
10
13
|
|
|
11
14
|
# Third-party imports
|
|
12
15
|
from loky import Future
|
|
13
|
-
import
|
|
16
|
+
from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
|
|
14
17
|
from polars.exceptions import PanicException
|
|
15
|
-
from polars_grouper import graph_solver
|
|
16
18
|
from polars_expr_transformer import simple_function_to_expr as to_expr
|
|
19
|
+
from polars_grouper import graph_solver
|
|
17
20
|
from pyarrow import Table as PaTable
|
|
18
21
|
from pyarrow.parquet import ParquetFile
|
|
19
22
|
|
|
20
23
|
# Local imports - Core
|
|
21
24
|
from flowfile_core.configs import logger
|
|
22
|
-
from flowfile_core.utils.utils import ensure_similarity_dicts
|
|
23
25
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
24
|
-
from flowfile_core.schemas import (
|
|
25
|
-
cloud_storage_schemas,
|
|
26
|
-
input_schema,
|
|
27
|
-
transform_schema as transform_schemas
|
|
28
|
-
)
|
|
29
|
-
from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
|
|
30
26
|
|
|
31
27
|
# Local imports - Flow File Components
|
|
32
28
|
from flowfile_core.flowfile.flow_data_engine import utils
|
|
33
|
-
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
|
|
34
|
-
|
|
35
|
-
|
|
29
|
+
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (
|
|
30
|
+
CloudStorageReader,
|
|
31
|
+
ensure_path_has_wildcard_pattern,
|
|
32
|
+
get_first_file_from_s3_dir,
|
|
33
|
+
)
|
|
36
34
|
from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
|
|
37
35
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
|
|
38
36
|
FlowfileColumn,
|
|
39
37
|
assert_if_flowfile_schema,
|
|
40
|
-
convert_stats_to_column_info
|
|
38
|
+
convert_stats_to_column_info,
|
|
41
39
|
)
|
|
42
40
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
43
41
|
from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
|
|
44
42
|
from flowfile_core.flowfile.flow_data_engine.join import (
|
|
45
|
-
|
|
46
|
-
verify_join_map_integrity,
|
|
47
|
-
rename_df_table_for_join,
|
|
43
|
+
get_col_name_to_delete,
|
|
48
44
|
get_undo_rename_mapping_join,
|
|
49
|
-
|
|
45
|
+
rename_df_table_for_join,
|
|
46
|
+
verify_join_map_integrity,
|
|
47
|
+
verify_join_select_integrity,
|
|
50
48
|
)
|
|
51
49
|
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
52
50
|
from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
|
|
@@ -55,19 +53,21 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
|
|
|
55
53
|
ExternalDfFetcher,
|
|
56
54
|
ExternalExecutorTracker,
|
|
57
55
|
ExternalFuzzyMatchFetcher,
|
|
58
|
-
fetch_unique_values
|
|
56
|
+
fetch_unique_values,
|
|
59
57
|
)
|
|
60
|
-
from flowfile_core.flowfile.flow_data_engine.threaded_processes import
|
|
61
|
-
get_join_count,
|
|
62
|
-
write_threaded
|
|
63
|
-
)
|
|
64
|
-
|
|
58
|
+
from flowfile_core.flowfile.flow_data_engine.threaded_processes import write_threaded
|
|
65
59
|
from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
|
|
60
|
+
from flowfile_core.schemas import cloud_storage_schemas, input_schema
|
|
61
|
+
from flowfile_core.schemas import transform_schema as transform_schemas
|
|
62
|
+
from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
|
|
63
|
+
from flowfile_core.utils.utils import ensure_similarity_dicts
|
|
66
64
|
|
|
67
|
-
T = TypeVar(
|
|
65
|
+
T = TypeVar("T", pl.DataFrame, pl.LazyFrame)
|
|
68
66
|
|
|
69
67
|
|
|
70
|
-
def _handle_duplication_join_keys(
|
|
68
|
+
def _handle_duplication_join_keys(
|
|
69
|
+
left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager
|
|
70
|
+
) -> tuple[T, T, dict[str, str]]:
|
|
71
71
|
"""Temporarily renames join keys to avoid conflicts during a join.
|
|
72
72
|
|
|
73
73
|
This helper function checks the join type and renames the join key columns
|
|
@@ -86,20 +86,28 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform
|
|
|
86
86
|
- The (potentially modified) right DataFrame.
|
|
87
87
|
- A dictionary mapping the temporary names back to their desired final names.
|
|
88
88
|
"""
|
|
89
|
+
|
|
89
90
|
def _construct_temp_name(column_name: str) -> str:
|
|
90
|
-
return "__FL_TEMP__"+column_name
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
91
|
+
return "__FL_TEMP__" + column_name
|
|
92
|
+
|
|
93
|
+
if join_manager.how == "right":
|
|
94
|
+
left_df = left_df.with_columns(
|
|
95
|
+
pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
96
|
+
for jk in join_manager.left_manager.get_join_key_selects()
|
|
97
|
+
)
|
|
94
98
|
reverse_actions = {
|
|
95
99
|
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
|
|
96
|
-
for jk in
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
+
for jk in join_manager.left_manager.get_join_key_selects()
|
|
101
|
+
}
|
|
102
|
+
elif join_manager.how in ("left", "inner"):
|
|
103
|
+
right_df = right_df.with_columns(
|
|
104
|
+
pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
105
|
+
for jk in join_manager.right_manager.get_join_key_selects()
|
|
106
|
+
)
|
|
100
107
|
reverse_actions = {
|
|
101
108
|
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
|
|
102
|
-
for jk in
|
|
109
|
+
for jk in join_manager.right_manager.get_join_key_selects()
|
|
110
|
+
}
|
|
103
111
|
else:
|
|
104
112
|
reverse_actions = {}
|
|
105
113
|
return left_df, right_df, reverse_actions
|
|
@@ -116,12 +124,12 @@ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.
|
|
|
116
124
|
Args:
|
|
117
125
|
join_input: The JoinInput settings object to modify.
|
|
118
126
|
"""
|
|
119
|
-
if join_input.how in (
|
|
127
|
+
if join_input.how in ("semi", "anti"):
|
|
120
128
|
for jk in join_input.right_select.renames:
|
|
121
129
|
jk.keep = False
|
|
122
130
|
|
|
123
131
|
|
|
124
|
-
def get_select_columns(full_select_input:
|
|
132
|
+
def get_select_columns(full_select_input: list[transform_schemas.SelectInput]) -> list[str]:
|
|
125
133
|
"""Extracts a list of column names to be selected from a SelectInput list.
|
|
126
134
|
|
|
127
135
|
This function filters a list of `SelectInput` objects to return the names
|
|
@@ -154,15 +162,16 @@ class FlowDataEngine:
|
|
|
154
162
|
errors: A list of errors encountered during operations.
|
|
155
163
|
_schema: A cached list of `FlowfileColumn` objects representing the schema.
|
|
156
164
|
"""
|
|
165
|
+
|
|
157
166
|
# Core attributes
|
|
158
|
-
_data_frame:
|
|
159
|
-
columns:
|
|
167
|
+
_data_frame: pl.DataFrame | pl.LazyFrame
|
|
168
|
+
columns: list[Any]
|
|
160
169
|
|
|
161
170
|
# Metadata attributes
|
|
162
171
|
name: str = None
|
|
163
172
|
number_of_records: int = None
|
|
164
|
-
errors:
|
|
165
|
-
_schema:
|
|
173
|
+
errors: list = None
|
|
174
|
+
_schema: list["FlowfileColumn"] | None = None
|
|
166
175
|
|
|
167
176
|
# Configuration attributes
|
|
168
177
|
_optimize_memory: bool = False
|
|
@@ -171,16 +180,16 @@ class FlowDataEngine:
|
|
|
171
180
|
_calculate_schema_stats: bool = False
|
|
172
181
|
|
|
173
182
|
# Cache and optimization attributes
|
|
174
|
-
__col_name_idx_map:
|
|
175
|
-
__data_map:
|
|
176
|
-
__optimized_columns:
|
|
183
|
+
__col_name_idx_map: dict = None
|
|
184
|
+
__data_map: dict = None
|
|
185
|
+
__optimized_columns: list = None
|
|
177
186
|
__sample__: str = None
|
|
178
187
|
__number_of_fields: int = None
|
|
179
|
-
_col_idx:
|
|
188
|
+
_col_idx: dict[str, int] = None
|
|
180
189
|
|
|
181
190
|
# Source tracking
|
|
182
|
-
_org_path:
|
|
183
|
-
_external_source:
|
|
191
|
+
_org_path: str | None = None
|
|
192
|
+
_external_source: ExternalDataSource | None = None
|
|
184
193
|
|
|
185
194
|
# State tracking
|
|
186
195
|
sorted_by: int = None
|
|
@@ -193,18 +202,21 @@ class FlowDataEngine:
|
|
|
193
202
|
_number_of_records_callback: Callable = None
|
|
194
203
|
_data_callback: Callable = None
|
|
195
204
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
205
|
+
def __init__(
|
|
206
|
+
self,
|
|
207
|
+
raw_data: Union[
|
|
208
|
+
list[dict], list[Any], dict[str, Any], "ParquetFile", pl.DataFrame, pl.LazyFrame, input_schema.RawData
|
|
209
|
+
] = None,
|
|
210
|
+
path_ref: str = None,
|
|
211
|
+
name: str = None,
|
|
212
|
+
optimize_memory: bool = True,
|
|
213
|
+
schema: list["FlowfileColumn"] | list[str] | pl.Schema = None,
|
|
214
|
+
number_of_records: int = None,
|
|
215
|
+
calculate_schema_stats: bool = False,
|
|
216
|
+
streamable: bool = True,
|
|
217
|
+
number_of_records_callback: Callable = None,
|
|
218
|
+
data_callback: Callable = None,
|
|
219
|
+
):
|
|
208
220
|
"""Initializes the FlowDataEngine from various data sources.
|
|
209
221
|
|
|
210
222
|
Args:
|
|
@@ -264,12 +276,12 @@ class FlowDataEngine:
|
|
|
264
276
|
elif isinstance(raw_data, (list, dict)):
|
|
265
277
|
self._handle_python_data(raw_data)
|
|
266
278
|
|
|
267
|
-
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records:
|
|
279
|
+
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: int | None):
|
|
268
280
|
"""(Internal) Initializes the engine from an eager Polars DataFrame."""
|
|
269
281
|
self.data_frame = df
|
|
270
282
|
self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
|
|
271
283
|
|
|
272
|
-
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records:
|
|
284
|
+
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: int | None, optimize_memory: bool):
|
|
273
285
|
"""(Internal) Initializes the engine from a Polars LazyFrame."""
|
|
274
286
|
self.data_frame = lf
|
|
275
287
|
self._lazy = True
|
|
@@ -280,14 +292,14 @@ class FlowDataEngine:
|
|
|
280
292
|
else:
|
|
281
293
|
self.number_of_records = lf.select(pl.len()).collect()[0, 0]
|
|
282
294
|
|
|
283
|
-
def _handle_python_data(self, data:
|
|
295
|
+
def _handle_python_data(self, data: list | dict):
|
|
284
296
|
"""(Internal) Dispatches Python collections to the correct handler."""
|
|
285
297
|
if isinstance(data, dict):
|
|
286
298
|
self._handle_dict_input(data)
|
|
287
299
|
else:
|
|
288
300
|
self._handle_list_input(data)
|
|
289
301
|
|
|
290
|
-
def _handle_dict_input(self, data:
|
|
302
|
+
def _handle_dict_input(self, data: dict):
|
|
291
303
|
"""(Internal) Initializes the engine from a Python dictionary."""
|
|
292
304
|
if len(data) == 0:
|
|
293
305
|
self.initialize_empty_fl()
|
|
@@ -311,8 +323,12 @@ class FlowDataEngine:
|
|
|
311
323
|
raw_data: An instance of `RawData` containing the data and schema.
|
|
312
324
|
"""
|
|
313
325
|
flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
|
|
314
|
-
polars_schema = pl.Schema(
|
|
315
|
-
|
|
326
|
+
polars_schema = pl.Schema(
|
|
327
|
+
[
|
|
328
|
+
(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
|
|
329
|
+
for flowfile_column in flowfile_schema
|
|
330
|
+
]
|
|
331
|
+
)
|
|
316
332
|
try:
|
|
317
333
|
df = pl.DataFrame(raw_data.data, polars_schema, strict=False)
|
|
318
334
|
except TypeError as e:
|
|
@@ -322,7 +338,7 @@ class FlowDataEngine:
|
|
|
322
338
|
self.data_frame = df.lazy()
|
|
323
339
|
self.lazy = True
|
|
324
340
|
|
|
325
|
-
def _handle_list_input(self, data:
|
|
341
|
+
def _handle_list_input(self, data: list):
|
|
326
342
|
"""(Internal) Initializes the engine from a list of records."""
|
|
327
343
|
number_of_records = len(data)
|
|
328
344
|
if number_of_records > 0:
|
|
@@ -335,19 +351,19 @@ class FlowDataEngine:
|
|
|
335
351
|
self.number_of_records = 0
|
|
336
352
|
|
|
337
353
|
@staticmethod
|
|
338
|
-
def _process_list_data(data:
|
|
354
|
+
def _process_list_data(data: list) -> list[dict]:
|
|
339
355
|
"""(Internal) Normalizes list data into a list of dictionaries.
|
|
340
356
|
|
|
341
357
|
Ensures that a list of objects or non-dict items is converted into a
|
|
342
358
|
uniform list of dictionaries suitable for Polars DataFrame creation.
|
|
343
359
|
"""
|
|
344
|
-
if not (isinstance(data[0], dict) or hasattr(data[0],
|
|
360
|
+
if not (isinstance(data[0], dict) or hasattr(data[0], "__dict__")):
|
|
345
361
|
try:
|
|
346
362
|
return pl.DataFrame(data).to_dicts()
|
|
347
363
|
except TypeError:
|
|
348
|
-
raise Exception(
|
|
364
|
+
raise Exception("Value must be able to be converted to dictionary")
|
|
349
365
|
except Exception as e:
|
|
350
|
-
raise Exception(f
|
|
366
|
+
raise Exception(f"Value must be able to be converted to dictionary: {e}")
|
|
351
367
|
|
|
352
368
|
if not isinstance(data[0], dict):
|
|
353
369
|
data = [row.__dict__ for row in data]
|
|
@@ -374,49 +390,37 @@ class FlowDataEngine:
|
|
|
374
390
|
|
|
375
391
|
logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
|
|
376
392
|
|
|
377
|
-
if write_settings.write_mode ==
|
|
393
|
+
if write_settings.write_mode == "append" and write_settings.file_format != "delta":
|
|
378
394
|
raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
|
|
379
395
|
storage_options = CloudStorageReader.get_storage_options(connection)
|
|
380
396
|
credential_provider = CloudStorageReader.get_credential_provider(connection)
|
|
381
397
|
# Dispatch to the correct writer based on file format
|
|
382
398
|
if write_settings.file_format == "parquet":
|
|
383
399
|
self._write_parquet_to_cloud(
|
|
384
|
-
write_settings.resource_path,
|
|
385
|
-
storage_options,
|
|
386
|
-
credential_provider,
|
|
387
|
-
write_settings
|
|
400
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
388
401
|
)
|
|
389
402
|
elif write_settings.file_format == "delta":
|
|
390
403
|
self._write_delta_to_cloud(
|
|
391
|
-
write_settings.resource_path,
|
|
392
|
-
storage_options,
|
|
393
|
-
credential_provider,
|
|
394
|
-
write_settings
|
|
404
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
395
405
|
)
|
|
396
406
|
elif write_settings.file_format == "csv":
|
|
397
|
-
self._write_csv_to_cloud(
|
|
398
|
-
write_settings.resource_path,
|
|
399
|
-
storage_options,
|
|
400
|
-
credential_provider,
|
|
401
|
-
write_settings
|
|
402
|
-
)
|
|
407
|
+
self._write_csv_to_cloud(write_settings.resource_path, storage_options, credential_provider, write_settings)
|
|
403
408
|
elif write_settings.file_format == "json":
|
|
404
409
|
self._write_json_to_cloud(
|
|
405
|
-
write_settings.resource_path,
|
|
406
|
-
storage_options,
|
|
407
|
-
credential_provider,
|
|
408
|
-
write_settings
|
|
410
|
+
write_settings.resource_path, storage_options, credential_provider, write_settings
|
|
409
411
|
)
|
|
410
412
|
else:
|
|
411
413
|
raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
|
|
412
414
|
|
|
413
415
|
logger.info(f"Successfully wrote data to {write_settings.resource_path}")
|
|
414
416
|
|
|
415
|
-
def _write_parquet_to_cloud(
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
417
|
+
def _write_parquet_to_cloud(
|
|
418
|
+
self,
|
|
419
|
+
resource_path: str,
|
|
420
|
+
storage_options: dict[str, Any],
|
|
421
|
+
credential_provider: Callable | None,
|
|
422
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
423
|
+
):
|
|
420
424
|
"""(Internal) Writes the DataFrame to a Parquet file in cloud storage.
|
|
421
425
|
|
|
422
426
|
Uses `sink_parquet` for efficient streaming writes. Falls back to a
|
|
@@ -436,18 +440,20 @@ class FlowDataEngine:
|
|
|
436
440
|
except Exception as e:
|
|
437
441
|
logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
|
|
438
442
|
pl_df = self.collect()
|
|
439
|
-
sink_kwargs[
|
|
443
|
+
sink_kwargs["file"] = sink_kwargs.pop("path")
|
|
440
444
|
pl_df.write_parquet(**sink_kwargs)
|
|
441
445
|
|
|
442
446
|
except Exception as e:
|
|
443
447
|
logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
|
|
444
448
|
raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
|
|
445
449
|
|
|
446
|
-
def _write_delta_to_cloud(
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
450
|
+
def _write_delta_to_cloud(
|
|
451
|
+
self,
|
|
452
|
+
resource_path: str,
|
|
453
|
+
storage_options: dict[str, Any],
|
|
454
|
+
credential_provider: Callable | None,
|
|
455
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
456
|
+
):
|
|
451
457
|
"""(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
|
|
452
458
|
|
|
453
459
|
This operation requires collecting the data first, as `write_delta` operates
|
|
@@ -463,11 +469,13 @@ class FlowDataEngine:
|
|
|
463
469
|
sink_kwargs["credential_provider"] = credential_provider
|
|
464
470
|
self.collect().write_delta(**sink_kwargs)
|
|
465
471
|
|
|
466
|
-
def _write_csv_to_cloud(
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
472
|
+
def _write_csv_to_cloud(
|
|
473
|
+
self,
|
|
474
|
+
resource_path: str,
|
|
475
|
+
storage_options: dict[str, Any],
|
|
476
|
+
credential_provider: Callable | None,
|
|
477
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
478
|
+
):
|
|
471
479
|
"""(Internal) Writes the DataFrame to a CSV file in cloud storage.
|
|
472
480
|
|
|
473
481
|
Uses `sink_csv` for efficient, streaming writes of the data.
|
|
@@ -489,11 +497,13 @@ class FlowDataEngine:
|
|
|
489
497
|
logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
|
|
490
498
|
raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
|
|
491
499
|
|
|
492
|
-
def _write_json_to_cloud(
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
500
|
+
def _write_json_to_cloud(
|
|
501
|
+
self,
|
|
502
|
+
resource_path: str,
|
|
503
|
+
storage_options: dict[str, Any],
|
|
504
|
+
credential_provider: Callable | None,
|
|
505
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings,
|
|
506
|
+
):
|
|
497
507
|
"""(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
|
|
498
508
|
|
|
499
509
|
Uses `sink_ndjson` for efficient, streaming writes.
|
|
@@ -511,7 +521,9 @@ class FlowDataEngine:
|
|
|
511
521
|
raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
|
|
512
522
|
|
|
513
523
|
@classmethod
|
|
514
|
-
def from_cloud_storage_obj(
|
|
524
|
+
def from_cloud_storage_obj(
|
|
525
|
+
cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal
|
|
526
|
+
) -> "FlowDataEngine":
|
|
515
527
|
"""Creates a FlowDataEngine from an object in cloud storage.
|
|
516
528
|
|
|
517
529
|
This method supports reading from various cloud storage providers like AWS S3,
|
|
@@ -548,31 +560,22 @@ class FlowDataEngine:
|
|
|
548
560
|
)
|
|
549
561
|
elif read_settings.file_format == "delta":
|
|
550
562
|
return cls._read_delta_from_cloud(
|
|
551
|
-
read_settings.resource_path,
|
|
552
|
-
storage_options,
|
|
553
|
-
credential_provider,
|
|
554
|
-
read_settings
|
|
563
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
555
564
|
)
|
|
556
565
|
elif read_settings.file_format == "csv":
|
|
557
566
|
return cls._read_csv_from_cloud(
|
|
558
|
-
read_settings.resource_path,
|
|
559
|
-
storage_options,
|
|
560
|
-
credential_provider,
|
|
561
|
-
read_settings
|
|
567
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
562
568
|
)
|
|
563
569
|
elif read_settings.file_format == "json":
|
|
564
570
|
return cls._read_json_from_cloud(
|
|
565
571
|
read_settings.resource_path,
|
|
566
572
|
storage_options,
|
|
567
573
|
credential_provider,
|
|
568
|
-
read_settings.scan_mode == "directory"
|
|
574
|
+
read_settings.scan_mode == "directory",
|
|
569
575
|
)
|
|
570
576
|
elif read_settings.file_format == "iceberg":
|
|
571
577
|
return cls._read_iceberg_from_cloud(
|
|
572
|
-
read_settings.resource_path,
|
|
573
|
-
storage_options,
|
|
574
|
-
credential_provider,
|
|
575
|
-
read_settings
|
|
578
|
+
read_settings.resource_path, storage_options, credential_provider, read_settings
|
|
576
579
|
)
|
|
577
580
|
|
|
578
581
|
elif read_settings.file_format in ["delta", "iceberg"]:
|
|
@@ -582,33 +585,40 @@ class FlowDataEngine:
|
|
|
582
585
|
raise ValueError(f"Unsupported file format: {read_settings.file_format}")
|
|
583
586
|
|
|
584
587
|
@staticmethod
|
|
585
|
-
def _get_schema_from_first_file_in_dir(
|
|
586
|
-
|
|
588
|
+
def _get_schema_from_first_file_in_dir(
|
|
589
|
+
source: str, storage_options: dict[str, Any], file_format: Literal["csv", "parquet", "json", "delta"]
|
|
590
|
+
) -> list[FlowfileColumn] | None:
|
|
587
591
|
"""Infers the schema by scanning the first file in a cloud directory."""
|
|
588
592
|
try:
|
|
589
593
|
scan_func = getattr(pl, "scan_" + file_format)
|
|
590
594
|
first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
|
|
591
|
-
return convert_stats_to_column_info(
|
|
592
|
-
|
|
595
|
+
return convert_stats_to_column_info(
|
|
596
|
+
FlowDataEngine._create_schema_stats_from_pl_schema(
|
|
597
|
+
scan_func(first_file_ref, storage_options=storage_options).collect_schema()
|
|
598
|
+
)
|
|
599
|
+
)
|
|
593
600
|
except Exception as e:
|
|
594
601
|
logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
|
|
595
602
|
|
|
596
|
-
|
|
597
603
|
@classmethod
|
|
598
|
-
def _read_iceberg_from_cloud(
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
604
|
+
def _read_iceberg_from_cloud(
|
|
605
|
+
cls,
|
|
606
|
+
resource_path: str,
|
|
607
|
+
storage_options: dict[str, Any],
|
|
608
|
+
credential_provider: Callable | None,
|
|
609
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
610
|
+
) -> "FlowDataEngine":
|
|
603
611
|
"""Reads Iceberg table(s) from cloud storage."""
|
|
604
|
-
raise NotImplementedError(
|
|
612
|
+
raise NotImplementedError("Failed to read Iceberg table from cloud storage: Not yet implemented")
|
|
605
613
|
|
|
606
614
|
@classmethod
|
|
607
|
-
def _read_parquet_from_cloud(
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
615
|
+
def _read_parquet_from_cloud(
|
|
616
|
+
cls,
|
|
617
|
+
resource_path: str,
|
|
618
|
+
storage_options: dict[str, Any],
|
|
619
|
+
credential_provider: Callable | None,
|
|
620
|
+
is_directory: bool,
|
|
621
|
+
) -> "FlowDataEngine":
|
|
612
622
|
"""Reads Parquet file(s) from cloud storage."""
|
|
613
623
|
try:
|
|
614
624
|
# Use scan_parquet for lazy evaluation
|
|
@@ -632,7 +642,7 @@ class FlowDataEngine:
|
|
|
632
642
|
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
633
643
|
optimize_memory=True,
|
|
634
644
|
streamable=True,
|
|
635
|
-
schema=schema
|
|
645
|
+
schema=schema,
|
|
636
646
|
)
|
|
637
647
|
|
|
638
648
|
except Exception as e:
|
|
@@ -640,18 +650,20 @@ class FlowDataEngine:
|
|
|
640
650
|
raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
|
|
641
651
|
|
|
642
652
|
@classmethod
|
|
643
|
-
def _read_delta_from_cloud(
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
653
|
+
def _read_delta_from_cloud(
|
|
654
|
+
cls,
|
|
655
|
+
resource_path: str,
|
|
656
|
+
storage_options: dict[str, Any],
|
|
657
|
+
credential_provider: Callable | None,
|
|
658
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
659
|
+
) -> "FlowDataEngine":
|
|
648
660
|
"""Reads a Delta Lake table from cloud storage."""
|
|
649
661
|
try:
|
|
650
662
|
logger.info("Reading Delta file from cloud storage...")
|
|
651
663
|
logger.info(f"read_settings: {read_settings}")
|
|
652
664
|
scan_kwargs = {"source": resource_path}
|
|
653
665
|
if read_settings.delta_version:
|
|
654
|
-
scan_kwargs[
|
|
666
|
+
scan_kwargs["version"] = read_settings.delta_version
|
|
655
667
|
if storage_options:
|
|
656
668
|
scan_kwargs["storage_options"] = storage_options
|
|
657
669
|
if credential_provider:
|
|
@@ -662,18 +674,20 @@ class FlowDataEngine:
|
|
|
662
674
|
lf,
|
|
663
675
|
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
664
676
|
optimize_memory=True,
|
|
665
|
-
streamable=True
|
|
677
|
+
streamable=True,
|
|
666
678
|
)
|
|
667
679
|
except Exception as e:
|
|
668
680
|
logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
|
|
669
681
|
raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
|
|
670
682
|
|
|
671
683
|
@classmethod
|
|
672
|
-
def _read_csv_from_cloud(
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
684
|
+
def _read_csv_from_cloud(
|
|
685
|
+
cls,
|
|
686
|
+
resource_path: str,
|
|
687
|
+
storage_options: dict[str, Any],
|
|
688
|
+
credential_provider: Callable | None,
|
|
689
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings,
|
|
690
|
+
) -> "FlowDataEngine":
|
|
677
691
|
"""Reads CSV file(s) from cloud storage."""
|
|
678
692
|
try:
|
|
679
693
|
scan_kwargs = {
|
|
@@ -702,7 +716,7 @@ class FlowDataEngine:
|
|
|
702
716
|
number_of_records=6_666_666, # Will be calculated lazily
|
|
703
717
|
optimize_memory=True,
|
|
704
718
|
streamable=True,
|
|
705
|
-
schema=schema
|
|
719
|
+
schema=schema,
|
|
706
720
|
)
|
|
707
721
|
|
|
708
722
|
except Exception as e:
|
|
@@ -710,11 +724,13 @@ class FlowDataEngine:
|
|
|
710
724
|
raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
|
|
711
725
|
|
|
712
726
|
@classmethod
|
|
713
|
-
def _read_json_from_cloud(
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
727
|
+
def _read_json_from_cloud(
|
|
728
|
+
cls,
|
|
729
|
+
resource_path: str,
|
|
730
|
+
storage_options: dict[str, Any],
|
|
731
|
+
credential_provider: Callable | None,
|
|
732
|
+
is_directory: bool,
|
|
733
|
+
) -> "FlowDataEngine":
|
|
718
734
|
"""Reads JSON file(s) from cloud storage."""
|
|
719
735
|
try:
|
|
720
736
|
if is_directory:
|
|
@@ -754,8 +770,9 @@ class FlowDataEngine:
|
|
|
754
770
|
else:
|
|
755
771
|
self.data_frame = pl.read_parquet(path_ref)
|
|
756
772
|
|
|
757
|
-
def _finalize_initialization(
|
|
758
|
-
|
|
773
|
+
def _finalize_initialization(
|
|
774
|
+
self, name: str, optimize_memory: bool, schema: Any | None, calculate_schema_stats: bool
|
|
775
|
+
):
|
|
759
776
|
"""Finalizes initialization by setting remaining attributes."""
|
|
760
777
|
_ = calculate_schema_stats
|
|
761
778
|
self.name = name
|
|
@@ -802,23 +819,20 @@ class FlowDataEngine:
|
|
|
802
819
|
def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
|
|
803
820
|
"""Sets the underlying Polars DataFrame or LazyFrame."""
|
|
804
821
|
if self.lazy and isinstance(df, pl.DataFrame):
|
|
805
|
-
raise Exception(
|
|
822
|
+
raise Exception("Cannot set a non-lazy dataframe to a lazy flowfile")
|
|
806
823
|
self._data_frame = df
|
|
807
824
|
|
|
808
825
|
@staticmethod
|
|
809
|
-
def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) ->
|
|
826
|
+
def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> list[dict]:
|
|
810
827
|
"""Converts a Polars Schema into a list of schema statistics dictionaries."""
|
|
811
|
-
return [
|
|
812
|
-
dict(column_name=k, pl_datatype=v, col_index=i)
|
|
813
|
-
for i, (k, v) in enumerate(pl_schema.items())
|
|
814
|
-
]
|
|
828
|
+
return [dict(column_name=k, pl_datatype=v, col_index=i) for i, (k, v) in enumerate(pl_schema.items())]
|
|
815
829
|
|
|
816
|
-
def _add_schema_from_schema_stats(self, schema_stats:
|
|
830
|
+
def _add_schema_from_schema_stats(self, schema_stats: list[dict]):
|
|
817
831
|
"""Populates the schema from a list of schema statistics dictionaries."""
|
|
818
832
|
self._schema = convert_stats_to_column_info(schema_stats)
|
|
819
833
|
|
|
820
834
|
@property
|
|
821
|
-
def schema(self) ->
|
|
835
|
+
def schema(self) -> list[FlowfileColumn]:
|
|
822
836
|
"""The schema of the DataFrame as a list of `FlowfileColumn` objects.
|
|
823
837
|
|
|
824
838
|
This property lazily calculates the schema if it hasn't been determined yet.
|
|
@@ -865,8 +879,10 @@ class FlowDataEngine:
|
|
|
865
879
|
if n_records is None:
|
|
866
880
|
logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
|
|
867
881
|
else:
|
|
868
|
-
logger.info(
|
|
869
|
-
|
|
882
|
+
logger.info(
|
|
883
|
+
f'Fetching {n_records} record(s) for Table object "{id(self)}". '
|
|
884
|
+
f"Settings: streaming={self._streamable}"
|
|
885
|
+
)
|
|
870
886
|
|
|
871
887
|
if not self.lazy:
|
|
872
888
|
return self.data_frame
|
|
@@ -880,16 +896,15 @@ class FlowDataEngine:
|
|
|
880
896
|
def _collect_data(self, n_records: int = None) -> pl.DataFrame:
|
|
881
897
|
"""Internal method to handle data collection logic."""
|
|
882
898
|
if n_records is None:
|
|
883
|
-
|
|
884
899
|
self.collect_external()
|
|
885
900
|
if self._streamable:
|
|
886
901
|
try:
|
|
887
|
-
logger.info(
|
|
902
|
+
logger.info("Collecting data in streaming mode")
|
|
888
903
|
return self.data_frame.collect(engine="streaming")
|
|
889
904
|
except PanicException:
|
|
890
905
|
self._streamable = False
|
|
891
906
|
|
|
892
|
-
logger.info(
|
|
907
|
+
logger.info("Collecting data in non-streaming mode")
|
|
893
908
|
return self.data_frame.collect()
|
|
894
909
|
|
|
895
910
|
if self.external_source is not None:
|
|
@@ -918,7 +933,7 @@ class FlowDataEngine:
|
|
|
918
933
|
return self._create_partial_dataframe(ok_cols, error_cols, n_records)
|
|
919
934
|
return self._create_empty_dataframe(n_records)
|
|
920
935
|
|
|
921
|
-
def _identify_valid_columns(self, n_records: int) ->
|
|
936
|
+
def _identify_valid_columns(self, n_records: int) -> tuple[list[str], list[tuple[str, Any]]]:
|
|
922
937
|
"""Identifies which columns can be collected successfully."""
|
|
923
938
|
ok_cols = []
|
|
924
939
|
error_cols = []
|
|
@@ -930,30 +945,30 @@ class FlowDataEngine:
|
|
|
930
945
|
error_cols.append((c, self.data_frame.schema[c]))
|
|
931
946
|
return ok_cols, error_cols
|
|
932
947
|
|
|
933
|
-
def _create_partial_dataframe(
|
|
934
|
-
|
|
948
|
+
def _create_partial_dataframe(
|
|
949
|
+
self, ok_cols: list[str], error_cols: list[tuple[str, Any]], n_records: int
|
|
950
|
+
) -> pl.DataFrame:
|
|
935
951
|
"""Creates a DataFrame with partial data for columns that could be collected."""
|
|
936
952
|
df = self.data_frame.select(ok_cols)
|
|
937
|
-
df = df.with_columns([
|
|
938
|
-
pl.lit(None).alias(column_name).cast(data_type)
|
|
939
|
-
for column_name, data_type in error_cols
|
|
940
|
-
])
|
|
953
|
+
df = df.with_columns([pl.lit(None).alias(column_name).cast(data_type) for column_name, data_type in error_cols])
|
|
941
954
|
return df.select(self.columns).head(n_records).collect()
|
|
942
955
|
|
|
943
956
|
def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
|
|
944
957
|
"""Creates an empty DataFrame with the correct schema."""
|
|
945
958
|
if self.number_of_records > 0:
|
|
946
|
-
return pl.DataFrame(
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
959
|
+
return pl.DataFrame(
|
|
960
|
+
{
|
|
961
|
+
column_name: pl.Series(
|
|
962
|
+
name=column_name, values=[None] * min(self.number_of_records, n_records)
|
|
963
|
+
).cast(data_type)
|
|
964
|
+
for column_name, data_type in self.data_frame.schema.items()
|
|
965
|
+
}
|
|
966
|
+
)
|
|
953
967
|
return pl.DataFrame(schema=self.data_frame.schema)
|
|
954
968
|
|
|
955
|
-
def do_group_by(
|
|
956
|
-
|
|
969
|
+
def do_group_by(
|
|
970
|
+
self, group_by_input: transform_schemas.GroupByInput, calculate_schema_stats: bool = True
|
|
971
|
+
) -> "FlowDataEngine":
|
|
957
972
|
"""Performs a group-by operation on the DataFrame.
|
|
958
973
|
|
|
959
974
|
Args:
|
|
@@ -965,27 +980,23 @@ class FlowDataEngine:
|
|
|
965
980
|
Returns:
|
|
966
981
|
A new `FlowDataEngine` instance with the grouped and aggregated data.
|
|
967
982
|
"""
|
|
968
|
-
aggregations = [c for c in group_by_input.agg_cols if c.agg !=
|
|
969
|
-
group_columns = [c for c in group_by_input.agg_cols if c.agg ==
|
|
983
|
+
aggregations = [c for c in group_by_input.agg_cols if c.agg != "groupby"]
|
|
984
|
+
group_columns = [c for c in group_by_input.agg_cols if c.agg == "groupby"]
|
|
970
985
|
|
|
971
986
|
if len(group_columns) == 0:
|
|
972
987
|
return FlowDataEngine(
|
|
973
|
-
self.data_frame.select(
|
|
974
|
-
|
|
975
|
-
),
|
|
976
|
-
calculate_schema_stats=calculate_schema_stats
|
|
988
|
+
self.data_frame.select(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
|
|
989
|
+
calculate_schema_stats=calculate_schema_stats,
|
|
977
990
|
)
|
|
978
991
|
|
|
979
992
|
df = self.data_frame.rename({c.old_name: c.new_name for c in group_columns})
|
|
980
993
|
group_by_columns = [n_c.new_name for n_c in group_columns]
|
|
981
994
|
return FlowDataEngine(
|
|
982
|
-
df.group_by(*group_by_columns).agg(
|
|
983
|
-
|
|
984
|
-
),
|
|
985
|
-
calculate_schema_stats=calculate_schema_stats
|
|
995
|
+
df.group_by(*group_by_columns).agg(ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations),
|
|
996
|
+
calculate_schema_stats=calculate_schema_stats,
|
|
986
997
|
)
|
|
987
998
|
|
|
988
|
-
def do_sort(self, sorts:
|
|
999
|
+
def do_sort(self, sorts: list[transform_schemas.SortByInput]) -> "FlowDataEngine":
|
|
989
1000
|
"""Sorts the DataFrame by one or more columns.
|
|
990
1001
|
|
|
991
1002
|
Args:
|
|
@@ -998,12 +1009,13 @@ class FlowDataEngine:
|
|
|
998
1009
|
if not sorts:
|
|
999
1010
|
return self
|
|
1000
1011
|
|
|
1001
|
-
descending = [s.how ==
|
|
1012
|
+
descending = [s.how == "desc" or s.how.lower() == "descending" for s in sorts]
|
|
1002
1013
|
df = self.data_frame.sort([sort_by.column for sort_by in sorts], descending=descending)
|
|
1003
1014
|
return FlowDataEngine(df, number_of_records=self.number_of_records, schema=self.schema)
|
|
1004
1015
|
|
|
1005
|
-
def change_column_types(
|
|
1006
|
-
|
|
1016
|
+
def change_column_types(
|
|
1017
|
+
self, transforms: list[transform_schemas.SelectInput], calculate_schema: bool = False
|
|
1018
|
+
) -> "FlowDataEngine":
|
|
1007
1019
|
"""Changes the data type of one or more columns.
|
|
1008
1020
|
|
|
1009
1021
|
Args:
|
|
@@ -1017,7 +1029,8 @@ class FlowDataEngine:
|
|
|
1017
1029
|
dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
|
|
1018
1030
|
idx_mapping = list(
|
|
1019
1031
|
(transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
|
|
1020
|
-
for transform in transforms
|
|
1032
|
+
for transform in transforms
|
|
1033
|
+
if transform.data_type is not None
|
|
1021
1034
|
)
|
|
1022
1035
|
|
|
1023
1036
|
actual_transforms = [c for c in idx_mapping if c[2] != dtypes[c[1]]]
|
|
@@ -1031,10 +1044,10 @@ class FlowDataEngine:
|
|
|
1031
1044
|
df,
|
|
1032
1045
|
number_of_records=self.number_of_records,
|
|
1033
1046
|
calculate_schema_stats=calculate_schema,
|
|
1034
|
-
streamable=self._streamable
|
|
1047
|
+
streamable=self._streamable,
|
|
1035
1048
|
)
|
|
1036
1049
|
|
|
1037
|
-
def save(self, path: str, data_type: str =
|
|
1050
|
+
def save(self, path: str, data_type: str = "parquet") -> Future:
|
|
1038
1051
|
"""Saves the DataFrame to a file in a separate thread.
|
|
1039
1052
|
|
|
1040
1053
|
Args:
|
|
@@ -1048,7 +1061,7 @@ class FlowDataEngine:
|
|
|
1048
1061
|
df = deepcopy(self.data_frame)
|
|
1049
1062
|
return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
|
|
1050
1063
|
|
|
1051
|
-
def to_pylist(self) ->
|
|
1064
|
+
def to_pylist(self) -> list[dict]:
|
|
1052
1065
|
"""Converts the DataFrame to a list of Python dictionaries.
|
|
1053
1066
|
|
|
1054
1067
|
Returns:
|
|
@@ -1082,15 +1095,15 @@ class FlowDataEngine:
|
|
|
1082
1095
|
data = list(self.to_dict().values())
|
|
1083
1096
|
return input_schema.RawData(columns=columns, data=data)
|
|
1084
1097
|
|
|
1085
|
-
def to_dict(self) ->
|
|
1098
|
+
def to_dict(self) -> dict[str, list]:
|
|
1086
1099
|
"""Converts the DataFrame to a Python dictionary of columns.
|
|
1087
1100
|
|
|
1088
|
-
|
|
1089
|
-
|
|
1101
|
+
Each key in the dictionary is a column name, and the corresponding value
|
|
1102
|
+
is a list of the data in that column.
|
|
1090
1103
|
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1104
|
+
Returns:
|
|
1105
|
+
A dictionary mapping column names to lists of their values.
|
|
1106
|
+
"""
|
|
1094
1107
|
if self.lazy:
|
|
1095
1108
|
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
|
|
1096
1109
|
else:
|
|
@@ -1130,7 +1143,7 @@ class FlowDataEngine:
|
|
|
1130
1143
|
return cls(pl.read_sql(sql, conn))
|
|
1131
1144
|
|
|
1132
1145
|
@classmethod
|
|
1133
|
-
def create_from_schema(cls, schema:
|
|
1146
|
+
def create_from_schema(cls, schema: list[FlowfileColumn]) -> "FlowDataEngine":
|
|
1134
1147
|
"""Creates an empty FlowDataEngine from a schema definition.
|
|
1135
1148
|
|
|
1136
1149
|
Args:
|
|
@@ -1147,7 +1160,7 @@ class FlowDataEngine:
|
|
|
1147
1160
|
return cls(df, schema=schema, calculate_schema_stats=False, number_of_records=0)
|
|
1148
1161
|
|
|
1149
1162
|
@classmethod
|
|
1150
|
-
def create_from_path(cls, received_table: input_schema.
|
|
1163
|
+
def create_from_path(cls, received_table: input_schema.ReceivedTable) -> "FlowDataEngine":
|
|
1151
1164
|
"""Creates a FlowDataEngine from a local file path.
|
|
1152
1165
|
|
|
1153
1166
|
Supports various file types like CSV, Parquet, and Excel.
|
|
@@ -1161,14 +1174,14 @@ class FlowDataEngine:
|
|
|
1161
1174
|
"""
|
|
1162
1175
|
received_table.set_absolute_filepath()
|
|
1163
1176
|
file_type_handlers = {
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1177
|
+
"csv": create_funcs.create_from_path_csv,
|
|
1178
|
+
"parquet": create_funcs.create_from_path_parquet,
|
|
1179
|
+
"excel": create_funcs.create_from_path_excel,
|
|
1167
1180
|
}
|
|
1168
1181
|
|
|
1169
1182
|
handler = file_type_handlers.get(received_table.file_type)
|
|
1170
1183
|
if not handler:
|
|
1171
|
-
raise Exception(f
|
|
1184
|
+
raise Exception(f"Cannot create from {received_table.file_type}")
|
|
1172
1185
|
|
|
1173
1186
|
flow_file = cls(handler(received_table))
|
|
1174
1187
|
flow_file._org_path = received_table.abs_file_path
|
|
@@ -1189,7 +1202,7 @@ class FlowDataEngine:
|
|
|
1189
1202
|
return cls(create_fake_data(number_of_records))
|
|
1190
1203
|
|
|
1191
1204
|
@classmethod
|
|
1192
|
-
def generate_enumerator(cls, length: int = 1000, output_name: str =
|
|
1205
|
+
def generate_enumerator(cls, length: int = 1000, output_name: str = "output_column") -> "FlowDataEngine":
|
|
1193
1206
|
"""Generates a FlowDataEngine with a single column containing a sequence of integers.
|
|
1194
1207
|
|
|
1195
1208
|
Args:
|
|
@@ -1203,8 +1216,9 @@ class FlowDataEngine:
|
|
|
1203
1216
|
length = 10_000_000
|
|
1204
1217
|
return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
|
|
1205
1218
|
|
|
1206
|
-
def _handle_schema(
|
|
1207
|
-
|
|
1219
|
+
def _handle_schema(
|
|
1220
|
+
self, schema: list[FlowfileColumn] | list[str] | pl.Schema | None, pl_schema: pl.Schema
|
|
1221
|
+
) -> list[FlowfileColumn] | None:
|
|
1208
1222
|
"""Handles schema processing and validation during initialization."""
|
|
1209
1223
|
if schema is None and pl_schema is not None:
|
|
1210
1224
|
return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
|
|
@@ -1215,7 +1229,8 @@ class FlowDataEngine:
|
|
|
1215
1229
|
elif pl_schema is not None and schema is not None:
|
|
1216
1230
|
if schema.__len__() != pl_schema.__len__():
|
|
1217
1231
|
raise Exception(
|
|
1218
|
-
f
|
|
1232
|
+
f"Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}"
|
|
1233
|
+
)
|
|
1219
1234
|
if isinstance(schema, pl.Schema):
|
|
1220
1235
|
return self._handle_polars_schema(schema, pl_schema)
|
|
1221
1236
|
elif isinstance(schema, list) and len(schema) == 0:
|
|
@@ -1224,31 +1239,29 @@ class FlowDataEngine:
|
|
|
1224
1239
|
return self._handle_string_schema(schema, pl_schema)
|
|
1225
1240
|
return schema
|
|
1226
1241
|
|
|
1227
|
-
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) ->
|
|
1242
|
+
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> list[FlowfileColumn]:
|
|
1228
1243
|
"""Handles Polars schema conversion."""
|
|
1229
1244
|
flow_file_columns = [
|
|
1230
1245
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
1231
|
-
for col_name, dtype in zip(schema.names(), schema.dtypes())
|
|
1246
|
+
for col_name, dtype in zip(schema.names(), schema.dtypes(), strict=False)
|
|
1232
1247
|
]
|
|
1233
1248
|
|
|
1234
1249
|
select_arg = [
|
|
1235
1250
|
pl.col(o).alias(n).cast(schema_dtype)
|
|
1236
|
-
for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes())
|
|
1251
|
+
for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes(), strict=False)
|
|
1237
1252
|
]
|
|
1238
1253
|
|
|
1239
1254
|
self.data_frame = self.data_frame.select(select_arg)
|
|
1240
1255
|
return flow_file_columns
|
|
1241
1256
|
|
|
1242
|
-
def _handle_string_schema(self, schema:
|
|
1257
|
+
def _handle_string_schema(self, schema: list[str], pl_schema: pl.Schema) -> list[FlowfileColumn]:
|
|
1243
1258
|
"""Handles string-based schema conversion."""
|
|
1244
1259
|
flow_file_columns = [
|
|
1245
1260
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
1246
|
-
for col_name, dtype in zip(schema, pl_schema.dtypes())
|
|
1261
|
+
for col_name, dtype in zip(schema, pl_schema.dtypes(), strict=False)
|
|
1247
1262
|
]
|
|
1248
1263
|
|
|
1249
|
-
self.data_frame = self.data_frame.rename({
|
|
1250
|
-
o: n for o, n in zip(pl_schema.names(), schema)
|
|
1251
|
-
})
|
|
1264
|
+
self.data_frame = self.data_frame.rename({o: n for o, n in zip(pl_schema.names(), schema, strict=False)})
|
|
1252
1265
|
|
|
1253
1266
|
return flow_file_columns
|
|
1254
1267
|
|
|
@@ -1266,25 +1279,16 @@ class FlowDataEngine:
|
|
|
1266
1279
|
A new `FlowDataEngine` instance with the exploded rows.
|
|
1267
1280
|
"""
|
|
1268
1281
|
output_column_name = (
|
|
1269
|
-
split_input.output_column_name
|
|
1270
|
-
if split_input.output_column_name
|
|
1271
|
-
else split_input.column_to_split
|
|
1282
|
+
split_input.output_column_name if split_input.output_column_name else split_input.column_to_split
|
|
1272
1283
|
)
|
|
1273
1284
|
|
|
1274
1285
|
split_value = (
|
|
1275
|
-
split_input.split_fixed_value
|
|
1276
|
-
if split_input.split_by_fixed_value
|
|
1277
|
-
else pl.col(split_input.split_by_column)
|
|
1286
|
+
split_input.split_fixed_value if split_input.split_by_fixed_value else pl.col(split_input.split_by_column)
|
|
1278
1287
|
)
|
|
1279
1288
|
|
|
1280
|
-
df = (
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
.str.split(by=split_value)
|
|
1284
|
-
.alias(output_column_name)
|
|
1285
|
-
)
|
|
1286
|
-
.explode(output_column_name)
|
|
1287
|
-
)
|
|
1289
|
+
df = self.data_frame.with_columns(
|
|
1290
|
+
pl.col(split_input.column_to_split).str.split(by=split_value).alias(output_column_name)
|
|
1291
|
+
).explode(output_column_name)
|
|
1288
1292
|
|
|
1289
1293
|
return FlowDataEngine(df)
|
|
1290
1294
|
|
|
@@ -1304,15 +1308,9 @@ class FlowDataEngine:
|
|
|
1304
1308
|
lf = self.data_frame
|
|
1305
1309
|
|
|
1306
1310
|
if unpivot_input.data_type_selector_expr is not None:
|
|
1307
|
-
result = lf.unpivot(
|
|
1308
|
-
on=unpivot_input.data_type_selector_expr(),
|
|
1309
|
-
index=unpivot_input.index_columns
|
|
1310
|
-
)
|
|
1311
|
+
result = lf.unpivot(on=unpivot_input.data_type_selector_expr(), index=unpivot_input.index_columns)
|
|
1311
1312
|
elif unpivot_input.value_columns is not None:
|
|
1312
|
-
result = lf.unpivot(
|
|
1313
|
-
on=unpivot_input.value_columns,
|
|
1314
|
-
index=unpivot_input.index_columns
|
|
1315
|
-
)
|
|
1313
|
+
result = lf.unpivot(on=unpivot_input.value_columns, index=unpivot_input.index_columns)
|
|
1316
1314
|
else:
|
|
1317
1315
|
result = lf.unpivot()
|
|
1318
1316
|
|
|
@@ -1332,19 +1330,24 @@ class FlowDataEngine:
|
|
|
1332
1330
|
"""
|
|
1333
1331
|
# Get unique values for pivot columns
|
|
1334
1332
|
max_unique_vals = 200
|
|
1335
|
-
new_cols_unique = fetch_unique_values(
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1333
|
+
new_cols_unique = fetch_unique_values(
|
|
1334
|
+
self.data_frame.select(pivot_input.pivot_column)
|
|
1335
|
+
.unique()
|
|
1336
|
+
.sort(pivot_input.pivot_column)
|
|
1337
|
+
.limit(max_unique_vals)
|
|
1338
|
+
.cast(pl.String)
|
|
1339
|
+
)
|
|
1339
1340
|
if len(new_cols_unique) >= max_unique_vals:
|
|
1340
1341
|
if node_logger:
|
|
1341
|
-
node_logger.warning(
|
|
1342
|
-
|
|
1342
|
+
node_logger.warning(
|
|
1343
|
+
"Pivot column has too many unique values. Please consider using a different column."
|
|
1344
|
+
f" Max unique values: {max_unique_vals}"
|
|
1345
|
+
)
|
|
1343
1346
|
|
|
1344
1347
|
if len(pivot_input.index_columns) == 0:
|
|
1345
1348
|
no_index_cols = True
|
|
1346
|
-
pivot_input.index_columns = [
|
|
1347
|
-
ff = self.apply_flowfile_formula(
|
|
1349
|
+
pivot_input.index_columns = ["__temp__"]
|
|
1350
|
+
ff = self.apply_flowfile_formula("1", col_name="__temp__")
|
|
1348
1351
|
else:
|
|
1349
1352
|
no_index_cols = False
|
|
1350
1353
|
ff = self
|
|
@@ -1354,36 +1357,32 @@ class FlowDataEngine:
|
|
|
1354
1357
|
grouped_ff = ff.do_group_by(pivot_input.get_group_by_input(), False)
|
|
1355
1358
|
pivot_column = pivot_input.get_pivot_column()
|
|
1356
1359
|
|
|
1357
|
-
input_df = grouped_ff.data_frame.with_columns(
|
|
1358
|
-
pivot_column.cast(pl.String).alias(pivot_input.pivot_column)
|
|
1359
|
-
)
|
|
1360
|
+
input_df = grouped_ff.data_frame.with_columns(pivot_column.cast(pl.String).alias(pivot_input.pivot_column))
|
|
1360
1361
|
number_of_aggregations = len(pivot_input.aggregations)
|
|
1361
1362
|
df = (
|
|
1362
|
-
input_df.select(
|
|
1363
|
-
*index_columns,
|
|
1364
|
-
pivot_column,
|
|
1365
|
-
pivot_input.get_values_expr()
|
|
1366
|
-
)
|
|
1363
|
+
input_df.select(*index_columns, pivot_column, pivot_input.get_values_expr())
|
|
1367
1364
|
.group_by(*index_columns)
|
|
1368
|
-
.agg(
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1365
|
+
.agg(
|
|
1366
|
+
[
|
|
1367
|
+
(pl.col("vals").filter(pivot_column == new_col_value)).first().alias(new_col_value)
|
|
1368
|
+
for new_col_value in new_cols_unique
|
|
1369
|
+
]
|
|
1370
|
+
)
|
|
1374
1371
|
.select(
|
|
1375
1372
|
*index_columns,
|
|
1376
1373
|
*[
|
|
1377
|
-
pl.col(new_col)
|
|
1374
|
+
pl.col(new_col)
|
|
1375
|
+
.struct.field(agg)
|
|
1376
|
+
.alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
|
|
1378
1377
|
for new_col in new_cols_unique
|
|
1379
1378
|
for agg in pivot_input.aggregations
|
|
1380
|
-
]
|
|
1379
|
+
],
|
|
1381
1380
|
)
|
|
1382
1381
|
)
|
|
1383
1382
|
|
|
1384
1383
|
# Clean up temporary columns if needed
|
|
1385
1384
|
if no_index_cols:
|
|
1386
|
-
df = df.drop(
|
|
1385
|
+
df = df.drop("__temp__")
|
|
1387
1386
|
pivot_input.index_columns = []
|
|
1388
1387
|
|
|
1389
1388
|
return FlowDataEngine(df, calculate_schema_stats=False)
|
|
@@ -1402,7 +1401,7 @@ class FlowDataEngine:
|
|
|
1402
1401
|
try:
|
|
1403
1402
|
f = to_expr(predicate)
|
|
1404
1403
|
except Exception as e:
|
|
1405
|
-
logger.warning(f
|
|
1404
|
+
logger.warning(f"Error in filter expression: {e}")
|
|
1406
1405
|
f = to_expr("False")
|
|
1407
1406
|
df = self.data_frame.filter(f)
|
|
1408
1407
|
return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
|
|
@@ -1429,29 +1428,27 @@ class FlowDataEngine:
|
|
|
1429
1428
|
select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
|
|
1430
1429
|
|
|
1431
1430
|
df = (
|
|
1432
|
-
self.data_frame
|
|
1433
|
-
.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
|
|
1431
|
+
self.data_frame.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
|
|
1434
1432
|
.with_columns(
|
|
1435
|
-
(
|
|
1436
|
-
|
|
1437
|
-
|
|
1433
|
+
(
|
|
1434
|
+
pl.cum_count(record_id_settings.output_column_name).over(record_id_settings.group_by_columns)
|
|
1435
|
+
+ record_id_settings.offset
|
|
1436
|
+
- 1
|
|
1437
|
+
).alias(record_id_settings.output_column_name)
|
|
1438
1438
|
)
|
|
1439
1439
|
.select(select_cols)
|
|
1440
1440
|
)
|
|
1441
1441
|
|
|
1442
|
-
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name,
|
|
1442
|
+
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
|
|
1443
1443
|
output_schema.extend(self.schema)
|
|
1444
1444
|
|
|
1445
1445
|
return FlowDataEngine(df, schema=output_schema)
|
|
1446
1446
|
|
|
1447
1447
|
def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
1448
1448
|
"""Adds a simple sequential record ID column."""
|
|
1449
|
-
df = self.data_frame.with_row_index(
|
|
1450
|
-
record_id_settings.output_column_name,
|
|
1451
|
-
record_id_settings.offset
|
|
1452
|
-
)
|
|
1449
|
+
df = self.data_frame.with_row_index(record_id_settings.output_column_name, record_id_settings.offset)
|
|
1453
1450
|
|
|
1454
|
-
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name,
|
|
1451
|
+
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, "UInt64")]
|
|
1455
1452
|
output_schema.extend(self.schema)
|
|
1456
1453
|
|
|
1457
1454
|
return FlowDataEngine(df, schema=output_schema)
|
|
@@ -1483,7 +1480,7 @@ class FlowDataEngine:
|
|
|
1483
1480
|
|
|
1484
1481
|
def __repr__(self) -> str:
|
|
1485
1482
|
"""Returns a string representation of the FlowDataEngine."""
|
|
1486
|
-
return f
|
|
1483
|
+
return f"flow data engine\n{self.data_frame.__repr__()}"
|
|
1487
1484
|
|
|
1488
1485
|
def __call__(self) -> "FlowDataEngine":
|
|
1489
1486
|
"""Makes the class instance callable, returning itself."""
|
|
@@ -1503,16 +1500,16 @@ class FlowDataEngine:
|
|
|
1503
1500
|
Returns:
|
|
1504
1501
|
The same `FlowDataEngine` instance, now backed by the cached data.
|
|
1505
1502
|
"""
|
|
1506
|
-
edf = ExternalDfFetcher(
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
logger.info(
|
|
1503
|
+
edf = ExternalDfFetcher(
|
|
1504
|
+
lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False, flow_id=-1, node_id=-1
|
|
1505
|
+
)
|
|
1506
|
+
logger.info("Caching data in background")
|
|
1510
1507
|
result = edf.get_result()
|
|
1511
1508
|
if isinstance(result, pl.LazyFrame):
|
|
1512
|
-
logger.info(
|
|
1509
|
+
logger.info("Data cached")
|
|
1513
1510
|
del self._data_frame
|
|
1514
1511
|
self.data_frame = result
|
|
1515
|
-
logger.info(
|
|
1512
|
+
logger.info("Data loaded from cache")
|
|
1516
1513
|
return self
|
|
1517
1514
|
|
|
1518
1515
|
def collect_external(self):
|
|
@@ -1524,14 +1521,14 @@ class FlowDataEngine:
|
|
|
1524
1521
|
re-evaluated.
|
|
1525
1522
|
"""
|
|
1526
1523
|
if self._external_source is not None:
|
|
1527
|
-
logger.info(
|
|
1524
|
+
logger.info("Collecting external source")
|
|
1528
1525
|
if self.external_source.get_pl_df() is not None:
|
|
1529
1526
|
self.data_frame = self.external_source.get_pl_df().lazy()
|
|
1530
1527
|
else:
|
|
1531
1528
|
self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
|
|
1532
1529
|
self._schema = None # enforce reset schema
|
|
1533
1530
|
|
|
1534
|
-
def get_output_sample(self, n_rows: int = 10) ->
|
|
1531
|
+
def get_output_sample(self, n_rows: int = 10) -> list[dict]:
|
|
1535
1532
|
"""Gets a sample of the data as a list of dictionaries.
|
|
1536
1533
|
|
|
1537
1534
|
This is typically used to display a preview of the data in a UI.
|
|
@@ -1559,14 +1556,20 @@ class FlowDataEngine:
|
|
|
1559
1556
|
try:
|
|
1560
1557
|
df = df.head(n_rows).collect()
|
|
1561
1558
|
except Exception as e:
|
|
1562
|
-
logger.warning(f
|
|
1559
|
+
logger.warning(f"Error in getting sample: {e}")
|
|
1563
1560
|
df = df.head(n_rows).collect(engine="auto")
|
|
1564
1561
|
else:
|
|
1565
1562
|
df = self.collect()
|
|
1566
1563
|
return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
|
|
1567
1564
|
|
|
1568
|
-
def get_sample(
|
|
1569
|
-
|
|
1565
|
+
def get_sample(
|
|
1566
|
+
self,
|
|
1567
|
+
n_rows: int = 100,
|
|
1568
|
+
random: bool = False,
|
|
1569
|
+
shuffle: bool = False,
|
|
1570
|
+
seed: int = None,
|
|
1571
|
+
execution_location: ExecutionLocationsLiteral | None = None,
|
|
1572
|
+
) -> "FlowDataEngine":
|
|
1570
1573
|
"""Gets a sample of rows from the DataFrame.
|
|
1571
1574
|
|
|
1572
1575
|
Args:
|
|
@@ -1578,23 +1581,23 @@ class FlowDataEngine:
|
|
|
1578
1581
|
Returns:
|
|
1579
1582
|
A new `FlowDataEngine` instance containing the sampled data.
|
|
1580
1583
|
"""
|
|
1581
|
-
logging.info(f
|
|
1582
|
-
|
|
1584
|
+
logging.info(f"Getting sample of {n_rows} rows")
|
|
1583
1585
|
if random:
|
|
1584
1586
|
if self.lazy and self.external_source is not None:
|
|
1585
1587
|
self.collect_external()
|
|
1586
1588
|
|
|
1587
1589
|
if self.lazy and shuffle:
|
|
1588
|
-
sample_df =
|
|
1589
|
-
|
|
1590
|
+
sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(
|
|
1591
|
+
n_rows, seed=seed, shuffle=shuffle
|
|
1592
|
+
)
|
|
1590
1593
|
elif shuffle:
|
|
1591
1594
|
sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
|
|
1592
1595
|
else:
|
|
1593
1596
|
if execution_location is None:
|
|
1594
1597
|
execution_location = get_global_execution_location()
|
|
1595
|
-
n_rows = min(
|
|
1596
|
-
calculate_in_worker_process=execution_location == "remote")
|
|
1597
|
-
|
|
1598
|
+
n_rows = min(
|
|
1599
|
+
n_rows, self.get_number_of_records(calculate_in_worker_process=execution_location == "remote")
|
|
1600
|
+
)
|
|
1598
1601
|
|
|
1599
1602
|
every_n_records = ceil(self.number_of_records / n_rows)
|
|
1600
1603
|
sample_df = self.data_frame.gather_every(every_n_records)
|
|
@@ -1619,8 +1622,9 @@ class FlowDataEngine:
|
|
|
1619
1622
|
else:
|
|
1620
1623
|
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
1621
1624
|
|
|
1622
|
-
def iter_batches(
|
|
1623
|
-
|
|
1625
|
+
def iter_batches(
|
|
1626
|
+
self, batch_size: int = 1000, columns: list | tuple | str = None
|
|
1627
|
+
) -> Generator["FlowDataEngine", None, None]:
|
|
1624
1628
|
"""Iterates over the DataFrame in batches.
|
|
1625
1629
|
|
|
1626
1630
|
Args:
|
|
@@ -1638,9 +1642,14 @@ class FlowDataEngine:
|
|
|
1638
1642
|
for batch in batches:
|
|
1639
1643
|
yield FlowDataEngine(batch)
|
|
1640
1644
|
|
|
1641
|
-
def start_fuzzy_join(
|
|
1642
|
-
|
|
1643
|
-
|
|
1645
|
+
def start_fuzzy_join(
|
|
1646
|
+
self,
|
|
1647
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1648
|
+
other: "FlowDataEngine",
|
|
1649
|
+
file_ref: str,
|
|
1650
|
+
flow_id: int = -1,
|
|
1651
|
+
node_id: int | str = -1,
|
|
1652
|
+
) -> ExternalFuzzyMatchFetcher:
|
|
1644
1653
|
"""Starts a fuzzy join operation in a background process.
|
|
1645
1654
|
|
|
1646
1655
|
This method prepares the data and initiates the fuzzy matching in a
|
|
@@ -1657,45 +1666,71 @@ class FlowDataEngine:
|
|
|
1657
1666
|
An `ExternalFuzzyMatchFetcher` object that can be used to track the
|
|
1658
1667
|
progress and retrieve the result of the fuzzy join.
|
|
1659
1668
|
"""
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1669
|
+
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1670
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1671
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1672
|
+
)
|
|
1673
|
+
|
|
1674
|
+
return ExternalFuzzyMatchFetcher(
|
|
1675
|
+
left_df,
|
|
1676
|
+
right_df,
|
|
1677
|
+
fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
|
|
1678
|
+
file_ref=file_ref + "_fm",
|
|
1679
|
+
wait_on_completion=False,
|
|
1680
|
+
flow_id=flow_id,
|
|
1681
|
+
node_id=node_id,
|
|
1682
|
+
)
|
|
1683
|
+
|
|
1684
|
+
def fuzzy_join_external(
|
|
1685
|
+
self,
|
|
1686
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1687
|
+
other: "FlowDataEngine",
|
|
1688
|
+
file_ref: str = None,
|
|
1689
|
+
flow_id: int = -1,
|
|
1690
|
+
node_id: int = -1,
|
|
1691
|
+
):
|
|
1675
1692
|
if file_ref is None:
|
|
1676
|
-
file_ref = str(id(self)) +
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1693
|
+
file_ref = str(id(self)) + "_" + str(id(other))
|
|
1694
|
+
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1695
|
+
|
|
1696
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1697
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1698
|
+
)
|
|
1699
|
+
external_tracker = ExternalFuzzyMatchFetcher(
|
|
1700
|
+
left_df,
|
|
1701
|
+
right_df,
|
|
1702
|
+
fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
|
|
1703
|
+
file_ref=file_ref + "_fm",
|
|
1704
|
+
wait_on_completion=False,
|
|
1705
|
+
flow_id=flow_id,
|
|
1706
|
+
node_id=node_id,
|
|
1707
|
+
)
|
|
1685
1708
|
return FlowDataEngine(external_tracker.get_result())
|
|
1686
1709
|
|
|
1687
|
-
def fuzzy_join(
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1710
|
+
def fuzzy_join(
|
|
1711
|
+
self,
|
|
1712
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1713
|
+
other: "FlowDataEngine",
|
|
1714
|
+
node_logger: NodeLogger = None,
|
|
1715
|
+
) -> "FlowDataEngine":
|
|
1716
|
+
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1717
|
+
left_df, right_df = prepare_for_fuzzy_match(
|
|
1718
|
+
left=self, right=other, fuzzy_match_input_manager=fuzzy_match_input_manager
|
|
1719
|
+
)
|
|
1720
|
+
fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input_manager.fuzzy_maps]
|
|
1721
|
+
return FlowDataEngine(
|
|
1722
|
+
fuzzy_match_dfs(
|
|
1723
|
+
left_df, right_df, fuzzy_maps=fuzzy_mappings, logger=node_logger.logger if node_logger else logger
|
|
1724
|
+
).lazy()
|
|
1725
|
+
)
|
|
1726
|
+
|
|
1727
|
+
def do_cross_join(
|
|
1728
|
+
self,
|
|
1729
|
+
cross_join_input: transform_schemas.CrossJoinInput,
|
|
1730
|
+
auto_generate_selection: bool,
|
|
1731
|
+
verify_integrity: bool,
|
|
1732
|
+
other: "FlowDataEngine",
|
|
1733
|
+
) -> "FlowDataEngine":
|
|
1699
1734
|
"""Performs a cross join with another DataFrame.
|
|
1700
1735
|
|
|
1701
1736
|
A cross join produces the Cartesian product of the two DataFrames.
|
|
@@ -1713,101 +1748,109 @@ class FlowDataEngine:
|
|
|
1713
1748
|
Exception: If `verify_integrity` is True and the join would result in
|
|
1714
1749
|
an excessively large number of records.
|
|
1715
1750
|
"""
|
|
1716
|
-
|
|
1717
1751
|
self.lazy = True
|
|
1718
|
-
|
|
1719
1752
|
other.lazy = True
|
|
1753
|
+
cross_join_input_manager = transform_schemas.CrossJoinInputManager(cross_join_input)
|
|
1754
|
+
verify_join_select_integrity(
|
|
1755
|
+
cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns
|
|
1756
|
+
)
|
|
1757
|
+
right_select = [
|
|
1758
|
+
v.old_name
|
|
1759
|
+
for v in cross_join_input_manager.right_select.renames
|
|
1760
|
+
if (v.keep or v.join_key) and v.is_available
|
|
1761
|
+
]
|
|
1762
|
+
left_select = [
|
|
1763
|
+
v.old_name
|
|
1764
|
+
for v in cross_join_input_manager.left_select.renames
|
|
1765
|
+
if (v.keep or v.join_key) and v.is_available
|
|
1766
|
+
]
|
|
1767
|
+
cross_join_input_manager.auto_rename(rename_mode="suffix")
|
|
1768
|
+
left = self.data_frame.select(left_select).rename(cross_join_input_manager.left_select.rename_table)
|
|
1769
|
+
right = other.data_frame.select(right_select).rename(cross_join_input_manager.right_select.rename_table)
|
|
1720
1770
|
|
|
1721
|
-
|
|
1722
|
-
right_select = [v.old_name for v in cross_join_input.right_select.renames
|
|
1723
|
-
if (v.keep or v.join_key) and v.is_available]
|
|
1724
|
-
left_select = [v.old_name for v in cross_join_input.left_select.renames
|
|
1725
|
-
if (v.keep or v.join_key) and v.is_available]
|
|
1726
|
-
|
|
1727
|
-
left = self.data_frame.select(left_select).rename(cross_join_input.left_select.rename_table)
|
|
1728
|
-
right = other.data_frame.select(right_select).rename(cross_join_input.right_select.rename_table)
|
|
1729
|
-
|
|
1730
|
-
joined_df = left.join(right, how='cross')
|
|
1771
|
+
joined_df = left.join(right, how="cross")
|
|
1731
1772
|
|
|
1732
|
-
cols_to_delete_after = [
|
|
1733
|
-
|
|
1734
|
-
|
|
1773
|
+
cols_to_delete_after = [
|
|
1774
|
+
col.new_name
|
|
1775
|
+
for col in cross_join_input_manager.left_select.renames + cross_join_input_manager.right_select.renames
|
|
1776
|
+
if col.join_key and not col.keep and col.is_available
|
|
1777
|
+
]
|
|
1735
1778
|
|
|
1736
1779
|
fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
|
|
1737
1780
|
return fl
|
|
1738
1781
|
|
|
1739
|
-
def join(
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1782
|
+
def join(
|
|
1783
|
+
self,
|
|
1784
|
+
join_input: transform_schemas.JoinInput,
|
|
1785
|
+
auto_generate_selection: bool,
|
|
1786
|
+
verify_integrity: bool,
|
|
1787
|
+
other: "FlowDataEngine",
|
|
1788
|
+
) -> "FlowDataEngine":
|
|
1789
|
+
"""Performs a standard SQL-style join with another DataFrame."""
|
|
1790
|
+
# Create manager from input
|
|
1791
|
+
join_manager = transform_schemas.JoinInputManager(join_input)
|
|
1792
|
+
ensure_right_unselect_for_semi_and_anti_joins(join_manager.input)
|
|
1793
|
+
for jk in join_manager.join_mapping:
|
|
1794
|
+
if jk.left_col not in {c.old_name for c in join_manager.left_select.renames}:
|
|
1795
|
+
join_manager.left_select.append(transform_schemas.SelectInput(jk.left_col, keep=False))
|
|
1796
|
+
if jk.right_col not in {c.old_name for c in join_manager.right_select.renames}:
|
|
1797
|
+
join_manager.right_select.append(transform_schemas.SelectInput(jk.right_col, keep=False))
|
|
1798
|
+
verify_join_select_integrity(join_manager.input, left_columns=self.columns, right_columns=other.columns)
|
|
1799
|
+
if not verify_join_map_integrity(join_manager.input, left_columns=self.schema, right_columns=other.schema):
|
|
1800
|
+
raise Exception("Join is not valid by the data fields")
|
|
1744
1801
|
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
and column selections.
|
|
1748
|
-
auto_generate_selection: If True, automatically handles column renaming.
|
|
1749
|
-
verify_integrity: If True, performs checks to prevent excessively large joins.
|
|
1750
|
-
other: The right `FlowDataEngine` to join with.
|
|
1802
|
+
if auto_generate_selection:
|
|
1803
|
+
join_manager.auto_rename()
|
|
1751
1804
|
|
|
1752
|
-
|
|
1753
|
-
|
|
1805
|
+
# Use manager properties throughout
|
|
1806
|
+
left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(
|
|
1807
|
+
join_manager.left_manager.get_rename_table()
|
|
1808
|
+
)
|
|
1809
|
+
right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(
|
|
1810
|
+
join_manager.right_manager.get_rename_table()
|
|
1811
|
+
)
|
|
1754
1812
|
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
"""
|
|
1759
|
-
ensure_right_unselect_for_semi_and_anti_joins(join_input)
|
|
1760
|
-
verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
|
|
1761
|
-
if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
|
|
1762
|
-
raise Exception('Join is not valid by the data fields')
|
|
1763
|
-
if auto_generate_selection:
|
|
1764
|
-
join_input.auto_rename()
|
|
1765
|
-
left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
|
|
1766
|
-
right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
|
|
1767
|
-
if verify_integrity and join_input.how != 'right':
|
|
1768
|
-
n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
|
|
1769
|
-
right_on_keys=join_input.right_join_keys, how=join_input.how)
|
|
1770
|
-
if n_records > 1_000_000_000:
|
|
1771
|
-
raise Exception("Join will result in too many records, ending process")
|
|
1772
|
-
else:
|
|
1773
|
-
n_records = -1
|
|
1774
|
-
left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_input)
|
|
1775
|
-
left, right = rename_df_table_for_join(left, right, join_input.get_join_key_renames())
|
|
1776
|
-
if join_input.how == 'right':
|
|
1813
|
+
left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_manager)
|
|
1814
|
+
left, right = rename_df_table_for_join(left, right, join_manager.get_join_key_renames())
|
|
1815
|
+
if join_manager.how == "right":
|
|
1777
1816
|
joined_df = right.join(
|
|
1778
1817
|
other=left,
|
|
1779
|
-
left_on=
|
|
1780
|
-
right_on=
|
|
1818
|
+
left_on=join_manager.right_join_keys,
|
|
1819
|
+
right_on=join_manager.left_join_keys,
|
|
1781
1820
|
how="left",
|
|
1782
|
-
suffix=""
|
|
1821
|
+
suffix="",
|
|
1822
|
+
).rename(reverse_join_key_mapping)
|
|
1783
1823
|
else:
|
|
1784
1824
|
joined_df = left.join(
|
|
1785
1825
|
other=right,
|
|
1786
|
-
left_on=
|
|
1787
|
-
right_on=
|
|
1788
|
-
how=
|
|
1789
|
-
suffix=""
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1826
|
+
left_on=join_manager.left_join_keys,
|
|
1827
|
+
right_on=join_manager.right_join_keys,
|
|
1828
|
+
how=join_manager.how,
|
|
1829
|
+
suffix="",
|
|
1830
|
+
).rename(reverse_join_key_mapping)
|
|
1831
|
+
|
|
1832
|
+
left_cols_to_delete_after = [
|
|
1833
|
+
get_col_name_to_delete(col, "left")
|
|
1834
|
+
for col in join_manager.input.left_select.renames
|
|
1835
|
+
if not col.keep and col.is_available and col.join_key
|
|
1836
|
+
]
|
|
1837
|
+
|
|
1838
|
+
right_cols_to_delete_after = [
|
|
1839
|
+
get_col_name_to_delete(col, "right")
|
|
1840
|
+
for col in join_manager.input.right_select.renames
|
|
1841
|
+
if not col.keep
|
|
1842
|
+
and col.is_available
|
|
1843
|
+
and col.join_key
|
|
1844
|
+
and join_manager.how in ("left", "right", "inner", "cross", "outer")
|
|
1845
|
+
]
|
|
1846
|
+
|
|
1799
1847
|
if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
|
|
1800
1848
|
joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
|
|
1801
|
-
|
|
1849
|
+
|
|
1850
|
+
undo_join_key_remapping = get_undo_rename_mapping_join(join_manager)
|
|
1802
1851
|
joined_df = joined_df.rename(undo_join_key_remapping)
|
|
1803
1852
|
|
|
1804
|
-
|
|
1805
|
-
return FlowDataEngine(joined_df, calculate_schema_stats=True,
|
|
1806
|
-
number_of_records=n_records, streamable=False)
|
|
1807
|
-
else:
|
|
1808
|
-
fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
|
|
1809
|
-
number_of_records=0, streamable=False)
|
|
1810
|
-
return fl
|
|
1853
|
+
return FlowDataEngine(joined_df, calculate_schema_stats=False, number_of_records=0, streamable=False)
|
|
1811
1854
|
|
|
1812
1855
|
def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
|
|
1813
1856
|
"""Solves a graph problem represented by 'from' and 'to' columns.
|
|
@@ -1822,8 +1865,9 @@ class FlowDataEngine:
|
|
|
1822
1865
|
A new `FlowDataEngine` instance with the solved graph data.
|
|
1823
1866
|
"""
|
|
1824
1867
|
lf = self.data_frame.with_columns(
|
|
1825
|
-
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
|
|
1826
|
-
|
|
1868
|
+
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to).alias(
|
|
1869
|
+
graph_solver_input.output_column_name
|
|
1870
|
+
)
|
|
1827
1871
|
)
|
|
1828
1872
|
return FlowDataEngine(lf)
|
|
1829
1873
|
|
|
@@ -1838,7 +1882,7 @@ class FlowDataEngine:
|
|
|
1838
1882
|
A new `FlowDataEngine` instance with the added column.
|
|
1839
1883
|
"""
|
|
1840
1884
|
if col_name is None:
|
|
1841
|
-
col_name =
|
|
1885
|
+
col_name = "new_values"
|
|
1842
1886
|
return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
|
|
1843
1887
|
|
|
1844
1888
|
def get_record_count(self) -> "FlowDataEngine":
|
|
@@ -1848,7 +1892,7 @@ class FlowDataEngine:
|
|
|
1848
1892
|
Returns:
|
|
1849
1893
|
A new `FlowDataEngine` instance.
|
|
1850
1894
|
"""
|
|
1851
|
-
return FlowDataEngine(self.data_frame.select(pl.len().alias(
|
|
1895
|
+
return FlowDataEngine(self.data_frame.select(pl.len().alias("number_of_records")))
|
|
1852
1896
|
|
|
1853
1897
|
def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
|
|
1854
1898
|
"""Asserts that this DataFrame is equal to another.
|
|
@@ -1871,13 +1915,13 @@ class FlowDataEngine:
|
|
|
1871
1915
|
other = other.select_columns(self.columns)
|
|
1872
1916
|
|
|
1873
1917
|
if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
|
|
1874
|
-
raise Exception(
|
|
1918
|
+
raise Exception("Number of records is not equal")
|
|
1875
1919
|
|
|
1876
1920
|
if self.columns != other.columns:
|
|
1877
|
-
raise Exception(
|
|
1921
|
+
raise Exception("Schema is not equal")
|
|
1878
1922
|
|
|
1879
1923
|
if strict_schema:
|
|
1880
|
-
assert self.data_frame.schema == other.data_frame.schema,
|
|
1924
|
+
assert self.data_frame.schema == other.data_frame.schema, "Data types do not match"
|
|
1881
1925
|
|
|
1882
1926
|
if ordered:
|
|
1883
1927
|
self_lf = self.data_frame.sort(by=self.columns)
|
|
@@ -1887,7 +1931,7 @@ class FlowDataEngine:
|
|
|
1887
1931
|
other_lf = other.data_frame
|
|
1888
1932
|
|
|
1889
1933
|
self.lazy, other.lazy = org_laziness
|
|
1890
|
-
assert self_lf.equals(other_lf),
|
|
1934
|
+
assert self_lf.equals(other_lf), "Data is not equal"
|
|
1891
1935
|
|
|
1892
1936
|
def initialize_empty_fl(self):
|
|
1893
1937
|
"""Initializes an empty LazyFrame."""
|
|
@@ -1902,7 +1946,7 @@ class FlowDataEngine:
|
|
|
1902
1946
|
operation_type="calculate_number_of_records",
|
|
1903
1947
|
flow_id=-1,
|
|
1904
1948
|
node_id=-1,
|
|
1905
|
-
wait_on_completion=True
|
|
1949
|
+
wait_on_completion=True,
|
|
1906
1950
|
).result
|
|
1907
1951
|
return number_of_records
|
|
1908
1952
|
|
|
@@ -1918,8 +1962,9 @@ class FlowDataEngine:
|
|
|
1918
1962
|
"""
|
|
1919
1963
|
return self.get_number_of_records(force_calculate=force_calculate)
|
|
1920
1964
|
|
|
1921
|
-
def get_number_of_records(
|
|
1922
|
-
|
|
1965
|
+
def get_number_of_records(
|
|
1966
|
+
self, warn: bool = False, force_calculate: bool = False, calculate_in_worker_process: bool = False
|
|
1967
|
+
) -> int:
|
|
1923
1968
|
"""Gets the total number of records in the DataFrame.
|
|
1924
1969
|
|
|
1925
1970
|
For lazy frames, this may trigger a full data scan, which can be expensive.
|
|
@@ -1949,12 +1994,13 @@ class FlowDataEngine:
|
|
|
1949
1994
|
except Exception as e:
|
|
1950
1995
|
logger.error(f"Error: {e}")
|
|
1951
1996
|
if warn:
|
|
1952
|
-
logger.warning(
|
|
1997
|
+
logger.warning("Calculating the number of records this can be expensive on a lazy frame")
|
|
1953
1998
|
try:
|
|
1954
1999
|
self.number_of_records = self.data_frame.select(pl.len()).collect(
|
|
1955
|
-
engine="streaming" if self._streamable else "auto"
|
|
2000
|
+
engine="streaming" if self._streamable else "auto"
|
|
2001
|
+
)[0, 0]
|
|
1956
2002
|
except Exception:
|
|
1957
|
-
raise ValueError(
|
|
2003
|
+
raise ValueError("Could not get number of records")
|
|
1958
2004
|
else:
|
|
1959
2005
|
self.number_of_records = self.data_frame.__len__()
|
|
1960
2006
|
return self.number_of_records
|
|
@@ -1995,7 +2041,7 @@ class FlowDataEngine:
|
|
|
1995
2041
|
return self._external_source
|
|
1996
2042
|
|
|
1997
2043
|
@property
|
|
1998
|
-
def cols_idx(self) ->
|
|
2044
|
+
def cols_idx(self) -> dict[str, int]:
|
|
1999
2045
|
"""A dictionary mapping column names to their integer index."""
|
|
2000
2046
|
if self._col_idx is None:
|
|
2001
2047
|
self._col_idx = {c: i for i, c in enumerate(self.columns)}
|
|
@@ -2017,7 +2063,7 @@ class FlowDataEngine:
|
|
|
2017
2063
|
[transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
|
|
2018
2064
|
)
|
|
2019
2065
|
|
|
2020
|
-
def select_columns(self, list_select:
|
|
2066
|
+
def select_columns(self, list_select: list[str] | tuple[str] | str) -> "FlowDataEngine":
|
|
2021
2067
|
"""Selects a subset of columns from the DataFrame.
|
|
2022
2068
|
|
|
2023
2069
|
Args:
|
|
@@ -2030,17 +2076,17 @@ class FlowDataEngine:
|
|
|
2030
2076
|
list_select = [list_select]
|
|
2031
2077
|
|
|
2032
2078
|
idx_to_keep = [self.cols_idx.get(c) for c in list_select]
|
|
2033
|
-
selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep) if id_to_keep is not None]
|
|
2079
|
+
selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep, strict=False) if id_to_keep is not None]
|
|
2034
2080
|
new_schema = [self.schema[i] for i in idx_to_keep if i is not None]
|
|
2035
2081
|
|
|
2036
2082
|
return FlowDataEngine(
|
|
2037
2083
|
self.data_frame.select(selects),
|
|
2038
2084
|
number_of_records=self.number_of_records,
|
|
2039
2085
|
schema=new_schema,
|
|
2040
|
-
streamable=self._streamable
|
|
2086
|
+
streamable=self._streamable,
|
|
2041
2087
|
)
|
|
2042
2088
|
|
|
2043
|
-
def drop_columns(self, columns:
|
|
2089
|
+
def drop_columns(self, columns: list[str]) -> "FlowDataEngine":
|
|
2044
2090
|
"""Drops specified columns from the DataFrame.
|
|
2045
2091
|
|
|
2046
2092
|
Args:
|
|
@@ -2054,12 +2100,10 @@ class FlowDataEngine:
|
|
|
2054
2100
|
new_schema = [self.schema[i] for i in idx_to_keep]
|
|
2055
2101
|
|
|
2056
2102
|
return FlowDataEngine(
|
|
2057
|
-
self.data_frame.select(cols_for_select),
|
|
2058
|
-
number_of_records=self.number_of_records,
|
|
2059
|
-
schema=new_schema
|
|
2103
|
+
self.data_frame.select(cols_for_select), number_of_records=self.number_of_records, schema=new_schema
|
|
2060
2104
|
)
|
|
2061
2105
|
|
|
2062
|
-
def reorganize_order(self, column_order:
|
|
2106
|
+
def reorganize_order(self, column_order: list[str]) -> "FlowDataEngine":
|
|
2063
2107
|
"""Reorganizes columns into a specified order.
|
|
2064
2108
|
|
|
2065
2109
|
Args:
|
|
@@ -2072,8 +2116,9 @@ class FlowDataEngine:
|
|
|
2072
2116
|
schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
|
|
2073
2117
|
return FlowDataEngine(df, schema=schema, number_of_records=self.number_of_records)
|
|
2074
2118
|
|
|
2075
|
-
def apply_flowfile_formula(
|
|
2076
|
-
|
|
2119
|
+
def apply_flowfile_formula(
|
|
2120
|
+
self, func: str, col_name: str, output_data_type: pl.DataType = None
|
|
2121
|
+
) -> "FlowDataEngine":
|
|
2077
2122
|
"""Applies a formula to create a new column or transform an existing one.
|
|
2078
2123
|
|
|
2079
2124
|
Args:
|
|
@@ -2092,8 +2137,7 @@ class FlowDataEngine:
|
|
|
2092
2137
|
|
|
2093
2138
|
return FlowDataEngine(df2, number_of_records=self.number_of_records)
|
|
2094
2139
|
|
|
2095
|
-
def apply_sql_formula(self, func: str, col_name: str,
|
|
2096
|
-
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
2140
|
+
def apply_sql_formula(self, func: str, col_name: str, output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
2097
2141
|
"""Applies an SQL-style formula using `pl.sql_expr`.
|
|
2098
2142
|
|
|
2099
2143
|
Args:
|
|
@@ -2105,15 +2149,16 @@ class FlowDataEngine:
|
|
|
2105
2149
|
A new `FlowDataEngine` instance with the applied formula.
|
|
2106
2150
|
"""
|
|
2107
2151
|
expr = to_expr(func)
|
|
2108
|
-
if output_data_type not in (None,
|
|
2152
|
+
if output_data_type not in (None, transform_schemas.AUTO_DATA_TYPE):
|
|
2109
2153
|
df = self.data_frame.with_columns(expr.cast(output_data_type).alias(col_name))
|
|
2110
2154
|
else:
|
|
2111
2155
|
df = self.data_frame.with_columns(expr.alias(col_name))
|
|
2112
2156
|
|
|
2113
2157
|
return FlowDataEngine(df, number_of_records=self.number_of_records)
|
|
2114
2158
|
|
|
2115
|
-
def output(
|
|
2116
|
-
|
|
2159
|
+
def output(
|
|
2160
|
+
self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str, execute_remote: bool = True
|
|
2161
|
+
) -> "FlowDataEngine":
|
|
2117
2162
|
"""Writes the DataFrame to an output file.
|
|
2118
2163
|
|
|
2119
2164
|
Can execute the write operation locally or in a remote worker process.
|
|
@@ -2127,21 +2172,21 @@ class FlowDataEngine:
|
|
|
2127
2172
|
Returns:
|
|
2128
2173
|
The same `FlowDataEngine` instance for chaining.
|
|
2129
2174
|
"""
|
|
2130
|
-
logger.info(
|
|
2175
|
+
logger.info("Starting to write output")
|
|
2131
2176
|
if execute_remote:
|
|
2132
2177
|
status = utils.write_output(
|
|
2133
2178
|
self.data_frame,
|
|
2134
2179
|
data_type=output_fs.file_type,
|
|
2135
2180
|
path=output_fs.abs_file_path,
|
|
2136
2181
|
write_mode=output_fs.write_mode,
|
|
2137
|
-
sheet_name=output_fs.
|
|
2138
|
-
delimiter=output_fs.
|
|
2182
|
+
sheet_name=output_fs.sheet_name,
|
|
2183
|
+
delimiter=output_fs.delimiter,
|
|
2139
2184
|
flow_id=flow_id,
|
|
2140
|
-
node_id=node_id
|
|
2185
|
+
node_id=node_id,
|
|
2141
2186
|
)
|
|
2142
2187
|
tracker = ExternalExecutorTracker(status)
|
|
2143
2188
|
tracker.get_result()
|
|
2144
|
-
logger.info(
|
|
2189
|
+
logger.info("Finished writing output")
|
|
2145
2190
|
else:
|
|
2146
2191
|
logger.info("Starting to write results locally")
|
|
2147
2192
|
utils.local_write_output(
|
|
@@ -2149,8 +2194,8 @@ class FlowDataEngine:
|
|
|
2149
2194
|
data_type=output_fs.file_type,
|
|
2150
2195
|
path=output_fs.abs_file_path,
|
|
2151
2196
|
write_mode=output_fs.write_mode,
|
|
2152
|
-
sheet_name=output_fs.
|
|
2153
|
-
delimiter=output_fs.
|
|
2197
|
+
sheet_name=output_fs.sheet_name,
|
|
2198
|
+
delimiter=output_fs.delimiter,
|
|
2154
2199
|
flow_id=flow_id,
|
|
2155
2200
|
node_id=node_id,
|
|
2156
2201
|
)
|
|
@@ -2183,11 +2228,10 @@ class FlowDataEngine:
|
|
|
2183
2228
|
if isinstance(other, FlowDataEngine):
|
|
2184
2229
|
other = [other]
|
|
2185
2230
|
|
|
2186
|
-
dfs:
|
|
2187
|
-
return FlowDataEngine(pl.concat(dfs, how=
|
|
2231
|
+
dfs: list[pl.LazyFrame] | list[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
|
|
2232
|
+
return FlowDataEngine(pl.concat(dfs, how="diagonal_relaxed"))
|
|
2188
2233
|
|
|
2189
|
-
def do_select(self, select_inputs: transform_schemas.SelectInputs,
|
|
2190
|
-
keep_missing: bool = True) -> "FlowDataEngine":
|
|
2234
|
+
def do_select(self, select_inputs: transform_schemas.SelectInputs, keep_missing: bool = True) -> "FlowDataEngine":
|
|
2191
2235
|
"""Performs a complex column selection, renaming, and reordering operation.
|
|
2192
2236
|
|
|
2193
2237
|
Args:
|
|
@@ -2203,7 +2247,8 @@ class FlowDataEngine:
|
|
|
2203
2247
|
|
|
2204
2248
|
if not keep_missing:
|
|
2205
2249
|
drop_cols = set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames).union(
|
|
2206
|
-
set(r.old_name for r in renames if not r.keep)
|
|
2250
|
+
set(r.old_name for r in renames if not r.keep)
|
|
2251
|
+
)
|
|
2207
2252
|
keep_cols = []
|
|
2208
2253
|
else:
|
|
2209
2254
|
keep_cols = list(set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames))
|
|
@@ -2223,12 +2268,14 @@ class FlowDataEngine:
|
|
|
2223
2268
|
|
|
2224
2269
|
rename_dict = {r.old_name: r.new_name for r in available_renames}
|
|
2225
2270
|
fl = self.select_columns(
|
|
2226
|
-
list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
|
|
2271
|
+
list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols
|
|
2272
|
+
)
|
|
2227
2273
|
fl = fl.change_column_types(transforms=[r for r in renames if r.keep])
|
|
2228
2274
|
ndf = fl.data_frame.rename(rename_dict)
|
|
2229
2275
|
renames.sort(key=lambda r: 0 if r.position is None else r.position)
|
|
2230
|
-
sorted_cols = utils.match_order(
|
|
2231
|
-
|
|
2276
|
+
sorted_cols = utils.match_order(
|
|
2277
|
+
ndf.collect_schema().names(), [r.new_name for r in renames] + self.data_frame.collect_schema().names()
|
|
2278
|
+
)
|
|
2232
2279
|
output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
|
|
2233
2280
|
return output_file.reorganize_order(sorted_cols)
|
|
2234
2281
|
|
|
@@ -2236,7 +2283,7 @@ class FlowDataEngine:
|
|
|
2236
2283
|
"""Sets whether DataFrame operations should be streamable."""
|
|
2237
2284
|
self._streamable = streamable
|
|
2238
2285
|
|
|
2239
|
-
def _calculate_schema(self) ->
|
|
2286
|
+
def _calculate_schema(self) -> list[dict]:
|
|
2240
2287
|
"""Calculates schema statistics."""
|
|
2241
2288
|
if self.external_source is not None:
|
|
2242
2289
|
self.collect_external()
|
|
@@ -2256,8 +2303,10 @@ class FlowDataEngine:
|
|
|
2256
2303
|
def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
|
|
2257
2304
|
"""Creates a FlowDataEngine from a path in a worker process."""
|
|
2258
2305
|
received_table.set_absolute_filepath()
|
|
2259
|
-
|
|
2260
|
-
|
|
2306
|
+
|
|
2307
|
+
external_fetcher = ExternalCreateFetcher(
|
|
2308
|
+
received_table=received_table, file_type=received_table.file_type, flow_id=flow_id, node_id=node_id
|
|
2309
|
+
)
|
|
2261
2310
|
return cls(external_fetcher.get_result())
|
|
2262
2311
|
|
|
2263
2312
|
|
|
@@ -2280,10 +2329,10 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
|
|
|
2280
2329
|
if len(flowfile_tables) == 0:
|
|
2281
2330
|
kwargs = {}
|
|
2282
2331
|
elif len(flowfile_tables) == 1:
|
|
2283
|
-
kwargs = {
|
|
2332
|
+
kwargs = {"input_df": flowfile_tables[0].data_frame}
|
|
2284
2333
|
else:
|
|
2285
|
-
kwargs = {f
|
|
2334
|
+
kwargs = {f"input_df_{i+1}": flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
|
|
2286
2335
|
df = polars_executable(**kwargs)
|
|
2287
2336
|
if isinstance(df, pl.DataFrame):
|
|
2288
2337
|
logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
|
|
2289
|
-
return FlowDataEngine(df)
|
|
2338
|
+
return FlowDataEngine(df)
|