Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/api-6ef0dcef.js +80 -0
- flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +9 -6
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +472 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +718 -253
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +563 -117
- flowfile_core/flowfile/flow_node/models.py +154 -20
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +35 -26
- flowfile_core/main.py +35 -15
- flowfile_core/routes/cloud_connections.py +77 -0
- flowfile_core/routes/logs.py +2 -7
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +130 -90
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +121 -71
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +150 -12
- flowfile_core/schemas/transform_schema.py +175 -35
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +481 -208
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +160 -22
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +292 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +214 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_core/schemas/models.py +0 -193
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
from copy import deepcopy
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from math import ceil
|
|
7
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
7
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
|
|
8
8
|
|
|
9
9
|
# Third-party imports
|
|
10
10
|
from loky import Future
|
|
@@ -12,29 +12,39 @@ import polars as pl
|
|
|
12
12
|
from polars.exceptions import PanicException
|
|
13
13
|
from polars_grouper import graph_solver
|
|
14
14
|
from polars_expr_transformer import simple_function_to_expr as to_expr
|
|
15
|
+
from pyarrow import Table as PaTable
|
|
15
16
|
from pyarrow.parquet import ParquetFile
|
|
16
17
|
|
|
17
18
|
# Local imports - Core
|
|
18
19
|
from flowfile_core.configs import logger
|
|
20
|
+
from flowfile_core.utils.utils import ensure_similarity_dicts
|
|
19
21
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
20
22
|
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
21
23
|
from flowfile_core.schemas import (
|
|
24
|
+
cloud_storage_schemas,
|
|
22
25
|
input_schema,
|
|
23
26
|
transform_schema as transform_schemas
|
|
24
27
|
)
|
|
25
28
|
|
|
26
29
|
# Local imports - Flow File Components
|
|
27
30
|
from flowfile_core.flowfile.flow_data_engine import utils
|
|
31
|
+
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (CloudStorageReader,
|
|
32
|
+
ensure_path_has_wildcard_pattern,
|
|
33
|
+
get_first_file_from_s3_dir)
|
|
28
34
|
from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
|
|
29
35
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
|
|
30
36
|
FlowfileColumn,
|
|
37
|
+
assert_if_flowfile_schema,
|
|
31
38
|
convert_stats_to_column_info
|
|
32
39
|
)
|
|
33
40
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
34
41
|
from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
|
|
35
42
|
from flowfile_core.flowfile.flow_data_engine.join import (
|
|
36
43
|
verify_join_select_integrity,
|
|
37
|
-
verify_join_map_integrity
|
|
44
|
+
verify_join_map_integrity,
|
|
45
|
+
rename_df_table_for_join,
|
|
46
|
+
get_undo_rename_mapping_join,
|
|
47
|
+
get_col_name_to_delete
|
|
38
48
|
)
|
|
39
49
|
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
40
50
|
from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
|
|
@@ -52,20 +62,95 @@ from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
|
|
|
52
62
|
|
|
53
63
|
from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
|
|
54
64
|
|
|
65
|
+
T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
|
|
55
66
|
|
|
56
|
-
|
|
57
|
-
|
|
67
|
+
def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
|
|
68
|
+
"""Temporarily renames join keys to avoid conflicts during a join.
|
|
69
|
+
|
|
70
|
+
This helper function checks the join type and renames the join key columns
|
|
71
|
+
in either the left or right DataFrame to a temporary name (`__FL_TEMP__...`).
|
|
72
|
+
This prevents Polars from automatically suffixing columns with `_right` when
|
|
73
|
+
join keys have the same name.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
left_df: The left Polars DataFrame or LazyFrame.
|
|
77
|
+
right_df: The right Polars DataFrame or LazyFrame.
|
|
78
|
+
join_input: The JoinInput settings object defining the join.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
A tuple containing:
|
|
82
|
+
- The (potentially modified) left DataFrame.
|
|
83
|
+
- The (potentially modified) right DataFrame.
|
|
84
|
+
- A dictionary mapping the temporary names back to their desired final names.
|
|
58
85
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
86
|
+
def _construct_temp_name(column_name: str) -> str:
|
|
87
|
+
return "__FL_TEMP__"+column_name
|
|
88
|
+
if join_input.how == 'right':
|
|
89
|
+
left_df = left_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
90
|
+
for jk in join_input.left_select.join_key_selects)
|
|
91
|
+
reverse_actions = {
|
|
92
|
+
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
|
|
93
|
+
for jk in join_input.left_select.join_key_selects}
|
|
94
|
+
elif join_input.how in ('left', 'inner'):
|
|
95
|
+
right_df = right_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
96
|
+
for jk in join_input.right_select.join_key_selects)
|
|
97
|
+
reverse_actions = {
|
|
98
|
+
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
|
|
99
|
+
for jk in join_input.right_select.join_key_selects}
|
|
100
|
+
else:
|
|
101
|
+
reverse_actions = {}
|
|
102
|
+
return left_df, right_df, reverse_actions
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.JoinInput) -> None:
|
|
106
|
+
"""Modifies JoinInput for semi/anti joins to not keep right-side columns.
|
|
107
|
+
|
|
108
|
+
For 'semi' and 'anti' joins, Polars only returns columns from the left
|
|
109
|
+
DataFrame. This function enforces that behavior by modifying the `join_input`
|
|
110
|
+
in-place, setting the `keep` flag to `False` for all columns in the
|
|
111
|
+
right-side selection.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
join_input: The JoinInput settings object to modify.
|
|
115
|
+
"""
|
|
116
|
+
if join_input.how in ('semi', 'anti'):
|
|
117
|
+
for jk in join_input.right_select.renames:
|
|
118
|
+
jk.keep = False
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
|
|
122
|
+
"""Extracts a list of column names to be selected from a SelectInput list.
|
|
123
|
+
|
|
124
|
+
This function filters a list of `SelectInput` objects to return the names
|
|
125
|
+
of columns that are marked as available and are either a join key or
|
|
126
|
+
explicitly marked to be kept.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
full_select_input: A list of SelectInput objects.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
A list of column names to be selected.
|
|
67
133
|
"""
|
|
134
|
+
return [v.old_name for v in full_select_input if (v.keep or v.join_key) and v.is_available]
|
|
68
135
|
|
|
136
|
+
|
|
137
|
+
@dataclass
|
|
138
|
+
class FlowDataEngine:
|
|
139
|
+
"""The core data handling engine for Flowfile.
|
|
140
|
+
|
|
141
|
+
This class acts as a high-level wrapper around a Polars DataFrame or
|
|
142
|
+
LazyFrame, providing a unified API for data ingestion, transformation,
|
|
143
|
+
and output. It manages data state (lazy vs. eager), schema information,
|
|
144
|
+
and execution logic.
|
|
145
|
+
|
|
146
|
+
Attributes:
|
|
147
|
+
_data_frame: The underlying Polars DataFrame or LazyFrame.
|
|
148
|
+
columns: A list of column names in the current data frame.
|
|
149
|
+
name: An optional name for the data engine instance.
|
|
150
|
+
number_of_records: The number of records. Can be -1 for lazy frames.
|
|
151
|
+
errors: A list of errors encountered during operations.
|
|
152
|
+
_schema: A cached list of `FlowfileColumn` objects representing the schema.
|
|
153
|
+
"""
|
|
69
154
|
# Core attributes
|
|
70
155
|
_data_frame: Union[pl.DataFrame, pl.LazyFrame]
|
|
71
156
|
columns: List[Any]
|
|
@@ -105,12 +190,9 @@ class FlowDataEngine:
|
|
|
105
190
|
_number_of_records_callback: Callable = None
|
|
106
191
|
_data_callback: Callable = None
|
|
107
192
|
|
|
108
|
-
# Tracking info
|
|
109
|
-
# node_id: int = None # TODO: Implement node_id
|
|
110
|
-
# flow_id: int = None # TODO: Implement flow_id
|
|
111
193
|
|
|
112
194
|
def __init__(self,
|
|
113
|
-
raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
|
|
195
|
+
raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
|
|
114
196
|
path_ref: str = None,
|
|
115
197
|
name: str = None,
|
|
116
198
|
optimize_memory: bool = True,
|
|
@@ -120,7 +202,22 @@ class FlowDataEngine:
|
|
|
120
202
|
streamable: bool = True,
|
|
121
203
|
number_of_records_callback: Callable = None,
|
|
122
204
|
data_callback: Callable = None):
|
|
123
|
-
"""
|
|
205
|
+
"""Initializes the FlowDataEngine from various data sources.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
raw_data: The input data. Can be a list of dicts, a Polars DataFrame/LazyFrame,
|
|
209
|
+
or a `RawData` schema object.
|
|
210
|
+
path_ref: A string path to a Parquet file.
|
|
211
|
+
name: An optional name for the data engine instance.
|
|
212
|
+
optimize_memory: If True, prefers lazy operations to conserve memory.
|
|
213
|
+
schema: An optional schema definition. Can be a list of `FlowfileColumn` objects,
|
|
214
|
+
a list of column names, or a Polars `Schema`.
|
|
215
|
+
number_of_records: The number of records, if known.
|
|
216
|
+
calculate_schema_stats: If True, computes detailed statistics for each column.
|
|
217
|
+
streamable: If True, allows for streaming operations when possible.
|
|
218
|
+
number_of_records_callback: A callback function to retrieve the number of records.
|
|
219
|
+
data_callback: A callback function to retrieve the data.
|
|
220
|
+
"""
|
|
124
221
|
self._initialize_attributes(number_of_records_callback, data_callback, streamable)
|
|
125
222
|
|
|
126
223
|
if raw_data is not None:
|
|
@@ -129,11 +226,14 @@ class FlowDataEngine:
|
|
|
129
226
|
self._handle_path_ref(path_ref, optimize_memory)
|
|
130
227
|
else:
|
|
131
228
|
self.initialize_empty_fl()
|
|
132
|
-
|
|
133
229
|
self._finalize_initialization(name, optimize_memory, schema, calculate_schema_stats)
|
|
134
230
|
|
|
135
231
|
def _initialize_attributes(self, number_of_records_callback, data_callback, streamable):
|
|
136
|
-
"""
|
|
232
|
+
"""(Internal) Sets the initial default attributes for a new instance.
|
|
233
|
+
|
|
234
|
+
This helper is called first during initialization to ensure all state-tracking
|
|
235
|
+
and configuration attributes have a clean default value before data is processed.
|
|
236
|
+
"""
|
|
137
237
|
self._external_source = None
|
|
138
238
|
self._number_of_records_callback = number_of_records_callback
|
|
139
239
|
self._data_callback = data_callback
|
|
@@ -147,8 +247,11 @@ class FlowDataEngine:
|
|
|
147
247
|
self.is_future = False
|
|
148
248
|
|
|
149
249
|
def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
|
|
150
|
-
"""
|
|
250
|
+
"""(Internal) Dispatches raw data to the appropriate handler based on its type.
|
|
151
251
|
|
|
252
|
+
This acts as a router during initialization, inspecting the type of `raw_data`
|
|
253
|
+
and calling the corresponding specialized `_handle_*` method to process it.
|
|
254
|
+
"""
|
|
152
255
|
if isinstance(raw_data, input_schema.RawData):
|
|
153
256
|
self._handle_raw_data_format(raw_data)
|
|
154
257
|
elif isinstance(raw_data, pl.DataFrame):
|
|
@@ -159,12 +262,12 @@ class FlowDataEngine:
|
|
|
159
262
|
self._handle_python_data(raw_data)
|
|
160
263
|
|
|
161
264
|
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
|
|
162
|
-
"""
|
|
265
|
+
"""(Internal) Initializes the engine from an eager Polars DataFrame."""
|
|
163
266
|
self.data_frame = df
|
|
164
267
|
self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
|
|
165
268
|
|
|
166
269
|
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
|
|
167
|
-
"""
|
|
270
|
+
"""(Internal) Initializes the engine from a Polars LazyFrame."""
|
|
168
271
|
self.data_frame = lf
|
|
169
272
|
self._lazy = True
|
|
170
273
|
if number_of_records is not None:
|
|
@@ -175,27 +278,35 @@ class FlowDataEngine:
|
|
|
175
278
|
self.number_of_records = lf.select(pl.len()).collect()[0, 0]
|
|
176
279
|
|
|
177
280
|
def _handle_python_data(self, data: Union[List, Dict]):
|
|
178
|
-
"""
|
|
281
|
+
"""(Internal) Dispatches Python collections to the correct handler."""
|
|
179
282
|
if isinstance(data, dict):
|
|
180
283
|
self._handle_dict_input(data)
|
|
181
284
|
else:
|
|
182
285
|
self._handle_list_input(data)
|
|
183
286
|
|
|
184
287
|
def _handle_dict_input(self, data: Dict):
|
|
185
|
-
"""
|
|
288
|
+
"""(Internal) Initializes the engine from a Python dictionary."""
|
|
186
289
|
if len(data) == 0:
|
|
187
290
|
self.initialize_empty_fl()
|
|
188
291
|
lengths = [len(v) if isinstance(v, (list, tuple)) else 1 for v in data.values()]
|
|
189
292
|
|
|
190
|
-
if len(set(lengths)) == 1 and lengths[0]>1:
|
|
293
|
+
if len(set(lengths)) == 1 and lengths[0] > 1:
|
|
191
294
|
self.number_of_records = lengths[0]
|
|
192
295
|
self.data_frame = pl.DataFrame(data)
|
|
193
296
|
else:
|
|
194
297
|
self.number_of_records = 1
|
|
195
298
|
self.data_frame = pl.DataFrame([data])
|
|
299
|
+
self.lazy = True
|
|
196
300
|
|
|
197
301
|
def _handle_raw_data_format(self, raw_data: input_schema.RawData):
|
|
198
|
-
"""
|
|
302
|
+
"""(Internal) Initializes the engine from a `RawData` schema object.
|
|
303
|
+
|
|
304
|
+
This method uses the schema provided in the `RawData` object to correctly
|
|
305
|
+
infer data types when creating the Polars DataFrame.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
raw_data: An instance of `RawData` containing the data and schema.
|
|
309
|
+
"""
|
|
199
310
|
flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
|
|
200
311
|
polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
|
|
201
312
|
for flowfile_column in flowfile_schema])
|
|
@@ -209,7 +320,7 @@ class FlowDataEngine:
|
|
|
209
320
|
self.lazy = True
|
|
210
321
|
|
|
211
322
|
def _handle_list_input(self, data: List):
|
|
212
|
-
"""
|
|
323
|
+
"""(Internal) Initializes the engine from a list of records."""
|
|
213
324
|
number_of_records = len(data)
|
|
214
325
|
if number_of_records > 0:
|
|
215
326
|
processed_data = self._process_list_data(data)
|
|
@@ -222,20 +333,411 @@ class FlowDataEngine:
|
|
|
222
333
|
|
|
223
334
|
@staticmethod
|
|
224
335
|
def _process_list_data(data: List) -> List[Dict]:
|
|
225
|
-
"""
|
|
336
|
+
"""(Internal) Normalizes list data into a list of dictionaries.
|
|
337
|
+
|
|
338
|
+
Ensures that a list of objects or non-dict items is converted into a
|
|
339
|
+
uniform list of dictionaries suitable for Polars DataFrame creation.
|
|
340
|
+
"""
|
|
226
341
|
if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
|
|
227
342
|
try:
|
|
228
343
|
return pl.DataFrame(data).to_dicts()
|
|
229
|
-
except:
|
|
344
|
+
except TypeError:
|
|
230
345
|
raise Exception('Value must be able to be converted to dictionary')
|
|
346
|
+
except Exception as e:
|
|
347
|
+
raise Exception(f'Value must be able to be converted to dictionary: {e}')
|
|
231
348
|
|
|
232
349
|
if not isinstance(data[0], dict):
|
|
233
350
|
data = [row.__dict__ for row in data]
|
|
234
351
|
|
|
235
|
-
return
|
|
352
|
+
return ensure_similarity_dicts(data)
|
|
353
|
+
|
|
354
|
+
def to_cloud_storage_obj(self, settings: cloud_storage_schemas.CloudStorageWriteSettingsInternal):
|
|
355
|
+
"""Writes the DataFrame to an object in cloud storage.
|
|
356
|
+
|
|
357
|
+
This method supports writing to various cloud storage providers like AWS S3,
|
|
358
|
+
Azure Data Lake Storage, and Google Cloud Storage.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
settings: A `CloudStorageWriteSettingsInternal` object containing connection
|
|
362
|
+
details, file format, and write options.
|
|
363
|
+
|
|
364
|
+
Raises:
|
|
365
|
+
ValueError: If the specified file format is not supported for writing.
|
|
366
|
+
NotImplementedError: If the 'append' write mode is used with an unsupported format.
|
|
367
|
+
Exception: If the write operation to cloud storage fails for any reason.
|
|
368
|
+
"""
|
|
369
|
+
connection = settings.connection
|
|
370
|
+
write_settings = settings.write_settings
|
|
371
|
+
|
|
372
|
+
logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
|
|
373
|
+
|
|
374
|
+
if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
|
|
375
|
+
raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
|
|
376
|
+
storage_options = CloudStorageReader.get_storage_options(connection)
|
|
377
|
+
credential_provider = CloudStorageReader.get_credential_provider(connection)
|
|
378
|
+
# Dispatch to the correct writer based on file format
|
|
379
|
+
if write_settings.file_format == "parquet":
|
|
380
|
+
self._write_parquet_to_cloud(
|
|
381
|
+
write_settings.resource_path,
|
|
382
|
+
storage_options,
|
|
383
|
+
credential_provider,
|
|
384
|
+
write_settings
|
|
385
|
+
)
|
|
386
|
+
elif write_settings.file_format == "delta":
|
|
387
|
+
self._write_delta_to_cloud(
|
|
388
|
+
write_settings.resource_path,
|
|
389
|
+
storage_options,
|
|
390
|
+
credential_provider,
|
|
391
|
+
write_settings
|
|
392
|
+
)
|
|
393
|
+
elif write_settings.file_format == "csv":
|
|
394
|
+
self._write_csv_to_cloud(
|
|
395
|
+
write_settings.resource_path,
|
|
396
|
+
storage_options,
|
|
397
|
+
credential_provider,
|
|
398
|
+
write_settings
|
|
399
|
+
)
|
|
400
|
+
elif write_settings.file_format == "json":
|
|
401
|
+
self._write_json_to_cloud(
|
|
402
|
+
write_settings.resource_path,
|
|
403
|
+
storage_options,
|
|
404
|
+
credential_provider,
|
|
405
|
+
write_settings
|
|
406
|
+
)
|
|
407
|
+
else:
|
|
408
|
+
raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
|
|
409
|
+
|
|
410
|
+
logger.info(f"Successfully wrote data to {write_settings.resource_path}")
|
|
411
|
+
|
|
412
|
+
def _write_parquet_to_cloud(self,
|
|
413
|
+
resource_path: str,
|
|
414
|
+
storage_options: Dict[str, Any],
|
|
415
|
+
credential_provider: Optional[Callable],
|
|
416
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
|
|
417
|
+
"""(Internal) Writes the DataFrame to a Parquet file in cloud storage.
|
|
418
|
+
|
|
419
|
+
Uses `sink_parquet` for efficient streaming writes. Falls back to a
|
|
420
|
+
collect-then-write pattern if sinking fails.
|
|
421
|
+
"""
|
|
422
|
+
try:
|
|
423
|
+
sink_kwargs = {
|
|
424
|
+
"path": resource_path,
|
|
425
|
+
"compression": write_settings.parquet_compression,
|
|
426
|
+
}
|
|
427
|
+
if storage_options:
|
|
428
|
+
sink_kwargs["storage_options"] = storage_options
|
|
429
|
+
if credential_provider:
|
|
430
|
+
sink_kwargs["credential_provider"] = credential_provider
|
|
431
|
+
try:
|
|
432
|
+
self.data_frame.sink_parquet(**sink_kwargs)
|
|
433
|
+
except Exception as e:
|
|
434
|
+
logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
|
|
435
|
+
pl_df = self.collect()
|
|
436
|
+
sink_kwargs['file'] = sink_kwargs.pop("path")
|
|
437
|
+
pl_df.write_parquet(**sink_kwargs)
|
|
438
|
+
|
|
439
|
+
except Exception as e:
|
|
440
|
+
logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
|
|
441
|
+
raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
|
|
442
|
+
|
|
443
|
+
def _write_delta_to_cloud(self,
|
|
444
|
+
resource_path: str,
|
|
445
|
+
storage_options: Dict[str, Any],
|
|
446
|
+
credential_provider: Optional[Callable],
|
|
447
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
|
|
448
|
+
"""(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
|
|
449
|
+
|
|
450
|
+
This operation requires collecting the data first, as `write_delta` operates
|
|
451
|
+
on an eager DataFrame.
|
|
452
|
+
"""
|
|
453
|
+
sink_kwargs = {
|
|
454
|
+
"target": resource_path,
|
|
455
|
+
"mode": write_settings.write_mode,
|
|
456
|
+
}
|
|
457
|
+
if storage_options:
|
|
458
|
+
sink_kwargs["storage_options"] = storage_options
|
|
459
|
+
if credential_provider:
|
|
460
|
+
sink_kwargs["credential_provider"] = credential_provider
|
|
461
|
+
self.collect().write_delta(**sink_kwargs)
|
|
462
|
+
|
|
463
|
+
def _write_csv_to_cloud(self,
|
|
464
|
+
resource_path: str,
|
|
465
|
+
storage_options: Dict[str, Any],
|
|
466
|
+
credential_provider: Optional[Callable],
|
|
467
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
|
|
468
|
+
"""(Internal) Writes the DataFrame to a CSV file in cloud storage.
|
|
469
|
+
|
|
470
|
+
Uses `sink_csv` for efficient, streaming writes of the data.
|
|
471
|
+
"""
|
|
472
|
+
try:
|
|
473
|
+
sink_kwargs = {
|
|
474
|
+
"path": resource_path,
|
|
475
|
+
"separator": write_settings.csv_delimiter,
|
|
476
|
+
}
|
|
477
|
+
if storage_options:
|
|
478
|
+
sink_kwargs["storage_options"] = storage_options
|
|
479
|
+
if credential_provider:
|
|
480
|
+
sink_kwargs["credential_provider"] = credential_provider
|
|
481
|
+
|
|
482
|
+
# sink_csv executes the lazy query and writes the result
|
|
483
|
+
self.data_frame.sink_csv(**sink_kwargs)
|
|
484
|
+
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
|
|
487
|
+
raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
|
|
488
|
+
|
|
489
|
+
def _write_json_to_cloud(self,
|
|
490
|
+
resource_path: str,
|
|
491
|
+
storage_options: Dict[str, Any],
|
|
492
|
+
credential_provider: Optional[Callable],
|
|
493
|
+
write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
|
|
494
|
+
"""(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
|
|
495
|
+
|
|
496
|
+
Uses `sink_ndjson` for efficient, streaming writes.
|
|
497
|
+
"""
|
|
498
|
+
try:
|
|
499
|
+
sink_kwargs = {"path": resource_path}
|
|
500
|
+
if storage_options:
|
|
501
|
+
sink_kwargs["storage_options"] = storage_options
|
|
502
|
+
if credential_provider:
|
|
503
|
+
sink_kwargs["credential_provider"] = credential_provider
|
|
504
|
+
self.data_frame.sink_ndjson(**sink_kwargs)
|
|
505
|
+
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.error(f"Failed to write JSON to {resource_path}: {str(e)}")
|
|
508
|
+
raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
|
|
509
|
+
|
|
510
|
+
@classmethod
|
|
511
|
+
def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal) -> "FlowDataEngine":
|
|
512
|
+
"""Creates a FlowDataEngine from an object in cloud storage.
|
|
513
|
+
|
|
514
|
+
This method supports reading from various cloud storage providers like AWS S3,
|
|
515
|
+
Azure Data Lake Storage, and Google Cloud Storage, with support for
|
|
516
|
+
various authentication methods.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
settings: A `CloudStorageReadSettingsInternal` object containing connection
|
|
520
|
+
details, file format, and read options.
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
A new `FlowDataEngine` instance containing the data from cloud storage.
|
|
524
|
+
|
|
525
|
+
Raises:
|
|
526
|
+
ValueError: If the storage type or file format is not supported.
|
|
527
|
+
NotImplementedError: If a requested file format like "delta" or "iceberg"
|
|
528
|
+
is not yet implemented.
|
|
529
|
+
Exception: If reading from cloud storage fails.
|
|
530
|
+
"""
|
|
531
|
+
connection = settings.connection
|
|
532
|
+
read_settings = settings.read_settings
|
|
533
|
+
|
|
534
|
+
logger.info(f"Reading from {connection.storage_type} storage: {read_settings.resource_path}")
|
|
535
|
+
# Get storage options based on connection type
|
|
536
|
+
storage_options = CloudStorageReader.get_storage_options(connection)
|
|
537
|
+
# Get credential provider if needed
|
|
538
|
+
credential_provider = CloudStorageReader.get_credential_provider(connection)
|
|
539
|
+
if read_settings.file_format == "parquet":
|
|
540
|
+
return cls._read_parquet_from_cloud(
|
|
541
|
+
read_settings.resource_path,
|
|
542
|
+
storage_options,
|
|
543
|
+
credential_provider,
|
|
544
|
+
read_settings.scan_mode == "directory",
|
|
545
|
+
)
|
|
546
|
+
elif read_settings.file_format == "delta":
|
|
547
|
+
return cls._read_delta_from_cloud(
|
|
548
|
+
read_settings.resource_path,
|
|
549
|
+
storage_options,
|
|
550
|
+
credential_provider,
|
|
551
|
+
read_settings
|
|
552
|
+
)
|
|
553
|
+
elif read_settings.file_format == "csv":
|
|
554
|
+
return cls._read_csv_from_cloud(
|
|
555
|
+
read_settings.resource_path,
|
|
556
|
+
storage_options,
|
|
557
|
+
credential_provider,
|
|
558
|
+
read_settings
|
|
559
|
+
)
|
|
560
|
+
elif read_settings.file_format == "json":
|
|
561
|
+
return cls._read_json_from_cloud(
|
|
562
|
+
read_settings.resource_path,
|
|
563
|
+
storage_options,
|
|
564
|
+
credential_provider,
|
|
565
|
+
read_settings.scan_mode == "directory"
|
|
566
|
+
)
|
|
567
|
+
elif read_settings.file_format == "iceberg":
|
|
568
|
+
return cls._read_iceberg_from_cloud(
|
|
569
|
+
read_settings.resource_path,
|
|
570
|
+
storage_options,
|
|
571
|
+
credential_provider,
|
|
572
|
+
read_settings
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
elif read_settings.file_format in ["delta", "iceberg"]:
|
|
576
|
+
# These would require additional libraries
|
|
577
|
+
raise NotImplementedError(f"File format {read_settings.file_format} not yet implemented")
|
|
578
|
+
else:
|
|
579
|
+
raise ValueError(f"Unsupported file format: {read_settings.file_format}")
|
|
580
|
+
|
|
581
|
+
@staticmethod
|
|
582
|
+
def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any],
|
|
583
|
+
file_format: Literal["csv", "parquet", "json", "delta"]) -> List[FlowfileColumn] | None:
|
|
584
|
+
"""Infers the schema by scanning the first file in a cloud directory."""
|
|
585
|
+
try:
|
|
586
|
+
scan_func = getattr(pl, "scan_" + file_format)
|
|
587
|
+
first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
|
|
588
|
+
return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
|
|
589
|
+
scan_func(first_file_ref, storage_options=storage_options).collect_schema()))
|
|
590
|
+
except Exception as e:
|
|
591
|
+
logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
@classmethod
|
|
595
|
+
def _read_iceberg_from_cloud(cls,
|
|
596
|
+
resource_path: str,
|
|
597
|
+
storage_options: Dict[str, Any],
|
|
598
|
+
credential_provider: Optional[Callable],
|
|
599
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
|
|
600
|
+
"""Reads Iceberg table(s) from cloud storage."""
|
|
601
|
+
raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
|
|
602
|
+
|
|
603
|
+
@classmethod
|
|
604
|
+
def _read_parquet_from_cloud(cls,
|
|
605
|
+
resource_path: str,
|
|
606
|
+
storage_options: Dict[str, Any],
|
|
607
|
+
credential_provider: Optional[Callable],
|
|
608
|
+
is_directory: bool) -> "FlowDataEngine":
|
|
609
|
+
"""Reads Parquet file(s) from cloud storage."""
|
|
610
|
+
try:
|
|
611
|
+
# Use scan_parquet for lazy evaluation
|
|
612
|
+
if is_directory:
|
|
613
|
+
resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="parquet")
|
|
614
|
+
scan_kwargs = {"source": resource_path}
|
|
615
|
+
|
|
616
|
+
if storage_options:
|
|
617
|
+
scan_kwargs["storage_options"] = storage_options
|
|
618
|
+
|
|
619
|
+
if credential_provider:
|
|
620
|
+
scan_kwargs["credential_provider"] = credential_provider
|
|
621
|
+
if storage_options and is_directory:
|
|
622
|
+
schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options, "parquet")
|
|
623
|
+
else:
|
|
624
|
+
schema = None
|
|
625
|
+
lf = pl.scan_parquet(**scan_kwargs)
|
|
626
|
+
|
|
627
|
+
return cls(
|
|
628
|
+
lf,
|
|
629
|
+
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
630
|
+
optimize_memory=True,
|
|
631
|
+
streamable=True,
|
|
632
|
+
schema=schema
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
except Exception as e:
|
|
636
|
+
logger.error(f"Failed to read Parquet from {resource_path}: {str(e)}")
|
|
637
|
+
raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
|
|
638
|
+
|
|
639
|
+
@classmethod
|
|
640
|
+
def _read_delta_from_cloud(cls,
|
|
641
|
+
resource_path: str,
|
|
642
|
+
storage_options: Dict[str, Any],
|
|
643
|
+
credential_provider: Optional[Callable],
|
|
644
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
|
|
645
|
+
"""Reads a Delta Lake table from cloud storage."""
|
|
646
|
+
try:
|
|
647
|
+
logger.info("Reading Delta file from cloud storage...")
|
|
648
|
+
logger.info(f"read_settings: {read_settings}")
|
|
649
|
+
scan_kwargs = {"source": resource_path}
|
|
650
|
+
if read_settings.delta_version:
|
|
651
|
+
scan_kwargs['version'] = read_settings.delta_version
|
|
652
|
+
if storage_options:
|
|
653
|
+
scan_kwargs["storage_options"] = storage_options
|
|
654
|
+
if credential_provider:
|
|
655
|
+
scan_kwargs["credential_provider"] = credential_provider
|
|
656
|
+
lf = pl.scan_delta(**scan_kwargs)
|
|
657
|
+
|
|
658
|
+
return cls(
|
|
659
|
+
lf,
|
|
660
|
+
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
661
|
+
optimize_memory=True,
|
|
662
|
+
streamable=True
|
|
663
|
+
)
|
|
664
|
+
except Exception as e:
|
|
665
|
+
logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
|
|
666
|
+
raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
|
|
667
|
+
|
|
668
|
+
@classmethod
|
|
669
|
+
def _read_csv_from_cloud(cls,
|
|
670
|
+
resource_path: str,
|
|
671
|
+
storage_options: Dict[str, Any],
|
|
672
|
+
credential_provider: Optional[Callable],
|
|
673
|
+
read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
|
|
674
|
+
"""Reads CSV file(s) from cloud storage."""
|
|
675
|
+
try:
|
|
676
|
+
scan_kwargs = {
|
|
677
|
+
"source": resource_path,
|
|
678
|
+
"has_header": read_settings.csv_has_header,
|
|
679
|
+
"separator": read_settings.csv_delimiter,
|
|
680
|
+
"encoding": read_settings.csv_encoding,
|
|
681
|
+
}
|
|
682
|
+
if storage_options:
|
|
683
|
+
scan_kwargs["storage_options"] = storage_options
|
|
684
|
+
if credential_provider:
|
|
685
|
+
scan_kwargs["credential_provider"] = credential_provider
|
|
686
|
+
|
|
687
|
+
if read_settings.scan_mode == "directory":
|
|
688
|
+
resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="csv")
|
|
689
|
+
scan_kwargs["source"] = resource_path
|
|
690
|
+
if storage_options and read_settings.scan_mode == "directory":
|
|
691
|
+
schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options, "csv")
|
|
692
|
+
else:
|
|
693
|
+
schema = None
|
|
694
|
+
|
|
695
|
+
lf = pl.scan_csv(**scan_kwargs)
|
|
696
|
+
|
|
697
|
+
return cls(
|
|
698
|
+
lf,
|
|
699
|
+
number_of_records=6_666_666, # Will be calculated lazily
|
|
700
|
+
optimize_memory=True,
|
|
701
|
+
streamable=True,
|
|
702
|
+
schema=schema
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
except Exception as e:
|
|
706
|
+
logger.error(f"Failed to read CSV from {resource_path}: {str(e)}")
|
|
707
|
+
raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
|
|
708
|
+
|
|
709
|
+
@classmethod
|
|
710
|
+
def _read_json_from_cloud(cls,
|
|
711
|
+
resource_path: str,
|
|
712
|
+
storage_options: Dict[str, Any],
|
|
713
|
+
credential_provider: Optional[Callable],
|
|
714
|
+
is_directory: bool) -> "FlowDataEngine":
|
|
715
|
+
"""Reads JSON file(s) from cloud storage."""
|
|
716
|
+
try:
|
|
717
|
+
if is_directory:
|
|
718
|
+
resource_path = ensure_path_has_wildcard_pattern(resource_path, "json")
|
|
719
|
+
scan_kwargs = {"source": resource_path}
|
|
720
|
+
|
|
721
|
+
if storage_options:
|
|
722
|
+
scan_kwargs["storage_options"] = storage_options
|
|
723
|
+
if credential_provider:
|
|
724
|
+
scan_kwargs["credential_provider"] = credential_provider
|
|
725
|
+
|
|
726
|
+
lf = pl.scan_ndjson(**scan_kwargs) # Using NDJSON for line-delimited JSON
|
|
727
|
+
|
|
728
|
+
return cls(
|
|
729
|
+
lf,
|
|
730
|
+
number_of_records=-1,
|
|
731
|
+
optimize_memory=True,
|
|
732
|
+
streamable=True,
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
except Exception as e:
|
|
736
|
+
logger.error(f"Failed to read JSON from {resource_path}: {str(e)}")
|
|
737
|
+
raise Exception(f"Failed to read JSON from cloud storage: {str(e)}")
|
|
236
738
|
|
|
237
739
|
def _handle_path_ref(self, path_ref: str, optimize_memory: bool):
|
|
238
|
-
"""
|
|
740
|
+
"""Handles file path reference input."""
|
|
239
741
|
try:
|
|
240
742
|
pf = ParquetFile(path_ref)
|
|
241
743
|
except Exception as e:
|
|
@@ -251,21 +753,32 @@ class FlowDataEngine:
|
|
|
251
753
|
|
|
252
754
|
def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
|
|
253
755
|
calculate_schema_stats: bool):
|
|
254
|
-
"""
|
|
756
|
+
"""Finalizes initialization by setting remaining attributes."""
|
|
255
757
|
_ = calculate_schema_stats
|
|
256
758
|
self.name = name
|
|
257
759
|
self._optimize_memory = optimize_memory
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
760
|
+
if assert_if_flowfile_schema(schema):
|
|
761
|
+
self._schema = schema
|
|
762
|
+
self.columns = [c.column_name for c in self._schema]
|
|
763
|
+
else:
|
|
764
|
+
pl_schema = self.data_frame.collect_schema()
|
|
765
|
+
self._schema = self._handle_schema(schema, pl_schema)
|
|
766
|
+
self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
|
|
261
767
|
|
|
262
768
|
def __getitem__(self, item):
|
|
263
|
-
"""
|
|
769
|
+
"""Accesses a specific column or item from the DataFrame."""
|
|
264
770
|
return self.data_frame.select([item])
|
|
265
771
|
|
|
266
772
|
@property
|
|
267
|
-
def data_frame(self) -> pl.LazyFrame | pl.DataFrame:
|
|
268
|
-
"""
|
|
773
|
+
def data_frame(self) -> pl.LazyFrame | pl.DataFrame | None:
|
|
774
|
+
"""The underlying Polars DataFrame or LazyFrame.
|
|
775
|
+
|
|
776
|
+
This property provides access to the Polars object that backs the
|
|
777
|
+
FlowDataEngine. It handles lazy-loading from external sources if necessary.
|
|
778
|
+
|
|
779
|
+
Returns:
|
|
780
|
+
The active Polars `DataFrame` or `LazyFrame`.
|
|
781
|
+
"""
|
|
269
782
|
if self._data_frame is not None and not self.is_future:
|
|
270
783
|
return self._data_frame
|
|
271
784
|
elif self.is_future:
|
|
@@ -284,14 +797,32 @@ class FlowDataEngine:
|
|
|
284
797
|
|
|
285
798
|
@data_frame.setter
|
|
286
799
|
def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
|
|
287
|
-
"""
|
|
800
|
+
"""Sets the underlying Polars DataFrame or LazyFrame."""
|
|
288
801
|
if self.lazy and isinstance(df, pl.DataFrame):
|
|
289
802
|
raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
|
|
290
803
|
self._data_frame = df
|
|
291
804
|
|
|
805
|
+
@staticmethod
|
|
806
|
+
def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
|
|
807
|
+
"""Converts a Polars Schema into a list of schema statistics dictionaries."""
|
|
808
|
+
return [
|
|
809
|
+
dict(column_name=k, pl_datatype=v, col_index=i)
|
|
810
|
+
for i, (k, v) in enumerate(pl_schema.items())
|
|
811
|
+
]
|
|
812
|
+
|
|
813
|
+
def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
|
|
814
|
+
"""Populates the schema from a list of schema statistics dictionaries."""
|
|
815
|
+
self._schema = convert_stats_to_column_info(schema_stats)
|
|
816
|
+
|
|
292
817
|
@property
|
|
293
818
|
def schema(self) -> List[FlowfileColumn]:
|
|
294
|
-
"""
|
|
819
|
+
"""The schema of the DataFrame as a list of `FlowfileColumn` objects.
|
|
820
|
+
|
|
821
|
+
This property lazily calculates the schema if it hasn't been determined yet.
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
A list of `FlowfileColumn` objects describing the schema.
|
|
825
|
+
"""
|
|
295
826
|
if self.number_of_fields == 0:
|
|
296
827
|
return []
|
|
297
828
|
if self._schema is None or (self._calculate_schema_stats and not self.ind_schema_calculated):
|
|
@@ -299,26 +830,34 @@ class FlowDataEngine:
|
|
|
299
830
|
schema_stats = self._calculate_schema()
|
|
300
831
|
self.ind_schema_calculated = True
|
|
301
832
|
else:
|
|
302
|
-
schema_stats =
|
|
303
|
-
|
|
304
|
-
for i, (k, v) in enumerate(self.data_frame.collect_schema().items())
|
|
305
|
-
]
|
|
306
|
-
self._schema = convert_stats_to_column_info(schema_stats)
|
|
833
|
+
schema_stats = self._create_schema_stats_from_pl_schema(self.data_frame.collect_schema())
|
|
834
|
+
self._add_schema_from_schema_stats(schema_stats)
|
|
307
835
|
return self._schema
|
|
308
836
|
|
|
309
837
|
@property
|
|
310
838
|
def number_of_fields(self) -> int:
|
|
311
|
-
"""
|
|
839
|
+
"""The number of columns (fields) in the DataFrame.
|
|
840
|
+
|
|
841
|
+
Returns:
|
|
842
|
+
The integer count of columns.
|
|
843
|
+
"""
|
|
312
844
|
if self.__number_of_fields is None:
|
|
313
845
|
self.__number_of_fields = len(self.columns)
|
|
314
846
|
return self.__number_of_fields
|
|
315
847
|
|
|
316
|
-
# Data Collection and Sampling Methods
|
|
317
|
-
|
|
318
848
|
def collect(self, n_records: int = None) -> pl.DataFrame:
|
|
319
|
-
"""
|
|
320
|
-
|
|
321
|
-
|
|
849
|
+
"""Collects the data and returns it as a Polars DataFrame.
|
|
850
|
+
|
|
851
|
+
This method triggers the execution of the lazy query plan (if applicable)
|
|
852
|
+
and returns the result. It supports streaming to optimize memory usage
|
|
853
|
+
for large datasets.
|
|
854
|
+
|
|
855
|
+
Args:
|
|
856
|
+
n_records: The maximum number of records to collect. If None, all
|
|
857
|
+
records are collected.
|
|
858
|
+
|
|
859
|
+
Returns:
|
|
860
|
+
A Polars `DataFrame` containing the collected data.
|
|
322
861
|
"""
|
|
323
862
|
if n_records is None:
|
|
324
863
|
logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
|
|
@@ -336,8 +875,9 @@ class FlowDataEngine:
|
|
|
336
875
|
return self._handle_collection_error(n_records)
|
|
337
876
|
|
|
338
877
|
def _collect_data(self, n_records: int = None) -> pl.DataFrame:
|
|
339
|
-
"""Internal method to handle data collection."""
|
|
878
|
+
"""Internal method to handle data collection logic."""
|
|
340
879
|
if n_records is None:
|
|
880
|
+
|
|
341
881
|
self.collect_external()
|
|
342
882
|
if self._streamable:
|
|
343
883
|
try:
|
|
@@ -353,11 +893,11 @@ class FlowDataEngine:
|
|
|
353
893
|
return self._collect_from_external_source(n_records)
|
|
354
894
|
|
|
355
895
|
if self._streamable:
|
|
356
|
-
return self.data_frame.head(n_records).collect(engine="streaming"
|
|
896
|
+
return self.data_frame.head(n_records).collect(engine="streaming")
|
|
357
897
|
return self.data_frame.head(n_records).collect()
|
|
358
898
|
|
|
359
899
|
def _collect_from_external_source(self, n_records: int) -> pl.DataFrame:
|
|
360
|
-
"""
|
|
900
|
+
"""Handles collection from an external source."""
|
|
361
901
|
if self.external_source.get_pl_df() is not None:
|
|
362
902
|
all_data = self.external_source.get_pl_df().head(n_records)
|
|
363
903
|
self.data_frame = all_data
|
|
@@ -367,7 +907,7 @@ class FlowDataEngine:
|
|
|
367
907
|
return self.data_frame
|
|
368
908
|
|
|
369
909
|
def _handle_collection_error(self, n_records: int) -> pl.DataFrame:
|
|
370
|
-
"""
|
|
910
|
+
"""Handles errors during collection by attempting partial collection."""
|
|
371
911
|
n_records = 100000000 if n_records is None else n_records
|
|
372
912
|
ok_cols, error_cols = self._identify_valid_columns(n_records)
|
|
373
913
|
|
|
@@ -376,7 +916,7 @@ class FlowDataEngine:
|
|
|
376
916
|
return self._create_empty_dataframe(n_records)
|
|
377
917
|
|
|
378
918
|
def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
|
|
379
|
-
"""
|
|
919
|
+
"""Identifies which columns can be collected successfully."""
|
|
380
920
|
ok_cols = []
|
|
381
921
|
error_cols = []
|
|
382
922
|
for c in self.columns:
|
|
@@ -389,7 +929,7 @@ class FlowDataEngine:
|
|
|
389
929
|
|
|
390
930
|
def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
|
|
391
931
|
n_records: int) -> pl.DataFrame:
|
|
392
|
-
"""
|
|
932
|
+
"""Creates a DataFrame with partial data for columns that could be collected."""
|
|
393
933
|
df = self.data_frame.select(ok_cols)
|
|
394
934
|
df = df.with_columns([
|
|
395
935
|
pl.lit(None).alias(column_name).cast(data_type)
|
|
@@ -398,7 +938,7 @@ class FlowDataEngine:
|
|
|
398
938
|
return df.select(self.columns).head(n_records).collect()
|
|
399
939
|
|
|
400
940
|
def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
|
|
401
|
-
"""
|
|
941
|
+
"""Creates an empty DataFrame with the correct schema."""
|
|
402
942
|
if self.number_of_records > 0:
|
|
403
943
|
return pl.DataFrame({
|
|
404
944
|
column_name: pl.Series(
|
|
@@ -409,11 +949,19 @@ class FlowDataEngine:
|
|
|
409
949
|
})
|
|
410
950
|
return pl.DataFrame(schema=self.data_frame.schema)
|
|
411
951
|
|
|
412
|
-
# Data Transformation Methods
|
|
413
|
-
|
|
414
952
|
def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
|
|
415
953
|
calculate_schema_stats: bool = True) -> "FlowDataEngine":
|
|
416
|
-
"""
|
|
954
|
+
"""Performs a group-by operation on the DataFrame.
|
|
955
|
+
|
|
956
|
+
Args:
|
|
957
|
+
group_by_input: A `GroupByInput` object defining the grouping columns
|
|
958
|
+
and aggregations.
|
|
959
|
+
calculate_schema_stats: If True, calculates schema statistics for the
|
|
960
|
+
resulting DataFrame.
|
|
961
|
+
|
|
962
|
+
Returns:
|
|
963
|
+
A new `FlowDataEngine` instance with the grouped and aggregated data.
|
|
964
|
+
"""
|
|
417
965
|
aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
|
|
418
966
|
group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
|
|
419
967
|
|
|
@@ -435,7 +983,15 @@ class FlowDataEngine:
|
|
|
435
983
|
)
|
|
436
984
|
|
|
437
985
|
def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
|
|
438
|
-
"""
|
|
986
|
+
"""Sorts the DataFrame by one or more columns.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
sorts: A list of `SortByInput` objects, each specifying a column
|
|
990
|
+
and sort direction ('asc' or 'desc').
|
|
991
|
+
|
|
992
|
+
Returns:
|
|
993
|
+
A new `FlowDataEngine` instance with the sorted data.
|
|
994
|
+
"""
|
|
439
995
|
if not sorts:
|
|
440
996
|
return self
|
|
441
997
|
|
|
@@ -445,7 +1001,16 @@ class FlowDataEngine:
|
|
|
445
1001
|
|
|
446
1002
|
def change_column_types(self, transforms: List[transform_schemas.SelectInput],
|
|
447
1003
|
calculate_schema: bool = False) -> "FlowDataEngine":
|
|
448
|
-
"""
|
|
1004
|
+
"""Changes the data type of one or more columns.
|
|
1005
|
+
|
|
1006
|
+
Args:
|
|
1007
|
+
transforms: A list of `SelectInput` objects, where each object specifies
|
|
1008
|
+
the column and its new `polars_type`.
|
|
1009
|
+
calculate_schema: If True, recalculates the schema after the type change.
|
|
1010
|
+
|
|
1011
|
+
Returns:
|
|
1012
|
+
A new `FlowDataEngine` instance with the updated column types.
|
|
1013
|
+
"""
|
|
449
1014
|
dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
|
|
450
1015
|
idx_mapping = list(
|
|
451
1016
|
(transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
|
|
@@ -466,26 +1031,79 @@ class FlowDataEngine:
|
|
|
466
1031
|
streamable=self._streamable
|
|
467
1032
|
)
|
|
468
1033
|
|
|
469
|
-
# Data Export and Conversion Methods
|
|
470
|
-
|
|
471
1034
|
def save(self, path: str, data_type: str = 'parquet') -> Future:
|
|
472
|
-
"""
|
|
1035
|
+
"""Saves the DataFrame to a file in a separate thread.
|
|
1036
|
+
|
|
1037
|
+
Args:
|
|
1038
|
+
path: The file path to save to.
|
|
1039
|
+
data_type: The format to save in (e.g., 'parquet', 'csv').
|
|
1040
|
+
|
|
1041
|
+
Returns:
|
|
1042
|
+
A `loky.Future` object representing the asynchronous save operation.
|
|
1043
|
+
"""
|
|
473
1044
|
estimated_size = deepcopy(self.get_estimated_file_size() * 4)
|
|
474
1045
|
df = deepcopy(self.data_frame)
|
|
475
1046
|
return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
|
|
476
1047
|
|
|
477
1048
|
def to_pylist(self) -> List[Dict]:
|
|
478
|
-
"""
|
|
1049
|
+
"""Converts the DataFrame to a list of Python dictionaries.
|
|
1050
|
+
|
|
1051
|
+
Returns:
|
|
1052
|
+
A list where each item is a dictionary representing a row.
|
|
1053
|
+
"""
|
|
479
1054
|
if self.lazy:
|
|
480
1055
|
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
|
|
481
1056
|
return self.data_frame.to_dicts()
|
|
482
1057
|
|
|
1058
|
+
def to_arrow(self) -> PaTable:
|
|
1059
|
+
"""Converts the DataFrame to a PyArrow Table.
|
|
1060
|
+
|
|
1061
|
+
This method triggers a `.collect()` call if the data is lazy,
|
|
1062
|
+
then converts the resulting eager DataFrame into a `pyarrow.Table`.
|
|
1063
|
+
|
|
1064
|
+
Returns:
|
|
1065
|
+
A `pyarrow.Table` instance representing the data.
|
|
1066
|
+
"""
|
|
1067
|
+
if self.lazy:
|
|
1068
|
+
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_arrow()
|
|
1069
|
+
else:
|
|
1070
|
+
return self.data_frame.to_arrow()
|
|
1071
|
+
|
|
1072
|
+
def to_raw_data(self) -> input_schema.RawData:
|
|
1073
|
+
"""Converts the DataFrame to a `RawData` schema object.
|
|
1074
|
+
|
|
1075
|
+
Returns:
|
|
1076
|
+
An `input_schema.RawData` object containing the schema and data.
|
|
1077
|
+
"""
|
|
1078
|
+
columns = [c.get_minimal_field_info() for c in self.schema]
|
|
1079
|
+
data = list(self.to_dict().values())
|
|
1080
|
+
return input_schema.RawData(columns=columns, data=data)
|
|
1081
|
+
|
|
483
1082
|
def to_dict(self) -> Dict[str, List]:
|
|
484
|
-
|
|
1083
|
+
"""Converts the DataFrame to a Python dictionary of columns.
|
|
1084
|
+
|
|
1085
|
+
Each key in the dictionary is a column name, and the corresponding value
|
|
1086
|
+
is a list of the data in that column.
|
|
1087
|
+
|
|
1088
|
+
Returns:
|
|
1089
|
+
A dictionary mapping column names to lists of their values.
|
|
1090
|
+
"""
|
|
1091
|
+
if self.lazy:
|
|
1092
|
+
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
|
|
1093
|
+
else:
|
|
1094
|
+
return self.data_frame.to_dict(as_series=False)
|
|
485
1095
|
|
|
486
1096
|
@classmethod
|
|
487
1097
|
def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
|
|
488
|
-
"""
|
|
1098
|
+
"""Creates a FlowDataEngine from an external data source.
|
|
1099
|
+
|
|
1100
|
+
Args:
|
|
1101
|
+
external_source: An object that conforms to the `ExternalDataSource`
|
|
1102
|
+
interface.
|
|
1103
|
+
|
|
1104
|
+
Returns:
|
|
1105
|
+
A new `FlowDataEngine` instance.
|
|
1106
|
+
"""
|
|
489
1107
|
if external_source.schema is not None:
|
|
490
1108
|
ff = cls.create_from_schema(external_source.schema)
|
|
491
1109
|
elif external_source.initial_data_getter is not None:
|
|
@@ -497,12 +1115,27 @@ class FlowDataEngine:
|
|
|
497
1115
|
|
|
498
1116
|
@classmethod
|
|
499
1117
|
def create_from_sql(cls, sql: str, conn: Any) -> "FlowDataEngine":
|
|
500
|
-
"""
|
|
1118
|
+
"""Creates a FlowDataEngine by executing a SQL query.
|
|
1119
|
+
|
|
1120
|
+
Args:
|
|
1121
|
+
sql: The SQL query string to execute.
|
|
1122
|
+
conn: A database connection object or connection URI string.
|
|
1123
|
+
|
|
1124
|
+
Returns:
|
|
1125
|
+
A new `FlowDataEngine` instance with the query result.
|
|
1126
|
+
"""
|
|
501
1127
|
return cls(pl.read_sql(sql, conn))
|
|
502
1128
|
|
|
503
1129
|
@classmethod
|
|
504
1130
|
def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
|
|
505
|
-
"""
|
|
1131
|
+
"""Creates an empty FlowDataEngine from a schema definition.
|
|
1132
|
+
|
|
1133
|
+
Args:
|
|
1134
|
+
schema: A list of `FlowfileColumn` objects defining the schema.
|
|
1135
|
+
|
|
1136
|
+
Returns:
|
|
1137
|
+
A new, empty `FlowDataEngine` instance with the specified schema.
|
|
1138
|
+
"""
|
|
506
1139
|
pl_schema = []
|
|
507
1140
|
for i, flow_file_column in enumerate(schema):
|
|
508
1141
|
pl_schema.append((flow_file_column.name, cast_str_to_polars_type(flow_file_column.data_type)))
|
|
@@ -512,9 +1145,18 @@ class FlowDataEngine:
|
|
|
512
1145
|
|
|
513
1146
|
@classmethod
|
|
514
1147
|
def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
|
|
515
|
-
"""
|
|
516
|
-
|
|
1148
|
+
"""Creates a FlowDataEngine from a local file path.
|
|
1149
|
+
|
|
1150
|
+
Supports various file types like CSV, Parquet, and Excel.
|
|
1151
|
+
|
|
1152
|
+
Args:
|
|
1153
|
+
received_table: A `ReceivedTableBase` object containing the file path
|
|
1154
|
+
and format details.
|
|
517
1155
|
|
|
1156
|
+
Returns:
|
|
1157
|
+
A new `FlowDataEngine` instance with data from the file.
|
|
1158
|
+
"""
|
|
1159
|
+
received_table.set_absolute_filepath()
|
|
518
1160
|
file_type_handlers = {
|
|
519
1161
|
'csv': create_funcs.create_from_path_csv,
|
|
520
1162
|
'parquet': create_funcs.create_from_path_parquet,
|
|
@@ -531,38 +1173,56 @@ class FlowDataEngine:
|
|
|
531
1173
|
|
|
532
1174
|
@classmethod
|
|
533
1175
|
def create_random(cls, number_of_records: int = 1000) -> "FlowDataEngine":
|
|
534
|
-
"""
|
|
1176
|
+
"""Creates a FlowDataEngine with randomly generated data.
|
|
1177
|
+
|
|
1178
|
+
Useful for testing and examples.
|
|
1179
|
+
|
|
1180
|
+
Args:
|
|
1181
|
+
number_of_records: The number of random records to generate.
|
|
1182
|
+
|
|
1183
|
+
Returns:
|
|
1184
|
+
A new `FlowDataEngine` instance with fake data.
|
|
1185
|
+
"""
|
|
535
1186
|
return cls(create_fake_data(number_of_records))
|
|
536
1187
|
|
|
537
1188
|
@classmethod
|
|
538
1189
|
def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
|
|
539
|
-
"""
|
|
1190
|
+
"""Generates a FlowDataEngine with a single column containing a sequence of integers.
|
|
1191
|
+
|
|
1192
|
+
Args:
|
|
1193
|
+
length: The number of integers to generate in the sequence.
|
|
1194
|
+
output_name: The name of the output column.
|
|
1195
|
+
|
|
1196
|
+
Returns:
|
|
1197
|
+
A new `FlowDataEngine` instance.
|
|
1198
|
+
"""
|
|
540
1199
|
if length > 10_000_000:
|
|
541
1200
|
length = 10_000_000
|
|
542
1201
|
return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
|
|
543
1202
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema,
|
|
1203
|
+
def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
|
|
547
1204
|
pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
|
|
548
|
-
"""
|
|
549
|
-
if schema is None:
|
|
1205
|
+
"""Handles schema processing and validation during initialization."""
|
|
1206
|
+
if schema is None and pl_schema is not None:
|
|
1207
|
+
return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
|
|
1208
|
+
elif schema is None and pl_schema is None:
|
|
550
1209
|
return None
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
1210
|
+
elif assert_if_flowfile_schema(schema) and pl_schema is None:
|
|
1211
|
+
return schema
|
|
1212
|
+
elif pl_schema is not None and schema is not None:
|
|
1213
|
+
if schema.__len__() != pl_schema.__len__():
|
|
1214
|
+
raise Exception(
|
|
1215
|
+
f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
|
|
1216
|
+
if isinstance(schema, pl.Schema):
|
|
1217
|
+
return self._handle_polars_schema(schema, pl_schema)
|
|
1218
|
+
elif isinstance(schema, list) and len(schema) == 0:
|
|
1219
|
+
return []
|
|
1220
|
+
elif isinstance(schema[0], str):
|
|
1221
|
+
return self._handle_string_schema(schema, pl_schema)
|
|
1222
|
+
return schema
|
|
563
1223
|
|
|
564
1224
|
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
|
|
565
|
-
"""
|
|
1225
|
+
"""Handles Polars schema conversion."""
|
|
566
1226
|
flow_file_columns = [
|
|
567
1227
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
568
1228
|
for col_name, dtype in zip(schema.names(), schema.dtypes())
|
|
@@ -577,7 +1237,7 @@ class FlowDataEngine:
|
|
|
577
1237
|
return flow_file_columns
|
|
578
1238
|
|
|
579
1239
|
def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
|
|
580
|
-
"""
|
|
1240
|
+
"""Handles string-based schema conversion."""
|
|
581
1241
|
flow_file_columns = [
|
|
582
1242
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
583
1243
|
for col_name, dtype in zip(schema, pl_schema.dtypes())
|
|
@@ -589,10 +1249,19 @@ class FlowDataEngine:
|
|
|
589
1249
|
|
|
590
1250
|
return flow_file_columns
|
|
591
1251
|
|
|
592
|
-
# Data Manipulation Methods
|
|
593
|
-
|
|
594
1252
|
def split(self, split_input: transform_schemas.TextToRowsInput) -> "FlowDataEngine":
|
|
595
|
-
"""
|
|
1253
|
+
"""Splits a column's text values into multiple rows based on a delimiter.
|
|
1254
|
+
|
|
1255
|
+
This operation is often referred to as "exploding" the DataFrame, as it
|
|
1256
|
+
increases the number of rows.
|
|
1257
|
+
|
|
1258
|
+
Args:
|
|
1259
|
+
split_input: A `TextToRowsInput` object specifying the column to split,
|
|
1260
|
+
the delimiter, and the output column name.
|
|
1261
|
+
|
|
1262
|
+
Returns:
|
|
1263
|
+
A new `FlowDataEngine` instance with the exploded rows.
|
|
1264
|
+
"""
|
|
596
1265
|
output_column_name = (
|
|
597
1266
|
split_input.output_column_name
|
|
598
1267
|
if split_input.output_column_name
|
|
@@ -617,7 +1286,18 @@ class FlowDataEngine:
|
|
|
617
1286
|
return FlowDataEngine(df)
|
|
618
1287
|
|
|
619
1288
|
def unpivot(self, unpivot_input: transform_schemas.UnpivotInput) -> "FlowDataEngine":
|
|
620
|
-
"""
|
|
1289
|
+
"""Converts the DataFrame from a wide to a long format.
|
|
1290
|
+
|
|
1291
|
+
This is the inverse of a pivot operation, taking columns and transforming
|
|
1292
|
+
them into `variable` and `value` rows.
|
|
1293
|
+
|
|
1294
|
+
Args:
|
|
1295
|
+
unpivot_input: An `UnpivotInput` object specifying which columns to
|
|
1296
|
+
unpivot and which to keep as index columns.
|
|
1297
|
+
|
|
1298
|
+
Returns:
|
|
1299
|
+
A new, unpivoted `FlowDataEngine` instance.
|
|
1300
|
+
"""
|
|
621
1301
|
lf = self.data_frame
|
|
622
1302
|
|
|
623
1303
|
if unpivot_input.data_type_selector_expr is not None:
|
|
@@ -636,7 +1316,17 @@ class FlowDataEngine:
|
|
|
636
1316
|
return FlowDataEngine(result)
|
|
637
1317
|
|
|
638
1318
|
def do_pivot(self, pivot_input: transform_schemas.PivotInput, node_logger: NodeLogger = None) -> "FlowDataEngine":
|
|
639
|
-
"""
|
|
1319
|
+
"""Converts the DataFrame from a long to a wide format, aggregating values.
|
|
1320
|
+
|
|
1321
|
+
Args:
|
|
1322
|
+
pivot_input: A `PivotInput` object defining the index, pivot, and value
|
|
1323
|
+
columns, along with the aggregation logic.
|
|
1324
|
+
node_logger: An optional logger for reporting warnings, e.g., if the
|
|
1325
|
+
pivot column has too many unique values.
|
|
1326
|
+
|
|
1327
|
+
Returns:
|
|
1328
|
+
A new, pivoted `FlowDataEngine` instance.
|
|
1329
|
+
"""
|
|
640
1330
|
# Get unique values for pivot columns
|
|
641
1331
|
max_unique_vals = 200
|
|
642
1332
|
new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
|
|
@@ -696,7 +1386,16 @@ class FlowDataEngine:
|
|
|
696
1386
|
return FlowDataEngine(df, calculate_schema_stats=False)
|
|
697
1387
|
|
|
698
1388
|
def do_filter(self, predicate: str) -> "FlowDataEngine":
|
|
699
|
-
"""
|
|
1389
|
+
"""Filters rows based on a predicate expression.
|
|
1390
|
+
|
|
1391
|
+
Args:
|
|
1392
|
+
predicate: A string containing a Polars expression that evaluates to
|
|
1393
|
+
a boolean value.
|
|
1394
|
+
|
|
1395
|
+
Returns:
|
|
1396
|
+
A new `FlowDataEngine` instance containing only the rows that match
|
|
1397
|
+
the predicate.
|
|
1398
|
+
"""
|
|
700
1399
|
try:
|
|
701
1400
|
f = to_expr(predicate)
|
|
702
1401
|
except Exception as e:
|
|
@@ -706,13 +1405,24 @@ class FlowDataEngine:
|
|
|
706
1405
|
return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
|
|
707
1406
|
|
|
708
1407
|
def add_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
709
|
-
"""
|
|
1408
|
+
"""Adds a record ID (row number) column to the DataFrame.
|
|
1409
|
+
|
|
1410
|
+
Can generate a simple sequential ID or a grouped ID that resets for
|
|
1411
|
+
each group.
|
|
1412
|
+
|
|
1413
|
+
Args:
|
|
1414
|
+
record_id_settings: A `RecordIdInput` object specifying the output
|
|
1415
|
+
column name, offset, and optional grouping columns.
|
|
1416
|
+
|
|
1417
|
+
Returns:
|
|
1418
|
+
A new `FlowDataEngine` instance with the added record ID column.
|
|
1419
|
+
"""
|
|
710
1420
|
if record_id_settings.group_by and len(record_id_settings.group_by_columns) > 0:
|
|
711
1421
|
return self._add_grouped_record_id(record_id_settings)
|
|
712
1422
|
return self._add_simple_record_id(record_id_settings)
|
|
713
1423
|
|
|
714
1424
|
def _add_grouped_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
715
|
-
"""
|
|
1425
|
+
"""Adds a record ID column with grouping."""
|
|
716
1426
|
select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
|
|
717
1427
|
|
|
718
1428
|
df = (
|
|
@@ -732,7 +1442,7 @@ class FlowDataEngine:
|
|
|
732
1442
|
return FlowDataEngine(df, schema=output_schema)
|
|
733
1443
|
|
|
734
1444
|
def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
735
|
-
"""
|
|
1445
|
+
"""Adds a simple sequential record ID column."""
|
|
736
1446
|
df = self.data_frame.with_row_index(
|
|
737
1447
|
record_id_settings.output_column_name,
|
|
738
1448
|
record_id_settings.offset
|
|
@@ -743,38 +1453,52 @@ class FlowDataEngine:
|
|
|
743
1453
|
|
|
744
1454
|
return FlowDataEngine(df, schema=output_schema)
|
|
745
1455
|
|
|
746
|
-
# Utility Methods
|
|
747
|
-
|
|
748
1456
|
def get_schema_column(self, col_name: str) -> FlowfileColumn:
|
|
749
|
-
"""
|
|
1457
|
+
"""Retrieves the schema information for a single column by its name.
|
|
1458
|
+
|
|
1459
|
+
Args:
|
|
1460
|
+
col_name: The name of the column to retrieve.
|
|
1461
|
+
|
|
1462
|
+
Returns:
|
|
1463
|
+
A `FlowfileColumn` object for the specified column, or `None` if not found.
|
|
1464
|
+
"""
|
|
750
1465
|
for s in self.schema:
|
|
751
1466
|
if s.name == col_name:
|
|
752
1467
|
return s
|
|
753
1468
|
|
|
754
1469
|
def get_estimated_file_size(self) -> int:
|
|
755
|
-
"""
|
|
1470
|
+
"""Estimates the file size in bytes if the data originated from a local file.
|
|
1471
|
+
|
|
1472
|
+
This relies on the original path being tracked during file ingestion.
|
|
1473
|
+
|
|
1474
|
+
Returns:
|
|
1475
|
+
The file size in bytes, or 0 if the original path is unknown.
|
|
1476
|
+
"""
|
|
756
1477
|
if self._org_path is not None:
|
|
757
1478
|
return os.path.getsize(self._org_path)
|
|
758
1479
|
return 0
|
|
759
1480
|
|
|
760
1481
|
def __repr__(self) -> str:
|
|
761
|
-
"""
|
|
762
|
-
return f'
|
|
1482
|
+
"""Returns a string representation of the FlowDataEngine."""
|
|
1483
|
+
return f'flow data engine\n{self.data_frame.__repr__()}'
|
|
763
1484
|
|
|
764
1485
|
def __call__(self) -> "FlowDataEngine":
|
|
765
|
-
"""
|
|
1486
|
+
"""Makes the class instance callable, returning itself."""
|
|
766
1487
|
return self
|
|
767
1488
|
|
|
768
1489
|
def __len__(self) -> int:
|
|
769
|
-
"""
|
|
1490
|
+
"""Returns the number of records in the table."""
|
|
770
1491
|
return self.number_of_records if self.number_of_records >= 0 else self.get_number_of_records()
|
|
771
1492
|
|
|
772
1493
|
def cache(self) -> "FlowDataEngine":
|
|
773
|
-
"""
|
|
774
|
-
|
|
1494
|
+
"""Caches the current DataFrame to disk and updates the internal reference.
|
|
1495
|
+
|
|
1496
|
+
This triggers a background process to write the current LazyFrame's result
|
|
1497
|
+
to a temporary file. Subsequent operations on this `FlowDataEngine` instance
|
|
1498
|
+
will read from the cached file, which can speed up downstream computations.
|
|
775
1499
|
|
|
776
1500
|
Returns:
|
|
777
|
-
FlowDataEngine
|
|
1501
|
+
The same `FlowDataEngine` instance, now backed by the cached data.
|
|
778
1502
|
"""
|
|
779
1503
|
edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
|
|
780
1504
|
flow_id=-1,
|
|
@@ -789,7 +1513,13 @@ class FlowDataEngine:
|
|
|
789
1513
|
return self
|
|
790
1514
|
|
|
791
1515
|
def collect_external(self):
|
|
792
|
-
"""
|
|
1516
|
+
"""Materializes data from a tracked external source.
|
|
1517
|
+
|
|
1518
|
+
If the `FlowDataEngine` was created from an `ExternalDataSource`, this
|
|
1519
|
+
method will trigger the data retrieval, update the internal `_data_frame`
|
|
1520
|
+
to a `LazyFrame` of the collected data, and reset the schema to be
|
|
1521
|
+
re-evaluated.
|
|
1522
|
+
"""
|
|
793
1523
|
if self._external_source is not None:
|
|
794
1524
|
logger.info('Collecting external source')
|
|
795
1525
|
if self.external_source.get_pl_df() is not None:
|
|
@@ -798,16 +1528,16 @@ class FlowDataEngine:
|
|
|
798
1528
|
self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
|
|
799
1529
|
self._schema = None # enforce reset schema
|
|
800
1530
|
|
|
801
|
-
# Data Access Methods
|
|
802
1531
|
def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
|
|
803
|
-
"""
|
|
804
|
-
|
|
1532
|
+
"""Gets a sample of the data as a list of dictionaries.
|
|
1533
|
+
|
|
1534
|
+
This is typically used to display a preview of the data in a UI.
|
|
805
1535
|
|
|
806
1536
|
Args:
|
|
807
|
-
n_rows:
|
|
1537
|
+
n_rows: The number of rows to sample.
|
|
808
1538
|
|
|
809
1539
|
Returns:
|
|
810
|
-
|
|
1540
|
+
A list of dictionaries, where each dictionary represents a row.
|
|
811
1541
|
"""
|
|
812
1542
|
if self.number_of_records > n_rows or self.number_of_records < 0:
|
|
813
1543
|
df = self.collect(n_rows)
|
|
@@ -816,6 +1546,7 @@ class FlowDataEngine:
|
|
|
816
1546
|
return df.to_dicts()
|
|
817
1547
|
|
|
818
1548
|
def __get_sample__(self, n_rows: int = 100, streamable: bool = True) -> "FlowDataEngine":
|
|
1549
|
+
"""Internal method to get a sample of the data."""
|
|
819
1550
|
if not self.lazy:
|
|
820
1551
|
df = self.data_frame.lazy()
|
|
821
1552
|
else:
|
|
@@ -833,19 +1564,18 @@ class FlowDataEngine:
|
|
|
833
1564
|
|
|
834
1565
|
def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
|
|
835
1566
|
seed: int = None) -> "FlowDataEngine":
|
|
836
|
-
"""
|
|
837
|
-
Get a sample of rows from the DataFrame.
|
|
1567
|
+
"""Gets a sample of rows from the DataFrame.
|
|
838
1568
|
|
|
839
1569
|
Args:
|
|
840
|
-
n_rows:
|
|
841
|
-
random:
|
|
842
|
-
shuffle:
|
|
843
|
-
seed:
|
|
1570
|
+
n_rows: The number of rows to sample.
|
|
1571
|
+
random: If True, performs random sampling. If False, takes the first n_rows.
|
|
1572
|
+
shuffle: If True (and `random` is True), shuffles the data before sampling.
|
|
1573
|
+
seed: A random seed for reproducibility.
|
|
844
1574
|
|
|
845
1575
|
Returns:
|
|
846
|
-
FlowDataEngine
|
|
1576
|
+
A new `FlowDataEngine` instance containing the sampled data.
|
|
847
1577
|
"""
|
|
848
|
-
n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=
|
|
1578
|
+
n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=OFFLOAD_TO_WORKER))
|
|
849
1579
|
logging.info(f'Getting sample of {n_rows} rows')
|
|
850
1580
|
|
|
851
1581
|
if random:
|
|
@@ -869,31 +1599,30 @@ class FlowDataEngine:
|
|
|
869
1599
|
return FlowDataEngine(sample_df, schema=self.schema, number_of_records=n_records)
|
|
870
1600
|
|
|
871
1601
|
def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
|
|
872
|
-
"""
|
|
873
|
-
Get a subset of rows from the DataFrame.
|
|
1602
|
+
"""Gets the first `n_rows` from the DataFrame.
|
|
874
1603
|
|
|
875
1604
|
Args:
|
|
876
|
-
n_rows:
|
|
1605
|
+
n_rows: The number of rows to include in the subset.
|
|
877
1606
|
|
|
878
1607
|
Returns:
|
|
879
|
-
FlowDataEngine
|
|
1608
|
+
A new `FlowDataEngine` instance containing the subset of data.
|
|
880
1609
|
"""
|
|
881
1610
|
if not self.lazy:
|
|
882
1611
|
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
883
1612
|
else:
|
|
884
1613
|
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
885
1614
|
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
"""
|
|
889
|
-
Iterate over the DataFrame in batches.
|
|
1615
|
+
def iter_batches(self, batch_size: int = 1000,
|
|
1616
|
+
columns: Union[List, Tuple, str] = None) -> Generator["FlowDataEngine", None, None]:
|
|
1617
|
+
"""Iterates over the DataFrame in batches.
|
|
890
1618
|
|
|
891
1619
|
Args:
|
|
892
|
-
batch_size:
|
|
893
|
-
columns:
|
|
1620
|
+
batch_size: The size of each batch.
|
|
1621
|
+
columns: A list of column names to include in the batches. If None,
|
|
1622
|
+
all columns are included.
|
|
894
1623
|
|
|
895
1624
|
Yields:
|
|
896
|
-
FlowDataEngine
|
|
1625
|
+
A `FlowDataEngine` instance for each batch.
|
|
897
1626
|
"""
|
|
898
1627
|
if columns:
|
|
899
1628
|
self.data_frame = self.data_frame.select(columns)
|
|
@@ -905,17 +1634,21 @@ class FlowDataEngine:
|
|
|
905
1634
|
def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
906
1635
|
other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
|
|
907
1636
|
node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
|
|
908
|
-
"""
|
|
909
|
-
|
|
1637
|
+
"""Starts a fuzzy join operation in a background process.
|
|
1638
|
+
|
|
1639
|
+
This method prepares the data and initiates the fuzzy matching in a
|
|
1640
|
+
separate process, returning a tracker object immediately.
|
|
910
1641
|
|
|
911
1642
|
Args:
|
|
912
|
-
fuzzy_match_input:
|
|
913
|
-
other:
|
|
914
|
-
file_ref:
|
|
915
|
-
flow_id:
|
|
916
|
-
node_id:
|
|
1643
|
+
fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
|
|
1644
|
+
other: The right `FlowDataEngine` to join with.
|
|
1645
|
+
file_ref: A reference string for temporary files.
|
|
1646
|
+
flow_id: The flow ID for tracking.
|
|
1647
|
+
node_id: The node ID for tracking.
|
|
1648
|
+
|
|
917
1649
|
Returns:
|
|
918
|
-
|
|
1650
|
+
An `ExternalFuzzyMatchFetcher` object that can be used to track the
|
|
1651
|
+
progress and retrieve the result of the fuzzy join.
|
|
919
1652
|
"""
|
|
920
1653
|
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
921
1654
|
fuzzy_match_input=fuzzy_match_input)
|
|
@@ -929,17 +1662,19 @@ class FlowDataEngine:
|
|
|
929
1662
|
def do_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
930
1663
|
other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
|
|
931
1664
|
node_id: int | str = -1) -> "FlowDataEngine":
|
|
932
|
-
"""
|
|
933
|
-
|
|
1665
|
+
"""Performs a fuzzy join with another DataFrame.
|
|
1666
|
+
|
|
1667
|
+
This method blocks until the fuzzy join operation is complete.
|
|
934
1668
|
|
|
935
1669
|
Args:
|
|
936
|
-
fuzzy_match_input:
|
|
937
|
-
other:
|
|
938
|
-
file_ref:
|
|
939
|
-
flow_id:
|
|
940
|
-
node_id:
|
|
1670
|
+
fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
|
|
1671
|
+
other: The right `FlowDataEngine` to join with.
|
|
1672
|
+
file_ref: A reference string for temporary files.
|
|
1673
|
+
flow_id: The flow ID for tracking.
|
|
1674
|
+
node_id: The node ID for tracking.
|
|
1675
|
+
|
|
941
1676
|
Returns:
|
|
942
|
-
FlowDataEngine
|
|
1677
|
+
A new `FlowDataEngine` instance with the result of the fuzzy join.
|
|
943
1678
|
"""
|
|
944
1679
|
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
945
1680
|
fuzzy_match_input=fuzzy_match_input)
|
|
@@ -953,18 +1688,19 @@ class FlowDataEngine:
|
|
|
953
1688
|
|
|
954
1689
|
def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
|
|
955
1690
|
fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
|
|
956
|
-
"""
|
|
957
|
-
|
|
1691
|
+
"""Performs a simple fuzzy match between two DataFrames on a single column pair.
|
|
1692
|
+
|
|
1693
|
+
This is a convenience method for a common fuzzy join scenario.
|
|
958
1694
|
|
|
959
1695
|
Args:
|
|
960
|
-
right:
|
|
961
|
-
left_on:
|
|
962
|
-
right_on:
|
|
963
|
-
fuzzy_method:
|
|
964
|
-
threshold:
|
|
1696
|
+
right: The right `FlowDataEngine` to match against.
|
|
1697
|
+
left_on: The column name from the left DataFrame to match on.
|
|
1698
|
+
right_on: The column name from the right DataFrame to match on.
|
|
1699
|
+
fuzzy_method: The fuzzy matching algorithm to use (e.g., 'levenshtein').
|
|
1700
|
+
threshold: The similarity score threshold (0.0 to 1.0) for a match.
|
|
965
1701
|
|
|
966
1702
|
Returns:
|
|
967
|
-
|
|
1703
|
+
A new `FlowDataEngine` with the matched data.
|
|
968
1704
|
"""
|
|
969
1705
|
fuzzy_match_input = transform_schemas.FuzzyMatchInput(
|
|
970
1706
|
[transform_schemas.FuzzyMap(
|
|
@@ -980,29 +1716,28 @@ class FlowDataEngine:
|
|
|
980
1716
|
def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
|
|
981
1717
|
auto_generate_selection: bool, verify_integrity: bool,
|
|
982
1718
|
other: "FlowDataEngine") -> "FlowDataEngine":
|
|
983
|
-
"""
|
|
984
|
-
|
|
1719
|
+
"""Performs a cross join with another DataFrame.
|
|
1720
|
+
|
|
1721
|
+
A cross join produces the Cartesian product of the two DataFrames.
|
|
985
1722
|
|
|
986
1723
|
Args:
|
|
987
|
-
cross_join_input:
|
|
988
|
-
auto_generate_selection:
|
|
989
|
-
verify_integrity:
|
|
990
|
-
other:
|
|
1724
|
+
cross_join_input: A `CrossJoinInput` object specifying column selections.
|
|
1725
|
+
auto_generate_selection: If True, automatically renames columns to avoid conflicts.
|
|
1726
|
+
verify_integrity: If True, checks if the resulting join would be too large.
|
|
1727
|
+
other: The right `FlowDataEngine` to join with.
|
|
991
1728
|
|
|
992
1729
|
Returns:
|
|
993
|
-
|
|
1730
|
+
A new `FlowDataEngine` with the result of the cross join.
|
|
994
1731
|
|
|
995
1732
|
Raises:
|
|
996
|
-
Exception: If join would result in
|
|
1733
|
+
Exception: If `verify_integrity` is True and the join would result in
|
|
1734
|
+
an excessively large number of records.
|
|
997
1735
|
"""
|
|
998
1736
|
self.lazy = True
|
|
999
1737
|
other.lazy = True
|
|
1000
1738
|
|
|
1001
1739
|
verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
|
|
1002
1740
|
|
|
1003
|
-
# if auto_generate_selection:
|
|
1004
|
-
# cross_join_input.auto_rename()
|
|
1005
|
-
|
|
1006
1741
|
right_select = [v.old_name for v in cross_join_input.right_select.renames
|
|
1007
1742
|
if (v.keep or v.join_key) and v.is_available]
|
|
1008
1743
|
left_select = [v.old_name for v in cross_join_input.left_select.renames
|
|
@@ -1034,37 +1769,32 @@ class FlowDataEngine:
|
|
|
1034
1769
|
|
|
1035
1770
|
def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
|
|
1036
1771
|
verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
|
|
1037
|
-
"""
|
|
1038
|
-
|
|
1772
|
+
"""Performs a standard SQL-style join with another DataFrame.
|
|
1773
|
+
|
|
1774
|
+
Supports various join types like 'inner', 'left', 'right', 'outer', 'semi', and 'anti'.
|
|
1039
1775
|
|
|
1040
1776
|
Args:
|
|
1041
|
-
join_input:
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1777
|
+
join_input: A `JoinInput` object defining the join keys, join type,
|
|
1778
|
+
and column selections.
|
|
1779
|
+
auto_generate_selection: If True, automatically handles column renaming.
|
|
1780
|
+
verify_integrity: If True, performs checks to prevent excessively large joins.
|
|
1781
|
+
other: The right `FlowDataEngine` to join with.
|
|
1045
1782
|
|
|
1046
1783
|
Returns:
|
|
1047
|
-
|
|
1784
|
+
A new `FlowDataEngine` with the joined data.
|
|
1048
1785
|
|
|
1049
1786
|
Raises:
|
|
1050
|
-
Exception: If join
|
|
1787
|
+
Exception: If the join configuration is invalid or if `verify_integrity`
|
|
1788
|
+
is True and the join is predicted to be too large.
|
|
1051
1789
|
"""
|
|
1052
|
-
|
|
1053
|
-
# other.lazy = False if join_input.how == 'right' else True
|
|
1054
|
-
|
|
1790
|
+
ensure_right_unselect_for_semi_and_anti_joins(join_input)
|
|
1055
1791
|
verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
|
|
1056
1792
|
if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
|
|
1057
1793
|
raise Exception('Join is not valid by the data fields')
|
|
1058
1794
|
if auto_generate_selection:
|
|
1059
1795
|
join_input.auto_rename()
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
if (v.keep or v.join_key) and v.is_available]
|
|
1063
|
-
left_select = [v.old_name for v in join_input.left_select.renames
|
|
1064
|
-
if (v.keep or v.join_key) and v.is_available]
|
|
1065
|
-
left = self.data_frame.select(left_select).rename(join_input.left_select.rename_table)
|
|
1066
|
-
right = other.data_frame.select(right_select).rename(join_input.right_select.rename_table)
|
|
1067
|
-
|
|
1796
|
+
left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
|
|
1797
|
+
right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
|
|
1068
1798
|
if verify_integrity and join_input.how != 'right':
|
|
1069
1799
|
n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
|
|
1070
1800
|
right_on_keys=join_input.right_join_keys, how=join_input.how)
|
|
@@ -1072,37 +1802,55 @@ class FlowDataEngine:
|
|
|
1072
1802
|
raise Exception("Join will result in too many records, ending process")
|
|
1073
1803
|
else:
|
|
1074
1804
|
n_records = -1
|
|
1805
|
+
left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_input)
|
|
1806
|
+
left, right = rename_df_table_for_join(left, right, join_input.get_join_key_renames())
|
|
1075
1807
|
if join_input.how == 'right':
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1808
|
+
joined_df = right.join(
|
|
1809
|
+
other=left,
|
|
1810
|
+
left_on=join_input.right_join_keys,
|
|
1811
|
+
right_on=join_input.left_join_keys,
|
|
1812
|
+
how="left",
|
|
1813
|
+
suffix="").rename(reverse_join_key_mapping)
|
|
1079
1814
|
else:
|
|
1080
|
-
joined_df = left.join(
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1815
|
+
joined_df = left.join(
|
|
1816
|
+
other=right,
|
|
1817
|
+
left_on=join_input.left_join_keys,
|
|
1818
|
+
right_on=join_input.right_join_keys,
|
|
1819
|
+
how=join_input.how,
|
|
1820
|
+
suffix="").rename(reverse_join_key_mapping)
|
|
1821
|
+
left_cols_to_delete_after = [get_col_name_to_delete(col, 'left') for col in join_input.left_select.renames
|
|
1822
|
+
if not col.keep
|
|
1823
|
+
and col.is_available and col.join_key
|
|
1824
|
+
]
|
|
1825
|
+
right_cols_to_delete_after = [get_col_name_to_delete(col, 'right') for col in join_input.right_select.renames
|
|
1826
|
+
if not col.keep
|
|
1827
|
+
and col.is_available and col.join_key
|
|
1828
|
+
and join_input.how in ("left", "right", "inner", "cross", "outer")
|
|
1829
|
+
]
|
|
1830
|
+
if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
|
|
1831
|
+
joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
|
|
1832
|
+
undo_join_key_remapping = get_undo_rename_mapping_join(join_input)
|
|
1833
|
+
joined_df = joined_df.rename(undo_join_key_remapping)
|
|
1834
|
+
|
|
1088
1835
|
if verify_integrity:
|
|
1089
1836
|
return FlowDataEngine(joined_df, calculate_schema_stats=True,
|
|
1090
|
-
|
|
1837
|
+
number_of_records=n_records, streamable=False)
|
|
1091
1838
|
else:
|
|
1092
1839
|
fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
|
|
1093
|
-
|
|
1840
|
+
number_of_records=0, streamable=False)
|
|
1094
1841
|
return fl
|
|
1095
1842
|
|
|
1096
|
-
# Graph Operations
|
|
1097
1843
|
def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
|
|
1098
|
-
"""
|
|
1099
|
-
|
|
1844
|
+
"""Solves a graph problem represented by 'from' and 'to' columns.
|
|
1845
|
+
|
|
1846
|
+
This is used for operations like finding connected components in a graph.
|
|
1100
1847
|
|
|
1101
1848
|
Args:
|
|
1102
|
-
graph_solver_input:
|
|
1849
|
+
graph_solver_input: A `GraphSolverInput` object defining the source,
|
|
1850
|
+
destination, and output column names.
|
|
1103
1851
|
|
|
1104
1852
|
Returns:
|
|
1105
|
-
FlowDataEngine
|
|
1853
|
+
A new `FlowDataEngine` instance with the solved graph data.
|
|
1106
1854
|
"""
|
|
1107
1855
|
lf = self.data_frame.with_columns(
|
|
1108
1856
|
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
|
|
@@ -1110,48 +1858,48 @@ class FlowDataEngine:
|
|
|
1110
1858
|
)
|
|
1111
1859
|
return FlowDataEngine(lf)
|
|
1112
1860
|
|
|
1113
|
-
# Data Modification Methods
|
|
1114
1861
|
def add_new_values(self, values: Iterable, col_name: str = None) -> "FlowDataEngine":
|
|
1115
|
-
"""
|
|
1116
|
-
Add a new column with specified values.
|
|
1862
|
+
"""Adds a new column with the provided values.
|
|
1117
1863
|
|
|
1118
1864
|
Args:
|
|
1119
|
-
values:
|
|
1120
|
-
col_name:
|
|
1865
|
+
values: An iterable (e.g., list, tuple) of values to add as a new column.
|
|
1866
|
+
col_name: The name for the new column. Defaults to 'new_values'.
|
|
1121
1867
|
|
|
1122
1868
|
Returns:
|
|
1123
|
-
FlowDataEngine
|
|
1869
|
+
A new `FlowDataEngine` instance with the added column.
|
|
1124
1870
|
"""
|
|
1125
1871
|
if col_name is None:
|
|
1126
1872
|
col_name = 'new_values'
|
|
1127
1873
|
return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
|
|
1128
1874
|
|
|
1129
1875
|
def get_record_count(self) -> "FlowDataEngine":
|
|
1130
|
-
"""
|
|
1131
|
-
|
|
1876
|
+
"""Returns a new FlowDataEngine with a single column 'number_of_records'
|
|
1877
|
+
containing the total number of records.
|
|
1132
1878
|
|
|
1133
1879
|
Returns:
|
|
1134
|
-
FlowDataEngine
|
|
1880
|
+
A new `FlowDataEngine` instance.
|
|
1135
1881
|
"""
|
|
1136
1882
|
return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
|
|
1137
1883
|
|
|
1138
1884
|
def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
|
|
1139
|
-
"""
|
|
1140
|
-
|
|
1885
|
+
"""Asserts that this DataFrame is equal to another.
|
|
1886
|
+
|
|
1887
|
+
Useful for testing.
|
|
1141
1888
|
|
|
1142
1889
|
Args:
|
|
1143
|
-
other:
|
|
1144
|
-
ordered:
|
|
1145
|
-
strict_schema:
|
|
1890
|
+
other: The other `FlowDataEngine` to compare with.
|
|
1891
|
+
ordered: If True, the row order must be identical.
|
|
1892
|
+
strict_schema: If True, the data types of the schemas must be identical.
|
|
1146
1893
|
|
|
1147
1894
|
Raises:
|
|
1148
|
-
Exception: If DataFrames are not equal
|
|
1895
|
+
Exception: If the DataFrames are not equal based on the specified criteria.
|
|
1149
1896
|
"""
|
|
1150
1897
|
org_laziness = self.lazy, other.lazy
|
|
1151
1898
|
self.lazy = False
|
|
1152
1899
|
other.lazy = False
|
|
1153
1900
|
self.number_of_records = -1
|
|
1154
1901
|
other.number_of_records = -1
|
|
1902
|
+
other = other.select_columns(self.columns)
|
|
1155
1903
|
|
|
1156
1904
|
if self.get_number_of_records() != other.get_number_of_records():
|
|
1157
1905
|
raise Exception('Number of records is not equal')
|
|
@@ -1172,14 +1920,14 @@ class FlowDataEngine:
|
|
|
1172
1920
|
self.lazy, other.lazy = org_laziness
|
|
1173
1921
|
assert self_lf.equals(other_lf), 'Data is not equal'
|
|
1174
1922
|
|
|
1175
|
-
# Initialization Methods
|
|
1176
1923
|
def initialize_empty_fl(self):
|
|
1177
|
-
"""
|
|
1924
|
+
"""Initializes an empty LazyFrame."""
|
|
1178
1925
|
self.data_frame = pl.LazyFrame()
|
|
1179
1926
|
self.number_of_records = 0
|
|
1180
1927
|
self._lazy = True
|
|
1181
1928
|
|
|
1182
1929
|
def _calculate_number_of_records_in_worker(self) -> int:
|
|
1930
|
+
"""Calculates the number of records in a worker process."""
|
|
1183
1931
|
number_of_records = ExternalDfFetcher(
|
|
1184
1932
|
lf=self.data_frame,
|
|
1185
1933
|
operation_type="calculate_number_of_records",
|
|
@@ -1191,18 +1939,20 @@ class FlowDataEngine:
|
|
|
1191
1939
|
|
|
1192
1940
|
def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
|
|
1193
1941
|
calculate_in_worker_process: bool = False) -> int:
|
|
1194
|
-
"""
|
|
1195
|
-
|
|
1942
|
+
"""Gets the total number of records in the DataFrame.
|
|
1943
|
+
|
|
1944
|
+
For lazy frames, this may trigger a full data scan, which can be expensive.
|
|
1196
1945
|
|
|
1197
1946
|
Args:
|
|
1198
|
-
warn:
|
|
1199
|
-
force_calculate:
|
|
1200
|
-
calculate_in_worker_process:
|
|
1947
|
+
warn: If True, logs a warning if a potentially expensive calculation is triggered.
|
|
1948
|
+
force_calculate: If True, forces recalculation even if a value is cached.
|
|
1949
|
+
calculate_in_worker_process: If True, offloads the calculation to a worker process.
|
|
1950
|
+
|
|
1201
1951
|
Returns:
|
|
1202
|
-
|
|
1952
|
+
The total number of records.
|
|
1203
1953
|
|
|
1204
1954
|
Raises:
|
|
1205
|
-
|
|
1955
|
+
ValueError: If the number of records could not be determined.
|
|
1206
1956
|
"""
|
|
1207
1957
|
if self.is_future and not self.is_collected:
|
|
1208
1958
|
return -1
|
|
@@ -1213,37 +1963,39 @@ class FlowDataEngine:
|
|
|
1213
1963
|
|
|
1214
1964
|
if self.lazy:
|
|
1215
1965
|
if calculate_in_worker_process:
|
|
1216
|
-
self.number_of_records = self._calculate_number_of_records_in_worker()
|
|
1217
|
-
else:
|
|
1218
|
-
if warn:
|
|
1219
|
-
logger.warning('Calculating the number of records this can be expensive on a lazy frame')
|
|
1220
1966
|
try:
|
|
1221
|
-
self.number_of_records = self.
|
|
1222
|
-
|
|
1223
|
-
except Exception:
|
|
1224
|
-
|
|
1967
|
+
self.number_of_records = self._calculate_number_of_records_in_worker()
|
|
1968
|
+
return self.number_of_records
|
|
1969
|
+
except Exception as e:
|
|
1970
|
+
logger.error(f"Error: {e}")
|
|
1971
|
+
if warn:
|
|
1972
|
+
logger.warning('Calculating the number of records this can be expensive on a lazy frame')
|
|
1973
|
+
try:
|
|
1974
|
+
self.number_of_records = self.data_frame.select(pl.len()).collect(
|
|
1975
|
+
engine="streaming" if self._streamable else "auto")[0, 0]
|
|
1976
|
+
except Exception:
|
|
1977
|
+
raise ValueError('Could not get number of records')
|
|
1225
1978
|
else:
|
|
1226
1979
|
self.number_of_records = self.data_frame.__len__()
|
|
1227
1980
|
return self.number_of_records
|
|
1228
1981
|
|
|
1229
|
-
# Properties
|
|
1230
1982
|
@property
|
|
1231
1983
|
def has_errors(self) -> bool:
|
|
1232
|
-
"""
|
|
1984
|
+
"""Checks if there are any errors."""
|
|
1233
1985
|
return len(self.errors) > 0
|
|
1234
1986
|
|
|
1235
1987
|
@property
|
|
1236
1988
|
def lazy(self) -> bool:
|
|
1237
|
-
"""
|
|
1989
|
+
"""Indicates if the DataFrame is in lazy mode."""
|
|
1238
1990
|
return self._lazy
|
|
1239
1991
|
|
|
1240
1992
|
@lazy.setter
|
|
1241
1993
|
def lazy(self, exec_lazy: bool = False):
|
|
1242
|
-
"""
|
|
1243
|
-
Set the laziness of the DataFrame.
|
|
1994
|
+
"""Sets the laziness of the DataFrame.
|
|
1244
1995
|
|
|
1245
1996
|
Args:
|
|
1246
|
-
exec_lazy:
|
|
1997
|
+
exec_lazy: If True, converts the DataFrame to a LazyFrame. If False,
|
|
1998
|
+
collects the data and converts it to an eager DataFrame.
|
|
1247
1999
|
"""
|
|
1248
2000
|
if exec_lazy != self._lazy:
|
|
1249
2001
|
if exec_lazy:
|
|
@@ -1259,42 +2011,40 @@ class FlowDataEngine:
|
|
|
1259
2011
|
|
|
1260
2012
|
@property
|
|
1261
2013
|
def external_source(self) -> ExternalDataSource:
|
|
1262
|
-
"""
|
|
2014
|
+
"""The external data source, if any."""
|
|
1263
2015
|
return self._external_source
|
|
1264
2016
|
|
|
1265
2017
|
@property
|
|
1266
2018
|
def cols_idx(self) -> Dict[str, int]:
|
|
1267
|
-
"""
|
|
2019
|
+
"""A dictionary mapping column names to their integer index."""
|
|
1268
2020
|
if self._col_idx is None:
|
|
1269
2021
|
self._col_idx = {c: i for i, c in enumerate(self.columns)}
|
|
1270
2022
|
return self._col_idx
|
|
1271
2023
|
|
|
1272
2024
|
@property
|
|
1273
2025
|
def __name__(self) -> str:
|
|
1274
|
-
"""
|
|
2026
|
+
"""The name of the table."""
|
|
1275
2027
|
return self.name
|
|
1276
2028
|
|
|
1277
|
-
# Schema and Column Operations
|
|
1278
2029
|
def get_select_inputs(self) -> transform_schemas.SelectInputs:
|
|
1279
|
-
"""
|
|
1280
|
-
Get select inputs for all columns.
|
|
2030
|
+
"""Gets `SelectInput` specifications for all columns in the current schema.
|
|
1281
2031
|
|
|
1282
2032
|
Returns:
|
|
1283
|
-
SelectInputs
|
|
2033
|
+
A `SelectInputs` object that can be used to configure selection or
|
|
2034
|
+
transformation operations.
|
|
1284
2035
|
"""
|
|
1285
2036
|
return transform_schemas.SelectInputs(
|
|
1286
2037
|
[transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
|
|
1287
2038
|
)
|
|
1288
2039
|
|
|
1289
2040
|
def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
|
|
1290
|
-
"""
|
|
1291
|
-
Select specific columns from the DataFrame.
|
|
2041
|
+
"""Selects a subset of columns from the DataFrame.
|
|
1292
2042
|
|
|
1293
2043
|
Args:
|
|
1294
|
-
list_select:
|
|
2044
|
+
list_select: A list, tuple, or single string of column names to select.
|
|
1295
2045
|
|
|
1296
2046
|
Returns:
|
|
1297
|
-
FlowDataEngine
|
|
2047
|
+
A new `FlowDataEngine` instance containing only the selected columns.
|
|
1298
2048
|
"""
|
|
1299
2049
|
if isinstance(list_select, str):
|
|
1300
2050
|
list_select = [list_select]
|
|
@@ -1311,14 +2061,13 @@ class FlowDataEngine:
|
|
|
1311
2061
|
)
|
|
1312
2062
|
|
|
1313
2063
|
def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
|
|
1314
|
-
"""
|
|
1315
|
-
Drop specified columns from the DataFrame.
|
|
2064
|
+
"""Drops specified columns from the DataFrame.
|
|
1316
2065
|
|
|
1317
2066
|
Args:
|
|
1318
|
-
columns:
|
|
2067
|
+
columns: A list of column names to drop.
|
|
1319
2068
|
|
|
1320
2069
|
Returns:
|
|
1321
|
-
FlowDataEngine
|
|
2070
|
+
A new `FlowDataEngine` instance without the dropped columns.
|
|
1322
2071
|
"""
|
|
1323
2072
|
cols_for_select = tuple(set(self.columns) - set(columns))
|
|
1324
2073
|
idx_to_keep = [self.cols_idx.get(c) for c in cols_for_select]
|
|
@@ -1331,14 +2080,13 @@ class FlowDataEngine:
|
|
|
1331
2080
|
)
|
|
1332
2081
|
|
|
1333
2082
|
def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
|
|
1334
|
-
"""
|
|
1335
|
-
Reorganize columns in specified order.
|
|
2083
|
+
"""Reorganizes columns into a specified order.
|
|
1336
2084
|
|
|
1337
2085
|
Args:
|
|
1338
|
-
column_order:
|
|
2086
|
+
column_order: A list of column names in the desired order.
|
|
1339
2087
|
|
|
1340
2088
|
Returns:
|
|
1341
|
-
FlowDataEngine
|
|
2089
|
+
A new `FlowDataEngine` instance with the columns reordered.
|
|
1342
2090
|
"""
|
|
1343
2091
|
df = self.data_frame.select(column_order)
|
|
1344
2092
|
schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
|
|
@@ -1346,16 +2094,15 @@ class FlowDataEngine:
|
|
|
1346
2094
|
|
|
1347
2095
|
def apply_flowfile_formula(self, func: str, col_name: str,
|
|
1348
2096
|
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
1349
|
-
"""
|
|
1350
|
-
Apply a formula to create a new column.
|
|
2097
|
+
"""Applies a formula to create a new column or transform an existing one.
|
|
1351
2098
|
|
|
1352
2099
|
Args:
|
|
1353
|
-
func:
|
|
1354
|
-
col_name:
|
|
1355
|
-
output_data_type:
|
|
2100
|
+
func: A string containing a Polars expression formula.
|
|
2101
|
+
col_name: The name of the new or transformed column.
|
|
2102
|
+
output_data_type: The desired Polars data type for the output column.
|
|
1356
2103
|
|
|
1357
2104
|
Returns:
|
|
1358
|
-
FlowDataEngine
|
|
2105
|
+
A new `FlowDataEngine` instance with the applied formula.
|
|
1359
2106
|
"""
|
|
1360
2107
|
parsed_func = to_expr(func)
|
|
1361
2108
|
if output_data_type is not None:
|
|
@@ -1367,16 +2114,15 @@ class FlowDataEngine:
|
|
|
1367
2114
|
|
|
1368
2115
|
def apply_sql_formula(self, func: str, col_name: str,
|
|
1369
2116
|
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
1370
|
-
"""
|
|
1371
|
-
Apply an SQL-style formula to create a new column.
|
|
2117
|
+
"""Applies an SQL-style formula using `pl.sql_expr`.
|
|
1372
2118
|
|
|
1373
2119
|
Args:
|
|
1374
|
-
func:
|
|
1375
|
-
col_name:
|
|
1376
|
-
output_data_type:
|
|
2120
|
+
func: A string containing an SQL expression.
|
|
2121
|
+
col_name: The name of the new or transformed column.
|
|
2122
|
+
output_data_type: The desired Polars data type for the output column.
|
|
1377
2123
|
|
|
1378
2124
|
Returns:
|
|
1379
|
-
FlowDataEngine
|
|
2125
|
+
A new `FlowDataEngine` instance with the applied formula.
|
|
1380
2126
|
"""
|
|
1381
2127
|
expr = to_expr(func)
|
|
1382
2128
|
if output_data_type not in (None, "Auto"):
|
|
@@ -1388,16 +2134,18 @@ class FlowDataEngine:
|
|
|
1388
2134
|
|
|
1389
2135
|
def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
|
|
1390
2136
|
execute_remote: bool = True) -> "FlowDataEngine":
|
|
1391
|
-
"""
|
|
1392
|
-
|
|
2137
|
+
"""Writes the DataFrame to an output file.
|
|
2138
|
+
|
|
2139
|
+
Can execute the write operation locally or in a remote worker process.
|
|
1393
2140
|
|
|
1394
2141
|
Args:
|
|
1395
|
-
output_fs:
|
|
1396
|
-
flow_id:
|
|
1397
|
-
node_id:
|
|
1398
|
-
execute_remote: If
|
|
2142
|
+
output_fs: An `OutputSettings` object with details about the output file.
|
|
2143
|
+
flow_id: The flow ID for tracking.
|
|
2144
|
+
node_id: The node ID for tracking.
|
|
2145
|
+
execute_remote: If True, executes the write in a worker process.
|
|
2146
|
+
|
|
1399
2147
|
Returns:
|
|
1400
|
-
FlowDataEngine
|
|
2148
|
+
The same `FlowDataEngine` instance for chaining.
|
|
1401
2149
|
"""
|
|
1402
2150
|
logger.info('Starting to write output')
|
|
1403
2151
|
if execute_remote:
|
|
@@ -1429,30 +2177,28 @@ class FlowDataEngine:
|
|
|
1429
2177
|
logger.info("Finished writing output")
|
|
1430
2178
|
return self
|
|
1431
2179
|
|
|
1432
|
-
# Data Operations
|
|
1433
2180
|
def make_unique(self, unique_input: transform_schemas.UniqueInput = None) -> "FlowDataEngine":
|
|
1434
|
-
"""
|
|
1435
|
-
Get unique rows based on specified columns.
|
|
2181
|
+
"""Gets the unique rows from the DataFrame.
|
|
1436
2182
|
|
|
1437
2183
|
Args:
|
|
1438
|
-
unique_input:
|
|
2184
|
+
unique_input: A `UniqueInput` object specifying a subset of columns
|
|
2185
|
+
to consider for uniqueness and a strategy for keeping rows.
|
|
1439
2186
|
|
|
1440
2187
|
Returns:
|
|
1441
|
-
FlowDataEngine
|
|
2188
|
+
A new `FlowDataEngine` instance with unique rows.
|
|
1442
2189
|
"""
|
|
1443
2190
|
if unique_input is None or unique_input.columns is None:
|
|
1444
2191
|
return FlowDataEngine(self.data_frame.unique())
|
|
1445
2192
|
return FlowDataEngine(self.data_frame.unique(unique_input.columns, keep=unique_input.strategy))
|
|
1446
2193
|
|
|
1447
2194
|
def concat(self, other: Iterable["FlowDataEngine"] | "FlowDataEngine") -> "FlowDataEngine":
|
|
1448
|
-
"""
|
|
1449
|
-
Concatenate with other DataFrames.
|
|
2195
|
+
"""Concatenates this DataFrame with one or more other DataFrames.
|
|
1450
2196
|
|
|
1451
2197
|
Args:
|
|
1452
|
-
other:
|
|
2198
|
+
other: A single `FlowDataEngine` or an iterable of them.
|
|
1453
2199
|
|
|
1454
2200
|
Returns:
|
|
1455
|
-
FlowDataEngine
|
|
2201
|
+
A new `FlowDataEngine` containing the concatenated data.
|
|
1456
2202
|
"""
|
|
1457
2203
|
if isinstance(other, FlowDataEngine):
|
|
1458
2204
|
other = [other]
|
|
@@ -1462,15 +2208,15 @@ class FlowDataEngine:
|
|
|
1462
2208
|
|
|
1463
2209
|
def do_select(self, select_inputs: transform_schemas.SelectInputs,
|
|
1464
2210
|
keep_missing: bool = True) -> "FlowDataEngine":
|
|
1465
|
-
"""
|
|
1466
|
-
Perform complex column selection and transformation.
|
|
2211
|
+
"""Performs a complex column selection, renaming, and reordering operation.
|
|
1467
2212
|
|
|
1468
2213
|
Args:
|
|
1469
|
-
select_inputs:
|
|
1470
|
-
keep_missing:
|
|
2214
|
+
select_inputs: A `SelectInputs` object defining the desired transformations.
|
|
2215
|
+
keep_missing: If True, columns not specified in `select_inputs` are kept.
|
|
2216
|
+
If False, they are dropped.
|
|
1471
2217
|
|
|
1472
2218
|
Returns:
|
|
1473
|
-
|
|
2219
|
+
A new `FlowDataEngine` with the transformed selection.
|
|
1474
2220
|
"""
|
|
1475
2221
|
new_schema = deepcopy(self.schema)
|
|
1476
2222
|
renames = [r for r in select_inputs.renames if r.is_available]
|
|
@@ -1506,29 +2252,29 @@ class FlowDataEngine:
|
|
|
1506
2252
|
output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
|
|
1507
2253
|
return output_file.reorganize_order(sorted_cols)
|
|
1508
2254
|
|
|
1509
|
-
# Utility Methods
|
|
1510
2255
|
def set_streamable(self, streamable: bool = False):
|
|
1511
|
-
"""
|
|
2256
|
+
"""Sets whether DataFrame operations should be streamable."""
|
|
1512
2257
|
self._streamable = streamable
|
|
1513
2258
|
|
|
1514
2259
|
def _calculate_schema(self) -> List[Dict]:
|
|
1515
|
-
"""
|
|
2260
|
+
"""Calculates schema statistics."""
|
|
1516
2261
|
if self.external_source is not None:
|
|
1517
2262
|
self.collect_external()
|
|
1518
2263
|
v = utils.calculate_schema(self.data_frame)
|
|
1519
2264
|
return v
|
|
1520
2265
|
|
|
1521
2266
|
def calculate_schema(self):
|
|
1522
|
-
"""
|
|
2267
|
+
"""Calculates and returns the schema."""
|
|
1523
2268
|
self._calculate_schema_stats = True
|
|
1524
2269
|
return self.schema
|
|
1525
2270
|
|
|
1526
2271
|
def count(self) -> int:
|
|
1527
|
-
"""
|
|
2272
|
+
"""Gets the total number of records."""
|
|
1528
2273
|
return self.get_number_of_records()
|
|
1529
2274
|
|
|
1530
2275
|
@classmethod
|
|
1531
2276
|
def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
|
|
2277
|
+
"""Creates a FlowDataEngine from a path in a worker process."""
|
|
1532
2278
|
received_table.set_absolute_filepath()
|
|
1533
2279
|
external_fetcher = ExternalCreateFetcher(received_table=received_table,
|
|
1534
2280
|
file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
|
|
@@ -1536,14 +2282,19 @@ class FlowDataEngine:
|
|
|
1536
2282
|
|
|
1537
2283
|
|
|
1538
2284
|
def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowDataEngine":
|
|
1539
|
-
"""
|
|
1540
|
-
|
|
2285
|
+
"""Executes arbitrary Polars code on one or more FlowDataEngine objects.
|
|
2286
|
+
|
|
2287
|
+
This function takes a string of Python code that uses Polars and executes it.
|
|
2288
|
+
Input `FlowDataEngine` objects are made available in the code's scope as
|
|
2289
|
+
`input_df` (for a single input) or `input_df_1`, `input_df_2`, etc.
|
|
1541
2290
|
|
|
1542
2291
|
Args:
|
|
1543
|
-
|
|
2292
|
+
*flowfile_tables: A variable number of `FlowDataEngine` objects to be
|
|
2293
|
+
used as input to the code.
|
|
2294
|
+
code: A string containing the Polars code to execute.
|
|
1544
2295
|
|
|
1545
2296
|
Returns:
|
|
1546
|
-
FlowDataEngine
|
|
2297
|
+
A new `FlowDataEngine` instance containing the result of the executed code.
|
|
1547
2298
|
"""
|
|
1548
2299
|
polars_executable = polars_code_parser.get_executable(code, num_inputs=len(flowfile_tables))
|
|
1549
2300
|
if len(flowfile_tables) == 0:
|
|
@@ -1555,4 +2306,4 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
|
|
|
1555
2306
|
df = polars_executable(**kwargs)
|
|
1556
2307
|
if isinstance(df, pl.DataFrame):
|
|
1557
2308
|
logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
|
|
1558
|
-
return FlowDataEngine(df)
|
|
2309
|
+
return FlowDataEngine(df)
|