Flowfile 0.3.1.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +2 -1
- flowfile/api.py +5 -3
- flowfile/web/__init__.py +3 -0
- flowfile/web/static/assets/{AirbyteReader-cb0c1d4a.js → AirbyteReader-2b1cf2d8.js} +10 -9
- flowfile/web/static/assets/{CrossJoin-a514fa59.js → CrossJoin-cc3ab73c.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-f2cecf33.js → DatabaseConnectionSettings-307c4652.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-83ee3c98.js → DatabaseManager-69faa6e1.js} +10 -6
- flowfile/web/static/assets/{DatabaseReader-dc0c6881.js → DatabaseReader-e4134cd0.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-5afe9f8d.js → DatabaseWriter-d32d75b1.js} +9 -9
- flowfile/web/static/assets/{ExploreData-c7ee19cf.js → ExploreData-5eb48389.js} +18639 -18629
- flowfile/web/static/assets/{ExternalSource-17b23a01.js → ExternalSource-29489051.js} +8 -21
- flowfile/web/static/assets/{Filter-90856b4f.js → Filter-031332bb.js} +9 -9
- flowfile/web/static/assets/{Formula-38b71e9e.js → Formula-3b900540.js} +15 -15
- flowfile/web/static/assets/{Formula-d60a74f4.css → Formula-b8cefc31.css} +4 -4
- flowfile/web/static/assets/{FuzzyMatch-d0f1fe81.js → FuzzyMatch-dee31153.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-0c86bbc6.js → GraphSolver-ca74eb47.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f2772e9f.js → GroupBy-081b6591.js} +8 -7
- flowfile/web/static/assets/{Join-bc3e1cf7.js → Join-b467376f.js} +11 -10
- flowfile/web/static/assets/{ManualInput-03aa0245.js → ManualInput-ffffb80a.js} +11 -8
- flowfile/web/static/assets/{Output-5b35eee8.js → Output-9a87d4ba.js} +4 -4
- flowfile/web/static/assets/{Pivot-7164087c.js → Pivot-ee3e6093.js} +8 -7
- flowfile/web/static/assets/{PolarsCode-3abf6507.js → PolarsCode-03921254.js} +13 -11
- flowfile/web/static/assets/{PopOver-b37ff9be.js → PopOver-3bdf8951.js} +1 -1
- flowfile/web/static/assets/{Read-65966a3e.js → Read-67fee3a0.js} +6 -6
- flowfile/web/static/assets/{RecordCount-c66c6d6d.js → RecordCount-a2acd02d.js} +7 -6
- flowfile/web/static/assets/{RecordId-826dc095.js → RecordId-0c8bcd77.js} +10 -8
- flowfile/web/static/assets/{Sample-4ed555c8.js → Sample-60594a3a.js} +7 -6
- flowfile/web/static/assets/{SecretManager-eac1e97d.js → SecretManager-bbcec2ac.js} +2 -2
- flowfile/web/static/assets/{Select-085f05cc.js → Select-9540e6ca.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-1f5e79c1.js → SettingsSection-48f28104.js} +1 -1
- flowfile/web/static/assets/{Sort-3e6cb414.js → Sort-6dbe3633.js} +6 -6
- flowfile/web/static/assets/{TextToRows-606349bc.js → TextToRows-27aab4a8.js} +18 -13
- flowfile/web/static/assets/{UnavailableFields-b41976ed.js → UnavailableFields-8143044b.js} +2 -2
- flowfile/web/static/assets/{Union-fca91665.js → Union-52460248.js} +7 -6
- flowfile/web/static/assets/{Unique-a59f830e.js → Unique-f6962644.js} +8 -8
- flowfile/web/static/assets/{Unpivot-c3815565.js → Unpivot-1ff1e938.js} +5 -5
- flowfile/web/static/assets/{api-22b338bd.js → api-3b345d92.js} +1 -1
- flowfile/web/static/assets/{designer-e5bbe26f.js → designer-4736134f.js} +72 -42
- flowfile/web/static/assets/{documentation-08045cf2.js → documentation-b9545eba.js} +1 -1
- flowfile/web/static/assets/{dropDown-5e7e9a5a.js → dropDown-d5a4014c.js} +1 -1
- flowfile/web/static/assets/{dropDownGeneric-50a91b99.js → dropDownGeneric-1f4e32ec.js} +2 -2
- flowfile/web/static/assets/{fullEditor-705c6ccb.js → fullEditor-f4791c23.js} +3 -3
- flowfile/web/static/assets/{genericNodeSettings-65587f20.js → genericNodeSettings-1d456350.js} +3 -3
- flowfile/web/static/assets/{index-552863fd.js → index-f25c9283.js} +2608 -1570
- flowfile/web/static/assets/{nodeTitle-cf9bae3c.js → nodeTitle-cad6fd9d.js} +3 -3
- flowfile/web/static/assets/{secretApi-3ad510e1.js → secretApi-01f07e2c.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-bd644891.js → selectDynamic-f46a4e3f.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-dd17b478.js → vue-codemirror.esm-eb98fc8b.js} +15 -14
- flowfile/web/static/assets/{vue-content-loader.es-6b36f05e.js → vue-content-loader.es-860c0380.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/METADATA +1 -3
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/RECORD +97 -88
- flowfile_core/configs/__init__.py +15 -4
- flowfile_core/configs/node_store/nodes.py +2 -4
- flowfile_core/configs/settings.py +5 -3
- flowfile_core/configs/utils.py +18 -0
- flowfile_core/flowfile/FlowfileFlow.py +84 -29
- flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +55 -18
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +34 -2
- flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
- flowfile_core/flowfile/flow_graph_utils.py +320 -0
- flowfile_core/flowfile/flow_node/flow_node.py +2 -1
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +0 -1
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
- flowfile_core/flowfile/utils.py +34 -3
- flowfile_core/main.py +2 -3
- flowfile_core/routes/secrets.py +1 -1
- flowfile_core/schemas/input_schema.py +12 -14
- flowfile_core/schemas/transform_schema.py +25 -47
- flowfile_frame/__init__.py +11 -4
- flowfile_frame/adding_expr.py +280 -0
- flowfile_frame/config.py +9 -0
- flowfile_frame/expr.py +301 -83
- flowfile_frame/expr.pyi +2174 -0
- flowfile_frame/expr_name.py +258 -0
- flowfile_frame/flow_frame.py +616 -627
- flowfile_frame/flow_frame.pyi +336 -0
- flowfile_frame/flow_frame_methods.py +617 -0
- flowfile_frame/group_frame.py +89 -42
- flowfile_frame/join.py +1 -2
- flowfile_frame/lazy.py +704 -0
- flowfile_frame/lazy_methods.py +201 -0
- flowfile_frame/list_name_space.py +324 -0
- flowfile_frame/selectors.py +3 -0
- flowfile_frame/series.py +70 -0
- flowfile_frame/utils.py +80 -4
- flowfile/web/static/assets/GoogleSheet-854294a4.js +0 -2616
- flowfile/web/static/assets/GoogleSheet-92084da7.css +0 -233
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +0 -74
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/LICENSE +0 -0
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/WHEEL +0 -0
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/entry_points.txt +0 -0
- /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
- /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
from functools import wraps
|
|
3
|
+
from typing import Optional, TypeVar, Type, Callable
|
|
4
|
+
from flowfile_frame.utils import _get_function_source
|
|
5
|
+
from flowfile_frame.config import logger
|
|
6
|
+
|
|
7
|
+
T = TypeVar('T')
|
|
8
|
+
FlowFrameT = TypeVar('FlowFrameT', bound='FlowFrame')
|
|
9
|
+
|
|
10
|
+
PASSTHROUGH_METHODS = {
|
|
11
|
+
'collect', 'collect_async', 'profile', 'describe', 'explain',
|
|
12
|
+
'show_graph', 'fetch', 'collect_schema', 'columns', 'dtypes',
|
|
13
|
+
'schema', 'width', 'estimated_size', 'n_chunks', 'is_empty',
|
|
14
|
+
'chunk_lengths', 'get_meta'
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_lazyframe_method_wrapper(method_name: str, original_method: Callable) -> Callable:
|
|
19
|
+
"""
|
|
20
|
+
Creates a wrapper for a LazyFrame method that properly integrates with FlowFrame.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
method_name : str
|
|
25
|
+
Name of the LazyFrame method.
|
|
26
|
+
original_method : Callable
|
|
27
|
+
The original LazyFrame method.
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
Callable
|
|
32
|
+
A wrapper method appropriate for FlowFrame.
|
|
33
|
+
"""
|
|
34
|
+
# Determine if the original method returns a LazyFrame based on known method names
|
|
35
|
+
lazyframe_returning_methods = {
|
|
36
|
+
"drop", "select", "with_columns", "sort", "filter", "join", "head", "tail",
|
|
37
|
+
"limit", "drop_nulls", "fill_null", "with_row_index", "group_by", "explode",
|
|
38
|
+
"unique", "slice", "shift", "reverse", "max", "min", "sum", "mean", "median",
|
|
39
|
+
"std", "var", "drop_nans", "fill_nan", "interpolate", "null_count", "quantile",
|
|
40
|
+
"unpivot", "melt", "first", "last"
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
non_lazyframe_methods = {
|
|
44
|
+
"collect", "collect_schema", "fetch", "columns", "dtypes", "schema", "width",
|
|
45
|
+
"describe", "explain", "profile", "show_graph"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
returns_lazyframe = (
|
|
49
|
+
method_name in lazyframe_returning_methods or
|
|
50
|
+
(method_name not in non_lazyframe_methods and not method_name.startswith("_"))
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
@wraps(original_method)
|
|
54
|
+
def wrapper(self, *args, description: Optional[str] = None, **kwargs):
|
|
55
|
+
# Import here to avoid circular imports
|
|
56
|
+
from flowfile_frame.flow_frame import generate_node_id
|
|
57
|
+
new_node_id = generate_node_id()
|
|
58
|
+
|
|
59
|
+
if not all([True if not hasattr(arg, "convertable_to_code") else getattr(arg, 'convertable_to_code') for arg in
|
|
60
|
+
args]):
|
|
61
|
+
logger.debug("Warning, could not create a good node")
|
|
62
|
+
return self.__class__(getattr(self.data, method_name)(arg.expr for arg in args), flow_graph=self.flow_graph)
|
|
63
|
+
|
|
64
|
+
# Collect function sources and build representations
|
|
65
|
+
function_sources = []
|
|
66
|
+
args_representations = []
|
|
67
|
+
kwargs_representations = []
|
|
68
|
+
|
|
69
|
+
# Process positional arguments
|
|
70
|
+
for arg in args:
|
|
71
|
+
if callable(arg) and not isinstance(arg, type):
|
|
72
|
+
# Try to get function source
|
|
73
|
+
try:
|
|
74
|
+
source, is_module_level = _get_function_source(arg)
|
|
75
|
+
if source and hasattr(arg, '__name__') and arg.__name__ != '<lambda>':
|
|
76
|
+
function_sources.append(source)
|
|
77
|
+
# Use the function name in the representation
|
|
78
|
+
args_representations.append(arg.__name__)
|
|
79
|
+
else:
|
|
80
|
+
# Fallback to repr if we can't get the source
|
|
81
|
+
args_representations.append(repr(arg))
|
|
82
|
+
except:
|
|
83
|
+
args_representations.append(repr(arg))
|
|
84
|
+
else:
|
|
85
|
+
args_representations.append(repr(arg))
|
|
86
|
+
# Process keyword arguments
|
|
87
|
+
for key, value in kwargs.items():
|
|
88
|
+
if callable(value) and not isinstance(value, type):
|
|
89
|
+
# Try to get function source
|
|
90
|
+
try:
|
|
91
|
+
source, is_module_level = _get_function_source(value)
|
|
92
|
+
if source and hasattr(value, '__name__') and value.__name__ != '<lambda>':
|
|
93
|
+
function_sources.append(source)
|
|
94
|
+
kwargs_representations.append(f"{key}={value.__name__}")
|
|
95
|
+
else:
|
|
96
|
+
kwargs_representations.append(f"{key}={repr(value)}")
|
|
97
|
+
except:
|
|
98
|
+
kwargs_representations.append(f"{key}={repr(value)}")
|
|
99
|
+
else:
|
|
100
|
+
kwargs_representations.append(f"{key}={repr(value)}")
|
|
101
|
+
|
|
102
|
+
# Build parameter string
|
|
103
|
+
args_str = ", ".join(args_representations)
|
|
104
|
+
kwargs_str = ", ".join(kwargs_representations)
|
|
105
|
+
|
|
106
|
+
if args_str and kwargs_str:
|
|
107
|
+
params_str = f"{args_str}, {kwargs_str}"
|
|
108
|
+
elif args_str:
|
|
109
|
+
params_str = args_str
|
|
110
|
+
elif kwargs_str:
|
|
111
|
+
params_str = kwargs_str
|
|
112
|
+
else:
|
|
113
|
+
params_str = ""
|
|
114
|
+
|
|
115
|
+
# Build the code
|
|
116
|
+
operation_code = f"input_df.{method_name}({params_str})"
|
|
117
|
+
|
|
118
|
+
if function_sources:
|
|
119
|
+
unique_sources = []
|
|
120
|
+
seen = set()
|
|
121
|
+
for source in function_sources:
|
|
122
|
+
if source not in seen:
|
|
123
|
+
seen.add(source)
|
|
124
|
+
unique_sources.append(source)
|
|
125
|
+
|
|
126
|
+
functions_section = "# Function definitions\n" + "\n\n".join(unique_sources)
|
|
127
|
+
code = functions_section + "\n#─────SPLIT─────\n\noutput_df = " + operation_code
|
|
128
|
+
else:
|
|
129
|
+
code = "output_df = " + operation_code
|
|
130
|
+
|
|
131
|
+
# Use provided description or generate a default one
|
|
132
|
+
if description is None:
|
|
133
|
+
description = f"{method_name.replace('_', ' ').title()} operation"
|
|
134
|
+
|
|
135
|
+
self._add_polars_code(new_node_id, code, description)
|
|
136
|
+
|
|
137
|
+
if returns_lazyframe:
|
|
138
|
+
# Return a new FlowFrame with the result
|
|
139
|
+
return self._create_child_frame(new_node_id)
|
|
140
|
+
else:
|
|
141
|
+
# For methods that don't return a LazyFrame, return the result directly
|
|
142
|
+
return getattr(self.data, method_name)(*args, **kwargs)
|
|
143
|
+
|
|
144
|
+
return wrapper
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def add_lazyframe_methods(cls):
|
|
148
|
+
"""
|
|
149
|
+
Class decorator that adds all LazyFrame methods to a class.
|
|
150
|
+
|
|
151
|
+
This adds the methods at class creation time, so they are visible to static type checkers.
|
|
152
|
+
Methods already defined in the class are not overwritten.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
cls : Type
|
|
157
|
+
The class to which the methods will be added.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
Type
|
|
162
|
+
The modified class.
|
|
163
|
+
"""
|
|
164
|
+
# Get methods already defined in the class (including inherited methods)
|
|
165
|
+
existing_methods = set(dir(cls))
|
|
166
|
+
|
|
167
|
+
# Skip properties and private methods
|
|
168
|
+
skip_methods = {
|
|
169
|
+
name for name in dir(pl.LazyFrame)
|
|
170
|
+
if name.startswith('_') or isinstance(getattr(pl.LazyFrame, name), property)
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
# Add all public LazyFrame methods that don't already exist
|
|
174
|
+
for name in dir(pl.LazyFrame):
|
|
175
|
+
if name in existing_methods or name in skip_methods:
|
|
176
|
+
continue
|
|
177
|
+
attr = getattr(pl.LazyFrame, name)
|
|
178
|
+
if name in PASSTHROUGH_METHODS:
|
|
179
|
+
def create_passthrough_method(method_name, method_attr):
|
|
180
|
+
|
|
181
|
+
@wraps(method_attr)
|
|
182
|
+
def passthrough_method(self, *args, **kwargs):
|
|
183
|
+
return getattr(self.data, method_name)(*args, **kwargs)
|
|
184
|
+
|
|
185
|
+
return passthrough_method
|
|
186
|
+
|
|
187
|
+
setattr(cls, name, create_passthrough_method(name, attr))
|
|
188
|
+
|
|
189
|
+
else:
|
|
190
|
+
attr = getattr(pl.LazyFrame, name)
|
|
191
|
+
if callable(attr):
|
|
192
|
+
wrapped_method = create_lazyframe_method_wrapper(name, attr)
|
|
193
|
+
setattr(cls, name, wrapped_method)
|
|
194
|
+
|
|
195
|
+
overlap = {
|
|
196
|
+
name for name in existing_methods
|
|
197
|
+
if name in dir(pl.LazyFrame) and not name.startswith('_') and callable(getattr(pl.LazyFrame, name))
|
|
198
|
+
}
|
|
199
|
+
if overlap:
|
|
200
|
+
logger.debug(f"Preserved existing methods in {cls.__name__}: {', '.join(sorted(overlap))}")
|
|
201
|
+
return cls
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Union, TYPE_CHECKING, List, TypeVar, Callable, Sequence, Literal
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# --- TYPE CHECKING IMPORTS ---
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from flowfile_frame.expr import Expr, _get_expr_and_repr, col, lit
|
|
11
|
+
from polars._typing import IntoExprColumn, NullBehavior, ListToStructWidthStrategy
|
|
12
|
+
from datetime import date, datetime, time
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ExprListNameSpace:
|
|
16
|
+
"""Namespace for list related expressions."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, parent_expr: 'Expr', parent_repr_str: str):
|
|
19
|
+
self.parent = parent_expr
|
|
20
|
+
self.expr = parent_expr.expr.list if parent_expr.expr is not None else None
|
|
21
|
+
self.parent_repr_str = parent_repr_str
|
|
22
|
+
|
|
23
|
+
def _create_next_expr(self, *args, method_name: str, result_expr: Optional[pl.Expr], is_complex: bool = True, **kwargs) -> 'Expr':
|
|
24
|
+
from flowfile_frame.expr import Expr
|
|
25
|
+
"""Creates a new Expr instance, appending method call to repr string."""
|
|
26
|
+
args_repr = ", ".join(repr(a) for a in args)
|
|
27
|
+
kwargs_repr = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
|
|
28
|
+
|
|
29
|
+
if args_repr and kwargs_repr:
|
|
30
|
+
args_str = f"{args_repr}, {kwargs_repr}"
|
|
31
|
+
elif args_repr:
|
|
32
|
+
args_str = args_repr
|
|
33
|
+
elif kwargs_repr:
|
|
34
|
+
args_str = kwargs_repr
|
|
35
|
+
else:
|
|
36
|
+
args_str = ""
|
|
37
|
+
|
|
38
|
+
new_repr = f"{self.parent_repr_str}.list.{method_name}({args_str})"
|
|
39
|
+
|
|
40
|
+
# Create new instance, inheriting current agg_func status by default
|
|
41
|
+
new_expr_instance = Expr(
|
|
42
|
+
result_expr,
|
|
43
|
+
self.parent.column_name,
|
|
44
|
+
repr_str=new_repr,
|
|
45
|
+
initial_column_name=self.parent._initial_column_name,
|
|
46
|
+
selector=None,
|
|
47
|
+
agg_func=self.parent.agg_func,
|
|
48
|
+
is_complex=is_complex,
|
|
49
|
+
convertable_to_code=self.parent.convertable_to_code
|
|
50
|
+
)
|
|
51
|
+
return new_expr_instance
|
|
52
|
+
|
|
53
|
+
def all(self) -> Expr:
|
|
54
|
+
res_expr = self.expr.all() if self.expr is not None else None
|
|
55
|
+
return self._create_next_expr(method_name="all", result_expr=res_expr)
|
|
56
|
+
|
|
57
|
+
def any(self) -> Expr:
|
|
58
|
+
res_expr = self.expr.any() if self.expr is not None else None
|
|
59
|
+
return self._create_next_expr(method_name="any", result_expr=res_expr)
|
|
60
|
+
|
|
61
|
+
def len(self) -> Expr:
|
|
62
|
+
res_expr = self.expr.len() if self.expr is not None else None
|
|
63
|
+
return self._create_next_expr(method_name="len", result_expr=res_expr)
|
|
64
|
+
|
|
65
|
+
def drop_nulls(self) -> Expr:
|
|
66
|
+
res_expr = self.expr.drop_nulls() if self.expr is not None else None
|
|
67
|
+
return self._create_next_expr(method_name="drop_nulls", result_expr=res_expr)
|
|
68
|
+
|
|
69
|
+
def sample(
|
|
70
|
+
self,
|
|
71
|
+
n: int | IntoExprColumn | None = None,
|
|
72
|
+
*,
|
|
73
|
+
fraction: float | IntoExprColumn | None = None,
|
|
74
|
+
with_replacement: bool = False,
|
|
75
|
+
shuffle: bool = False,
|
|
76
|
+
seed: int | None = None,
|
|
77
|
+
) -> Expr:
|
|
78
|
+
if n is not None and fraction is not None:
|
|
79
|
+
raise ValueError("cannot specify both `n` and `fraction`")
|
|
80
|
+
|
|
81
|
+
res_expr = None
|
|
82
|
+
if self.expr is not None:
|
|
83
|
+
try:
|
|
84
|
+
if fraction is not None:
|
|
85
|
+
expr_fraction = fraction.expr if hasattr(fraction, 'expr') else fraction
|
|
86
|
+
res_expr = self.expr.sample(n=None, fraction=expr_fraction,
|
|
87
|
+
with_replacement=with_replacement,
|
|
88
|
+
shuffle=shuffle, seed=seed)
|
|
89
|
+
else:
|
|
90
|
+
expr_n = n.expr if hasattr(n, 'expr') else (1 if n is None else n)
|
|
91
|
+
res_expr = self.expr.sample(n=expr_n, fraction=None,
|
|
92
|
+
with_replacement=with_replacement,
|
|
93
|
+
shuffle=shuffle, seed=seed)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
print(f"Warning: Could not create polars expression for list.sample(): {e}")
|
|
96
|
+
|
|
97
|
+
return self._create_next_expr(
|
|
98
|
+
n if n is not None else None,
|
|
99
|
+
method_name="sample",
|
|
100
|
+
result_expr=res_expr,
|
|
101
|
+
fraction=fraction,
|
|
102
|
+
with_replacement=with_replacement,
|
|
103
|
+
shuffle=shuffle,
|
|
104
|
+
seed=seed
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def sum(self) -> Expr:
|
|
108
|
+
res_expr = self.expr.sum() if self.expr is not None else None
|
|
109
|
+
return self._create_next_expr(method_name="sum", result_expr=res_expr)
|
|
110
|
+
|
|
111
|
+
def max(self) -> Expr:
|
|
112
|
+
res_expr = self.expr.max() if self.expr is not None else None
|
|
113
|
+
return self._create_next_expr(method_name="max", result_expr=res_expr)
|
|
114
|
+
|
|
115
|
+
def min(self) -> Expr:
|
|
116
|
+
res_expr = self.expr.min() if self.expr is not None else None
|
|
117
|
+
return self._create_next_expr(method_name="min", result_expr=res_expr)
|
|
118
|
+
|
|
119
|
+
def mean(self) -> Expr:
|
|
120
|
+
res_expr = self.expr.mean() if self.expr is not None else None
|
|
121
|
+
return self._create_next_expr(method_name="mean", result_expr=res_expr)
|
|
122
|
+
|
|
123
|
+
def median(self) -> Expr:
|
|
124
|
+
res_expr = self.expr.median() if self.expr is not None else None
|
|
125
|
+
return self._create_next_expr(method_name="median", result_expr=res_expr)
|
|
126
|
+
|
|
127
|
+
def std(self, ddof: int = 1) -> Expr:
|
|
128
|
+
res_expr = self.expr.std(ddof=ddof) if self.expr is not None else None
|
|
129
|
+
return self._create_next_expr(method_name="std", result_expr=res_expr, ddof=ddof)
|
|
130
|
+
|
|
131
|
+
def var(self, ddof: int = 1) -> Expr:
|
|
132
|
+
res_expr = self.expr.var(ddof=ddof) if self.expr is not None else None
|
|
133
|
+
return self._create_next_expr(method_name="var", result_expr=res_expr, ddof=ddof)
|
|
134
|
+
|
|
135
|
+
def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Expr:
|
|
136
|
+
res_expr = self.expr.sort(descending=descending, nulls_last=nulls_last) if self.expr is not None else None
|
|
137
|
+
return self._create_next_expr(method_name="sort", result_expr=res_expr, descending=descending, nulls_last=nulls_last)
|
|
138
|
+
|
|
139
|
+
def reverse(self) -> Expr:
|
|
140
|
+
res_expr = self.expr.reverse() if self.expr is not None else None
|
|
141
|
+
return self._create_next_expr(method_name="reverse", result_expr=res_expr)
|
|
142
|
+
|
|
143
|
+
def unique(self, *, maintain_order: bool = False) -> Expr:
|
|
144
|
+
res_expr = self.expr.unique(maintain_order=maintain_order) if self.expr is not None else None
|
|
145
|
+
return self._create_next_expr(method_name="unique", result_expr=res_expr, maintain_order=maintain_order)
|
|
146
|
+
|
|
147
|
+
def n_unique(self) -> Expr:
|
|
148
|
+
res_expr = self.expr.n_unique() if self.expr is not None else None
|
|
149
|
+
return self._create_next_expr(method_name="n_unique", result_expr=res_expr)
|
|
150
|
+
|
|
151
|
+
def concat(self, other: list[Expr | str] | Expr | str | pl.Series | list[Any]) -> Expr:
|
|
152
|
+
res_expr = None
|
|
153
|
+
other_expr = None
|
|
154
|
+
|
|
155
|
+
# Handle different types of 'other'
|
|
156
|
+
if isinstance(other, (Expr, str)):
|
|
157
|
+
if isinstance(other, Expr):
|
|
158
|
+
other_expr = other.expr
|
|
159
|
+
else:
|
|
160
|
+
other_expr = pl.col(other)
|
|
161
|
+
elif isinstance(other, pl.Series):
|
|
162
|
+
other_expr = pl.lit(other)
|
|
163
|
+
elif isinstance(other, list):
|
|
164
|
+
if len(other) > 0 and isinstance(other[0], (Expr, str, pl.Series)):
|
|
165
|
+
# List of expressions
|
|
166
|
+
other_expr = [o.expr if hasattr(o, 'expr') else (pl.col(o) if isinstance(o, str) else o) for o in other]
|
|
167
|
+
else:
|
|
168
|
+
# List of values
|
|
169
|
+
other_expr = pl.lit(other)
|
|
170
|
+
|
|
171
|
+
# Create the polars expression if possible
|
|
172
|
+
if self.expr is not None and other_expr is not None:
|
|
173
|
+
try:
|
|
174
|
+
if isinstance(other_expr, list):
|
|
175
|
+
# Insert self.expr at the beginning
|
|
176
|
+
all_exprs = [self.parent.expr] + other_expr
|
|
177
|
+
res_expr = pl.concat_list(all_exprs)
|
|
178
|
+
else:
|
|
179
|
+
res_expr = self.expr.concat(other_expr)
|
|
180
|
+
except Exception as e:
|
|
181
|
+
print(f"Warning: Could not create polars expression for list.concat(): {e}")
|
|
182
|
+
|
|
183
|
+
return self._create_next_expr(other, method_name="concat", result_expr=res_expr)
|
|
184
|
+
|
|
185
|
+
def get(self, index: int | Expr | str, *, null_on_oob: bool = False) -> Expr:
|
|
186
|
+
index_expr = index.expr if hasattr(index, 'expr') else index
|
|
187
|
+
res_expr = self.expr.get(index_expr, null_on_oob=null_on_oob) if self.expr is not None else None
|
|
188
|
+
return self._create_next_expr(index, method_name="get", result_expr=res_expr, null_on_oob=null_on_oob)
|
|
189
|
+
|
|
190
|
+
def gather(self, indices: Expr | pl.Series | list[int] | list[list[int]], *, null_on_oob: bool = False) -> Expr:
|
|
191
|
+
indices_expr = indices
|
|
192
|
+
if isinstance(indices, list):
|
|
193
|
+
indices_expr = pl.Series(indices)
|
|
194
|
+
elif hasattr(indices, 'expr'):
|
|
195
|
+
indices_expr = indices.expr
|
|
196
|
+
|
|
197
|
+
res_expr = self.expr.gather(indices_expr, null_on_oob=null_on_oob) if self.expr is not None else None
|
|
198
|
+
return self._create_next_expr(indices, method_name="gather", result_expr=res_expr, null_on_oob=null_on_oob)
|
|
199
|
+
|
|
200
|
+
def gather_every(self, n: int | IntoExprColumn, offset: int | IntoExprColumn = 0) -> Expr:
|
|
201
|
+
n_expr = n.expr if hasattr(n, 'expr') else n
|
|
202
|
+
offset_expr = offset.expr if hasattr(offset, 'expr') else offset
|
|
203
|
+
|
|
204
|
+
res_expr = self.expr.gather_every(n_expr, offset_expr) if self.expr is not None else None
|
|
205
|
+
return self._create_next_expr(n, method_name="gather_every", result_expr=res_expr, offset=offset)
|
|
206
|
+
|
|
207
|
+
def first(self) -> Expr:
|
|
208
|
+
res_expr = self.expr.first() if self.expr is not None else None
|
|
209
|
+
return self._create_next_expr(method_name="first", result_expr=res_expr)
|
|
210
|
+
|
|
211
|
+
def last(self) -> Expr:
|
|
212
|
+
res_expr = self.expr.last() if self.expr is not None else None
|
|
213
|
+
return self._create_next_expr(method_name="last", result_expr=res_expr)
|
|
214
|
+
|
|
215
|
+
def contains(self, item: float | str | bool | int | date | datetime | time | IntoExprColumn) -> Expr:
|
|
216
|
+
item_expr = item.expr if hasattr(item, 'expr') else item
|
|
217
|
+
res_expr = self.expr.contains(item_expr) if self.expr is not None else None
|
|
218
|
+
return self._create_next_expr(item, method_name="contains", result_expr=res_expr)
|
|
219
|
+
|
|
220
|
+
def join(self, separator: IntoExprColumn, *, ignore_nulls: bool = True) -> Expr:
|
|
221
|
+
separator_expr = separator.expr if hasattr(separator, 'expr') else separator
|
|
222
|
+
res_expr = self.expr.join(separator_expr, ignore_nulls=ignore_nulls) if self.expr is not None else None
|
|
223
|
+
return self._create_next_expr(separator, method_name="join", result_expr=res_expr, ignore_nulls=ignore_nulls)
|
|
224
|
+
|
|
225
|
+
def arg_min(self) -> Expr:
|
|
226
|
+
res_expr = self.expr.arg_min() if self.expr is not None else None
|
|
227
|
+
return self._create_next_expr(method_name="arg_min", result_expr=res_expr)
|
|
228
|
+
|
|
229
|
+
def arg_max(self) -> Expr:
|
|
230
|
+
res_expr = self.expr.arg_max() if self.expr is not None else None
|
|
231
|
+
return self._create_next_expr(method_name="arg_max", result_expr=res_expr)
|
|
232
|
+
|
|
233
|
+
def diff(self, n: int = 1, null_behavior: NullBehavior = "ignore") -> Expr:
|
|
234
|
+
res_expr = self.expr.diff(n, null_behavior) if self.expr is not None else None
|
|
235
|
+
return self._create_next_expr(method_name="diff", result_expr=res_expr, n=n, null_behavior=null_behavior)
|
|
236
|
+
|
|
237
|
+
def shift(self, n: int | IntoExprColumn = 1) -> Expr:
|
|
238
|
+
n_expr = n.expr if hasattr(n, 'expr') else n
|
|
239
|
+
res_expr = self.expr.shift(n_expr) if self.expr is not None else None
|
|
240
|
+
return self._create_next_expr(n, method_name="shift", result_expr=res_expr)
|
|
241
|
+
|
|
242
|
+
def slice(self, offset: int | str | Expr, length: int | str | Expr | None = None) -> Expr:
|
|
243
|
+
offset_expr = offset.expr if hasattr(offset, 'expr') else offset
|
|
244
|
+
length_expr = length.expr if hasattr(length, 'expr') and length is not None else length
|
|
245
|
+
|
|
246
|
+
res_expr = self.expr.slice(offset_expr, length_expr) if self.expr is not None else None
|
|
247
|
+
return self._create_next_expr(offset, length, method_name="slice", result_expr=res_expr)
|
|
248
|
+
|
|
249
|
+
def head(self, n: int | str | Expr = 5) -> Expr:
|
|
250
|
+
n_expr = n.expr if hasattr(n, 'expr') else n
|
|
251
|
+
res_expr = self.expr.head(n_expr) if self.expr is not None else None
|
|
252
|
+
return self._create_next_expr(n, method_name="head", result_expr=res_expr)
|
|
253
|
+
|
|
254
|
+
def tail(self, n: int | str | Expr = 5) -> Expr:
|
|
255
|
+
n_expr = n.expr if hasattr(n, 'expr') else n
|
|
256
|
+
res_expr = self.expr.tail(n_expr) if self.expr is not None else None
|
|
257
|
+
return self._create_next_expr(n, method_name="tail", result_expr=res_expr)
|
|
258
|
+
|
|
259
|
+
def explode(self) -> Expr:
|
|
260
|
+
res_expr = self.expr.explode() if self.expr is not None else None
|
|
261
|
+
return self._create_next_expr(method_name="explode", result_expr=res_expr)
|
|
262
|
+
|
|
263
|
+
def count_matches(self, element: Any) -> Expr:
|
|
264
|
+
element_expr = element.expr if hasattr(element, 'expr') else element
|
|
265
|
+
res_expr = self.expr.count_matches(element_expr) if self.expr is not None else None
|
|
266
|
+
return self._create_next_expr(element, method_name="count_matches", result_expr=res_expr)
|
|
267
|
+
|
|
268
|
+
def to_array(self, width: int) -> Expr:
|
|
269
|
+
res_expr = self.expr.to_array(width) if self.expr is not None else None
|
|
270
|
+
return self._create_next_expr(width, method_name="to_array", result_expr=res_expr)
|
|
271
|
+
|
|
272
|
+
def to_struct(
|
|
273
|
+
self,
|
|
274
|
+
n_field_strategy: ListToStructWidthStrategy = "first_non_null",
|
|
275
|
+
fields: Sequence[str] | Callable[[int], str] | None = None,
|
|
276
|
+
upper_bound: int = 0,
|
|
277
|
+
) -> Expr:
|
|
278
|
+
res_expr = None
|
|
279
|
+
|
|
280
|
+
if self.expr is not None:
|
|
281
|
+
try:
|
|
282
|
+
if isinstance(fields, Sequence):
|
|
283
|
+
res_expr = self.expr.to_struct(fields=fields)
|
|
284
|
+
else:
|
|
285
|
+
res_expr = self.expr.to_struct(
|
|
286
|
+
n_field_strategy=n_field_strategy,
|
|
287
|
+
fields=fields,
|
|
288
|
+
upper_bound=upper_bound
|
|
289
|
+
)
|
|
290
|
+
except Exception as e:
|
|
291
|
+
print(f"Warning: Could not create polars expression for list.to_struct(): {e}")
|
|
292
|
+
|
|
293
|
+
return self._create_next_expr(
|
|
294
|
+
method_name="to_struct",
|
|
295
|
+
result_expr=res_expr,
|
|
296
|
+
n_field_strategy=n_field_strategy,
|
|
297
|
+
fields=fields,
|
|
298
|
+
upper_bound=upper_bound,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
def eval(self, expr: Expr, *, parallel: bool = False) -> Expr:
|
|
302
|
+
expr_inner = expr.expr if hasattr(expr, 'expr') else expr
|
|
303
|
+
res_expr = self.expr.eval(expr_inner, parallel=parallel) if self.expr is not None else None
|
|
304
|
+
return self._create_next_expr(expr, method_name="eval", result_expr=res_expr, parallel=parallel)
|
|
305
|
+
|
|
306
|
+
def set_union(self, other: Any) -> Expr:
|
|
307
|
+
other_expr = other.expr if hasattr(other, 'expr') else other
|
|
308
|
+
res_expr = self.expr.set_union(other_expr) if self.expr is not None else None
|
|
309
|
+
return self._create_next_expr(other, method_name="set_union", result_expr=res_expr)
|
|
310
|
+
|
|
311
|
+
def set_difference(self, other: Any) -> Expr:
|
|
312
|
+
other_expr = other.expr if hasattr(other, 'expr') else other
|
|
313
|
+
res_expr = self.expr.set_difference(other_expr) if self.expr is not None else None
|
|
314
|
+
return self._create_next_expr(other, method_name="set_difference", result_expr=res_expr)
|
|
315
|
+
|
|
316
|
+
def set_intersection(self, other: Any) -> Expr:
|
|
317
|
+
other_expr = other.expr if hasattr(other, 'expr') else other
|
|
318
|
+
res_expr = self.expr.set_intersection(other_expr) if self.expr is not None else None
|
|
319
|
+
return self._create_next_expr(other, method_name="set_intersection", result_expr=res_expr)
|
|
320
|
+
|
|
321
|
+
def set_symmetric_difference(self, other: Any) -> Expr:
|
|
322
|
+
other_expr = other.expr if hasattr(other, 'expr') else other
|
|
323
|
+
res_expr = self.expr.set_symmetric_difference(other_expr) if self.expr is not None else None
|
|
324
|
+
return self._create_next_expr(other, method_name="set_symmetric_difference", result_expr=res_expr)
|
flowfile_frame/selectors.py
CHANGED
|
@@ -51,6 +51,9 @@ class Selector:
|
|
|
51
51
|
# Expr init will handle creating the 'pl.sum(selector)' repr
|
|
52
52
|
return Expr(expr=None, selector=self, agg_func="sum")
|
|
53
53
|
|
|
54
|
+
def expr(self):
|
|
55
|
+
return eval(self.repr_str)
|
|
56
|
+
|
|
54
57
|
def mean(self) -> 'Expr':
|
|
55
58
|
"""Create an expression to average columns selected by this selector."""
|
|
56
59
|
return Expr(expr=None, selector=self, agg_func="mean")
|
flowfile_frame/series.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import polars as pl
|
|
3
|
+
from typing import Any, Optional, Union, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Series:
|
|
7
|
+
"""
|
|
8
|
+
A wrapper around polars.Series that represents itself as the code to create it.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
name: str | list | pl.Series | None = None,
|
|
14
|
+
values: list | None = None,
|
|
15
|
+
dtype: Any = None,
|
|
16
|
+
**kwargs # Ignored parameters
|
|
17
|
+
):
|
|
18
|
+
"""
|
|
19
|
+
Initialize a FlowSeries with the same API as pl.Series.
|
|
20
|
+
"""
|
|
21
|
+
# Store the original arguments for proper representation
|
|
22
|
+
self._name = name
|
|
23
|
+
self._values = values
|
|
24
|
+
self._dtype = dtype
|
|
25
|
+
|
|
26
|
+
# Handle the different initialization forms
|
|
27
|
+
if isinstance(name, pl.Series):
|
|
28
|
+
self._s = name
|
|
29
|
+
# Update our attributes to match the series
|
|
30
|
+
self._name = name.name
|
|
31
|
+
self._values = name.to_list()
|
|
32
|
+
self._dtype = name.dtype
|
|
33
|
+
elif isinstance(name, (list, tuple)) and values is None:
|
|
34
|
+
self._s = pl.Series(values=name, dtype=dtype)
|
|
35
|
+
self._name = "" # Default name is empty string
|
|
36
|
+
self._values = name
|
|
37
|
+
else:
|
|
38
|
+
self._s = pl.Series(name=name, values=values, dtype=dtype)
|
|
39
|
+
|
|
40
|
+
def __repr__(self) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Return a string that looks like the code to create this Series.
|
|
43
|
+
Example: pl.Series("c", [1, 2, 3])
|
|
44
|
+
"""
|
|
45
|
+
# Format name
|
|
46
|
+
if self._name:
|
|
47
|
+
name_str = f'"{self._name}"'
|
|
48
|
+
else:
|
|
49
|
+
name_str = '""'
|
|
50
|
+
|
|
51
|
+
# Format values
|
|
52
|
+
if self._values is None:
|
|
53
|
+
values_str = "[]"
|
|
54
|
+
elif len(self._values) <= 10:
|
|
55
|
+
values_str = str(self._values)
|
|
56
|
+
else:
|
|
57
|
+
# Show first few elements for long lists
|
|
58
|
+
sample = self._values[:3]
|
|
59
|
+
values_str = f"[{', '.join(map(str, sample))}, ...]"
|
|
60
|
+
|
|
61
|
+
# Format dtype if provided
|
|
62
|
+
dtype_str = ""
|
|
63
|
+
if self._dtype is not None:
|
|
64
|
+
dtype_str = f", dtype={self._dtype}"
|
|
65
|
+
|
|
66
|
+
return f"pl.Series({name_str}, {values_str}{dtype_str})"
|
|
67
|
+
|
|
68
|
+
def __str__(self) -> str:
|
|
69
|
+
"""Same as __repr__."""
|
|
70
|
+
return self.__repr__()
|