Flowfile 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +27 -6
- flowfile/api.py +5 -2
- flowfile/web/__init__.py +4 -2
- flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
- flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
- flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
- flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
- flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
- flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
- flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/METADATA +2 -2
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/RECORD +100 -98
- flowfile_core/__init__.py +1 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +1 -0
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/configs/utils.py +5 -0
- flowfile_core/database/connection.py +1 -3
- flowfile_core/flowfile/code_generator/code_generator.py +71 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -2
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +598 -310
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_graph.py +620 -192
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +510 -89
- flowfile_core/flowfile/flow_node/models.py +125 -20
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +36 -5
- flowfile_core/main.py +32 -13
- flowfile_core/routes/cloud_connections.py +7 -11
- flowfile_core/routes/logs.py +2 -6
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +127 -51
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/input_schema.py +92 -64
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +144 -11
- flowfile_core/schemas/transform_schema.py +82 -17
- flowfile_core/utils/arrow_reader.py +8 -3
- flowfile_core/utils/validate_setup.py +0 -2
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/__init__.py +0 -0
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +42 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +233 -111
- flowfile_frame/flow_frame.pyi +137 -91
- flowfile_frame/flow_frame_methods.py +150 -12
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- test_utils/s3/data_generator.py +1 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +6 -1
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/models.py +0 -193
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/LICENSE +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/WHEEL +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/entry_points.txt +0 -0
flowfile_frame/flow_frame.pyi
CHANGED
|
@@ -5,16 +5,22 @@ import os
|
|
|
5
5
|
import sys
|
|
6
6
|
import typing
|
|
7
7
|
from io import IOBase
|
|
8
|
-
from typing import List, Optional, ForwardRef
|
|
8
|
+
from typing import List, Optional, ForwardRef, TypeVar, Any, Iterable, Sequence, Mapping, Collection, Callable, Literal, IO, Union
|
|
9
|
+
from datetime import timedelta
|
|
10
|
+
from pathlib import Path
|
|
9
11
|
from collections.abc import Awaitable
|
|
10
12
|
|
|
11
13
|
# Third-party imports
|
|
12
14
|
import polars as pl
|
|
13
|
-
from polars._typing import *
|
|
15
|
+
from polars._typing import *
|
|
16
|
+
from polars._typing import ParquetMetadata, PlanStage
|
|
14
17
|
from polars._utils.async_ import _GeventDataFrameResult
|
|
15
18
|
from polars.dependencies import polars_cloud as pc
|
|
16
19
|
from polars.io.cloud import CredentialProviderFunction
|
|
17
20
|
from polars.lazyframe.frame import LazyGroupBy
|
|
21
|
+
from polars import LazyFrame, DataFrame, QueryOptFlags
|
|
22
|
+
from polars.io.parquet import ParquetFieldOverwrites
|
|
23
|
+
from polars.lazyframe.opt_flags import DEFAULT_QUERY_OPT_FLAGS
|
|
18
24
|
from polars.type_aliases import (Schema, IntoExpr, ClosedInterval, Label, StartBy, RollingInterpolationMethod, IpcCompression, CompatLevel, SyncOnCloseMethod, ExplainFormat, EngineType, SerializationFormat, AsofJoinStrategy)
|
|
19
25
|
|
|
20
26
|
# Local application/library specific imports
|
|
@@ -23,10 +29,11 @@ from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
|
23
29
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
24
30
|
from flowfile_frame import group_frame
|
|
25
31
|
from flowfile_frame.expr import Expr
|
|
32
|
+
from flowfile_core.schemas import transform_schema
|
|
26
33
|
|
|
27
34
|
# Conditional imports
|
|
28
35
|
if sys.version_info >= (3, 10):
|
|
29
|
-
|
|
36
|
+
from typing import Concatenate
|
|
30
37
|
else:
|
|
31
38
|
from typing_extensions import Concatenate
|
|
32
39
|
|
|
@@ -64,8 +71,8 @@ class FlowFrame:
|
|
|
64
71
|
|
|
65
72
|
def __gt__(self, other: Any) -> typing.NoReturn: ...
|
|
66
73
|
|
|
67
|
-
#
|
|
68
|
-
def __init__(self,
|
|
74
|
+
# The __init__ method is intentionally left empty.
|
|
75
|
+
def __init__(self, *args, **kwargs) -> None: ...
|
|
69
76
|
|
|
70
77
|
def __le__(self, other: Any) -> typing.NoReturn: ...
|
|
71
78
|
|
|
@@ -73,17 +80,26 @@ class FlowFrame:
|
|
|
73
80
|
|
|
74
81
|
def __ne__(self, other: object) -> typing.NoReturn: ...
|
|
75
82
|
|
|
76
|
-
#
|
|
77
|
-
def __new__(cls, data: typing.Union[LazyFrame, collections.abc.Mapping[str, typing.Union[collections.abc.Sequence[object], collections.abc.Mapping[str, collections.abc.Sequence[object]], ForwardRef('Series')]], collections.abc.Sequence[typing.Any], ForwardRef('np.ndarray[Any, Any]'), ForwardRef('pa.Table'), ForwardRef('pd.DataFrame'), ForwardRef('ArrowArrayExportable'), ForwardRef('ArrowStreamExportable')]=None, schema: typing.Union[collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]], collections.abc.Sequence[typing.Union[str, tuple[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]]]], NoneType]=None, schema_overrides: collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')]] | None=None, strict: bool=True, orient: typing.Optional[typing.Literal['col', 'row']]=None, infer_schema_length: int | None=100, nan_to_null: bool=False, flow_graph=None, node_id=None, parent_node_id=None) -> Self: ...
|
|
83
|
+
# Unified constructor for FlowFrame.
|
|
84
|
+
def __new__(cls, data: typing.Union[LazyFrame, collections.abc.Mapping[str, typing.Union[collections.abc.Sequence[object], collections.abc.Mapping[str, collections.abc.Sequence[object]], ForwardRef('Series')]], collections.abc.Sequence[typing.Any], ForwardRef('np.ndarray[Any, Any]'), ForwardRef('pa.Table'), ForwardRef('pd.DataFrame'), ForwardRef('ArrowArrayExportable'), ForwardRef('ArrowStreamExportable'), ForwardRef('torch.Tensor')] = None, schema: typing.Union[collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]], collections.abc.Sequence[typing.Union[str, tuple[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]]]], NoneType] = None, schema_overrides: collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')]] | None = None, strict: bool = True, orient: typing.Optional[typing.Literal['col', 'row']] = None, infer_schema_length: int | None = 100, nan_to_null: bool = False, flow_graph: typing.Optional[flowfile_core.flowfile.flow_graph.FlowGraph] = None, node_id: typing.Optional[int] = None, parent_node_id: typing.Optional[int] = None, **kwargs) -> Self: ...
|
|
78
85
|
|
|
79
86
|
def __repr__(self) -> Any: ...
|
|
80
87
|
|
|
81
88
|
# Helper method to add a connection between nodes
|
|
82
|
-
def _add_connection(self, from_id, to_id, input_type: typing.Literal['main', 'left', 'right']='main') -> Any: ...
|
|
89
|
+
def _add_connection(self, from_id, to_id, input_type: typing.Literal['main', 'left', 'right'] = 'main') -> Any: ...
|
|
83
90
|
|
|
84
|
-
|
|
91
|
+
# Add a cross join node to the graph.
|
|
92
|
+
def _add_cross_join_node(self, new_node_id: int, join_input: transform_schema.CrossJoinInput, description: str, other: FlowFrame) -> None: ...
|
|
85
93
|
|
|
86
|
-
def
|
|
94
|
+
def _add_number_of_records(self, new_node_id: int, description: str = None) -> 'FlowFrame': ...
|
|
95
|
+
|
|
96
|
+
def _add_polars_code(self, new_node_id: int, code: str, depending_on_ids: typing.Optional[typing.List[str]] = None, convertable_to_code: bool = True, method_name: str = None, polars_expr: typing.Union[flowfile_frame.expr.Expr, typing.List[flowfile_frame.expr.Expr], NoneType] = None, group_expr: typing.Union[flowfile_frame.expr.Expr, typing.List[flowfile_frame.expr.Expr], NoneType] = None, kwargs_expr: typing.Optional[typing.Dict] = None, group_kwargs: typing.Optional[typing.Dict] = None, description: str = None) -> Any: ...
|
|
97
|
+
|
|
98
|
+
# Add a regular join node to the graph.
|
|
99
|
+
def _add_regular_join_node(self, new_node_id: int, join_input: transform_schema.JoinInput, description: str, other: FlowFrame) -> None: ...
|
|
100
|
+
|
|
101
|
+
# Build kwargs dictionary for Polars join code.
|
|
102
|
+
def _build_polars_join_kwargs(self, on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], left_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], right_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], left_columns: typing.Optional[typing.List[str]], right_columns: typing.Optional[typing.List[str]], how: str, suffix: str, validate: str, nulls_equal: bool, coalesce: bool, maintain_order: typing.Literal[None, 'left', 'right', 'left_right', 'right_left']) -> dict: ...
|
|
87
103
|
|
|
88
104
|
def _comparison_error(self, operator: str) -> typing.NoReturn: ...
|
|
89
105
|
|
|
@@ -91,35 +107,50 @@ class FlowFrame:
|
|
|
91
107
|
def _create_child_frame(self, new_node_id) -> 'FlowFrame': ...
|
|
92
108
|
|
|
93
109
|
# Detect if the expression is a cum_count operation and use record_id if possible.
|
|
94
|
-
def _detect_cum_count_record_id(self, expr: Any, new_node_id: int, description: typing.Optional[str]=None) -> 'FlowFrame': ...
|
|
110
|
+
def _detect_cum_count_record_id(self, expr: Any, new_node_id: int, description: typing.Optional[str] = None) -> 'FlowFrame': ...
|
|
111
|
+
|
|
112
|
+
# Ensure both FlowFrames are in the same graph, combining if necessary.
|
|
113
|
+
def _ensure_same_graph(self, other: FlowFrame) -> None: ...
|
|
114
|
+
|
|
115
|
+
# Execute join using native FlowFile join nodes.
|
|
116
|
+
def _execute_native_join(self, other: FlowFrame, new_node_id: int, join_mappings: typing.Optional[typing.List], how: str, description: str) -> 'FlowFrame': ...
|
|
117
|
+
|
|
118
|
+
# Execute join using Polars code approach.
|
|
119
|
+
def _execute_polars_code_join(self, other: FlowFrame, new_node_id: int, on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], left_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], right_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], left_columns: typing.Optional[typing.List[str]], right_columns: typing.Optional[typing.List[str]], how: str, suffix: str, validate: str, nulls_equal: bool, coalesce: bool, maintain_order: typing.Literal[None, 'left', 'right', 'left_right', 'right_left'], description: str) -> 'FlowFrame': ...
|
|
95
120
|
|
|
96
121
|
# Generates the `input_df.sort(...)` Polars code string using pure expression strings.
|
|
97
122
|
def _generate_sort_polars_code(self, pure_sort_expr_strs: typing.List[str], descending_values: typing.List[bool], nulls_last_values: typing.List[bool], multithreaded: bool, maintain_order: bool) -> str: ...
|
|
98
123
|
|
|
99
|
-
|
|
124
|
+
# Parse and validate join column specifications.
|
|
125
|
+
def _parse_join_columns(self, on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], left_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], right_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], how: str) -> tuple[typing.Optional[typing.List[str]], typing.Optional[typing.List[str]]]: ...
|
|
126
|
+
|
|
127
|
+
# Determine if we should use Polars code instead of native join.
|
|
128
|
+
def _should_use_polars_code_for_join(self, maintain_order, coalesce, nulls_equal, validate, suffix) -> bool: ...
|
|
129
|
+
|
|
130
|
+
def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> 'FlowFrame': ...
|
|
100
131
|
|
|
101
132
|
# Approximate count of unique values.
|
|
102
|
-
def approx_n_unique(self, description: Optional[str] = None) ->
|
|
133
|
+
def approx_n_unique(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
103
134
|
|
|
104
135
|
# Return the `k` smallest rows.
|
|
105
|
-
def bottom_k(self, k: int, by: IntoExpr | Iterable[IntoExpr], reverse: bool | Sequence[bool]=False, description: Optional[str] = None) ->
|
|
136
|
+
def bottom_k(self, k: int, by: IntoExpr | Iterable[IntoExpr], reverse: bool | Sequence[bool] = False, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
106
137
|
|
|
107
138
|
def cache(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
108
139
|
|
|
109
140
|
# Cast LazyFrame column(s) to the specified dtype(s).
|
|
110
|
-
def cast(self, dtypes: Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType | PythonDataType] | PolarsDataType, strict: bool=True, description: Optional[str] = None) ->
|
|
141
|
+
def cast(self, dtypes: Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType | PythonDataType] | PolarsDataType | pl.DataTypeExpr, strict: bool = True, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
111
142
|
|
|
112
143
|
# Create an empty copy of the current LazyFrame, with zero to 'n' rows.
|
|
113
|
-
def clear(self, n: int=0, description: Optional[str] = None) ->
|
|
144
|
+
def clear(self, n: int = 0, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
114
145
|
|
|
115
146
|
# Create a copy of this LazyFrame.
|
|
116
|
-
def clone(self, description: Optional[str] = None) ->
|
|
147
|
+
def clone(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
117
148
|
|
|
118
149
|
# Collect lazy data into memory.
|
|
119
|
-
def collect(self, *args, **kwargs) ->
|
|
150
|
+
def collect(self, *args, **kwargs) -> DataFrame: ...
|
|
120
151
|
|
|
121
152
|
# Collect DataFrame asynchronously in thread pool.
|
|
122
|
-
def collect_async(self, gevent: bool
|
|
153
|
+
def collect_async(self, gevent: bool = False, engine: EngineType = 'auto', optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: ...
|
|
123
154
|
|
|
124
155
|
# Resolve the schema of this LazyFrame.
|
|
125
156
|
def collect_schema(self) -> Schema: ...
|
|
@@ -129,56 +160,56 @@ class FlowFrame:
|
|
|
129
160
|
def columns(self) -> typing.List[str]: ...
|
|
130
161
|
|
|
131
162
|
# Combine multiple FlowFrames into a single FlowFrame.
|
|
132
|
-
def concat(self, other: typing.Union[ForwardRef('FlowFrame'), typing.List[ForwardRef('FlowFrame')]], how: str='vertical', rechunk: bool=False, parallel: bool=True, description: str=None) -> 'FlowFrame': ...
|
|
163
|
+
def concat(self, other: typing.Union[ForwardRef('FlowFrame'), typing.List[ForwardRef('FlowFrame')]], how: str = 'vertical', rechunk: bool = False, parallel: bool = True, description: str = None) -> 'FlowFrame': ...
|
|
133
164
|
|
|
134
165
|
# Return the number of non-null elements for each column.
|
|
135
|
-
def count(self, description: Optional[str] = None) ->
|
|
166
|
+
def count(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
136
167
|
|
|
137
168
|
# Simple naive implementation of creating the frame from any type. It converts the data to a polars frame,
|
|
138
|
-
def create_from_any_type(self, data: typing.Union[collections.abc.Mapping[str, typing.Union[collections.abc.Sequence[object], collections.abc.Mapping[str, collections.abc.Sequence[object]], ForwardRef('Series')]], collections.abc.Sequence[typing.Any], ForwardRef('np.ndarray[Any, Any]'), ForwardRef('pa.Table'), ForwardRef('pd.DataFrame'), ForwardRef('ArrowArrayExportable'), ForwardRef('ArrowStreamExportable')]=None, schema: typing.Union[collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]], collections.abc.Sequence[typing.Union[str, tuple[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]]]], NoneType]=None, schema_overrides: collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')]] | None=None, strict: bool=True, orient: typing.Optional[typing.Literal['col', 'row']]=None, infer_schema_length: int | None=100, nan_to_null: bool=False, flow_graph=None, node_id=None, parent_node_id=None, description: Optional[str] = None) -> Any: ...
|
|
169
|
+
def create_from_any_type(self, data: typing.Union[collections.abc.Mapping[str, typing.Union[collections.abc.Sequence[object], collections.abc.Mapping[str, collections.abc.Sequence[object]], ForwardRef('Series')]], collections.abc.Sequence[typing.Any], ForwardRef('np.ndarray[Any, Any]'), ForwardRef('pa.Table'), ForwardRef('pd.DataFrame'), ForwardRef('ArrowArrayExportable'), ForwardRef('ArrowStreamExportable'), ForwardRef('torch.Tensor')] = None, schema: typing.Union[collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]], collections.abc.Sequence[typing.Union[str, tuple[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]]]], NoneType] = None, schema_overrides: collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')]] | None = None, strict: bool = True, orient: typing.Optional[typing.Literal['col', 'row']] = None, infer_schema_length: int | None = 100, nan_to_null: bool = False, flow_graph = None, node_id = None, parent_node_id = None, description: Optional[str] = None) -> Any: ...
|
|
139
170
|
|
|
140
171
|
# Creates a summary of statistics for a LazyFrame, returning a DataFrame.
|
|
141
|
-
def describe(self, percentiles: Sequence[float] | float | None=
|
|
172
|
+
def describe(self, percentiles: Sequence[float] | float | None = ..., interpolation: QuantileMethod = 'nearest') -> DataFrame: ...
|
|
142
173
|
|
|
143
174
|
# Read a logical plan from a file to construct a LazyFrame.
|
|
144
|
-
def deserialize(self, source: str | Path | IOBase, format: SerializationFormat='binary', description: Optional[str] = None) ->
|
|
175
|
+
def deserialize(self, source: str | Path | IOBase, format: SerializationFormat = 'binary', description: Optional[str] = None) -> 'FlowFrame': ...
|
|
145
176
|
|
|
146
177
|
# Remove columns from the DataFrame.
|
|
147
|
-
def drop(self, *columns, strict: bool=True, description: Optional[str] = None) ->
|
|
178
|
+
def drop(self, *columns, strict: bool = True, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
148
179
|
|
|
149
180
|
# Drop all rows that contain one or more NaN values.
|
|
150
|
-
def drop_nans(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None=None, description: Optional[str] = None) ->
|
|
181
|
+
def drop_nans(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
151
182
|
|
|
152
183
|
# Drop all rows that contain one or more null values.
|
|
153
|
-
def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None=None, description: Optional[str] = None) ->
|
|
184
|
+
def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
154
185
|
|
|
155
186
|
# Get the column data types.
|
|
156
187
|
@property
|
|
157
188
|
def dtypes(self) -> typing.List[pl.classes.DataType]: ...
|
|
158
189
|
|
|
159
190
|
# Create a string representation of the query plan.
|
|
160
|
-
def explain(self, format: ExplainFormat='plain', optimized: bool=True, type_coercion: bool=
|
|
191
|
+
def explain(self, format: ExplainFormat = 'plain', optimized: bool = True, type_coercion: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, simplify_expression: bool = True, slice_pushdown: bool = True, comm_subplan_elim: bool = True, comm_subexpr_elim: bool = True, cluster_with_columns: bool = True, collapse_joins: bool = True, streaming: bool = False, engine: EngineType = 'auto', tree_format: bool | None = None, optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS) -> str: ...
|
|
161
192
|
|
|
162
193
|
# Explode the dataframe to long format by exploding the given columns.
|
|
163
|
-
def explode(self, columns: typing.Union[str, flowfile_frame.expr.Column, typing.Iterable[str | flowfile_frame.expr.Column]], *more_columns, description: str=None) -> 'FlowFrame': ...
|
|
194
|
+
def explode(self, columns: typing.Union[str, flowfile_frame.expr.Column, typing.Iterable[str | flowfile_frame.expr.Column]], *more_columns, description: str = None) -> 'FlowFrame': ...
|
|
164
195
|
|
|
165
196
|
# Collect a small number of rows for debugging purposes.
|
|
166
|
-
def fetch(self, n_rows: int=500, type_coercion: bool=True, _type_check: bool=True, predicate_pushdown: bool=True, projection_pushdown: bool=True, simplify_expression: bool=True, no_optimization: bool=False, slice_pushdown: bool=True, comm_subplan_elim: bool=True, comm_subexpr_elim: bool=True, cluster_with_columns: bool=True, collapse_joins: bool=True) -> DataFrame: ...
|
|
197
|
+
def fetch(self, n_rows: int = 500, type_coercion: bool = True, _type_check: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, simplify_expression: bool = True, no_optimization: bool = False, slice_pushdown: bool = True, comm_subplan_elim: bool = True, comm_subexpr_elim: bool = True, cluster_with_columns: bool = True, collapse_joins: bool = True) -> DataFrame: ...
|
|
167
198
|
|
|
168
199
|
# Fill floating point NaN values.
|
|
169
|
-
def fill_nan(self, value: int | float | Expr | None, description: Optional[str] = None) ->
|
|
200
|
+
def fill_nan(self, value: int | float | Expr | None, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
170
201
|
|
|
171
202
|
# Fill null values using the specified value or strategy.
|
|
172
|
-
def fill_null(self, value: Any | Expr | None=None, strategy: FillNullStrategy | None=None, limit: int | None=None, matches_supertype: bool=True, description: Optional[str] = None) ->
|
|
203
|
+
def fill_null(self, value: Any | Expr | None = None, strategy: FillNullStrategy | None = None, limit: int | None = None, matches_supertype: bool = True, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
173
204
|
|
|
174
205
|
# Filter rows based on a predicate.
|
|
175
|
-
def filter(self, *predicates, flowfile_formula: typing.Optional[str]=None, description: typing.Optional[str]=None, **constraints) -> 'FlowFrame': ...
|
|
206
|
+
def filter(self, *predicates, flowfile_formula: typing.Optional[str] = None, description: typing.Optional[str] = None, **constraints) -> 'FlowFrame': ...
|
|
176
207
|
|
|
177
208
|
# Get the first row of the DataFrame.
|
|
178
|
-
def first(self, description: Optional[str] = None) ->
|
|
209
|
+
def first(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
179
210
|
|
|
180
211
|
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
|
181
|
-
def gather_every(self, n: int, offset: int=0, description: Optional[str] = None) ->
|
|
212
|
+
def gather_every(self, n: int, offset: int = 0, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
182
213
|
|
|
183
214
|
def get_node_settings(self, description: Optional[str] = None) -> FlowNode: ...
|
|
184
215
|
|
|
@@ -186,162 +217,165 @@ class FlowFrame:
|
|
|
186
217
|
def group_by(self, *by, description: Optional[str] = None, maintain_order: bool = False, **named_by) -> group_frame.GroupByFrame: ...
|
|
187
218
|
|
|
188
219
|
# Group based on a time value (or index value of type Int32, Int64).
|
|
189
|
-
def group_by_dynamic(self, index_column: IntoExpr, every: str | timedelta, period: str | timedelta | None=None, offset: str | timedelta | None=None, include_boundaries: bool=False, closed: ClosedInterval='left', label: Label='left', group_by: IntoExpr | Iterable[IntoExpr] | None=None, start_by: StartBy='window', description: Optional[str] = None) -> LazyGroupBy: ...
|
|
220
|
+
def group_by_dynamic(self, index_column: IntoExpr, every: str | timedelta, period: str | timedelta | None = None, offset: str | timedelta | None = None, include_boundaries: bool = False, closed: ClosedInterval = 'left', label: Label = 'left', group_by: IntoExpr | Iterable[IntoExpr] | None = None, start_by: StartBy = 'window', description: Optional[str] = None) -> LazyGroupBy: ...
|
|
190
221
|
|
|
191
|
-
def head(self, n: int, description: str=None) -> Any: ...
|
|
222
|
+
def head(self, n: int, description: str = None) -> Any: ...
|
|
192
223
|
|
|
193
224
|
# Inspect a node in the computation graph.
|
|
194
|
-
def inspect(self, fmt: str='{}', description: Optional[str] = None) ->
|
|
225
|
+
def inspect(self, fmt: str = '{}', description: Optional[str] = None) -> 'FlowFrame': ...
|
|
195
226
|
|
|
196
227
|
# Interpolate intermediate values. The interpolation method is linear.
|
|
197
|
-
def interpolate(self, description: Optional[str] = None) ->
|
|
228
|
+
def interpolate(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
198
229
|
|
|
199
230
|
# Add a join operation to the Logical Plan.
|
|
200
|
-
def join(self, other, on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column]=None, how: str='inner', left_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column]=None, right_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column]=None, suffix: str='_right', validate: str=None, nulls_equal: bool=False, coalesce: bool=None, maintain_order: typing.Literal[None, 'left', 'right', 'left_right', 'right_left']=None, description: str=None) ->
|
|
231
|
+
def join(self, other, on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, how: str = 'inner', left_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, right_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, suffix: str = '_right', validate: str = None, nulls_equal: bool = False, coalesce: bool = None, maintain_order: typing.Literal[None, 'left', 'right', 'left_right', 'right_left'] = None, description: str = None) -> 'FlowFrame': ...
|
|
201
232
|
|
|
202
233
|
# Perform an asof join.
|
|
203
|
-
def join_asof(self, other:
|
|
234
|
+
def join_asof(self, other: FlowFrame, left_on: str | None | Expr = None, right_on: str | None | Expr = None, on: str | None | Expr = None, by_left: str | Sequence[str] | None = None, by_right: str | Sequence[str] | None = None, by: str | Sequence[str] | None = None, strategy: AsofJoinStrategy = 'backward', suffix: str = '_right', tolerance: str | int | float | timedelta | None = None, allow_parallel: bool = True, force_parallel: bool = False, coalesce: bool = True, allow_exact_matches: bool = True, check_sortedness: bool = True, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
204
235
|
|
|
205
236
|
# Perform a join based on one or multiple (in)equality predicates.
|
|
206
|
-
def join_where(self, other:
|
|
237
|
+
def join_where(self, other: FlowFrame, *predicates, suffix: str = '_right', description: Optional[str] = None) -> 'FlowFrame': ...
|
|
207
238
|
|
|
208
239
|
# Get the last row of the DataFrame.
|
|
209
|
-
def last(self, description: Optional[str] = None) ->
|
|
240
|
+
def last(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
210
241
|
|
|
211
242
|
# Return lazy representation, i.e. itself.
|
|
212
|
-
def lazy(self, description: Optional[str] = None) ->
|
|
243
|
+
def lazy(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
213
244
|
|
|
214
|
-
def limit(self, n: int, description: str=None) -> Any: ...
|
|
245
|
+
def limit(self, n: int, description: str = None) -> Any: ...
|
|
215
246
|
|
|
216
247
|
# Apply a custom function.
|
|
217
|
-
def map_batches(self, function: Callable[[DataFrame], DataFrame], predicate_pushdown: bool=True, projection_pushdown: bool=True, slice_pushdown: bool=True, no_optimizations: bool=False, schema: None | SchemaDict=None, validate_output_schema: bool=True, streamable: bool=False, description: Optional[str] = None) ->
|
|
248
|
+
def map_batches(self, function: Callable[[DataFrame], DataFrame], predicate_pushdown: bool = True, projection_pushdown: bool = True, slice_pushdown: bool = True, no_optimizations: bool = False, schema: None | SchemaDict = None, validate_output_schema: bool = True, streamable: bool = False, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
249
|
+
|
|
250
|
+
# Match or evolve the schema of a LazyFrame into a specific schema.
|
|
251
|
+
def match_to_schema(self, schema: SchemaDict | Schema, missing_columns: Literal['insert', 'raise'] | Mapping[str, Literal['insert', 'raise'] | Expr] = 'raise', missing_struct_fields: Literal['insert', 'raise'] | Mapping[str, Literal['insert', 'raise']] = 'raise', extra_columns: Literal['ignore', 'raise'] = 'raise', extra_struct_fields: Literal['ignore', 'raise'] | Mapping[str, Literal['ignore', 'raise']] = 'raise', integer_cast: Literal['upcast', 'forbid'] | Mapping[str, Literal['upcast', 'forbid']] = 'forbid', float_cast: Literal['upcast', 'forbid'] | Mapping[str, Literal['upcast', 'forbid']] = 'forbid', description: Optional[str] = None) -> 'FlowFrame': ...
|
|
218
252
|
|
|
219
253
|
# Aggregate the columns in the LazyFrame to their maximum value.
|
|
220
|
-
def max(self, description: Optional[str] = None) ->
|
|
254
|
+
def max(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
221
255
|
|
|
222
256
|
# Aggregate the columns in the LazyFrame to their mean value.
|
|
223
|
-
def mean(self, description: Optional[str] = None) ->
|
|
257
|
+
def mean(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
224
258
|
|
|
225
259
|
# Aggregate the columns in the LazyFrame to their median value.
|
|
226
|
-
def median(self, description: Optional[str] = None) ->
|
|
260
|
+
def median(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
227
261
|
|
|
228
262
|
# Unpivot a DataFrame from wide to long format.
|
|
229
|
-
def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None=None, value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None=None, variable_name: str | None=None, value_name: str | None=None, streamable: bool=True, description: Optional[str] = None) ->
|
|
263
|
+
def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None, value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None, variable_name: str | None = None, value_name: str | None = None, streamable: bool = True, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
230
264
|
|
|
231
265
|
# Take two sorted DataFrames and merge them by the sorted key.
|
|
232
|
-
def merge_sorted(self, other:
|
|
266
|
+
def merge_sorted(self, other: FlowFrame, key: str, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
233
267
|
|
|
234
268
|
# Aggregate the columns in the LazyFrame to their minimum value.
|
|
235
|
-
def min(self, description: Optional[str] = None) ->
|
|
269
|
+
def min(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
236
270
|
|
|
237
271
|
# Aggregate the columns in the LazyFrame as the sum of their null value count.
|
|
238
|
-
def null_count(self, description: Optional[str] = None) ->
|
|
272
|
+
def null_count(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
239
273
|
|
|
240
274
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
|
241
275
|
def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args, description: Optional[str] = None, **kwargs) -> T: ...
|
|
242
276
|
|
|
243
277
|
# Pivot a DataFrame from long to wide format.
|
|
244
|
-
def pivot(self, on: str | list[str], index: str | list[str] | None=None, values: str | list[str] | None=None, aggregate_function: str | None='first', maintain_order: bool=True, sort_columns: bool=False, separator: str='_', description: str=None) -> 'FlowFrame': ...
|
|
278
|
+
def pivot(self, on: str | list[str], index: str | list[str] | None = None, values: str | list[str] | None = None, aggregate_function: str | None = 'first', maintain_order: bool = True, sort_columns: bool = False, separator: str = '_', description: str = None) -> 'FlowFrame': ...
|
|
245
279
|
|
|
246
280
|
# Profile a LazyFrame.
|
|
247
|
-
def profile(self, type_coercion: bool=
|
|
281
|
+
def profile(self, type_coercion: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, simplify_expression: bool = True, no_optimization: bool = False, slice_pushdown: bool = True, comm_subplan_elim: bool = True, comm_subexpr_elim: bool = True, cluster_with_columns: bool = True, collapse_joins: bool = True, show_plot: bool = False, truncate_nodes: int = 0, figsize: tuple[int, int] = ..., engine: EngineType = 'auto', optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS, **_kwargs) -> tuple[DataFrame, DataFrame]: ...
|
|
248
282
|
|
|
249
283
|
# Aggregate the columns in the LazyFrame to their quantile value.
|
|
250
|
-
def quantile(self, quantile: float | Expr, interpolation:
|
|
284
|
+
def quantile(self, quantile: float | Expr, interpolation: QuantileMethod = 'nearest', description: Optional[str] = None) -> 'FlowFrame': ...
|
|
251
285
|
|
|
252
286
|
# Run a query remotely on Polars Cloud.
|
|
253
|
-
def remote(self, context: pc.ComputeContext | None=None, plan_type: pc._typing.PlanTypePreference='dot', description: Optional[str] = None) -> pc.LazyFrameExt: ...
|
|
287
|
+
def remote(self, context: pc.ComputeContext | None = None, plan_type: pc._typing.PlanTypePreference = 'dot', description: Optional[str] = None) -> pc.LazyFrameExt: ...
|
|
254
288
|
|
|
255
289
|
# Remove rows, dropping those that match the given predicate expression(s).
|
|
256
|
-
def remove(self, *predicates, description: Optional[str] = None, **constraints) ->
|
|
290
|
+
def remove(self, *predicates, description: Optional[str] = None, **constraints) -> 'FlowFrame': ...
|
|
257
291
|
|
|
258
292
|
# Rename column names.
|
|
259
|
-
def rename(self, mapping:
|
|
293
|
+
def rename(self, mapping: Mapping[str, str] | Callable[[str], str], strict: bool = True, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
260
294
|
|
|
261
295
|
# Reverse the DataFrame.
|
|
262
|
-
def reverse(self, description: Optional[str] = None) ->
|
|
296
|
+
def reverse(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
263
297
|
|
|
264
298
|
# Create rolling groups based on a temporal or integer column.
|
|
265
|
-
def rolling(self, index_column: IntoExpr, period: str | timedelta, offset: str | timedelta | None=None, closed: ClosedInterval='right', group_by: IntoExpr | Iterable[IntoExpr] | None=None, description: Optional[str] = None) -> LazyGroupBy: ...
|
|
299
|
+
def rolling(self, index_column: IntoExpr, period: str | timedelta, offset: str | timedelta | None = None, closed: ClosedInterval = 'right', group_by: IntoExpr | Iterable[IntoExpr] | None = None, description: Optional[str] = None) -> LazyGroupBy: ...
|
|
266
300
|
|
|
267
301
|
# Save the graph
|
|
268
|
-
def save_graph(self, file_path: str, auto_arrange: bool=True, description: Optional[str] = None) -> Any: ...
|
|
302
|
+
def save_graph(self, file_path: str, auto_arrange: bool = True, description: Optional[str] = None) -> Any: ...
|
|
269
303
|
|
|
270
304
|
# Get an ordered mapping of column names to their data type.
|
|
271
305
|
@property
|
|
272
306
|
def schema(self) -> pl.Schema: ...
|
|
273
307
|
|
|
274
308
|
# Select columns from the frame.
|
|
275
|
-
def select(self, *columns, description: typing.Optional[str]=None) -> 'FlowFrame': ...
|
|
309
|
+
def select(self, *columns, description: typing.Optional[str] = None) -> 'FlowFrame': ...
|
|
276
310
|
|
|
277
311
|
# Select columns from this LazyFrame.
|
|
278
|
-
def select_seq(self, *exprs, description: Optional[str] = None, **named_exprs) ->
|
|
312
|
+
def select_seq(self, *exprs, description: Optional[str] = None, **named_exprs) -> 'FlowFrame': ...
|
|
279
313
|
|
|
280
314
|
# Serialize the logical plan of this LazyFrame to a file or string in JSON format.
|
|
281
|
-
def serialize(self, file: IOBase | str | Path | None=None, format: SerializationFormat='binary', description: Optional[str] = None) -> bytes | str | None: ...
|
|
315
|
+
def serialize(self, file: IOBase | str | Path | None = None, format: SerializationFormat = 'binary', description: Optional[str] = None) -> bytes | str | None: ...
|
|
282
316
|
|
|
283
317
|
# Flag a column as sorted.
|
|
284
|
-
def set_sorted(self, column: str, descending: bool=False, description: Optional[str] = None) ->
|
|
318
|
+
def set_sorted(self, column: str, descending: bool = False, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
285
319
|
|
|
286
320
|
# Shift values by the given number of indices.
|
|
287
|
-
def shift(self, n: int | IntoExprColumn=1, fill_value: IntoExpr | None=None, description: Optional[str] = None) ->
|
|
321
|
+
def shift(self, n: int | IntoExprColumn = 1, fill_value: IntoExpr | None = None, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
288
322
|
|
|
289
323
|
# Show a plot of the query plan.
|
|
290
|
-
def show_graph(self, optimized: bool=True, show: bool=True, output_path: str | Path | None=None, raw_output: bool=False, figsize: tuple[float, float]=
|
|
324
|
+
def show_graph(self, optimized: bool = True, show: bool = True, output_path: str | Path | None = None, raw_output: bool = False, figsize: tuple[float, float] = ..., type_coercion: bool = True, _type_check: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, simplify_expression: bool = True, slice_pushdown: bool = True, comm_subplan_elim: bool = True, comm_subexpr_elim: bool = True, cluster_with_columns: bool = True, collapse_joins: bool = True, engine: EngineType = 'auto', plan_stage: PlanStage = 'ir', _check_order: bool = True, optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS) -> str | None: ...
|
|
291
325
|
|
|
292
326
|
# Write the data to a CSV file.
|
|
293
|
-
def sink_csv(self, file: str, *args, separator: str=',', encoding: str='utf-8', description: str=None) -> 'FlowFrame': ...
|
|
327
|
+
def sink_csv(self, file: str, *args, separator: str = ',', encoding: str = 'utf-8', description: str = None) -> 'FlowFrame': ...
|
|
294
328
|
|
|
295
329
|
# Evaluate the query in streaming mode and write to an IPC file.
|
|
296
|
-
def sink_ipc(self, path: str | Path, compression: IpcCompression | None='
|
|
330
|
+
def sink_ipc(self, path: str | Path | IO[bytes] | PartitioningScheme, compression: IpcCompression | None = 'uncompressed', compat_level: CompatLevel | None = None, maintain_order: bool = True, storage_options: dict[str, Any] | None = None, credential_provider: CredentialProviderFunction | Literal['auto'] | None = 'auto', retries: int = 2, sync_on_close: SyncOnCloseMethod | None = None, mkdir: bool = False, lazy: bool = False, engine: EngineType = 'auto', optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS, description: Optional[str] = None) -> LazyFrame | None: ...
|
|
297
331
|
|
|
298
332
|
# Evaluate the query in streaming mode and write to an NDJSON file.
|
|
299
|
-
def sink_ndjson(self, path: str | Path
|
|
333
|
+
def sink_ndjson(self, path: str | Path | IO[bytes] | IO[str] | PartitioningScheme, maintain_order: bool = True, storage_options: dict[str, Any] | None = None, credential_provider: CredentialProviderFunction | Literal['auto'] | None = 'auto', retries: int = 2, sync_on_close: SyncOnCloseMethod | None = None, mkdir: bool = False, lazy: bool = False, engine: EngineType = 'auto', optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS, description: Optional[str] = None) -> LazyFrame | None: ...
|
|
300
334
|
|
|
301
335
|
# Evaluate the query in streaming mode and write to a Parquet file.
|
|
302
|
-
def sink_parquet(self, path: str | Path, compression: str='zstd', compression_level: int | None=None, statistics: bool | str | dict[str, bool]=True, row_group_size: int | None=None, data_page_size: int | None=None, maintain_order: bool
|
|
336
|
+
def sink_parquet(self, path: str | Path | IO[bytes] | PartitioningScheme, compression: str = 'zstd', compression_level: int | None = None, statistics: bool | str | dict[str, bool] = True, row_group_size: int | None = None, data_page_size: int | None = None, maintain_order: bool = True, storage_options: dict[str, Any] | None = None, credential_provider: CredentialProviderFunction | Literal['auto'] | None = 'auto', retries: int = 2, sync_on_close: SyncOnCloseMethod | None = None, metadata: ParquetMetadata | None = None, mkdir: bool = False, lazy: bool = False, field_overwrites: ParquetFieldOverwrites | Sequence[ParquetFieldOverwrites] | Mapping[str, ParquetFieldOverwrites] | None = None, engine: EngineType = 'auto', optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS, description: Optional[str] = None) -> LazyFrame | None: ...
|
|
303
337
|
|
|
304
338
|
# Get a slice of this DataFrame.
|
|
305
|
-
def slice(self, offset: int, length: int | None=None, description: Optional[str] = None) ->
|
|
339
|
+
def slice(self, offset: int, length: int | None = None, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
306
340
|
|
|
307
341
|
# Sort the dataframe by the given columns.
|
|
308
|
-
def sort(self, by: typing.Union[typing.List[typing.Union[flowfile_frame.expr.Expr, str]], flowfile_frame.expr.Expr, str], *more_by, descending: typing.Union[bool, typing.List[bool]]=False, nulls_last: typing.Union[bool, typing.List[bool]]=False, multithreaded: bool=True, maintain_order: bool=False, description: typing.Optional[str]=None) -> 'FlowFrame': ...
|
|
342
|
+
def sort(self, by: typing.Union[typing.List[typing.Union[flowfile_frame.expr.Expr, str]], flowfile_frame.expr.Expr, str], *more_by, descending: typing.Union[bool, typing.List[bool]] = False, nulls_last: typing.Union[bool, typing.List[bool]] = False, multithreaded: bool = True, maintain_order: bool = False, description: typing.Optional[str] = None) -> 'FlowFrame': ...
|
|
309
343
|
|
|
310
344
|
# Execute a SQL query against the LazyFrame.
|
|
311
|
-
def sql(self, query: str, table_name: str='self', description: Optional[str] = None) ->
|
|
345
|
+
def sql(self, query: str, table_name: str = 'self', description: Optional[str] = None) -> 'FlowFrame': ...
|
|
312
346
|
|
|
313
347
|
# Aggregate the columns in the LazyFrame to their standard deviation value.
|
|
314
|
-
def std(self, ddof: int=1, description: Optional[str] = None) ->
|
|
348
|
+
def std(self, ddof: int = 1, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
315
349
|
|
|
316
350
|
# Aggregate the columns in the LazyFrame to their sum value.
|
|
317
|
-
def sum(self, description: Optional[str] = None) ->
|
|
351
|
+
def sum(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
318
352
|
|
|
319
353
|
# Get the last `n` rows.
|
|
320
|
-
def tail(self, n: int=5, description: Optional[str] = None) ->
|
|
354
|
+
def tail(self, n: int = 5, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
321
355
|
|
|
322
356
|
# Split text in a column into multiple rows.
|
|
323
|
-
def text_to_rows(self, column: str | flowfile_frame.expr.Column, output_column: str=None, delimiter: str=None, split_by_column: str=None, description: str=None) -> 'FlowFrame': ...
|
|
357
|
+
def text_to_rows(self, column: str | flowfile_frame.expr.Column, output_column: str = None, delimiter: str = None, split_by_column: str = None, description: str = None) -> 'FlowFrame': ...
|
|
324
358
|
|
|
325
359
|
# Get the underlying ETL graph.
|
|
326
360
|
def to_graph(self, description: Optional[str] = None) -> Any: ...
|
|
327
361
|
|
|
328
362
|
# Return the `k` largest rows.
|
|
329
|
-
def top_k(self, k: int, by: IntoExpr | Iterable[IntoExpr], reverse: bool | Sequence[bool]=False, description: Optional[str] = None) ->
|
|
363
|
+
def top_k(self, k: int, by: IntoExpr | Iterable[IntoExpr], reverse: bool | Sequence[bool] = False, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
330
364
|
|
|
331
365
|
# Drop duplicate rows from this dataframe.
|
|
332
|
-
def unique(self, subset: typing.Union[str, ForwardRef('Expr'), typing.List[typing.Union[ForwardRef('Expr'), str]]]=None, keep: typing.Literal['first', 'last', 'any', 'none']='any', maintain_order: bool=False, description: str=None) -> 'FlowFrame': ...
|
|
366
|
+
def unique(self, subset: typing.Union[str, ForwardRef('Expr'), typing.List[typing.Union[ForwardRef('Expr'), str]]] = None, keep: typing.Literal['first', 'last', 'any', 'none'] = 'any', maintain_order: bool = False, description: str = None) -> 'FlowFrame': ...
|
|
333
367
|
|
|
334
368
|
# Decompose struct columns into separate columns for each of their fields.
|
|
335
|
-
def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns, description: Optional[str] = None) ->
|
|
369
|
+
def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
336
370
|
|
|
337
371
|
# Unpivot a DataFrame from wide to long format.
|
|
338
|
-
def unpivot(self, on: list[str | flowfile_frame.selectors.Selector] | str | None | flowfile_frame.selectors.Selector=None, index: list[str] | str | None=None, variable_name: str='variable', value_name: str='value', description: str=None) -> 'FlowFrame': ...
|
|
372
|
+
def unpivot(self, on: list[str | flowfile_frame.selectors.Selector] | str | None | flowfile_frame.selectors.Selector = None, index: list[str] | str | None = None, variable_name: str = 'variable', value_name: str = 'value', description: str = None) -> 'FlowFrame': ...
|
|
339
373
|
|
|
340
374
|
# Update the values in this `LazyFrame` with the values in `other`.
|
|
341
|
-
def update(self, other:
|
|
375
|
+
def update(self, other: FlowFrame, on: str | Sequence[str] | None = None, how: Literal['left', 'inner', 'full'] = 'left', left_on: str | Sequence[str] | None = None, right_on: str | Sequence[str] | None = None, include_nulls: bool = False, maintain_order: MaintainOrderJoin | None = 'left', description: Optional[str] = None) -> 'FlowFrame': ...
|
|
342
376
|
|
|
343
377
|
# Aggregate the columns in the LazyFrame to their variance value.
|
|
344
|
-
def var(self, ddof: int=1, description: Optional[str] = None) ->
|
|
378
|
+
def var(self, ddof: int = 1, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
345
379
|
|
|
346
380
|
# Get the number of columns.
|
|
347
381
|
@property
|
|
@@ -351,18 +385,30 @@ class FlowFrame:
|
|
|
351
385
|
def with_columns(self, *exprs: Union[Expr, Iterable[Expr], Any], flowfile_formulas: Optional[List[str]] = None, output_column_names: Optional[List[str]] = None, description: Optional[str] = None, **named_exprs: Union[Expr, Any]) -> 'FlowFrame': ...
|
|
352
386
|
|
|
353
387
|
# Add columns to this LazyFrame.
|
|
354
|
-
def with_columns_seq(self, *exprs, description: Optional[str] = None, **named_exprs) ->
|
|
388
|
+
def with_columns_seq(self, *exprs, description: Optional[str] = None, **named_exprs) -> 'FlowFrame': ...
|
|
355
389
|
|
|
356
390
|
# Add an external context to the computation graph.
|
|
357
|
-
def with_context(self, other: Self | list[Self], description: Optional[str] = None) ->
|
|
391
|
+
def with_context(self, other: Self | list[Self], description: Optional[str] = None) -> 'FlowFrame': ...
|
|
358
392
|
|
|
359
393
|
# Add a column at index 0 that counts the rows.
|
|
360
|
-
def with_row_count(self, name: str='row_nr', offset: int=0, description: Optional[str] = None) ->
|
|
394
|
+
def with_row_count(self, name: str = 'row_nr', offset: int = 0, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
361
395
|
|
|
362
396
|
# Add a row index as the first column in the DataFrame.
|
|
363
|
-
def with_row_index(self, name: str='index', offset: int=0, description: str=None) -> 'FlowFrame': ...
|
|
397
|
+
def with_row_index(self, name: str = 'index', offset: int = 0, description: str = None) -> 'FlowFrame': ...
|
|
398
|
+
|
|
399
|
+
def write_csv(self, file: str | os.PathLike, separator: str = ',', encoding: str = 'utf-8', convert_to_absolute_path: bool = True, description: str = None, **kwargs) -> 'FlowFrame': ...
|
|
364
400
|
|
|
365
|
-
|
|
401
|
+
# Write the data frame to cloud storage in CSV format.
|
|
402
|
+
def write_csv_to_cloud_storage(self, path: str, connection_name: typing.Optional[str] = None, delimiter: str = ';', encoding: typing.Literal['utf8', 'utf8-lossy'] = 'utf8', description: typing.Optional[str] = None) -> 'FlowFrame': ...
|
|
403
|
+
|
|
404
|
+
# Write the data frame to cloud storage in Delta Lake format.
|
|
405
|
+
def write_delta(self, path: str, connection_name: typing.Optional[str] = None, write_mode: typing.Literal['overwrite', 'append'] = 'overwrite', description: typing.Optional[str] = None) -> 'FlowFrame': ...
|
|
406
|
+
|
|
407
|
+
# Write the data frame to cloud storage in JSON format.
|
|
408
|
+
def write_json_to_cloud_storage(self, path: str, connection_name: typing.Optional[str] = None, description: typing.Optional[str] = None) -> 'FlowFrame': ...
|
|
366
409
|
|
|
367
410
|
# Write the data to a Parquet file. Creates a standard Output node if only
|
|
368
|
-
def write_parquet(self, path: str | os.PathLike, convert_to_absolute_path: bool=True, description: str=None, **kwargs) -> 'FlowFrame': ...
|
|
411
|
+
def write_parquet(self, path: str | os.PathLike, convert_to_absolute_path: bool = True, description: str = None, **kwargs) -> 'FlowFrame': ...
|
|
412
|
+
|
|
413
|
+
# Write the data frame to cloud storage in Parquet format.
|
|
414
|
+
def write_parquet_to_cloud_storage(self, path: str, connection_name: typing.Optional[str] = None, compression: typing.Literal['snappy', 'gzip', 'brotli', 'lz4', 'zstd'] = 'snappy', description: typing.Optional[str] = None) -> 'FlowFrame': ...
|