Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
- flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
- flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
- flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
- flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
- flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
- flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/METADATA +2 -2
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/RECORD +96 -94
- flowfile_core/__init__.py +1 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +1 -0
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/flowfile/code_generator/code_generator.py +71 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +597 -309
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_graph.py +619 -191
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +500 -89
- flowfile_core/flowfile/flow_node/models.py +125 -20
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +36 -5
- flowfile_core/main.py +32 -13
- flowfile_core/routes/cloud_connections.py +7 -11
- flowfile_core/routes/logs.py +2 -6
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +127 -51
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/input_schema.py +92 -64
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +144 -11
- flowfile_core/schemas/transform_schema.py +82 -17
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/__init__.py +0 -0
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +232 -110
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +150 -12
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- test_utils/s3/data_generator.py +1 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +6 -1
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/models.py +0 -193
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +0 -0
|
@@ -8,6 +8,7 @@ from typing import NamedTuple
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def get_func_type_mapping(func: str):
|
|
11
|
+
"""Infers the output data type of common aggregation functions."""
|
|
11
12
|
if func in ["mean", "avg", "median", "std", "var"]:
|
|
12
13
|
return "Float64"
|
|
13
14
|
elif func in ['min', 'max', 'first', 'last', "cumsum", "sum"]:
|
|
@@ -19,6 +20,7 @@ def get_func_type_mapping(func: str):
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
def string_concat(*column: str):
|
|
23
|
+
"""A simple wrapper to concatenate string columns in Polars."""
|
|
22
24
|
return pl.col(column).cast(pl.Utf8).str.concat(delimiter=',')
|
|
23
25
|
|
|
24
26
|
|
|
@@ -28,27 +30,35 @@ FuzzyTypeLiteral = Literal['levenshtein', 'jaro', 'jaro_winkler', 'hamming', 'da
|
|
|
28
30
|
|
|
29
31
|
|
|
30
32
|
def construct_join_key_name(side: SideLit, column_name: str) -> str:
|
|
33
|
+
"""Creates a temporary, unique name for a join key column."""
|
|
31
34
|
return "_FLOWFILE_JOIN_KEY_" + side.upper() + "_" + column_name
|
|
32
35
|
|
|
33
36
|
|
|
34
37
|
class JoinKeyRename(NamedTuple):
|
|
38
|
+
"""Represents the renaming of a join key from its original to a temporary name."""
|
|
35
39
|
original_name: str
|
|
36
40
|
temp_name: str
|
|
37
41
|
|
|
38
42
|
|
|
39
43
|
class JoinKeyRenameResponse(NamedTuple):
|
|
44
|
+
"""Contains a list of join key renames for one side of a join."""
|
|
40
45
|
side: SideLit
|
|
41
46
|
join_key_renames: List[JoinKeyRename]
|
|
42
47
|
|
|
43
48
|
|
|
44
49
|
class FullJoinKeyResponse(NamedTuple):
|
|
50
|
+
"""Holds the join key rename responses for both sides of a join."""
|
|
45
51
|
left: JoinKeyRenameResponse
|
|
46
52
|
right: JoinKeyRenameResponse
|
|
47
53
|
|
|
48
54
|
|
|
49
55
|
@dataclass
|
|
50
56
|
class SelectInput:
|
|
51
|
-
|
|
57
|
+
"""Defines how a single column should be selected, renamed, or type-cast.
|
|
58
|
+
|
|
59
|
+
This is a core building block for any operation that involves column manipulation.
|
|
60
|
+
It holds all the configuration for a single field in a selection operation.
|
|
61
|
+
"""
|
|
52
62
|
old_name: str
|
|
53
63
|
original_position: Optional[int] = None
|
|
54
64
|
new_name: Optional[str] = None
|
|
@@ -80,6 +90,7 @@ class SelectInput:
|
|
|
80
90
|
|
|
81
91
|
@property
|
|
82
92
|
def polars_type(self) -> str:
|
|
93
|
+
"""Translates a user-friendly type name to a Polars data type string."""
|
|
83
94
|
if self.data_type.lower() == 'string':
|
|
84
95
|
return 'Utf8'
|
|
85
96
|
elif self.data_type.lower() == 'integer':
|
|
@@ -91,7 +102,7 @@ class SelectInput:
|
|
|
91
102
|
|
|
92
103
|
@dataclass
|
|
93
104
|
class FieldInput:
|
|
94
|
-
|
|
105
|
+
"""Represents a single field with its name and data type, typically for defining an output column."""
|
|
95
106
|
name: str
|
|
96
107
|
data_type: Optional[str] = None
|
|
97
108
|
|
|
@@ -102,19 +113,22 @@ class FieldInput:
|
|
|
102
113
|
|
|
103
114
|
@dataclass
|
|
104
115
|
class FunctionInput:
|
|
116
|
+
"""Defines a formula to be applied, including the output field information."""
|
|
105
117
|
field: FieldInput
|
|
106
118
|
function: str
|
|
107
119
|
|
|
108
120
|
|
|
109
121
|
@dataclass
|
|
110
122
|
class BasicFilter:
|
|
123
|
+
"""Defines a simple, single-condition filter (e.g., 'column' 'equals' 'value')."""
|
|
111
124
|
field: str = ''
|
|
112
|
-
filter_type: str = ''
|
|
125
|
+
filter_type: str = ''
|
|
113
126
|
filter_value: str = ''
|
|
114
127
|
|
|
115
128
|
|
|
116
129
|
@dataclass
|
|
117
130
|
class FilterInput:
|
|
131
|
+
"""Defines the settings for a filter operation, supporting basic or advanced (expression-based) modes."""
|
|
118
132
|
advanced_filter: str = ''
|
|
119
133
|
basic_filter: BasicFilter = None
|
|
120
134
|
filter_type: str = 'basic'
|
|
@@ -122,43 +136,54 @@ class FilterInput:
|
|
|
122
136
|
|
|
123
137
|
@dataclass
|
|
124
138
|
class SelectInputs:
|
|
139
|
+
"""A container for a list of `SelectInput` objects, providing helper methods for managing selections."""
|
|
125
140
|
renames: List[SelectInput]
|
|
126
141
|
|
|
127
142
|
@property
|
|
128
143
|
def old_cols(self) -> Set:
|
|
144
|
+
"""Returns a set of original column names to be kept in the selection."""
|
|
129
145
|
return set(v.old_name for v in self.renames if v.keep)
|
|
130
146
|
|
|
131
147
|
@property
|
|
132
148
|
def new_cols(self) -> Set:
|
|
149
|
+
"""Returns a set of new (renamed) column names to be kept in the selection."""
|
|
133
150
|
return set(v.new_name for v in self.renames if v.keep)
|
|
134
151
|
|
|
135
152
|
@property
|
|
136
153
|
def rename_table(self):
|
|
137
|
-
|
|
154
|
+
"""Generates a dictionary for use in Polars' `.rename()` method."""
|
|
155
|
+
return {v.old_name: v.new_name for v in self.renames if v.is_available and (v.keep or v.join_key)}
|
|
138
156
|
|
|
139
157
|
def get_select_cols(self, include_join_key: bool = True):
|
|
158
|
+
"""Gets a list of original column names to select from the source DataFrame."""
|
|
140
159
|
return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
|
|
141
160
|
|
|
142
|
-
def __add__(self, other: SelectInput):
|
|
161
|
+
def __add__(self, other: "SelectInput"):
|
|
162
|
+
"""Allows adding a SelectInput using the '+' operator."""
|
|
143
163
|
self.renames.append(other)
|
|
144
164
|
|
|
145
|
-
def append(self, other: SelectInput):
|
|
165
|
+
def append(self, other: "SelectInput"):
|
|
166
|
+
"""Appends a new SelectInput to the list of renames."""
|
|
146
167
|
self.renames.append(other)
|
|
147
168
|
|
|
148
169
|
def remove_select_input(self, old_key: str):
|
|
170
|
+
"""Removes a SelectInput from the list based on its original name."""
|
|
149
171
|
self.renames = [rename for rename in self.renames if rename.old_name != old_key]
|
|
150
172
|
|
|
151
173
|
def unselect_field(self, old_key: str):
|
|
174
|
+
"""Marks a field to be dropped from the final selection by setting `keep` to False."""
|
|
152
175
|
for rename in self.renames:
|
|
153
176
|
if old_key == rename.old_name:
|
|
154
177
|
rename.keep = False
|
|
155
178
|
|
|
156
179
|
@classmethod
|
|
157
|
-
def create_from_list(cls, col_list: str):
|
|
180
|
+
def create_from_list(cls, col_list: List[str]):
|
|
181
|
+
"""Creates a SelectInputs object from a simple list of column names."""
|
|
158
182
|
return cls([SelectInput(c) for c in col_list])
|
|
159
183
|
|
|
160
184
|
@classmethod
|
|
161
185
|
def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame):
|
|
186
|
+
"""Creates a SelectInputs object from a Polars DataFrame's columns."""
|
|
162
187
|
return cls([SelectInput(c) for c in df.columns])
|
|
163
188
|
|
|
164
189
|
def get_select_input_on_old_name(self, old_name: str) -> SelectInput | None:
|
|
@@ -169,15 +194,18 @@ class SelectInputs:
|
|
|
169
194
|
|
|
170
195
|
|
|
171
196
|
class JoinInputs(SelectInputs):
|
|
197
|
+
"""Extends `SelectInputs` with functionality specific to join operations, like handling join keys."""
|
|
172
198
|
|
|
173
199
|
def __init__(self, renames: List[SelectInput]):
|
|
174
200
|
self.renames = renames
|
|
175
201
|
|
|
176
202
|
@property
|
|
177
203
|
def join_key_selects(self) -> List[SelectInput]:
|
|
204
|
+
"""Returns only the `SelectInput` objects that are marked as join keys."""
|
|
178
205
|
return [v for v in self.renames if v.join_key]
|
|
179
206
|
|
|
180
207
|
def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
|
|
208
|
+
"""Gets the temporary rename mapping for all join keys on one side of a join."""
|
|
181
209
|
return JoinKeyRenameResponse(
|
|
182
210
|
side,
|
|
183
211
|
[JoinKeyRename(jk.new_name,
|
|
@@ -186,19 +214,20 @@ class JoinInputs(SelectInputs):
|
|
|
186
214
|
)
|
|
187
215
|
|
|
188
216
|
def get_join_key_rename_mapping(self, side: SideLit) -> Dict[str, str]:
|
|
217
|
+
"""Returns a dictionary mapping original join key names to their temporary names."""
|
|
189
218
|
return {jkr[0]: jkr[1] for jkr in self.get_join_key_renames(side)[1]}
|
|
190
219
|
|
|
191
220
|
|
|
192
221
|
@dataclass
|
|
193
222
|
class JoinMap:
|
|
194
|
-
|
|
223
|
+
"""Defines a single mapping between a left and right column for a join key."""
|
|
195
224
|
left_col: str
|
|
196
225
|
right_col: str
|
|
197
226
|
|
|
198
227
|
|
|
199
|
-
|
|
200
228
|
@dataclass
|
|
201
229
|
class FuzzyMap(JoinMap):
|
|
230
|
+
"""Extends `JoinMap` with settings for fuzzy string matching, such as the algorithm and similarity threshold."""
|
|
202
231
|
threshold_score: Optional[float] = 80.0
|
|
203
232
|
fuzzy_type: Optional[FuzzyTypeLiteral] = 'levenshtein'
|
|
204
233
|
perc_unique: Optional[float] = 0.0
|
|
@@ -223,12 +252,13 @@ class FuzzyMap(JoinMap):
|
|
|
223
252
|
|
|
224
253
|
|
|
225
254
|
class JoinSelectMixin:
|
|
226
|
-
"""
|
|
255
|
+
"""A mixin providing common methods for join-like operations that involve left and right inputs."""
|
|
227
256
|
left_select: JoinInputs = None
|
|
228
257
|
right_select: JoinInputs = None
|
|
229
258
|
|
|
230
259
|
@staticmethod
|
|
231
260
|
def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> JoinInputs | None:
|
|
261
|
+
"""Parses various input formats into a standardized `JoinInputs` object."""
|
|
232
262
|
if all(isinstance(c, SelectInput) for c in select):
|
|
233
263
|
return JoinInputs(select)
|
|
234
264
|
elif all(isinstance(c, dict) for c in select):
|
|
@@ -241,6 +271,7 @@ class JoinSelectMixin:
|
|
|
241
271
|
return JoinInputs([SelectInput(s, s) for s in select])
|
|
242
272
|
|
|
243
273
|
def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
|
|
274
|
+
"""Generates a new, non-conflicting column name by adding a suffix if necessary."""
|
|
244
275
|
current_names = self.left_select.new_cols & self.right_select.new_cols
|
|
245
276
|
if old_col_name not in current_names:
|
|
246
277
|
return old_col_name
|
|
@@ -250,6 +281,7 @@ class JoinSelectMixin:
|
|
|
250
281
|
old_col_name = f'{side}_{old_col_name}'
|
|
251
282
|
|
|
252
283
|
def add_new_select_column(self, select_input: SelectInput, side: str):
|
|
284
|
+
"""Adds a new column to the selection for either the left or right side."""
|
|
253
285
|
selects = self.right_select if side == 'right' else self.left_select
|
|
254
286
|
select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
|
|
255
287
|
selects.__add__(select_input)
|
|
@@ -257,19 +289,23 @@ class JoinSelectMixin:
|
|
|
257
289
|
|
|
258
290
|
@dataclass
|
|
259
291
|
class CrossJoinInput(JoinSelectMixin):
|
|
292
|
+
"""Defines the settings for a cross join operation, including column selections for both inputs."""
|
|
260
293
|
left_select: SelectInputs = None
|
|
261
294
|
right_select: SelectInputs = None
|
|
262
295
|
|
|
263
296
|
def __init__(self, left_select: List[SelectInput] | List[str],
|
|
264
297
|
right_select: List[SelectInput] | List[str]):
|
|
298
|
+
"""Initializes the CrossJoinInput with selections for left and right tables."""
|
|
265
299
|
self.left_select = self.parse_select(left_select)
|
|
266
300
|
self.right_select = self.parse_select(right_select)
|
|
267
301
|
|
|
268
302
|
@property
|
|
269
303
|
def overlapping_records(self):
|
|
304
|
+
"""Finds column names that would conflict after the join."""
|
|
270
305
|
return self.left_select.new_cols & self.right_select.new_cols
|
|
271
306
|
|
|
272
307
|
def auto_rename(self):
|
|
308
|
+
"""Automatically renames columns on the right side to prevent naming conflicts."""
|
|
273
309
|
overlapping_records = self.overlapping_records
|
|
274
310
|
while len(overlapping_records) > 0:
|
|
275
311
|
for right_col in self.right_select.renames:
|
|
@@ -280,13 +316,15 @@ class CrossJoinInput(JoinSelectMixin):
|
|
|
280
316
|
|
|
281
317
|
@dataclass
|
|
282
318
|
class JoinInput(JoinSelectMixin):
|
|
319
|
+
"""Defines the settings for a standard SQL-style join, including keys, strategy, and selections."""
|
|
283
320
|
join_mapping: List[JoinMap]
|
|
284
321
|
left_select: JoinInputs = None
|
|
285
322
|
right_select: JoinInputs = None
|
|
286
323
|
how: JoinStrategy = 'inner'
|
|
287
324
|
|
|
288
325
|
@staticmethod
|
|
289
|
-
def parse_join_mapping(join_mapping:
|
|
326
|
+
def parse_join_mapping(join_mapping: any) -> List[JoinMap]:
|
|
327
|
+
"""Parses various input formats for join keys into a standardized list of `JoinMap` objects."""
|
|
290
328
|
if isinstance(join_mapping, (tuple, list)):
|
|
291
329
|
assert len(join_mapping) > 0
|
|
292
330
|
if all(isinstance(jm, dict) for jm in join_mapping):
|
|
@@ -309,6 +347,7 @@ class JoinInput(JoinSelectMixin):
|
|
|
309
347
|
left_select: List[SelectInput] | List[str],
|
|
310
348
|
right_select: List[SelectInput] | List[str],
|
|
311
349
|
how: JoinStrategy = 'inner'):
|
|
350
|
+
"""Initializes the JoinInput with keys, selections, and join strategy."""
|
|
312
351
|
self.join_mapping = self.parse_join_mapping(join_mapping)
|
|
313
352
|
self.left_select = self.parse_select(left_select)
|
|
314
353
|
self.right_select = self.parse_select(right_select)
|
|
@@ -316,10 +355,12 @@ class JoinInput(JoinSelectMixin):
|
|
|
316
355
|
self.how = how
|
|
317
356
|
|
|
318
357
|
def set_join_keys(self):
|
|
358
|
+
"""Marks the `SelectInput` objects corresponding to join keys."""
|
|
319
359
|
[setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
|
|
320
360
|
[setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
|
|
321
361
|
|
|
322
362
|
def get_join_key_renames(self, filter_drop: bool = False) -> FullJoinKeyResponse:
|
|
363
|
+
"""Gets the temporary rename mappings for the join keys on both sides."""
|
|
323
364
|
return FullJoinKeyResponse(self.left_select.get_join_key_renames(side="left", filter_drop=filter_drop),
|
|
324
365
|
self.right_select.get_join_key_renames(side="right", filter_drop=filter_drop))
|
|
325
366
|
|
|
@@ -335,18 +376,22 @@ class JoinInput(JoinSelectMixin):
|
|
|
335
376
|
|
|
336
377
|
@property
|
|
337
378
|
def _left_join_keys(self) -> Set:
|
|
379
|
+
"""Returns a set of the left-side join key column names."""
|
|
338
380
|
return set(jm.left_col for jm in self.join_mapping)
|
|
339
381
|
|
|
340
382
|
@property
|
|
341
383
|
def _right_join_keys(self) -> Set:
|
|
384
|
+
"""Returns a set of the right-side join key column names."""
|
|
342
385
|
return set(jm.right_col for jm in self.join_mapping)
|
|
343
386
|
|
|
344
387
|
@property
|
|
345
|
-
def left_join_keys(self) -> List:
|
|
388
|
+
def left_join_keys(self) -> List[str]:
|
|
389
|
+
"""Returns an ordered list of the left-side join key column names to be used in the join."""
|
|
346
390
|
return [jm.left_col for jm in self.used_join_mapping]
|
|
347
391
|
|
|
348
392
|
@property
|
|
349
|
-
def right_join_keys(self) -> List:
|
|
393
|
+
def right_join_keys(self) -> List[str]:
|
|
394
|
+
"""Returns an ordered list of the right-side join key column names to be used in the join."""
|
|
350
395
|
return [jm.right_col for jm in self.used_join_mapping]
|
|
351
396
|
|
|
352
397
|
@property
|
|
@@ -357,6 +402,7 @@ class JoinInput(JoinSelectMixin):
|
|
|
357
402
|
return self.left_select.new_cols & self.right_select.new_cols
|
|
358
403
|
|
|
359
404
|
def auto_rename(self):
|
|
405
|
+
"""Automatically renames columns on the right side to prevent naming conflicts."""
|
|
360
406
|
self.set_join_keys()
|
|
361
407
|
overlapping_records = self.overlapping_records
|
|
362
408
|
while len(overlapping_records) > 0:
|
|
@@ -366,7 +412,8 @@ class JoinInput(JoinSelectMixin):
|
|
|
366
412
|
overlapping_records = self.overlapping_records
|
|
367
413
|
|
|
368
414
|
@property
|
|
369
|
-
def used_join_mapping(self):
|
|
415
|
+
def used_join_mapping(self) -> List[JoinMap]:
|
|
416
|
+
"""Returns the final join mapping after applying all renames and transformations."""
|
|
370
417
|
new_mappings: List[JoinMap] = []
|
|
371
418
|
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
372
419
|
left_join_rename_mapping: Dict[str, str] = self.left_select.get_join_key_rename_mapping("left")
|
|
@@ -382,6 +429,7 @@ class JoinInput(JoinSelectMixin):
|
|
|
382
429
|
|
|
383
430
|
@dataclass
|
|
384
431
|
class FuzzyMatchInput(JoinInput):
|
|
432
|
+
"""Extends `JoinInput` with settings specific to fuzzy matching, such as the matching algorithm and threshold."""
|
|
385
433
|
join_mapping: List[FuzzyMap]
|
|
386
434
|
aggregate_output: bool = False
|
|
387
435
|
|
|
@@ -429,6 +477,7 @@ class FuzzyMatchInput(JoinInput):
|
|
|
429
477
|
|
|
430
478
|
@property
|
|
431
479
|
def fuzzy_maps(self) -> List[FuzzyMap]:
|
|
480
|
+
"""Returns the final fuzzy mappings after applying all column renames."""
|
|
432
481
|
new_mappings = []
|
|
433
482
|
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
434
483
|
for org_fuzzy_map in self.join_mapping:
|
|
@@ -480,6 +529,7 @@ class AggColl:
|
|
|
480
529
|
output_type: Optional[str] = None
|
|
481
530
|
|
|
482
531
|
def __init__(self, old_name: str, agg: str, new_name: str = None, output_type: str = None):
|
|
532
|
+
"""Initializes an aggregation column with its source, function, and new name."""
|
|
483
533
|
self.old_name = str(old_name)
|
|
484
534
|
if agg != 'groupby':
|
|
485
535
|
self.new_name = new_name if new_name is not None else self.old_name + "_" + agg
|
|
@@ -490,6 +540,7 @@ class AggColl:
|
|
|
490
540
|
|
|
491
541
|
@property
|
|
492
542
|
def agg_func(self):
|
|
543
|
+
"""Returns the corresponding Polars aggregation function from the `agg` string."""
|
|
493
544
|
if self.agg == 'groupby':
|
|
494
545
|
return self.agg
|
|
495
546
|
elif self.agg == 'concat':
|
|
@@ -524,6 +575,7 @@ class GroupByInput:
|
|
|
524
575
|
|
|
525
576
|
@dataclass
|
|
526
577
|
class PivotInput:
|
|
578
|
+
"""Defines the settings for a pivot (long-to-wide) operation."""
|
|
527
579
|
index_columns: List[str]
|
|
528
580
|
pivot_column: str
|
|
529
581
|
value_col: str
|
|
@@ -531,9 +583,11 @@ class PivotInput:
|
|
|
531
583
|
|
|
532
584
|
@property
|
|
533
585
|
def grouped_columns(self) -> List[str]:
|
|
586
|
+
"""Returns the list of columns to be used for the initial grouping stage of the pivot."""
|
|
534
587
|
return self.index_columns + [self.pivot_column]
|
|
535
588
|
|
|
536
589
|
def get_group_by_input(self) -> GroupByInput:
|
|
590
|
+
"""Constructs the `GroupByInput` needed for the pre-aggregation step of the pivot."""
|
|
537
591
|
group_by_cols = [AggColl(c, 'groupby') for c in self.grouped_columns]
|
|
538
592
|
agg_cols = [AggColl(self.value_col, agg=aggregation, new_name=aggregation) for aggregation in self.aggregations]
|
|
539
593
|
return GroupByInput(group_by_cols+agg_cols)
|
|
@@ -541,21 +595,25 @@ class PivotInput:
|
|
|
541
595
|
def get_index_columns(self) -> List[pl.col]:
|
|
542
596
|
return [pl.col(c) for c in self.index_columns]
|
|
543
597
|
|
|
544
|
-
def get_pivot_column(self) -> pl.
|
|
598
|
+
def get_pivot_column(self) -> pl.Expr:
|
|
599
|
+
"""Returns the pivot column as a Polars column expression."""
|
|
545
600
|
return pl.col(self.pivot_column)
|
|
546
601
|
|
|
547
602
|
def get_values_expr(self) -> pl.Expr:
|
|
603
|
+
"""Creates the struct expression used to gather the values for pivoting."""
|
|
548
604
|
return pl.struct([pl.col(c) for c in self.aggregations]).alias('vals')
|
|
549
605
|
|
|
550
606
|
|
|
551
607
|
@dataclass
|
|
552
608
|
class SortByInput:
|
|
609
|
+
"""Defines a single sort condition on a column, including the direction."""
|
|
553
610
|
column: str
|
|
554
611
|
how: str = 'asc'
|
|
555
612
|
|
|
556
613
|
|
|
557
614
|
@dataclass
|
|
558
615
|
class RecordIdInput:
|
|
616
|
+
"""Defines settings for adding a record ID (row number) column to the data."""
|
|
559
617
|
output_column_name: str = 'record_id'
|
|
560
618
|
offset: int = 1
|
|
561
619
|
group_by: Optional[bool] = False
|
|
@@ -564,6 +622,7 @@ class RecordIdInput:
|
|
|
564
622
|
|
|
565
623
|
@dataclass
|
|
566
624
|
class TextToRowsInput:
|
|
625
|
+
"""Defines settings for splitting a text column into multiple rows based on a delimiter."""
|
|
567
626
|
column_to_split: str
|
|
568
627
|
output_column_name: Optional[str] = None
|
|
569
628
|
split_by_fixed_value: Optional[bool] = True
|
|
@@ -573,12 +632,14 @@ class TextToRowsInput:
|
|
|
573
632
|
|
|
574
633
|
@dataclass
|
|
575
634
|
class UnpivotInput:
|
|
635
|
+
"""Defines settings for an unpivot (wide-to-long) operation."""
|
|
576
636
|
index_columns: Optional[List[str]] = field(default_factory=list)
|
|
577
637
|
value_columns: Optional[List[str]] = field(default_factory=list)
|
|
578
638
|
data_type_selector: Optional[Literal['float', 'all', 'date', 'numeric', 'string']] = None
|
|
579
639
|
data_type_selector_mode: Optional[Literal['data_type', 'column']] = 'column'
|
|
580
640
|
|
|
581
641
|
def __post_init__(self):
|
|
642
|
+
"""Ensures that list attributes are initialized correctly if they are None."""
|
|
582
643
|
if self.index_columns is None:
|
|
583
644
|
self.index_columns = []
|
|
584
645
|
if self.value_columns is None:
|
|
@@ -587,7 +648,8 @@ class UnpivotInput:
|
|
|
587
648
|
self.data_type_selector_mode = 'column'
|
|
588
649
|
|
|
589
650
|
@property
|
|
590
|
-
def data_type_selector_expr(self) -> Callable:
|
|
651
|
+
def data_type_selector_expr(self) -> Optional[Callable]:
|
|
652
|
+
"""Returns a Polars selector function based on the `data_type_selector` string."""
|
|
591
653
|
if self.data_type_selector_mode == 'data_type':
|
|
592
654
|
if self.data_type_selector is not None:
|
|
593
655
|
try:
|
|
@@ -600,17 +662,20 @@ class UnpivotInput:
|
|
|
600
662
|
|
|
601
663
|
@dataclass
|
|
602
664
|
class UnionInput:
|
|
665
|
+
"""Defines settings for a union (concatenation) operation."""
|
|
603
666
|
mode: Literal['selective', 'relaxed'] = 'relaxed'
|
|
604
667
|
|
|
605
668
|
|
|
606
669
|
@dataclass
|
|
607
670
|
class UniqueInput:
|
|
671
|
+
"""Defines settings for a uniqueness operation, specifying columns and which row to keep."""
|
|
608
672
|
columns: Optional[List[str]] = None
|
|
609
673
|
strategy: Literal["first", "last", "any", "none"] = "any"
|
|
610
674
|
|
|
611
675
|
|
|
612
676
|
@dataclass
|
|
613
677
|
class GraphSolverInput:
|
|
678
|
+
"""Defines settings for a graph-solving operation (e.g., finding connected components)."""
|
|
614
679
|
col_from: str
|
|
615
680
|
col_to: str
|
|
616
681
|
output_column_name: Optional[str] = 'graph_group'
|
|
@@ -618,5 +683,5 @@ class GraphSolverInput:
|
|
|
618
683
|
|
|
619
684
|
@dataclass
|
|
620
685
|
class PolarsCodeInput:
|
|
686
|
+
"""A simple container for a string of user-provided Polars code to be executed."""
|
|
621
687
|
polars_code: str
|
|
622
|
-
|
flowfile_frame/__init__.py
CHANGED
|
@@ -31,7 +31,15 @@ from flowfile_frame.series import Series
|
|
|
31
31
|
|
|
32
32
|
# File I/O
|
|
33
33
|
from flowfile_frame.flow_frame_methods import ( # noqa: F401
|
|
34
|
-
read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet
|
|
34
|
+
read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet, scan_json_from_cloud_storage,
|
|
35
|
+
scan_parquet_from_cloud_storage,
|
|
36
|
+
scan_csv_from_cloud_storage,
|
|
37
|
+
scan_delta)
|
|
38
|
+
|
|
39
|
+
from flowfile_frame.cloud_storage.secret_manager import (del_cloud_storage_connection,
|
|
40
|
+
create_cloud_storage_connection,
|
|
41
|
+
get_all_available_cloud_storage_connections,
|
|
42
|
+
create_cloud_storage_connection_if_not_exists)
|
|
35
43
|
|
|
36
44
|
from polars.datatypes import ( # noqa: F401
|
|
37
45
|
# Integer types
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Optional, Literal
|
|
2
|
+
|
|
3
|
+
from polars._typing import (CsvEncoding)
|
|
4
|
+
|
|
5
|
+
from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
6
|
+
from flowfile_core.schemas import input_schema, cloud_storage_schemas
|
|
7
|
+
from flowfile_frame.cloud_storage.secret_manager import get_current_user_id
|
|
8
|
+
from flowfile_frame.utils import generate_node_id
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def add_write_ff_to_cloud_storage(path: str,
|
|
12
|
+
flow_graph: Optional[FlowGraph],
|
|
13
|
+
depends_on_node_id: int,
|
|
14
|
+
*,
|
|
15
|
+
connection_name: Optional[str] = None,
|
|
16
|
+
write_mode: Literal["overwrite", "append"] = "overwrite",
|
|
17
|
+
file_format: Literal["csv", "parquet", "json", "delta"] = "parquet",
|
|
18
|
+
csv_delimiter: str = ";",
|
|
19
|
+
csv_encoding: CsvEncoding = "utf8",
|
|
20
|
+
parquet_compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy",
|
|
21
|
+
description: Optional[str] = None
|
|
22
|
+
) -> int:
|
|
23
|
+
node_id = generate_node_id()
|
|
24
|
+
flow_id = flow_graph.flow_id
|
|
25
|
+
settings = input_schema.NodeCloudStorageWriter(
|
|
26
|
+
flow_id=flow_id,
|
|
27
|
+
node_id=node_id,
|
|
28
|
+
cloud_storage_settings=cloud_storage_schemas.CloudStorageWriteSettings(resource_path=path,
|
|
29
|
+
connection_name=connection_name,
|
|
30
|
+
file_format=file_format,
|
|
31
|
+
write_mode=write_mode,
|
|
32
|
+
csv_delimiter=csv_delimiter,
|
|
33
|
+
csv_encoding=csv_encoding,
|
|
34
|
+
parquet_compression=parquet_compression),
|
|
35
|
+
user_id=get_current_user_id(),
|
|
36
|
+
depending_on_id=depends_on_node_id,
|
|
37
|
+
description=description)
|
|
38
|
+
flow_graph.add_cloud_storage_writer(settings)
|
|
39
|
+
return node_id
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from flowfile_core.schemas.cloud_storage_schemas import FullCloudStorageConnection, FullCloudStorageConnectionInterface
|
|
4
|
+
from flowfile_core.flowfile.database_connection_manager.db_connections import (store_cloud_connection,
|
|
5
|
+
get_all_cloud_connections_interface,
|
|
6
|
+
delete_cloud_connection)
|
|
7
|
+
from flowfile_core.database.connection import get_db_context
|
|
8
|
+
from flowfile_core.auth.jwt import get_current_user_sync, create_access_token
|
|
9
|
+
from asyncio import run
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_current_user_id() -> int | None:
|
|
13
|
+
access_token = create_access_token(data={"sub": "local_user"})
|
|
14
|
+
with get_db_context() as db:
|
|
15
|
+
current_user_id = get_current_user_sync(
|
|
16
|
+
access_token,
|
|
17
|
+
db
|
|
18
|
+
).id
|
|
19
|
+
return current_user_id
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_cloud_storage_connection(connection: FullCloudStorageConnection) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Create a cloud storage connection using the provided connection details.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
connection (FullCloudStorageConnection): The connection details for cloud storage.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
None
|
|
31
|
+
"""
|
|
32
|
+
access_token = create_access_token(data={"sub": "local_user"})
|
|
33
|
+
|
|
34
|
+
with get_db_context() as db:
|
|
35
|
+
current_user_id = get_current_user_sync(
|
|
36
|
+
access_token,
|
|
37
|
+
db
|
|
38
|
+
).id
|
|
39
|
+
store_cloud_connection(
|
|
40
|
+
db,
|
|
41
|
+
connection,
|
|
42
|
+
current_user_id
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def create_cloud_storage_connection_if_not_exists(connection: FullCloudStorageConnection) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Create a cloud storage connection if it does not already exist.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
connection (FullCloudStorageConnection): The connection details for cloud storage.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
None
|
|
55
|
+
"""
|
|
56
|
+
all_connections = get_all_available_cloud_storage_connections()
|
|
57
|
+
if not any(conn.connection_name == connection.connection_name for conn in all_connections):
|
|
58
|
+
create_cloud_storage_connection(connection)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_all_available_cloud_storage_connections() -> List[FullCloudStorageConnectionInterface]:
|
|
62
|
+
with get_db_context() as db:
|
|
63
|
+
all_connections = get_all_cloud_connections_interface(
|
|
64
|
+
db,
|
|
65
|
+
get_current_user_id()
|
|
66
|
+
)
|
|
67
|
+
return all_connections
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def del_cloud_storage_connection(connection_name: str) -> None:
|
|
71
|
+
with get_db_context() as db:
|
|
72
|
+
user_id = get_current_user_id()
|
|
73
|
+
delete_cloud_connection(db, connection_name, user_id)
|
flowfile_frame/expr.py
CHANGED
|
@@ -20,6 +20,13 @@ if TYPE_CHECKING:
|
|
|
20
20
|
from flowfile_frame.selectors import Selector
|
|
21
21
|
ExprType = TypeVar('ExprType', bound='Expr')
|
|
22
22
|
ColumnType = "Column" # Use string literal instead of direct class reference
|
|
23
|
+
from polars._typing import (
|
|
24
|
+
Ambiguous,
|
|
25
|
+
IntoExpr,
|
|
26
|
+
IntoExprColumn,
|
|
27
|
+
PolarsDataType,
|
|
28
|
+
PolarsTemporalType,
|
|
29
|
+
TimeUnit)
|
|
23
30
|
|
|
24
31
|
ExprOrStr = Union['Expr', str]
|
|
25
32
|
ExprOrStrList = List[ExprOrStr]
|
|
@@ -110,6 +117,11 @@ class StringMethods:
|
|
|
110
117
|
res_expr = self.expr.to_uppercase() if self.expr is not None else None
|
|
111
118
|
return self._create_next_expr(method_name="to_uppercase", result_expr=res_expr, is_complex=True)
|
|
112
119
|
|
|
120
|
+
def slice(self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None) -> Expr:
|
|
121
|
+
res_expr = self.expr.slice(offset=offset, length=length)
|
|
122
|
+
return self._create_next_expr(method_name="slice", result_expr=res_expr, is_complex=True,
|
|
123
|
+
offset=offset, length=length)
|
|
124
|
+
|
|
113
125
|
def to_lowercase(self):
|
|
114
126
|
res_expr = self.expr.to_lowercase() if self.expr is not None else None
|
|
115
127
|
return self._create_next_expr(method_name="to_lowercase", result_expr=res_expr, is_complex=True)
|
|
@@ -138,13 +150,28 @@ class StringMethods:
|
|
|
138
150
|
strict: bool = True,
|
|
139
151
|
exact: bool = True,
|
|
140
152
|
cache: bool = True,
|
|
141
|
-
ambiguous: Literal["earliest", "latest", "raise", "null"] | Expr = "raise",):
|
|
153
|
+
ambiguous: Literal["earliest", "latest", "raise", "null"] | Expr = "raise",) -> 'Expr':
|
|
142
154
|
res_expr = self.expr.to_datetime(format, time_unit=time_unit, time_zone=time_zone, strict=strict,
|
|
143
155
|
exact=exact, cache=cache, ambiguous=ambiguous)
|
|
144
156
|
return self._create_next_expr(method_name="to_datetime", result_expr=res_expr, is_complex=True,
|
|
145
157
|
format=format, time_unit=time_unit, time_zone=time_zone, strict=strict,
|
|
146
158
|
exact=exact, cache=cache, ambiguous=ambiguous)
|
|
147
159
|
|
|
160
|
+
def strptime(self,
|
|
161
|
+
dtype: PolarsTemporalType,
|
|
162
|
+
format: str | None = None,
|
|
163
|
+
*,
|
|
164
|
+
strict: bool = True,
|
|
165
|
+
exact: bool = True,
|
|
166
|
+
cache: bool = True,
|
|
167
|
+
ambiguous: Literal["earliest", "latest", "raise", "null"] | Expr = "raise",) -> 'Expr':
|
|
168
|
+
res_expr = self.expr.strptime(dtype, format, strict=strict, exact=exact, cache=cache, ambiguous=ambiguous)
|
|
169
|
+
return self._create_next_expr(method_name="strptime", dtype=dtype, result_expr=res_expr, is_complex=True,
|
|
170
|
+
format=format, strict=strict,
|
|
171
|
+
exact=exact, cache=cache, ambiguous=ambiguous)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
148
175
|
def __getattr__(self, name):
|
|
149
176
|
if self.expr is None or not hasattr(self.expr, name):
|
|
150
177
|
if self.expr is None:
|