Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/api-6ef0dcef.js +80 -0
- flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +9 -6
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +472 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +718 -253
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +563 -117
- flowfile_core/flowfile/flow_node/models.py +154 -20
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +35 -26
- flowfile_core/main.py +35 -15
- flowfile_core/routes/cloud_connections.py +77 -0
- flowfile_core/routes/logs.py +2 -7
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +130 -90
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +121 -71
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +150 -12
- flowfile_core/schemas/transform_schema.py +175 -35
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +481 -208
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +160 -22
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +292 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +214 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_core/schemas/models.py +0 -193
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
|
@@ -4,8 +4,11 @@ import polars as pl
|
|
|
4
4
|
from polars import selectors
|
|
5
5
|
from copy import deepcopy
|
|
6
6
|
|
|
7
|
+
from typing import NamedTuple
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
def get_func_type_mapping(func: str):
|
|
11
|
+
"""Infers the output data type of common aggregation functions."""
|
|
9
12
|
if func in ["mean", "avg", "median", "std", "var"]:
|
|
10
13
|
return "Float64"
|
|
11
14
|
elif func in ['min', 'max', 'first', 'last', "cumsum", "sum"]:
|
|
@@ -17,16 +20,45 @@ def get_func_type_mapping(func: str):
|
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def string_concat(*column: str):
|
|
23
|
+
"""A simple wrapper to concatenate string columns in Polars."""
|
|
20
24
|
return pl.col(column).cast(pl.Utf8).str.concat(delimiter=',')
|
|
21
25
|
|
|
22
26
|
|
|
23
|
-
|
|
27
|
+
SideLit = Literal["left", "right"]
|
|
28
|
+
JoinStrategy = Literal['inner', 'left', 'right', 'full', 'semi', 'anti', 'cross', 'outer']
|
|
24
29
|
FuzzyTypeLiteral = Literal['levenshtein', 'jaro', 'jaro_winkler', 'hamming', 'damerau_levenshtein', 'indel']
|
|
25
30
|
|
|
26
31
|
|
|
32
|
+
def construct_join_key_name(side: SideLit, column_name: str) -> str:
|
|
33
|
+
"""Creates a temporary, unique name for a join key column."""
|
|
34
|
+
return "_FLOWFILE_JOIN_KEY_" + side.upper() + "_" + column_name
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class JoinKeyRename(NamedTuple):
|
|
38
|
+
"""Represents the renaming of a join key from its original to a temporary name."""
|
|
39
|
+
original_name: str
|
|
40
|
+
temp_name: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class JoinKeyRenameResponse(NamedTuple):
|
|
44
|
+
"""Contains a list of join key renames for one side of a join."""
|
|
45
|
+
side: SideLit
|
|
46
|
+
join_key_renames: List[JoinKeyRename]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class FullJoinKeyResponse(NamedTuple):
|
|
50
|
+
"""Holds the join key rename responses for both sides of a join."""
|
|
51
|
+
left: JoinKeyRenameResponse
|
|
52
|
+
right: JoinKeyRenameResponse
|
|
53
|
+
|
|
54
|
+
|
|
27
55
|
@dataclass
|
|
28
56
|
class SelectInput:
|
|
29
|
-
|
|
57
|
+
"""Defines how a single column should be selected, renamed, or type-cast.
|
|
58
|
+
|
|
59
|
+
This is a core building block for any operation that involves column manipulation.
|
|
60
|
+
It holds all the configuration for a single field in a selection operation.
|
|
61
|
+
"""
|
|
30
62
|
old_name: str
|
|
31
63
|
original_position: Optional[int] = None
|
|
32
64
|
new_name: Optional[str] = None
|
|
@@ -58,6 +90,7 @@ class SelectInput:
|
|
|
58
90
|
|
|
59
91
|
@property
|
|
60
92
|
def polars_type(self) -> str:
|
|
93
|
+
"""Translates a user-friendly type name to a Polars data type string."""
|
|
61
94
|
if self.data_type.lower() == 'string':
|
|
62
95
|
return 'Utf8'
|
|
63
96
|
elif self.data_type.lower() == 'integer':
|
|
@@ -69,7 +102,7 @@ class SelectInput:
|
|
|
69
102
|
|
|
70
103
|
@dataclass
|
|
71
104
|
class FieldInput:
|
|
72
|
-
|
|
105
|
+
"""Represents a single field with its name and data type, typically for defining an output column."""
|
|
73
106
|
name: str
|
|
74
107
|
data_type: Optional[str] = None
|
|
75
108
|
|
|
@@ -80,19 +113,22 @@ class FieldInput:
|
|
|
80
113
|
|
|
81
114
|
@dataclass
|
|
82
115
|
class FunctionInput:
|
|
116
|
+
"""Defines a formula to be applied, including the output field information."""
|
|
83
117
|
field: FieldInput
|
|
84
118
|
function: str
|
|
85
119
|
|
|
86
120
|
|
|
87
121
|
@dataclass
|
|
88
122
|
class BasicFilter:
|
|
123
|
+
"""Defines a simple, single-condition filter (e.g., 'column' 'equals' 'value')."""
|
|
89
124
|
field: str = ''
|
|
90
|
-
filter_type: str = ''
|
|
125
|
+
filter_type: str = ''
|
|
91
126
|
filter_value: str = ''
|
|
92
127
|
|
|
93
128
|
|
|
94
129
|
@dataclass
|
|
95
130
|
class FilterInput:
|
|
131
|
+
"""Defines the settings for a filter operation, supporting basic or advanced (expression-based) modes."""
|
|
96
132
|
advanced_filter: str = ''
|
|
97
133
|
basic_filter: BasicFilter = None
|
|
98
134
|
filter_type: str = 'basic'
|
|
@@ -100,49 +136,98 @@ class FilterInput:
|
|
|
100
136
|
|
|
101
137
|
@dataclass
|
|
102
138
|
class SelectInputs:
|
|
139
|
+
"""A container for a list of `SelectInput` objects, providing helper methods for managing selections."""
|
|
103
140
|
renames: List[SelectInput]
|
|
104
141
|
|
|
105
142
|
@property
|
|
106
143
|
def old_cols(self) -> Set:
|
|
144
|
+
"""Returns a set of original column names to be kept in the selection."""
|
|
107
145
|
return set(v.old_name for v in self.renames if v.keep)
|
|
108
146
|
|
|
109
147
|
@property
|
|
110
148
|
def new_cols(self) -> Set:
|
|
111
|
-
|
|
149
|
+
"""Returns a set of new (renamed) column names to be kept in the selection."""
|
|
150
|
+
return set(v.new_name for v in self.renames if v.keep)
|
|
112
151
|
|
|
113
152
|
@property
|
|
114
153
|
def rename_table(self):
|
|
115
|
-
|
|
154
|
+
"""Generates a dictionary for use in Polars' `.rename()` method."""
|
|
155
|
+
return {v.old_name: v.new_name for v in self.renames if v.is_available and (v.keep or v.join_key)}
|
|
116
156
|
|
|
117
157
|
def get_select_cols(self, include_join_key: bool = True):
|
|
158
|
+
"""Gets a list of original column names to select from the source DataFrame."""
|
|
118
159
|
return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
|
|
119
160
|
|
|
120
|
-
def __add__(self, other: SelectInput):
|
|
161
|
+
def __add__(self, other: "SelectInput"):
|
|
162
|
+
"""Allows adding a SelectInput using the '+' operator."""
|
|
121
163
|
self.renames.append(other)
|
|
122
164
|
|
|
123
|
-
def append(self, other: SelectInput):
|
|
165
|
+
def append(self, other: "SelectInput"):
|
|
166
|
+
"""Appends a new SelectInput to the list of renames."""
|
|
124
167
|
self.renames.append(other)
|
|
125
168
|
|
|
126
169
|
def remove_select_input(self, old_key: str):
|
|
170
|
+
"""Removes a SelectInput from the list based on its original name."""
|
|
127
171
|
self.renames = [rename for rename in self.renames if rename.old_name != old_key]
|
|
128
172
|
|
|
173
|
+
def unselect_field(self, old_key: str):
|
|
174
|
+
"""Marks a field to be dropped from the final selection by setting `keep` to False."""
|
|
175
|
+
for rename in self.renames:
|
|
176
|
+
if old_key == rename.old_name:
|
|
177
|
+
rename.keep = False
|
|
178
|
+
|
|
129
179
|
@classmethod
|
|
130
|
-
def create_from_list(cls, col_list: str):
|
|
180
|
+
def create_from_list(cls, col_list: List[str]):
|
|
181
|
+
"""Creates a SelectInputs object from a simple list of column names."""
|
|
131
182
|
return cls([SelectInput(c) for c in col_list])
|
|
132
183
|
|
|
133
184
|
@classmethod
|
|
134
185
|
def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame):
|
|
186
|
+
"""Creates a SelectInputs object from a Polars DataFrame's columns."""
|
|
135
187
|
return cls([SelectInput(c) for c in df.columns])
|
|
136
188
|
|
|
189
|
+
def get_select_input_on_old_name(self, old_name: str) -> SelectInput | None:
|
|
190
|
+
return next((v for v in self.renames if v.old_name == old_name), None)
|
|
191
|
+
|
|
192
|
+
def get_select_input_on_new_name(self, old_name: str) -> SelectInput | None:
|
|
193
|
+
return next((v for v in self.renames if v.new_name == old_name), None)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class JoinInputs(SelectInputs):
|
|
197
|
+
"""Extends `SelectInputs` with functionality specific to join operations, like handling join keys."""
|
|
198
|
+
|
|
199
|
+
def __init__(self, renames: List[SelectInput]):
|
|
200
|
+
self.renames = renames
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def join_key_selects(self) -> List[SelectInput]:
|
|
204
|
+
"""Returns only the `SelectInput` objects that are marked as join keys."""
|
|
205
|
+
return [v for v in self.renames if v.join_key]
|
|
206
|
+
|
|
207
|
+
def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
|
|
208
|
+
"""Gets the temporary rename mapping for all join keys on one side of a join."""
|
|
209
|
+
return JoinKeyRenameResponse(
|
|
210
|
+
side,
|
|
211
|
+
[JoinKeyRename(jk.new_name,
|
|
212
|
+
construct_join_key_name(side, jk.new_name))
|
|
213
|
+
for jk in self.join_key_selects if jk.keep or not filter_drop]
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def get_join_key_rename_mapping(self, side: SideLit) -> Dict[str, str]:
|
|
217
|
+
"""Returns a dictionary mapping original join key names to their temporary names."""
|
|
218
|
+
return {jkr[0]: jkr[1] for jkr in self.get_join_key_renames(side)[1]}
|
|
219
|
+
|
|
137
220
|
|
|
138
221
|
@dataclass
|
|
139
222
|
class JoinMap:
|
|
223
|
+
"""Defines a single mapping between a left and right column for a join key."""
|
|
140
224
|
left_col: str
|
|
141
225
|
right_col: str
|
|
142
226
|
|
|
143
227
|
|
|
144
228
|
@dataclass
|
|
145
229
|
class FuzzyMap(JoinMap):
|
|
230
|
+
"""Extends `JoinMap` with settings for fuzzy string matching, such as the algorithm and similarity threshold."""
|
|
146
231
|
threshold_score: Optional[float] = 80.0
|
|
147
232
|
fuzzy_type: Optional[FuzzyTypeLiteral] = 'levenshtein'
|
|
148
233
|
perc_unique: Optional[float] = 0.0
|
|
@@ -167,22 +252,26 @@ class FuzzyMap(JoinMap):
|
|
|
167
252
|
|
|
168
253
|
|
|
169
254
|
class JoinSelectMixin:
|
|
170
|
-
"""
|
|
255
|
+
"""A mixin providing common methods for join-like operations that involve left and right inputs."""
|
|
256
|
+
left_select: JoinInputs = None
|
|
257
|
+
right_select: JoinInputs = None
|
|
171
258
|
|
|
172
259
|
@staticmethod
|
|
173
|
-
def parse_select(select: List[SelectInput] | List[str] | List[Dict]) ->
|
|
260
|
+
def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> JoinInputs | None:
|
|
261
|
+
"""Parses various input formats into a standardized `JoinInputs` object."""
|
|
174
262
|
if all(isinstance(c, SelectInput) for c in select):
|
|
175
|
-
return
|
|
263
|
+
return JoinInputs(select)
|
|
176
264
|
elif all(isinstance(c, dict) for c in select):
|
|
177
|
-
return
|
|
265
|
+
return JoinInputs([SelectInput(**c.__dict__) for c in select])
|
|
178
266
|
elif isinstance(select, dict):
|
|
179
267
|
renames = select.get('renames')
|
|
180
268
|
if renames:
|
|
181
|
-
return
|
|
269
|
+
return JoinInputs([SelectInput(**c) for c in renames])
|
|
182
270
|
elif all(isinstance(c, str) for c in select):
|
|
183
|
-
return
|
|
271
|
+
return JoinInputs([SelectInput(s, s) for s in select])
|
|
184
272
|
|
|
185
273
|
def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
|
|
274
|
+
"""Generates a new, non-conflicting column name by adding a suffix if necessary."""
|
|
186
275
|
current_names = self.left_select.new_cols & self.right_select.new_cols
|
|
187
276
|
if old_col_name not in current_names:
|
|
188
277
|
return old_col_name
|
|
@@ -192,6 +281,7 @@ class JoinSelectMixin:
|
|
|
192
281
|
old_col_name = f'{side}_{old_col_name}'
|
|
193
282
|
|
|
194
283
|
def add_new_select_column(self, select_input: SelectInput, side: str):
|
|
284
|
+
"""Adds a new column to the selection for either the left or right side."""
|
|
195
285
|
selects = self.right_select if side == 'right' else self.left_select
|
|
196
286
|
select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
|
|
197
287
|
selects.__add__(select_input)
|
|
@@ -199,19 +289,23 @@ class JoinSelectMixin:
|
|
|
199
289
|
|
|
200
290
|
@dataclass
|
|
201
291
|
class CrossJoinInput(JoinSelectMixin):
|
|
292
|
+
"""Defines the settings for a cross join operation, including column selections for both inputs."""
|
|
202
293
|
left_select: SelectInputs = None
|
|
203
294
|
right_select: SelectInputs = None
|
|
204
295
|
|
|
205
296
|
def __init__(self, left_select: List[SelectInput] | List[str],
|
|
206
297
|
right_select: List[SelectInput] | List[str]):
|
|
298
|
+
"""Initializes the CrossJoinInput with selections for left and right tables."""
|
|
207
299
|
self.left_select = self.parse_select(left_select)
|
|
208
300
|
self.right_select = self.parse_select(right_select)
|
|
209
301
|
|
|
210
302
|
@property
|
|
211
303
|
def overlapping_records(self):
|
|
304
|
+
"""Finds column names that would conflict after the join."""
|
|
212
305
|
return self.left_select.new_cols & self.right_select.new_cols
|
|
213
306
|
|
|
214
307
|
def auto_rename(self):
|
|
308
|
+
"""Automatically renames columns on the right side to prevent naming conflicts."""
|
|
215
309
|
overlapping_records = self.overlapping_records
|
|
216
310
|
while len(overlapping_records) > 0:
|
|
217
311
|
for right_col in self.right_select.renames:
|
|
@@ -222,13 +316,15 @@ class CrossJoinInput(JoinSelectMixin):
|
|
|
222
316
|
|
|
223
317
|
@dataclass
|
|
224
318
|
class JoinInput(JoinSelectMixin):
|
|
319
|
+
"""Defines the settings for a standard SQL-style join, including keys, strategy, and selections."""
|
|
225
320
|
join_mapping: List[JoinMap]
|
|
226
|
-
left_select:
|
|
227
|
-
right_select:
|
|
321
|
+
left_select: JoinInputs = None
|
|
322
|
+
right_select: JoinInputs = None
|
|
228
323
|
how: JoinStrategy = 'inner'
|
|
229
324
|
|
|
230
325
|
@staticmethod
|
|
231
|
-
def parse_join_mapping(join_mapping:
|
|
326
|
+
def parse_join_mapping(join_mapping: any) -> List[JoinMap]:
|
|
327
|
+
"""Parses various input formats for join keys into a standardized list of `JoinMap` objects."""
|
|
232
328
|
if isinstance(join_mapping, (tuple, list)):
|
|
233
329
|
assert len(join_mapping) > 0
|
|
234
330
|
if all(isinstance(jm, dict) for jm in join_mapping):
|
|
@@ -251,39 +347,63 @@ class JoinInput(JoinSelectMixin):
|
|
|
251
347
|
left_select: List[SelectInput] | List[str],
|
|
252
348
|
right_select: List[SelectInput] | List[str],
|
|
253
349
|
how: JoinStrategy = 'inner'):
|
|
350
|
+
"""Initializes the JoinInput with keys, selections, and join strategy."""
|
|
254
351
|
self.join_mapping = self.parse_join_mapping(join_mapping)
|
|
255
352
|
self.left_select = self.parse_select(left_select)
|
|
256
353
|
self.right_select = self.parse_select(right_select)
|
|
354
|
+
self.set_join_keys()
|
|
355
|
+
self.how = how
|
|
356
|
+
|
|
357
|
+
def set_join_keys(self):
|
|
358
|
+
"""Marks the `SelectInput` objects corresponding to join keys."""
|
|
257
359
|
[setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
|
|
258
360
|
[setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
|
|
259
|
-
|
|
361
|
+
|
|
362
|
+
def get_join_key_renames(self, filter_drop: bool = False) -> FullJoinKeyResponse:
|
|
363
|
+
"""Gets the temporary rename mappings for the join keys on both sides."""
|
|
364
|
+
return FullJoinKeyResponse(self.left_select.get_join_key_renames(side="left", filter_drop=filter_drop),
|
|
365
|
+
self.right_select.get_join_key_renames(side="right", filter_drop=filter_drop))
|
|
366
|
+
|
|
367
|
+
def get_names_for_table_rename(self) -> List[JoinMap]:
|
|
368
|
+
new_mappings: List[JoinMap] = []
|
|
369
|
+
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
370
|
+
for join_map in self.join_mapping:
|
|
371
|
+
new_mappings.append(JoinMap(left_rename_table.get(join_map.left_col, join_map.left_col),
|
|
372
|
+
right_rename_table.get(join_map.right_col, join_map.right_col)
|
|
373
|
+
)
|
|
374
|
+
)
|
|
375
|
+
return new_mappings
|
|
260
376
|
|
|
261
377
|
@property
|
|
262
378
|
def _left_join_keys(self) -> Set:
|
|
379
|
+
"""Returns a set of the left-side join key column names."""
|
|
263
380
|
return set(jm.left_col for jm in self.join_mapping)
|
|
264
381
|
|
|
265
382
|
@property
|
|
266
383
|
def _right_join_keys(self) -> Set:
|
|
384
|
+
"""Returns a set of the right-side join key column names."""
|
|
267
385
|
return set(jm.right_col for jm in self.join_mapping)
|
|
268
386
|
|
|
269
387
|
@property
|
|
270
|
-
def left_join_keys(self) -> List:
|
|
271
|
-
|
|
388
|
+
def left_join_keys(self) -> List[str]:
|
|
389
|
+
"""Returns an ordered list of the left-side join key column names to be used in the join."""
|
|
390
|
+
return [jm.left_col for jm in self.used_join_mapping]
|
|
272
391
|
|
|
273
392
|
@property
|
|
274
|
-
def right_join_keys(self) -> List:
|
|
275
|
-
|
|
393
|
+
def right_join_keys(self) -> List[str]:
|
|
394
|
+
"""Returns an ordered list of the right-side join key column names to be used in the join."""
|
|
395
|
+
return [jm.right_col for jm in self.used_join_mapping]
|
|
276
396
|
|
|
277
397
|
@property
|
|
278
398
|
def overlapping_records(self):
|
|
279
399
|
if self.how in ('left', 'right', 'inner'):
|
|
280
|
-
|
|
281
|
-
return ((self.left_select.new_cols & self.right_select.new_cols) -
|
|
282
|
-
(set(self.left_join_keys) & set(self.right_join_keys)))
|
|
400
|
+
return self.left_select.new_cols & self.right_select.new_cols
|
|
283
401
|
else:
|
|
284
402
|
return self.left_select.new_cols & self.right_select.new_cols
|
|
285
403
|
|
|
286
404
|
def auto_rename(self):
|
|
405
|
+
"""Automatically renames columns on the right side to prevent naming conflicts."""
|
|
406
|
+
self.set_join_keys()
|
|
287
407
|
overlapping_records = self.overlapping_records
|
|
288
408
|
while len(overlapping_records) > 0:
|
|
289
409
|
for right_col in self.right_select.renames:
|
|
@@ -292,13 +412,16 @@ class JoinInput(JoinSelectMixin):
|
|
|
292
412
|
overlapping_records = self.overlapping_records
|
|
293
413
|
|
|
294
414
|
@property
|
|
295
|
-
def
|
|
296
|
-
|
|
415
|
+
def used_join_mapping(self) -> List[JoinMap]:
|
|
416
|
+
"""Returns the final join mapping after applying all renames and transformations."""
|
|
417
|
+
new_mappings: List[JoinMap] = []
|
|
297
418
|
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
419
|
+
left_join_rename_mapping: Dict[str, str] = self.left_select.get_join_key_rename_mapping("left")
|
|
420
|
+
right_join_rename_mapping: Dict[str, str] = self.right_select.get_join_key_rename_mapping("right")
|
|
298
421
|
for join_map in self.join_mapping:
|
|
299
422
|
# del self.right_select.rename_table, self.left_select.rename_table
|
|
300
|
-
new_mappings.append(JoinMap(left_rename_table.get(join_map.left_col),
|
|
301
|
-
right_rename_table.get(join_map.right_col)
|
|
423
|
+
new_mappings.append(JoinMap(left_join_rename_mapping.get(left_rename_table.get(join_map.left_col, join_map.left_col)),
|
|
424
|
+
right_join_rename_mapping.get(right_rename_table.get(join_map.right_col, join_map.right_col))
|
|
302
425
|
)
|
|
303
426
|
)
|
|
304
427
|
return new_mappings
|
|
@@ -306,6 +429,7 @@ class JoinInput(JoinSelectMixin):
|
|
|
306
429
|
|
|
307
430
|
@dataclass
|
|
308
431
|
class FuzzyMatchInput(JoinInput):
|
|
432
|
+
"""Extends `JoinInput` with settings specific to fuzzy matching, such as the matching algorithm and threshold."""
|
|
309
433
|
join_mapping: List[FuzzyMap]
|
|
310
434
|
aggregate_output: bool = False
|
|
311
435
|
|
|
@@ -332,7 +456,7 @@ class FuzzyMatchInput(JoinInput):
|
|
|
332
456
|
return fuzz_mapping
|
|
333
457
|
|
|
334
458
|
def __init__(self, join_mapping: List[FuzzyMap] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
|
|
335
|
-
right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how:
|
|
459
|
+
right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: JoinStrategy = 'inner'):
|
|
336
460
|
self.join_mapping = self.parse_fuzz_mapping(join_mapping)
|
|
337
461
|
self.left_select = self.parse_select(left_select)
|
|
338
462
|
self.right_select = self.parse_select(right_select)
|
|
@@ -353,6 +477,7 @@ class FuzzyMatchInput(JoinInput):
|
|
|
353
477
|
|
|
354
478
|
@property
|
|
355
479
|
def fuzzy_maps(self) -> List[FuzzyMap]:
|
|
480
|
+
"""Returns the final fuzzy mappings after applying all column renames."""
|
|
356
481
|
new_mappings = []
|
|
357
482
|
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
358
483
|
for org_fuzzy_map in self.join_mapping:
|
|
@@ -404,6 +529,7 @@ class AggColl:
|
|
|
404
529
|
output_type: Optional[str] = None
|
|
405
530
|
|
|
406
531
|
def __init__(self, old_name: str, agg: str, new_name: str = None, output_type: str = None):
|
|
532
|
+
"""Initializes an aggregation column with its source, function, and new name."""
|
|
407
533
|
self.old_name = str(old_name)
|
|
408
534
|
if agg != 'groupby':
|
|
409
535
|
self.new_name = new_name if new_name is not None else self.old_name + "_" + agg
|
|
@@ -414,6 +540,7 @@ class AggColl:
|
|
|
414
540
|
|
|
415
541
|
@property
|
|
416
542
|
def agg_func(self):
|
|
543
|
+
"""Returns the corresponding Polars aggregation function from the `agg` string."""
|
|
417
544
|
if self.agg == 'groupby':
|
|
418
545
|
return self.agg
|
|
419
546
|
elif self.agg == 'concat':
|
|
@@ -448,6 +575,7 @@ class GroupByInput:
|
|
|
448
575
|
|
|
449
576
|
@dataclass
|
|
450
577
|
class PivotInput:
|
|
578
|
+
"""Defines the settings for a pivot (long-to-wide) operation."""
|
|
451
579
|
index_columns: List[str]
|
|
452
580
|
pivot_column: str
|
|
453
581
|
value_col: str
|
|
@@ -455,9 +583,11 @@ class PivotInput:
|
|
|
455
583
|
|
|
456
584
|
@property
|
|
457
585
|
def grouped_columns(self) -> List[str]:
|
|
586
|
+
"""Returns the list of columns to be used for the initial grouping stage of the pivot."""
|
|
458
587
|
return self.index_columns + [self.pivot_column]
|
|
459
588
|
|
|
460
589
|
def get_group_by_input(self) -> GroupByInput:
|
|
590
|
+
"""Constructs the `GroupByInput` needed for the pre-aggregation step of the pivot."""
|
|
461
591
|
group_by_cols = [AggColl(c, 'groupby') for c in self.grouped_columns]
|
|
462
592
|
agg_cols = [AggColl(self.value_col, agg=aggregation, new_name=aggregation) for aggregation in self.aggregations]
|
|
463
593
|
return GroupByInput(group_by_cols+agg_cols)
|
|
@@ -465,22 +595,25 @@ class PivotInput:
|
|
|
465
595
|
def get_index_columns(self) -> List[pl.col]:
|
|
466
596
|
return [pl.col(c) for c in self.index_columns]
|
|
467
597
|
|
|
468
|
-
def get_pivot_column(self) -> pl.
|
|
598
|
+
def get_pivot_column(self) -> pl.Expr:
|
|
599
|
+
"""Returns the pivot column as a Polars column expression."""
|
|
469
600
|
return pl.col(self.pivot_column)
|
|
470
601
|
|
|
471
602
|
def get_values_expr(self) -> pl.Expr:
|
|
603
|
+
"""Creates the struct expression used to gather the values for pivoting."""
|
|
472
604
|
return pl.struct([pl.col(c) for c in self.aggregations]).alias('vals')
|
|
473
605
|
|
|
474
606
|
|
|
475
|
-
|
|
476
607
|
@dataclass
|
|
477
608
|
class SortByInput:
|
|
609
|
+
"""Defines a single sort condition on a column, including the direction."""
|
|
478
610
|
column: str
|
|
479
611
|
how: str = 'asc'
|
|
480
612
|
|
|
481
613
|
|
|
482
614
|
@dataclass
|
|
483
615
|
class RecordIdInput:
|
|
616
|
+
"""Defines settings for adding a record ID (row number) column to the data."""
|
|
484
617
|
output_column_name: str = 'record_id'
|
|
485
618
|
offset: int = 1
|
|
486
619
|
group_by: Optional[bool] = False
|
|
@@ -489,6 +622,7 @@ class RecordIdInput:
|
|
|
489
622
|
|
|
490
623
|
@dataclass
|
|
491
624
|
class TextToRowsInput:
|
|
625
|
+
"""Defines settings for splitting a text column into multiple rows based on a delimiter."""
|
|
492
626
|
column_to_split: str
|
|
493
627
|
output_column_name: Optional[str] = None
|
|
494
628
|
split_by_fixed_value: Optional[bool] = True
|
|
@@ -498,12 +632,14 @@ class TextToRowsInput:
|
|
|
498
632
|
|
|
499
633
|
@dataclass
|
|
500
634
|
class UnpivotInput:
|
|
635
|
+
"""Defines settings for an unpivot (wide-to-long) operation."""
|
|
501
636
|
index_columns: Optional[List[str]] = field(default_factory=list)
|
|
502
637
|
value_columns: Optional[List[str]] = field(default_factory=list)
|
|
503
638
|
data_type_selector: Optional[Literal['float', 'all', 'date', 'numeric', 'string']] = None
|
|
504
639
|
data_type_selector_mode: Optional[Literal['data_type', 'column']] = 'column'
|
|
505
640
|
|
|
506
641
|
def __post_init__(self):
|
|
642
|
+
"""Ensures that list attributes are initialized correctly if they are None."""
|
|
507
643
|
if self.index_columns is None:
|
|
508
644
|
self.index_columns = []
|
|
509
645
|
if self.value_columns is None:
|
|
@@ -512,7 +648,8 @@ class UnpivotInput:
|
|
|
512
648
|
self.data_type_selector_mode = 'column'
|
|
513
649
|
|
|
514
650
|
@property
|
|
515
|
-
def data_type_selector_expr(self) -> Callable:
|
|
651
|
+
def data_type_selector_expr(self) -> Optional[Callable]:
|
|
652
|
+
"""Returns a Polars selector function based on the `data_type_selector` string."""
|
|
516
653
|
if self.data_type_selector_mode == 'data_type':
|
|
517
654
|
if self.data_type_selector is not None:
|
|
518
655
|
try:
|
|
@@ -525,17 +662,20 @@ class UnpivotInput:
|
|
|
525
662
|
|
|
526
663
|
@dataclass
|
|
527
664
|
class UnionInput:
|
|
665
|
+
"""Defines settings for a union (concatenation) operation."""
|
|
528
666
|
mode: Literal['selective', 'relaxed'] = 'relaxed'
|
|
529
667
|
|
|
530
668
|
|
|
531
669
|
@dataclass
|
|
532
670
|
class UniqueInput:
|
|
671
|
+
"""Defines settings for a uniqueness operation, specifying columns and which row to keep."""
|
|
533
672
|
columns: Optional[List[str]] = None
|
|
534
673
|
strategy: Literal["first", "last", "any", "none"] = "any"
|
|
535
674
|
|
|
536
675
|
|
|
537
676
|
@dataclass
|
|
538
677
|
class GraphSolverInput:
|
|
678
|
+
"""Defines settings for a graph-solving operation (e.g., finding connected components)."""
|
|
539
679
|
col_from: str
|
|
540
680
|
col_to: str
|
|
541
681
|
output_column_name: Optional[str] = 'graph_group'
|
|
@@ -543,5 +683,5 @@ class GraphSolverInput:
|
|
|
543
683
|
|
|
544
684
|
@dataclass
|
|
545
685
|
class PolarsCodeInput:
|
|
686
|
+
"""A simple container for a string of user-provided Polars code to be executed."""
|
|
546
687
|
polars_code: str
|
|
547
|
-
|
flowfile_core/utils/utils.py
CHANGED
|
@@ -1,8 +1,47 @@
|
|
|
1
1
|
import re
|
|
2
|
-
|
|
2
|
+
from itertools import chain
|
|
3
|
+
from typing import List, Dict
|
|
3
4
|
|
|
4
5
|
def camel_case_to_snake_case(text: str) -> str:
|
|
5
6
|
# Use a regular expression to find capital letters and replace them with _ followed by the lowercase letter
|
|
6
7
|
transformed_text = re.sub(r'(?<!^)(?=[A-Z])', '_', text).lower()
|
|
7
8
|
return transformed_text
|
|
8
9
|
|
|
10
|
+
|
|
11
|
+
def ensure_similarity_dicts(datas: List[Dict], respect_order: bool = True):
|
|
12
|
+
all_cols = (data.keys() for data in datas)
|
|
13
|
+
if not respect_order:
|
|
14
|
+
unique_cols = set(chain(*all_cols))
|
|
15
|
+
else:
|
|
16
|
+
col_store = set()
|
|
17
|
+
unique_cols = list()
|
|
18
|
+
for row in all_cols:
|
|
19
|
+
for col in row:
|
|
20
|
+
if col not in col_store:
|
|
21
|
+
unique_cols.append(col)
|
|
22
|
+
col_store.update((col,))
|
|
23
|
+
output = []
|
|
24
|
+
for data in datas:
|
|
25
|
+
new_record = dict()
|
|
26
|
+
for col in unique_cols:
|
|
27
|
+
val = data.get(col)
|
|
28
|
+
new_record[col] = val
|
|
29
|
+
output.append(new_record)
|
|
30
|
+
return output
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def convert_to_string(v):
|
|
34
|
+
try:
|
|
35
|
+
return str(v)
|
|
36
|
+
except:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def standardize_col_dtype(vals):
|
|
41
|
+
types = set(type(val) for val in vals)
|
|
42
|
+
if len(types) == 1:
|
|
43
|
+
return vals
|
|
44
|
+
elif int in types and float in types:
|
|
45
|
+
return vals
|
|
46
|
+
else:
|
|
47
|
+
return [convert_to_string(v) for v in vals]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""This script runs on run time and checks if all the nodes that are created have a function in the flow_graph as well
|
|
2
|
+
as have a component in flowfile_frontend"""
|
|
3
|
+
|
|
4
|
+
from flowfile_core.schemas import input_schema
|
|
5
|
+
from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
6
|
+
from flowfile_core.configs.node_store.nodes import nodes_list, NodeTemplate
|
|
7
|
+
import inspect
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def check_if_node_has_add_function_in_flow_graph(node: NodeTemplate):
|
|
11
|
+
func_name = "add_" + node.item
|
|
12
|
+
if not hasattr(FlowGraph, func_name):
|
|
13
|
+
raise ValueError(
|
|
14
|
+
f"Node {node.name} ({node.item}) does not have a corresponding function in FlowGraph: {func_name}"
|
|
15
|
+
"Check if the function is implemented in flow_graph.py or if the node item is correct."
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def check_if_node_has_input_schema_definition(node: NodeTemplate):
|
|
20
|
+
if "node"+node.item.replace("_","") not in {k.lower() for k in inspect.getmodule(input_schema).__dict__.keys()}:
|
|
21
|
+
raise ValueError(
|
|
22
|
+
f"Node {node.name} ({node.item}) does not have a corresponding input schema definition in input_schema.py."
|
|
23
|
+
"Check if the schema is implemented or if the node item is correct."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def validate_setup():
|
|
28
|
+
"""
|
|
29
|
+
Validates the setup by checking if all nodes in the nodes_list have a corresponding function in FlowGraph
|
|
30
|
+
and a corresponding input schema definition in input_schema.py.
|
|
31
|
+
Raises ValueError if any node is missing either.
|
|
32
|
+
"""
|
|
33
|
+
for node in nodes_list:
|
|
34
|
+
check_if_node_has_add_function_in_flow_graph(node)
|
|
35
|
+
check_if_node_has_input_schema_definition(node)
|
|
36
|
+
|
|
37
|
+
print("All nodes have corresponding functions in FlowGraph and input schema definitions.")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
if __name__ == "__main__":
|
|
41
|
+
validate_setup()
|
flowfile_frame/__init__.py
CHANGED
|
@@ -31,7 +31,15 @@ from flowfile_frame.series import Series
|
|
|
31
31
|
|
|
32
32
|
# File I/O
|
|
33
33
|
from flowfile_frame.flow_frame_methods import ( # noqa: F401
|
|
34
|
-
read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet
|
|
34
|
+
read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet, scan_json_from_cloud_storage,
|
|
35
|
+
scan_parquet_from_cloud_storage,
|
|
36
|
+
scan_csv_from_cloud_storage,
|
|
37
|
+
scan_delta)
|
|
38
|
+
|
|
39
|
+
from flowfile_frame.cloud_storage.secret_manager import (del_cloud_storage_connection,
|
|
40
|
+
create_cloud_storage_connection,
|
|
41
|
+
get_all_available_cloud_storage_connections,
|
|
42
|
+
create_cloud_storage_connection_if_not_exists)
|
|
35
43
|
|
|
36
44
|
from polars.datatypes import ( # noqa: F401
|
|
37
45
|
# Integer types
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Optional, Literal
|
|
2
|
+
|
|
3
|
+
from polars._typing import (CsvEncoding)
|
|
4
|
+
|
|
5
|
+
from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
6
|
+
from flowfile_core.schemas import input_schema, cloud_storage_schemas
|
|
7
|
+
from flowfile_frame.cloud_storage.secret_manager import get_current_user_id
|
|
8
|
+
from flowfile_frame.utils import generate_node_id
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def add_write_ff_to_cloud_storage(path: str,
|
|
12
|
+
flow_graph: Optional[FlowGraph],
|
|
13
|
+
depends_on_node_id: int,
|
|
14
|
+
*,
|
|
15
|
+
connection_name: Optional[str] = None,
|
|
16
|
+
write_mode: Literal["overwrite", "append"] = "overwrite",
|
|
17
|
+
file_format: Literal["csv", "parquet", "json", "delta"] = "parquet",
|
|
18
|
+
csv_delimiter: str = ";",
|
|
19
|
+
csv_encoding: CsvEncoding = "utf8",
|
|
20
|
+
parquet_compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy",
|
|
21
|
+
description: Optional[str] = None
|
|
22
|
+
) -> int:
|
|
23
|
+
node_id = generate_node_id()
|
|
24
|
+
flow_id = flow_graph.flow_id
|
|
25
|
+
settings = input_schema.NodeCloudStorageWriter(
|
|
26
|
+
flow_id=flow_id,
|
|
27
|
+
node_id=node_id,
|
|
28
|
+
cloud_storage_settings=cloud_storage_schemas.CloudStorageWriteSettings(resource_path=path,
|
|
29
|
+
connection_name=connection_name,
|
|
30
|
+
file_format=file_format,
|
|
31
|
+
write_mode=write_mode,
|
|
32
|
+
csv_delimiter=csv_delimiter,
|
|
33
|
+
csv_encoding=csv_encoding,
|
|
34
|
+
parquet_compression=parquet_compression),
|
|
35
|
+
user_id=get_current_user_id(),
|
|
36
|
+
depending_on_id=depends_on_node_id,
|
|
37
|
+
description=description)
|
|
38
|
+
flow_graph.add_cloud_storage_writer(settings)
|
|
39
|
+
return node_id
|