Flowfile 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +5 -2
  3. flowfile/web/__init__.py +4 -2
  4. flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
  5. flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
  6. flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
  7. flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
  8. flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
  11. flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
  12. flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
  13. flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
  14. flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
  15. flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
  16. flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
  17. flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
  18. flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
  19. flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
  20. flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
  21. flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
  22. flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
  23. flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
  24. flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
  26. flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
  27. flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
  28. flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
  29. flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
  31. flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
  33. flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
  34. flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
  35. flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
  37. flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
  38. flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
  39. flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
  40. flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
  41. flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
  42. flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
  43. flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
  44. flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
  45. flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
  49. flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/METADATA +2 -2
  52. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/RECORD +100 -98
  53. flowfile_core/__init__.py +1 -0
  54. flowfile_core/auth/jwt.py +39 -0
  55. flowfile_core/configs/node_store/nodes.py +1 -0
  56. flowfile_core/configs/settings.py +6 -5
  57. flowfile_core/configs/utils.py +5 -0
  58. flowfile_core/database/connection.py +1 -3
  59. flowfile_core/flowfile/code_generator/code_generator.py +71 -0
  60. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -2
  61. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +598 -310
  62. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  63. flowfile_core/flowfile/flow_graph.py +620 -192
  64. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  65. flowfile_core/flowfile/flow_node/flow_node.py +510 -89
  66. flowfile_core/flowfile/flow_node/models.py +125 -20
  67. flowfile_core/flowfile/handler.py +2 -33
  68. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  69. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  70. flowfile_core/flowfile/utils.py +36 -5
  71. flowfile_core/main.py +32 -13
  72. flowfile_core/routes/cloud_connections.py +7 -11
  73. flowfile_core/routes/logs.py +2 -6
  74. flowfile_core/routes/public.py +1 -0
  75. flowfile_core/routes/routes.py +127 -51
  76. flowfile_core/routes/secrets.py +72 -14
  77. flowfile_core/schemas/__init__.py +8 -0
  78. flowfile_core/schemas/input_schema.py +92 -64
  79. flowfile_core/schemas/output_model.py +19 -3
  80. flowfile_core/schemas/schemas.py +144 -11
  81. flowfile_core/schemas/transform_schema.py +82 -17
  82. flowfile_core/utils/arrow_reader.py +8 -3
  83. flowfile_core/utils/validate_setup.py +0 -2
  84. flowfile_frame/__init__.py +9 -1
  85. flowfile_frame/cloud_storage/__init__.py +0 -0
  86. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  87. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  88. flowfile_frame/expr.py +42 -1
  89. flowfile_frame/expr.pyi +76 -61
  90. flowfile_frame/flow_frame.py +233 -111
  91. flowfile_frame/flow_frame.pyi +137 -91
  92. flowfile_frame/flow_frame_methods.py +150 -12
  93. flowfile_frame/group_frame.py +3 -0
  94. flowfile_frame/utils.py +25 -3
  95. test_utils/s3/data_generator.py +1 -0
  96. test_utils/s3/demo_data_generator.py +186 -0
  97. test_utils/s3/fixtures.py +6 -1
  98. flowfile_core/schemas/defaults.py +0 -9
  99. flowfile_core/schemas/models.py +0 -193
  100. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/LICENSE +0 -0
  101. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/WHEEL +0 -0
  102. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/entry_points.txt +0 -0
@@ -8,6 +8,7 @@ from typing import NamedTuple
8
8
 
9
9
 
10
10
  def get_func_type_mapping(func: str):
11
+ """Infers the output data type of common aggregation functions."""
11
12
  if func in ["mean", "avg", "median", "std", "var"]:
12
13
  return "Float64"
13
14
  elif func in ['min', 'max', 'first', 'last', "cumsum", "sum"]:
@@ -19,6 +20,7 @@ def get_func_type_mapping(func: str):
19
20
 
20
21
 
21
22
  def string_concat(*column: str):
23
+ """A simple wrapper to concatenate string columns in Polars."""
22
24
  return pl.col(column).cast(pl.Utf8).str.concat(delimiter=',')
23
25
 
24
26
 
@@ -28,27 +30,35 @@ FuzzyTypeLiteral = Literal['levenshtein', 'jaro', 'jaro_winkler', 'hamming', 'da
28
30
 
29
31
 
30
32
  def construct_join_key_name(side: SideLit, column_name: str) -> str:
33
+ """Creates a temporary, unique name for a join key column."""
31
34
  return "_FLOWFILE_JOIN_KEY_" + side.upper() + "_" + column_name
32
35
 
33
36
 
34
37
  class JoinKeyRename(NamedTuple):
38
+ """Represents the renaming of a join key from its original to a temporary name."""
35
39
  original_name: str
36
40
  temp_name: str
37
41
 
38
42
 
39
43
  class JoinKeyRenameResponse(NamedTuple):
44
+ """Contains a list of join key renames for one side of a join."""
40
45
  side: SideLit
41
46
  join_key_renames: List[JoinKeyRename]
42
47
 
43
48
 
44
49
  class FullJoinKeyResponse(NamedTuple):
50
+ """Holds the join key rename responses for both sides of a join."""
45
51
  left: JoinKeyRenameResponse
46
52
  right: JoinKeyRenameResponse
47
53
 
48
54
 
49
55
  @dataclass
50
56
  class SelectInput:
51
- # __slots__ = ['old_name', 'new_name', 'keep', 'data_type', 'data_type_change', 'join_key']
57
+ """Defines how a single column should be selected, renamed, or type-cast.
58
+
59
+ This is a core building block for any operation that involves column manipulation.
60
+ It holds all the configuration for a single field in a selection operation.
61
+ """
52
62
  old_name: str
53
63
  original_position: Optional[int] = None
54
64
  new_name: Optional[str] = None
@@ -80,6 +90,7 @@ class SelectInput:
80
90
 
81
91
  @property
82
92
  def polars_type(self) -> str:
93
+ """Translates a user-friendly type name to a Polars data type string."""
83
94
  if self.data_type.lower() == 'string':
84
95
  return 'Utf8'
85
96
  elif self.data_type.lower() == 'integer':
@@ -91,7 +102,7 @@ class SelectInput:
91
102
 
92
103
  @dataclass
93
104
  class FieldInput:
94
- # __slots__ = ['old_name', 'new_name', 'keep', 'data_type', 'data_type_change', 'join_key']
105
+ """Represents a single field with its name and data type, typically for defining an output column."""
95
106
  name: str
96
107
  data_type: Optional[str] = None
97
108
 
@@ -102,19 +113,22 @@ class FieldInput:
102
113
 
103
114
  @dataclass
104
115
  class FunctionInput:
116
+ """Defines a formula to be applied, including the output field information."""
105
117
  field: FieldInput
106
118
  function: str
107
119
 
108
120
 
109
121
  @dataclass
110
122
  class BasicFilter:
123
+ """Defines a simple, single-condition filter (e.g., 'column' 'equals' 'value')."""
111
124
  field: str = ''
112
- filter_type: str = '' # equals, in, not in, smaller, larger
125
+ filter_type: str = ''
113
126
  filter_value: str = ''
114
127
 
115
128
 
116
129
  @dataclass
117
130
  class FilterInput:
131
+ """Defines the settings for a filter operation, supporting basic or advanced (expression-based) modes."""
118
132
  advanced_filter: str = ''
119
133
  basic_filter: BasicFilter = None
120
134
  filter_type: str = 'basic'
@@ -122,43 +136,54 @@ class FilterInput:
122
136
 
123
137
  @dataclass
124
138
  class SelectInputs:
139
+ """A container for a list of `SelectInput` objects, providing helper methods for managing selections."""
125
140
  renames: List[SelectInput]
126
141
 
127
142
  @property
128
143
  def old_cols(self) -> Set:
144
+ """Returns a set of original column names to be kept in the selection."""
129
145
  return set(v.old_name for v in self.renames if v.keep)
130
146
 
131
147
  @property
132
148
  def new_cols(self) -> Set:
149
+ """Returns a set of new (renamed) column names to be kept in the selection."""
133
150
  return set(v.new_name for v in self.renames if v.keep)
134
151
 
135
152
  @property
136
153
  def rename_table(self):
137
- return {v.old_name: v.new_name for v in self.renames if v.is_available}
154
+ """Generates a dictionary for use in Polars' `.rename()` method."""
155
+ return {v.old_name: v.new_name for v in self.renames if v.is_available and (v.keep or v.join_key)}
138
156
 
139
157
  def get_select_cols(self, include_join_key: bool = True):
158
+ """Gets a list of original column names to select from the source DataFrame."""
140
159
  return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
141
160
 
142
- def __add__(self, other: SelectInput):
161
+ def __add__(self, other: "SelectInput"):
162
+ """Allows adding a SelectInput using the '+' operator."""
143
163
  self.renames.append(other)
144
164
 
145
- def append(self, other: SelectInput):
165
+ def append(self, other: "SelectInput"):
166
+ """Appends a new SelectInput to the list of renames."""
146
167
  self.renames.append(other)
147
168
 
148
169
  def remove_select_input(self, old_key: str):
170
+ """Removes a SelectInput from the list based on its original name."""
149
171
  self.renames = [rename for rename in self.renames if rename.old_name != old_key]
150
172
 
151
173
  def unselect_field(self, old_key: str):
174
+ """Marks a field to be dropped from the final selection by setting `keep` to False."""
152
175
  for rename in self.renames:
153
176
  if old_key == rename.old_name:
154
177
  rename.keep = False
155
178
 
156
179
  @classmethod
157
- def create_from_list(cls, col_list: str):
180
+ def create_from_list(cls, col_list: List[str]):
181
+ """Creates a SelectInputs object from a simple list of column names."""
158
182
  return cls([SelectInput(c) for c in col_list])
159
183
 
160
184
  @classmethod
161
185
  def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame):
186
+ """Creates a SelectInputs object from a Polars DataFrame's columns."""
162
187
  return cls([SelectInput(c) for c in df.columns])
163
188
 
164
189
  def get_select_input_on_old_name(self, old_name: str) -> SelectInput | None:
@@ -169,15 +194,18 @@ class SelectInputs:
169
194
 
170
195
 
171
196
  class JoinInputs(SelectInputs):
197
+ """Extends `SelectInputs` with functionality specific to join operations, like handling join keys."""
172
198
 
173
199
  def __init__(self, renames: List[SelectInput]):
174
200
  self.renames = renames
175
201
 
176
202
  @property
177
203
  def join_key_selects(self) -> List[SelectInput]:
204
+ """Returns only the `SelectInput` objects that are marked as join keys."""
178
205
  return [v for v in self.renames if v.join_key]
179
206
 
180
207
  def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
208
+ """Gets the temporary rename mapping for all join keys on one side of a join."""
181
209
  return JoinKeyRenameResponse(
182
210
  side,
183
211
  [JoinKeyRename(jk.new_name,
@@ -186,19 +214,20 @@ class JoinInputs(SelectInputs):
186
214
  )
187
215
 
188
216
  def get_join_key_rename_mapping(self, side: SideLit) -> Dict[str, str]:
217
+ """Returns a dictionary mapping original join key names to their temporary names."""
189
218
  return {jkr[0]: jkr[1] for jkr in self.get_join_key_renames(side)[1]}
190
219
 
191
220
 
192
221
  @dataclass
193
222
  class JoinMap:
194
- # __slots__ = "left_col", "right_col"
223
+ """Defines a single mapping between a left and right column for a join key."""
195
224
  left_col: str
196
225
  right_col: str
197
226
 
198
227
 
199
-
200
228
  @dataclass
201
229
  class FuzzyMap(JoinMap):
230
+ """Extends `JoinMap` with settings for fuzzy string matching, such as the algorithm and similarity threshold."""
202
231
  threshold_score: Optional[float] = 80.0
203
232
  fuzzy_type: Optional[FuzzyTypeLiteral] = 'levenshtein'
204
233
  perc_unique: Optional[float] = 0.0
@@ -223,12 +252,13 @@ class FuzzyMap(JoinMap):
223
252
 
224
253
 
225
254
  class JoinSelectMixin:
226
- """Mixin for common join selection functionality"""
255
+ """A mixin providing common methods for join-like operations that involve left and right inputs."""
227
256
  left_select: JoinInputs = None
228
257
  right_select: JoinInputs = None
229
258
 
230
259
  @staticmethod
231
260
  def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> JoinInputs | None:
261
+ """Parses various input formats into a standardized `JoinInputs` object."""
232
262
  if all(isinstance(c, SelectInput) for c in select):
233
263
  return JoinInputs(select)
234
264
  elif all(isinstance(c, dict) for c in select):
@@ -241,6 +271,7 @@ class JoinSelectMixin:
241
271
  return JoinInputs([SelectInput(s, s) for s in select])
242
272
 
243
273
  def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
274
+ """Generates a new, non-conflicting column name by adding a suffix if necessary."""
244
275
  current_names = self.left_select.new_cols & self.right_select.new_cols
245
276
  if old_col_name not in current_names:
246
277
  return old_col_name
@@ -250,6 +281,7 @@ class JoinSelectMixin:
250
281
  old_col_name = f'{side}_{old_col_name}'
251
282
 
252
283
  def add_new_select_column(self, select_input: SelectInput, side: str):
284
+ """Adds a new column to the selection for either the left or right side."""
253
285
  selects = self.right_select if side == 'right' else self.left_select
254
286
  select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
255
287
  selects.__add__(select_input)
@@ -257,19 +289,23 @@ class JoinSelectMixin:
257
289
 
258
290
  @dataclass
259
291
  class CrossJoinInput(JoinSelectMixin):
292
+ """Defines the settings for a cross join operation, including column selections for both inputs."""
260
293
  left_select: SelectInputs = None
261
294
  right_select: SelectInputs = None
262
295
 
263
296
  def __init__(self, left_select: List[SelectInput] | List[str],
264
297
  right_select: List[SelectInput] | List[str]):
298
+ """Initializes the CrossJoinInput with selections for left and right tables."""
265
299
  self.left_select = self.parse_select(left_select)
266
300
  self.right_select = self.parse_select(right_select)
267
301
 
268
302
  @property
269
303
  def overlapping_records(self):
304
+ """Finds column names that would conflict after the join."""
270
305
  return self.left_select.new_cols & self.right_select.new_cols
271
306
 
272
307
  def auto_rename(self):
308
+ """Automatically renames columns on the right side to prevent naming conflicts."""
273
309
  overlapping_records = self.overlapping_records
274
310
  while len(overlapping_records) > 0:
275
311
  for right_col in self.right_select.renames:
@@ -280,13 +316,15 @@ class CrossJoinInput(JoinSelectMixin):
280
316
 
281
317
  @dataclass
282
318
  class JoinInput(JoinSelectMixin):
319
+ """Defines the settings for a standard SQL-style join, including keys, strategy, and selections."""
283
320
  join_mapping: List[JoinMap]
284
321
  left_select: JoinInputs = None
285
322
  right_select: JoinInputs = None
286
323
  how: JoinStrategy = 'inner'
287
324
 
288
325
  @staticmethod
289
- def parse_join_mapping(join_mapping: List[JoinMap] | Tuple[str, str] | str) -> List[JoinMap]:
326
+ def parse_join_mapping(join_mapping: any) -> List[JoinMap]:
327
+ """Parses various input formats for join keys into a standardized list of `JoinMap` objects."""
290
328
  if isinstance(join_mapping, (tuple, list)):
291
329
  assert len(join_mapping) > 0
292
330
  if all(isinstance(jm, dict) for jm in join_mapping):
@@ -309,6 +347,7 @@ class JoinInput(JoinSelectMixin):
309
347
  left_select: List[SelectInput] | List[str],
310
348
  right_select: List[SelectInput] | List[str],
311
349
  how: JoinStrategy = 'inner'):
350
+ """Initializes the JoinInput with keys, selections, and join strategy."""
312
351
  self.join_mapping = self.parse_join_mapping(join_mapping)
313
352
  self.left_select = self.parse_select(left_select)
314
353
  self.right_select = self.parse_select(right_select)
@@ -316,10 +355,12 @@ class JoinInput(JoinSelectMixin):
316
355
  self.how = how
317
356
 
318
357
  def set_join_keys(self):
358
+ """Marks the `SelectInput` objects corresponding to join keys."""
319
359
  [setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
320
360
  [setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
321
361
 
322
362
  def get_join_key_renames(self, filter_drop: bool = False) -> FullJoinKeyResponse:
363
+ """Gets the temporary rename mappings for the join keys on both sides."""
323
364
  return FullJoinKeyResponse(self.left_select.get_join_key_renames(side="left", filter_drop=filter_drop),
324
365
  self.right_select.get_join_key_renames(side="right", filter_drop=filter_drop))
325
366
 
@@ -335,18 +376,22 @@ class JoinInput(JoinSelectMixin):
335
376
 
336
377
  @property
337
378
  def _left_join_keys(self) -> Set:
379
+ """Returns a set of the left-side join key column names."""
338
380
  return set(jm.left_col for jm in self.join_mapping)
339
381
 
340
382
  @property
341
383
  def _right_join_keys(self) -> Set:
384
+ """Returns a set of the right-side join key column names."""
342
385
  return set(jm.right_col for jm in self.join_mapping)
343
386
 
344
387
  @property
345
- def left_join_keys(self) -> List:
388
+ def left_join_keys(self) -> List[str]:
389
+ """Returns an ordered list of the left-side join key column names to be used in the join."""
346
390
  return [jm.left_col for jm in self.used_join_mapping]
347
391
 
348
392
  @property
349
- def right_join_keys(self) -> List:
393
+ def right_join_keys(self) -> List[str]:
394
+ """Returns an ordered list of the right-side join key column names to be used in the join."""
350
395
  return [jm.right_col for jm in self.used_join_mapping]
351
396
 
352
397
  @property
@@ -357,6 +402,7 @@ class JoinInput(JoinSelectMixin):
357
402
  return self.left_select.new_cols & self.right_select.new_cols
358
403
 
359
404
  def auto_rename(self):
405
+ """Automatically renames columns on the right side to prevent naming conflicts."""
360
406
  self.set_join_keys()
361
407
  overlapping_records = self.overlapping_records
362
408
  while len(overlapping_records) > 0:
@@ -366,7 +412,8 @@ class JoinInput(JoinSelectMixin):
366
412
  overlapping_records = self.overlapping_records
367
413
 
368
414
  @property
369
- def used_join_mapping(self):
415
+ def used_join_mapping(self) -> List[JoinMap]:
416
+ """Returns the final join mapping after applying all renames and transformations."""
370
417
  new_mappings: List[JoinMap] = []
371
418
  left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
372
419
  left_join_rename_mapping: Dict[str, str] = self.left_select.get_join_key_rename_mapping("left")
@@ -382,6 +429,7 @@ class JoinInput(JoinSelectMixin):
382
429
 
383
430
  @dataclass
384
431
  class FuzzyMatchInput(JoinInput):
432
+ """Extends `JoinInput` with settings specific to fuzzy matching, such as the matching algorithm and threshold."""
385
433
  join_mapping: List[FuzzyMap]
386
434
  aggregate_output: bool = False
387
435
 
@@ -429,6 +477,7 @@ class FuzzyMatchInput(JoinInput):
429
477
 
430
478
  @property
431
479
  def fuzzy_maps(self) -> List[FuzzyMap]:
480
+ """Returns the final fuzzy mappings after applying all column renames."""
432
481
  new_mappings = []
433
482
  left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
434
483
  for org_fuzzy_map in self.join_mapping:
@@ -480,6 +529,7 @@ class AggColl:
480
529
  output_type: Optional[str] = None
481
530
 
482
531
  def __init__(self, old_name: str, agg: str, new_name: str = None, output_type: str = None):
532
+ """Initializes an aggregation column with its source, function, and new name."""
483
533
  self.old_name = str(old_name)
484
534
  if agg != 'groupby':
485
535
  self.new_name = new_name if new_name is not None else self.old_name + "_" + agg
@@ -490,6 +540,7 @@ class AggColl:
490
540
 
491
541
  @property
492
542
  def agg_func(self):
543
+ """Returns the corresponding Polars aggregation function from the `agg` string."""
493
544
  if self.agg == 'groupby':
494
545
  return self.agg
495
546
  elif self.agg == 'concat':
@@ -524,6 +575,7 @@ class GroupByInput:
524
575
 
525
576
  @dataclass
526
577
  class PivotInput:
578
+ """Defines the settings for a pivot (long-to-wide) operation."""
527
579
  index_columns: List[str]
528
580
  pivot_column: str
529
581
  value_col: str
@@ -531,9 +583,11 @@ class PivotInput:
531
583
 
532
584
  @property
533
585
  def grouped_columns(self) -> List[str]:
586
+ """Returns the list of columns to be used for the initial grouping stage of the pivot."""
534
587
  return self.index_columns + [self.pivot_column]
535
588
 
536
589
  def get_group_by_input(self) -> GroupByInput:
590
+ """Constructs the `GroupByInput` needed for the pre-aggregation step of the pivot."""
537
591
  group_by_cols = [AggColl(c, 'groupby') for c in self.grouped_columns]
538
592
  agg_cols = [AggColl(self.value_col, agg=aggregation, new_name=aggregation) for aggregation in self.aggregations]
539
593
  return GroupByInput(group_by_cols+agg_cols)
@@ -541,21 +595,25 @@ class PivotInput:
541
595
  def get_index_columns(self) -> List[pl.col]:
542
596
  return [pl.col(c) for c in self.index_columns]
543
597
 
544
- def get_pivot_column(self) -> pl.col:
598
+ def get_pivot_column(self) -> pl.Expr:
599
+ """Returns the pivot column as a Polars column expression."""
545
600
  return pl.col(self.pivot_column)
546
601
 
547
602
  def get_values_expr(self) -> pl.Expr:
603
+ """Creates the struct expression used to gather the values for pivoting."""
548
604
  return pl.struct([pl.col(c) for c in self.aggregations]).alias('vals')
549
605
 
550
606
 
551
607
  @dataclass
552
608
  class SortByInput:
609
+ """Defines a single sort condition on a column, including the direction."""
553
610
  column: str
554
611
  how: str = 'asc'
555
612
 
556
613
 
557
614
  @dataclass
558
615
  class RecordIdInput:
616
+ """Defines settings for adding a record ID (row number) column to the data."""
559
617
  output_column_name: str = 'record_id'
560
618
  offset: int = 1
561
619
  group_by: Optional[bool] = False
@@ -564,6 +622,7 @@ class RecordIdInput:
564
622
 
565
623
  @dataclass
566
624
  class TextToRowsInput:
625
+ """Defines settings for splitting a text column into multiple rows based on a delimiter."""
567
626
  column_to_split: str
568
627
  output_column_name: Optional[str] = None
569
628
  split_by_fixed_value: Optional[bool] = True
@@ -573,12 +632,14 @@ class TextToRowsInput:
573
632
 
574
633
  @dataclass
575
634
  class UnpivotInput:
635
+ """Defines settings for an unpivot (wide-to-long) operation."""
576
636
  index_columns: Optional[List[str]] = field(default_factory=list)
577
637
  value_columns: Optional[List[str]] = field(default_factory=list)
578
638
  data_type_selector: Optional[Literal['float', 'all', 'date', 'numeric', 'string']] = None
579
639
  data_type_selector_mode: Optional[Literal['data_type', 'column']] = 'column'
580
640
 
581
641
  def __post_init__(self):
642
+ """Ensures that list attributes are initialized correctly if they are None."""
582
643
  if self.index_columns is None:
583
644
  self.index_columns = []
584
645
  if self.value_columns is None:
@@ -587,7 +648,8 @@ class UnpivotInput:
587
648
  self.data_type_selector_mode = 'column'
588
649
 
589
650
  @property
590
- def data_type_selector_expr(self) -> Callable:
651
+ def data_type_selector_expr(self) -> Optional[Callable]:
652
+ """Returns a Polars selector function based on the `data_type_selector` string."""
591
653
  if self.data_type_selector_mode == 'data_type':
592
654
  if self.data_type_selector is not None:
593
655
  try:
@@ -600,17 +662,20 @@ class UnpivotInput:
600
662
 
601
663
  @dataclass
602
664
  class UnionInput:
665
+ """Defines settings for a union (concatenation) operation."""
603
666
  mode: Literal['selective', 'relaxed'] = 'relaxed'
604
667
 
605
668
 
606
669
  @dataclass
607
670
  class UniqueInput:
671
+ """Defines settings for a uniqueness operation, specifying columns and which row to keep."""
608
672
  columns: Optional[List[str]] = None
609
673
  strategy: Literal["first", "last", "any", "none"] = "any"
610
674
 
611
675
 
612
676
  @dataclass
613
677
  class GraphSolverInput:
678
+ """Defines settings for a graph-solving operation (e.g., finding connected components)."""
614
679
  col_from: str
615
680
  col_to: str
616
681
  output_column_name: Optional[str] = 'graph_group'
@@ -618,5 +683,5 @@ class GraphSolverInput:
618
683
 
619
684
  @dataclass
620
685
  class PolarsCodeInput:
686
+ """A simple container for a string of user-provided Polars code to be executed."""
621
687
  polars_code: str
622
-
@@ -138,11 +138,16 @@ def collect_batches(reader: pa.ipc.RecordBatchFileReader, n: int) -> Tuple[List[
138
138
  rows_collected = 0
139
139
 
140
140
  for batch in iter_batches(reader, n, rows_collected):
141
- batches.append(batch)
141
+
142
142
  rows_collected += batch.num_rows
143
143
  logger.debug(f"Collected batch: total rows now {rows_collected}")
144
144
  if rows_collected >= n:
145
+ if rows_collected > n:
146
+ batches.append(batch.slice(0, n - (rows_collected - batch.num_rows)))
147
+ else:
148
+ batches.append(batch)
145
149
  break
150
+ batches.append(batch)
146
151
 
147
152
  logger.info(f"Finished collecting {len(batches)} batches with {rows_collected} total rows")
148
153
  return batches, rows_collected
@@ -217,7 +222,7 @@ def read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> pa.Table:
217
222
 
218
223
  table = pa.Table.from_batches(batches) # type: ignore
219
224
  logger.info(f"Successfully read {rows_collected} rows from {file_path}")
220
- return table
225
+ return table
221
226
 
222
227
 
223
228
  def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Callable[[], pa.Table]:
@@ -244,4 +249,4 @@ def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Calla
244
249
  >>> table = reader_func()
245
250
  """
246
251
  logger.info(f"Creating reader function for {file_path} with n={n}, strict={strict}")
247
- return lambda: read_top_n(file_path, n, strict)
252
+ return lambda: read_top_n(file_path, n, strict)
@@ -34,8 +34,6 @@ def validate_setup():
34
34
  check_if_node_has_add_function_in_flow_graph(node)
35
35
  check_if_node_has_input_schema_definition(node)
36
36
 
37
- print("All nodes have corresponding functions in FlowGraph and input schema definitions.")
38
-
39
37
 
40
38
  if __name__ == "__main__":
41
39
  validate_setup()
@@ -31,7 +31,15 @@ from flowfile_frame.series import Series
31
31
 
32
32
  # File I/O
33
33
  from flowfile_frame.flow_frame_methods import ( # noqa: F401
34
- read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet)
34
+ read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet, scan_json_from_cloud_storage,
35
+ scan_parquet_from_cloud_storage,
36
+ scan_csv_from_cloud_storage,
37
+ scan_delta)
38
+
39
+ from flowfile_frame.cloud_storage.secret_manager import (del_cloud_storage_connection,
40
+ create_cloud_storage_connection,
41
+ get_all_available_cloud_storage_connections,
42
+ create_cloud_storage_connection_if_not_exists)
35
43
 
36
44
  from polars.datatypes import ( # noqa: F401
37
45
  # Integer types
File without changes
@@ -0,0 +1,39 @@
1
+ from typing import Optional, Literal
2
+
3
+ from polars._typing import (CsvEncoding)
4
+
5
+ from flowfile_core.flowfile.flow_graph import FlowGraph
6
+ from flowfile_core.schemas import input_schema, cloud_storage_schemas
7
+ from flowfile_frame.cloud_storage.secret_manager import get_current_user_id
8
+ from flowfile_frame.utils import generate_node_id
9
+
10
+
11
+ def add_write_ff_to_cloud_storage(path: str,
12
+ flow_graph: Optional[FlowGraph],
13
+ depends_on_node_id: int,
14
+ *,
15
+ connection_name: Optional[str] = None,
16
+ write_mode: Literal["overwrite", "append"] = "overwrite",
17
+ file_format: Literal["csv", "parquet", "json", "delta"] = "parquet",
18
+ csv_delimiter: str = ";",
19
+ csv_encoding: CsvEncoding = "utf8",
20
+ parquet_compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy",
21
+ description: Optional[str] = None
22
+ ) -> int:
23
+ node_id = generate_node_id()
24
+ flow_id = flow_graph.flow_id
25
+ settings = input_schema.NodeCloudStorageWriter(
26
+ flow_id=flow_id,
27
+ node_id=node_id,
28
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageWriteSettings(resource_path=path,
29
+ connection_name=connection_name,
30
+ file_format=file_format,
31
+ write_mode=write_mode,
32
+ csv_delimiter=csv_delimiter,
33
+ csv_encoding=csv_encoding,
34
+ parquet_compression=parquet_compression),
35
+ user_id=get_current_user_id(),
36
+ depending_on_id=depends_on_node_id,
37
+ description=description)
38
+ flow_graph.add_cloud_storage_writer(settings)
39
+ return node_id
@@ -0,0 +1,73 @@
1
+ from typing import List
2
+
3
+ from flowfile_core.schemas.cloud_storage_schemas import FullCloudStorageConnection, FullCloudStorageConnectionInterface
4
+ from flowfile_core.flowfile.database_connection_manager.db_connections import (store_cloud_connection,
5
+ get_all_cloud_connections_interface,
6
+ delete_cloud_connection)
7
+ from flowfile_core.database.connection import get_db_context
8
+ from flowfile_core.auth.jwt import get_current_user_sync, create_access_token
9
+ from asyncio import run
10
+
11
+
12
+ def get_current_user_id() -> int | None:
13
+ access_token = create_access_token(data={"sub": "local_user"})
14
+ with get_db_context() as db:
15
+ current_user_id = get_current_user_sync(
16
+ access_token,
17
+ db
18
+ ).id
19
+ return current_user_id
20
+
21
+
22
+ def create_cloud_storage_connection(connection: FullCloudStorageConnection) -> None:
23
+ """
24
+ Create a cloud storage connection using the provided connection details.
25
+
26
+ Args:
27
+ connection (FullCloudStorageConnection): The connection details for cloud storage.
28
+
29
+ Returns:
30
+ None
31
+ """
32
+ access_token = create_access_token(data={"sub": "local_user"})
33
+
34
+ with get_db_context() as db:
35
+ current_user_id = get_current_user_sync(
36
+ access_token,
37
+ db
38
+ ).id
39
+ store_cloud_connection(
40
+ db,
41
+ connection,
42
+ current_user_id
43
+ )
44
+
45
+
46
+ def create_cloud_storage_connection_if_not_exists(connection: FullCloudStorageConnection) -> None:
47
+ """
48
+ Create a cloud storage connection if it does not already exist.
49
+
50
+ Args:
51
+ connection (FullCloudStorageConnection): The connection details for cloud storage.
52
+
53
+ Returns:
54
+ None
55
+ """
56
+ all_connections = get_all_available_cloud_storage_connections()
57
+ if not any(conn.connection_name == connection.connection_name for conn in all_connections):
58
+ create_cloud_storage_connection(connection)
59
+
60
+
61
+ def get_all_available_cloud_storage_connections() -> List[FullCloudStorageConnectionInterface]:
62
+ with get_db_context() as db:
63
+ all_connections = get_all_cloud_connections_interface(
64
+ db,
65
+ get_current_user_id()
66
+ )
67
+ return all_connections
68
+
69
+
70
+ def del_cloud_storage_connection(connection_name: str) -> None:
71
+ with get_db_context() as db:
72
+ user_id = get_current_user_id()
73
+ delete_cloud_connection(db, connection_name, user_id)