Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (145) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  5. flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
  6. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  7. flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
  8. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  9. flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
  10. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
  11. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  12. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
  13. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
  14. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
  15. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
  16. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
  17. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
  18. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  19. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
  20. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
  21. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
  22. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
  23. flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
  24. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  25. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
  26. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  27. flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
  28. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
  29. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
  30. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
  31. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
  32. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
  33. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
  34. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
  35. flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
  36. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
  37. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
  38. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
  39. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
  40. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
  41. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
  42. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
  43. flowfile/web/static/assets/api-6ef0dcef.js +80 -0
  44. flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
  45. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  46. flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
  47. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
  48. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
  49. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
  50. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
  51. flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
  52. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
  53. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
  54. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
  55. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
  56. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
  57. flowfile/web/static/index.html +1 -1
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
  59. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
  60. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
  61. flowfile_core/__init__.py +3 -0
  62. flowfile_core/auth/jwt.py +39 -0
  63. flowfile_core/configs/node_store/nodes.py +9 -6
  64. flowfile_core/configs/settings.py +6 -5
  65. flowfile_core/database/connection.py +63 -15
  66. flowfile_core/database/init_db.py +0 -1
  67. flowfile_core/database/models.py +49 -2
  68. flowfile_core/flowfile/code_generator/code_generator.py +472 -17
  69. flowfile_core/flowfile/connection_manager/models.py +1 -1
  70. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  71. flowfile_core/flowfile/extensions.py +1 -1
  72. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  73. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  74. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
  75. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  76. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  77. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  78. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  79. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  80. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  81. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  82. flowfile_core/flowfile/flow_graph.py +718 -253
  83. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  84. flowfile_core/flowfile/flow_node/flow_node.py +563 -117
  85. flowfile_core/flowfile/flow_node/models.py +154 -20
  86. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  87. flowfile_core/flowfile/handler.py +2 -33
  88. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  89. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  90. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  91. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  92. flowfile_core/flowfile/utils.py +35 -26
  93. flowfile_core/main.py +35 -15
  94. flowfile_core/routes/cloud_connections.py +77 -0
  95. flowfile_core/routes/logs.py +2 -7
  96. flowfile_core/routes/public.py +1 -0
  97. flowfile_core/routes/routes.py +130 -90
  98. flowfile_core/routes/secrets.py +72 -14
  99. flowfile_core/schemas/__init__.py +8 -0
  100. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  101. flowfile_core/schemas/input_schema.py +121 -71
  102. flowfile_core/schemas/output_model.py +19 -3
  103. flowfile_core/schemas/schemas.py +150 -12
  104. flowfile_core/schemas/transform_schema.py +175 -35
  105. flowfile_core/utils/utils.py +40 -1
  106. flowfile_core/utils/validate_setup.py +41 -0
  107. flowfile_frame/__init__.py +9 -1
  108. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  109. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  110. flowfile_frame/expr.py +28 -1
  111. flowfile_frame/expr.pyi +76 -61
  112. flowfile_frame/flow_frame.py +481 -208
  113. flowfile_frame/flow_frame.pyi +140 -91
  114. flowfile_frame/flow_frame_methods.py +160 -22
  115. flowfile_frame/group_frame.py +3 -0
  116. flowfile_frame/utils.py +25 -3
  117. flowfile_worker/external_sources/s3_source/main.py +216 -0
  118. flowfile_worker/external_sources/s3_source/models.py +142 -0
  119. flowfile_worker/funcs.py +51 -6
  120. flowfile_worker/models.py +22 -2
  121. flowfile_worker/routes.py +40 -38
  122. flowfile_worker/utils.py +1 -1
  123. test_utils/s3/commands.py +46 -0
  124. test_utils/s3/data_generator.py +292 -0
  125. test_utils/s3/demo_data_generator.py +186 -0
  126. test_utils/s3/fixtures.py +214 -0
  127. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  128. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  129. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  130. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  131. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  132. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  133. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  134. flowfile_core/schemas/defaults.py +0 -9
  135. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  136. flowfile_core/schemas/models.py +0 -193
  137. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  138. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  139. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  140. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  141. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  142. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  143. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
  144. {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  145. {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
@@ -4,8 +4,11 @@ import polars as pl
4
4
  from polars import selectors
5
5
  from copy import deepcopy
6
6
 
7
+ from typing import NamedTuple
8
+
7
9
 
8
10
  def get_func_type_mapping(func: str):
11
+ """Infers the output data type of common aggregation functions."""
9
12
  if func in ["mean", "avg", "median", "std", "var"]:
10
13
  return "Float64"
11
14
  elif func in ['min', 'max', 'first', 'last', "cumsum", "sum"]:
@@ -17,16 +20,45 @@ def get_func_type_mapping(func: str):
17
20
 
18
21
 
19
22
  def string_concat(*column: str):
23
+ """A simple wrapper to concatenate string columns in Polars."""
20
24
  return pl.col(column).cast(pl.Utf8).str.concat(delimiter=',')
21
25
 
22
26
 
23
- JoinStrategy = Literal['inner', 'left', 'right', 'full', 'semi', 'anti', 'cross']
27
+ SideLit = Literal["left", "right"]
28
+ JoinStrategy = Literal['inner', 'left', 'right', 'full', 'semi', 'anti', 'cross', 'outer']
24
29
  FuzzyTypeLiteral = Literal['levenshtein', 'jaro', 'jaro_winkler', 'hamming', 'damerau_levenshtein', 'indel']
25
30
 
26
31
 
32
+ def construct_join_key_name(side: SideLit, column_name: str) -> str:
33
+ """Creates a temporary, unique name for a join key column."""
34
+ return "_FLOWFILE_JOIN_KEY_" + side.upper() + "_" + column_name
35
+
36
+
37
+ class JoinKeyRename(NamedTuple):
38
+ """Represents the renaming of a join key from its original to a temporary name."""
39
+ original_name: str
40
+ temp_name: str
41
+
42
+
43
+ class JoinKeyRenameResponse(NamedTuple):
44
+ """Contains a list of join key renames for one side of a join."""
45
+ side: SideLit
46
+ join_key_renames: List[JoinKeyRename]
47
+
48
+
49
+ class FullJoinKeyResponse(NamedTuple):
50
+ """Holds the join key rename responses for both sides of a join."""
51
+ left: JoinKeyRenameResponse
52
+ right: JoinKeyRenameResponse
53
+
54
+
27
55
  @dataclass
28
56
  class SelectInput:
29
- # __slots__ = ['old_name', 'new_name', 'keep', 'data_type', 'data_type_change', 'join_key']
57
+ """Defines how a single column should be selected, renamed, or type-cast.
58
+
59
+ This is a core building block for any operation that involves column manipulation.
60
+ It holds all the configuration for a single field in a selection operation.
61
+ """
30
62
  old_name: str
31
63
  original_position: Optional[int] = None
32
64
  new_name: Optional[str] = None
@@ -58,6 +90,7 @@ class SelectInput:
58
90
 
59
91
  @property
60
92
  def polars_type(self) -> str:
93
+ """Translates a user-friendly type name to a Polars data type string."""
61
94
  if self.data_type.lower() == 'string':
62
95
  return 'Utf8'
63
96
  elif self.data_type.lower() == 'integer':
@@ -69,7 +102,7 @@ class SelectInput:
69
102
 
70
103
  @dataclass
71
104
  class FieldInput:
72
- # __slots__ = ['old_name', 'new_name', 'keep', 'data_type', 'data_type_change', 'join_key']
105
+ """Represents a single field with its name and data type, typically for defining an output column."""
73
106
  name: str
74
107
  data_type: Optional[str] = None
75
108
 
@@ -80,19 +113,22 @@ class FieldInput:
80
113
 
81
114
  @dataclass
82
115
  class FunctionInput:
116
+ """Defines a formula to be applied, including the output field information."""
83
117
  field: FieldInput
84
118
  function: str
85
119
 
86
120
 
87
121
  @dataclass
88
122
  class BasicFilter:
123
+ """Defines a simple, single-condition filter (e.g., 'column' 'equals' 'value')."""
89
124
  field: str = ''
90
- filter_type: str = '' # equals, in, not in, smaller, larger
125
+ filter_type: str = ''
91
126
  filter_value: str = ''
92
127
 
93
128
 
94
129
  @dataclass
95
130
  class FilterInput:
131
+ """Defines the settings for a filter operation, supporting basic or advanced (expression-based) modes."""
96
132
  advanced_filter: str = ''
97
133
  basic_filter: BasicFilter = None
98
134
  filter_type: str = 'basic'
@@ -100,49 +136,98 @@ class FilterInput:
100
136
 
101
137
  @dataclass
102
138
  class SelectInputs:
139
+ """A container for a list of `SelectInput` objects, providing helper methods for managing selections."""
103
140
  renames: List[SelectInput]
104
141
 
105
142
  @property
106
143
  def old_cols(self) -> Set:
144
+ """Returns a set of original column names to be kept in the selection."""
107
145
  return set(v.old_name for v in self.renames if v.keep)
108
146
 
109
147
  @property
110
148
  def new_cols(self) -> Set:
111
- return set(v.new_name for v in self.renames if v.keep or v.join_key)
149
+ """Returns a set of new (renamed) column names to be kept in the selection."""
150
+ return set(v.new_name for v in self.renames if v.keep)
112
151
 
113
152
  @property
114
153
  def rename_table(self):
115
- return {v.old_name: v.new_name for v in self.renames if (v.keep or v.join_key) and v.is_available}
154
+ """Generates a dictionary for use in Polars' `.rename()` method."""
155
+ return {v.old_name: v.new_name for v in self.renames if v.is_available and (v.keep or v.join_key)}
116
156
 
117
157
  def get_select_cols(self, include_join_key: bool = True):
158
+ """Gets a list of original column names to select from the source DataFrame."""
118
159
  return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
119
160
 
120
- def __add__(self, other: SelectInput):
161
+ def __add__(self, other: "SelectInput"):
162
+ """Allows adding a SelectInput using the '+' operator."""
121
163
  self.renames.append(other)
122
164
 
123
- def append(self, other: SelectInput):
165
+ def append(self, other: "SelectInput"):
166
+ """Appends a new SelectInput to the list of renames."""
124
167
  self.renames.append(other)
125
168
 
126
169
  def remove_select_input(self, old_key: str):
170
+ """Removes a SelectInput from the list based on its original name."""
127
171
  self.renames = [rename for rename in self.renames if rename.old_name != old_key]
128
172
 
173
+ def unselect_field(self, old_key: str):
174
+ """Marks a field to be dropped from the final selection by setting `keep` to False."""
175
+ for rename in self.renames:
176
+ if old_key == rename.old_name:
177
+ rename.keep = False
178
+
129
179
  @classmethod
130
- def create_from_list(cls, col_list: str):
180
+ def create_from_list(cls, col_list: List[str]):
181
+ """Creates a SelectInputs object from a simple list of column names."""
131
182
  return cls([SelectInput(c) for c in col_list])
132
183
 
133
184
  @classmethod
134
185
  def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame):
186
+ """Creates a SelectInputs object from a Polars DataFrame's columns."""
135
187
  return cls([SelectInput(c) for c in df.columns])
136
188
 
189
+ def get_select_input_on_old_name(self, old_name: str) -> SelectInput | None:
190
+ return next((v for v in self.renames if v.old_name == old_name), None)
191
+
192
+ def get_select_input_on_new_name(self, old_name: str) -> SelectInput | None:
193
+ return next((v for v in self.renames if v.new_name == old_name), None)
194
+
195
+
196
+ class JoinInputs(SelectInputs):
197
+ """Extends `SelectInputs` with functionality specific to join operations, like handling join keys."""
198
+
199
+ def __init__(self, renames: List[SelectInput]):
200
+ self.renames = renames
201
+
202
+ @property
203
+ def join_key_selects(self) -> List[SelectInput]:
204
+ """Returns only the `SelectInput` objects that are marked as join keys."""
205
+ return [v for v in self.renames if v.join_key]
206
+
207
+ def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
208
+ """Gets the temporary rename mapping for all join keys on one side of a join."""
209
+ return JoinKeyRenameResponse(
210
+ side,
211
+ [JoinKeyRename(jk.new_name,
212
+ construct_join_key_name(side, jk.new_name))
213
+ for jk in self.join_key_selects if jk.keep or not filter_drop]
214
+ )
215
+
216
+ def get_join_key_rename_mapping(self, side: SideLit) -> Dict[str, str]:
217
+ """Returns a dictionary mapping original join key names to their temporary names."""
218
+ return {jkr[0]: jkr[1] for jkr in self.get_join_key_renames(side)[1]}
219
+
137
220
 
138
221
  @dataclass
139
222
  class JoinMap:
223
+ """Defines a single mapping between a left and right column for a join key."""
140
224
  left_col: str
141
225
  right_col: str
142
226
 
143
227
 
144
228
  @dataclass
145
229
  class FuzzyMap(JoinMap):
230
+ """Extends `JoinMap` with settings for fuzzy string matching, such as the algorithm and similarity threshold."""
146
231
  threshold_score: Optional[float] = 80.0
147
232
  fuzzy_type: Optional[FuzzyTypeLiteral] = 'levenshtein'
148
233
  perc_unique: Optional[float] = 0.0
@@ -167,22 +252,26 @@ class FuzzyMap(JoinMap):
167
252
 
168
253
 
169
254
  class JoinSelectMixin:
170
- """Mixin for common join selection functionality"""
255
+ """A mixin providing common methods for join-like operations that involve left and right inputs."""
256
+ left_select: JoinInputs = None
257
+ right_select: JoinInputs = None
171
258
 
172
259
  @staticmethod
173
- def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> SelectInputs:
260
+ def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> JoinInputs | None:
261
+ """Parses various input formats into a standardized `JoinInputs` object."""
174
262
  if all(isinstance(c, SelectInput) for c in select):
175
- return SelectInputs(select)
263
+ return JoinInputs(select)
176
264
  elif all(isinstance(c, dict) for c in select):
177
- return SelectInputs([SelectInput(**c) for c in select])
265
+ return JoinInputs([SelectInput(**c.__dict__) for c in select])
178
266
  elif isinstance(select, dict):
179
267
  renames = select.get('renames')
180
268
  if renames:
181
- return SelectInputs([SelectInput(**c) for c in renames])
269
+ return JoinInputs([SelectInput(**c) for c in renames])
182
270
  elif all(isinstance(c, str) for c in select):
183
- return SelectInputs([SelectInput(s, s) for s in select])
271
+ return JoinInputs([SelectInput(s, s) for s in select])
184
272
 
185
273
  def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
274
+ """Generates a new, non-conflicting column name by adding a suffix if necessary."""
186
275
  current_names = self.left_select.new_cols & self.right_select.new_cols
187
276
  if old_col_name not in current_names:
188
277
  return old_col_name
@@ -192,6 +281,7 @@ class JoinSelectMixin:
192
281
  old_col_name = f'{side}_{old_col_name}'
193
282
 
194
283
  def add_new_select_column(self, select_input: SelectInput, side: str):
284
+ """Adds a new column to the selection for either the left or right side."""
195
285
  selects = self.right_select if side == 'right' else self.left_select
196
286
  select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
197
287
  selects.__add__(select_input)
@@ -199,19 +289,23 @@ class JoinSelectMixin:
199
289
 
200
290
  @dataclass
201
291
  class CrossJoinInput(JoinSelectMixin):
292
+ """Defines the settings for a cross join operation, including column selections for both inputs."""
202
293
  left_select: SelectInputs = None
203
294
  right_select: SelectInputs = None
204
295
 
205
296
  def __init__(self, left_select: List[SelectInput] | List[str],
206
297
  right_select: List[SelectInput] | List[str]):
298
+ """Initializes the CrossJoinInput with selections for left and right tables."""
207
299
  self.left_select = self.parse_select(left_select)
208
300
  self.right_select = self.parse_select(right_select)
209
301
 
210
302
  @property
211
303
  def overlapping_records(self):
304
+ """Finds column names that would conflict after the join."""
212
305
  return self.left_select.new_cols & self.right_select.new_cols
213
306
 
214
307
  def auto_rename(self):
308
+ """Automatically renames columns on the right side to prevent naming conflicts."""
215
309
  overlapping_records = self.overlapping_records
216
310
  while len(overlapping_records) > 0:
217
311
  for right_col in self.right_select.renames:
@@ -222,13 +316,15 @@ class CrossJoinInput(JoinSelectMixin):
222
316
 
223
317
  @dataclass
224
318
  class JoinInput(JoinSelectMixin):
319
+ """Defines the settings for a standard SQL-style join, including keys, strategy, and selections."""
225
320
  join_mapping: List[JoinMap]
226
- left_select: SelectInputs = None
227
- right_select: SelectInputs = None
321
+ left_select: JoinInputs = None
322
+ right_select: JoinInputs = None
228
323
  how: JoinStrategy = 'inner'
229
324
 
230
325
  @staticmethod
231
- def parse_join_mapping(join_mapping: List[JoinMap] | Tuple[str, str] | str) -> List[JoinMap]:
326
+ def parse_join_mapping(join_mapping: any) -> List[JoinMap]:
327
+ """Parses various input formats for join keys into a standardized list of `JoinMap` objects."""
232
328
  if isinstance(join_mapping, (tuple, list)):
233
329
  assert len(join_mapping) > 0
234
330
  if all(isinstance(jm, dict) for jm in join_mapping):
@@ -251,39 +347,63 @@ class JoinInput(JoinSelectMixin):
251
347
  left_select: List[SelectInput] | List[str],
252
348
  right_select: List[SelectInput] | List[str],
253
349
  how: JoinStrategy = 'inner'):
350
+ """Initializes the JoinInput with keys, selections, and join strategy."""
254
351
  self.join_mapping = self.parse_join_mapping(join_mapping)
255
352
  self.left_select = self.parse_select(left_select)
256
353
  self.right_select = self.parse_select(right_select)
354
+ self.set_join_keys()
355
+ self.how = how
356
+
357
+ def set_join_keys(self):
358
+ """Marks the `SelectInput` objects corresponding to join keys."""
257
359
  [setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
258
360
  [setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
259
- self.how = how
361
+
362
+ def get_join_key_renames(self, filter_drop: bool = False) -> FullJoinKeyResponse:
363
+ """Gets the temporary rename mappings for the join keys on both sides."""
364
+ return FullJoinKeyResponse(self.left_select.get_join_key_renames(side="left", filter_drop=filter_drop),
365
+ self.right_select.get_join_key_renames(side="right", filter_drop=filter_drop))
366
+
367
+ def get_names_for_table_rename(self) -> List[JoinMap]:
368
+ new_mappings: List[JoinMap] = []
369
+ left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
370
+ for join_map in self.join_mapping:
371
+ new_mappings.append(JoinMap(left_rename_table.get(join_map.left_col, join_map.left_col),
372
+ right_rename_table.get(join_map.right_col, join_map.right_col)
373
+ )
374
+ )
375
+ return new_mappings
260
376
 
261
377
  @property
262
378
  def _left_join_keys(self) -> Set:
379
+ """Returns a set of the left-side join key column names."""
263
380
  return set(jm.left_col for jm in self.join_mapping)
264
381
 
265
382
  @property
266
383
  def _right_join_keys(self) -> Set:
384
+ """Returns a set of the right-side join key column names."""
267
385
  return set(jm.right_col for jm in self.join_mapping)
268
386
 
269
387
  @property
270
- def left_join_keys(self) -> List:
271
- return [self.left_select.rename_table.get(jm.left_col) for jm in self.join_mapping]
388
+ def left_join_keys(self) -> List[str]:
389
+ """Returns an ordered list of the left-side join key column names to be used in the join."""
390
+ return [jm.left_col for jm in self.used_join_mapping]
272
391
 
273
392
  @property
274
- def right_join_keys(self) -> List:
275
- return [self.right_select.rename_table.get(jm.right_col, jm.right_col) for jm in self.join_mapping]
393
+ def right_join_keys(self) -> List[str]:
394
+ """Returns an ordered list of the right-side join key column names to be used in the join."""
395
+ return [jm.right_col for jm in self.used_join_mapping]
276
396
 
277
397
  @property
278
398
  def overlapping_records(self):
279
399
  if self.how in ('left', 'right', 'inner'):
280
- # Never consider join keys as overlapping records since they will be dropped after the join
281
- return ((self.left_select.new_cols & self.right_select.new_cols) -
282
- (set(self.left_join_keys) & set(self.right_join_keys)))
400
+ return self.left_select.new_cols & self.right_select.new_cols
283
401
  else:
284
402
  return self.left_select.new_cols & self.right_select.new_cols
285
403
 
286
404
  def auto_rename(self):
405
+ """Automatically renames columns on the right side to prevent naming conflicts."""
406
+ self.set_join_keys()
287
407
  overlapping_records = self.overlapping_records
288
408
  while len(overlapping_records) > 0:
289
409
  for right_col in self.right_select.renames:
@@ -292,13 +412,16 @@ class JoinInput(JoinSelectMixin):
292
412
  overlapping_records = self.overlapping_records
293
413
 
294
414
  @property
295
- def join_mappings(self):
296
- new_mappings = []
415
+ def used_join_mapping(self) -> List[JoinMap]:
416
+ """Returns the final join mapping after applying all renames and transformations."""
417
+ new_mappings: List[JoinMap] = []
297
418
  left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
419
+ left_join_rename_mapping: Dict[str, str] = self.left_select.get_join_key_rename_mapping("left")
420
+ right_join_rename_mapping: Dict[str, str] = self.right_select.get_join_key_rename_mapping("right")
298
421
  for join_map in self.join_mapping:
299
422
  # del self.right_select.rename_table, self.left_select.rename_table
300
- new_mappings.append(JoinMap(left_rename_table.get(join_map.left_col),
301
- right_rename_table.get(join_map.right_col)
423
+ new_mappings.append(JoinMap(left_join_rename_mapping.get(left_rename_table.get(join_map.left_col, join_map.left_col)),
424
+ right_join_rename_mapping.get(right_rename_table.get(join_map.right_col, join_map.right_col))
302
425
  )
303
426
  )
304
427
  return new_mappings
@@ -306,6 +429,7 @@ class JoinInput(JoinSelectMixin):
306
429
 
307
430
  @dataclass
308
431
  class FuzzyMatchInput(JoinInput):
432
+ """Extends `JoinInput` with settings specific to fuzzy matching, such as the matching algorithm and threshold."""
309
433
  join_mapping: List[FuzzyMap]
310
434
  aggregate_output: bool = False
311
435
 
@@ -332,7 +456,7 @@ class FuzzyMatchInput(JoinInput):
332
456
  return fuzz_mapping
333
457
 
334
458
  def __init__(self, join_mapping: List[FuzzyMap] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
335
- right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: str = 'inner'):
459
+ right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: JoinStrategy = 'inner'):
336
460
  self.join_mapping = self.parse_fuzz_mapping(join_mapping)
337
461
  self.left_select = self.parse_select(left_select)
338
462
  self.right_select = self.parse_select(right_select)
@@ -353,6 +477,7 @@ class FuzzyMatchInput(JoinInput):
353
477
 
354
478
  @property
355
479
  def fuzzy_maps(self) -> List[FuzzyMap]:
480
+ """Returns the final fuzzy mappings after applying all column renames."""
356
481
  new_mappings = []
357
482
  left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
358
483
  for org_fuzzy_map in self.join_mapping:
@@ -404,6 +529,7 @@ class AggColl:
404
529
  output_type: Optional[str] = None
405
530
 
406
531
  def __init__(self, old_name: str, agg: str, new_name: str = None, output_type: str = None):
532
+ """Initializes an aggregation column with its source, function, and new name."""
407
533
  self.old_name = str(old_name)
408
534
  if agg != 'groupby':
409
535
  self.new_name = new_name if new_name is not None else self.old_name + "_" + agg
@@ -414,6 +540,7 @@ class AggColl:
414
540
 
415
541
  @property
416
542
  def agg_func(self):
543
+ """Returns the corresponding Polars aggregation function from the `agg` string."""
417
544
  if self.agg == 'groupby':
418
545
  return self.agg
419
546
  elif self.agg == 'concat':
@@ -448,6 +575,7 @@ class GroupByInput:
448
575
 
449
576
  @dataclass
450
577
  class PivotInput:
578
+ """Defines the settings for a pivot (long-to-wide) operation."""
451
579
  index_columns: List[str]
452
580
  pivot_column: str
453
581
  value_col: str
@@ -455,9 +583,11 @@ class PivotInput:
455
583
 
456
584
  @property
457
585
  def grouped_columns(self) -> List[str]:
586
+ """Returns the list of columns to be used for the initial grouping stage of the pivot."""
458
587
  return self.index_columns + [self.pivot_column]
459
588
 
460
589
  def get_group_by_input(self) -> GroupByInput:
590
+ """Constructs the `GroupByInput` needed for the pre-aggregation step of the pivot."""
461
591
  group_by_cols = [AggColl(c, 'groupby') for c in self.grouped_columns]
462
592
  agg_cols = [AggColl(self.value_col, agg=aggregation, new_name=aggregation) for aggregation in self.aggregations]
463
593
  return GroupByInput(group_by_cols+agg_cols)
@@ -465,22 +595,25 @@ class PivotInput:
465
595
  def get_index_columns(self) -> List[pl.col]:
466
596
  return [pl.col(c) for c in self.index_columns]
467
597
 
468
- def get_pivot_column(self) -> pl.col:
598
+ def get_pivot_column(self) -> pl.Expr:
599
+ """Returns the pivot column as a Polars column expression."""
469
600
  return pl.col(self.pivot_column)
470
601
 
471
602
  def get_values_expr(self) -> pl.Expr:
603
+ """Creates the struct expression used to gather the values for pivoting."""
472
604
  return pl.struct([pl.col(c) for c in self.aggregations]).alias('vals')
473
605
 
474
606
 
475
-
476
607
  @dataclass
477
608
  class SortByInput:
609
+ """Defines a single sort condition on a column, including the direction."""
478
610
  column: str
479
611
  how: str = 'asc'
480
612
 
481
613
 
482
614
  @dataclass
483
615
  class RecordIdInput:
616
+ """Defines settings for adding a record ID (row number) column to the data."""
484
617
  output_column_name: str = 'record_id'
485
618
  offset: int = 1
486
619
  group_by: Optional[bool] = False
@@ -489,6 +622,7 @@ class RecordIdInput:
489
622
 
490
623
  @dataclass
491
624
  class TextToRowsInput:
625
+ """Defines settings for splitting a text column into multiple rows based on a delimiter."""
492
626
  column_to_split: str
493
627
  output_column_name: Optional[str] = None
494
628
  split_by_fixed_value: Optional[bool] = True
@@ -498,12 +632,14 @@ class TextToRowsInput:
498
632
 
499
633
  @dataclass
500
634
  class UnpivotInput:
635
+ """Defines settings for an unpivot (wide-to-long) operation."""
501
636
  index_columns: Optional[List[str]] = field(default_factory=list)
502
637
  value_columns: Optional[List[str]] = field(default_factory=list)
503
638
  data_type_selector: Optional[Literal['float', 'all', 'date', 'numeric', 'string']] = None
504
639
  data_type_selector_mode: Optional[Literal['data_type', 'column']] = 'column'
505
640
 
506
641
  def __post_init__(self):
642
+ """Ensures that list attributes are initialized correctly if they are None."""
507
643
  if self.index_columns is None:
508
644
  self.index_columns = []
509
645
  if self.value_columns is None:
@@ -512,7 +648,8 @@ class UnpivotInput:
512
648
  self.data_type_selector_mode = 'column'
513
649
 
514
650
  @property
515
- def data_type_selector_expr(self) -> Callable:
651
+ def data_type_selector_expr(self) -> Optional[Callable]:
652
+ """Returns a Polars selector function based on the `data_type_selector` string."""
516
653
  if self.data_type_selector_mode == 'data_type':
517
654
  if self.data_type_selector is not None:
518
655
  try:
@@ -525,17 +662,20 @@ class UnpivotInput:
525
662
 
526
663
  @dataclass
527
664
  class UnionInput:
665
+ """Defines settings for a union (concatenation) operation."""
528
666
  mode: Literal['selective', 'relaxed'] = 'relaxed'
529
667
 
530
668
 
531
669
  @dataclass
532
670
  class UniqueInput:
671
+ """Defines settings for a uniqueness operation, specifying columns and which row to keep."""
533
672
  columns: Optional[List[str]] = None
534
673
  strategy: Literal["first", "last", "any", "none"] = "any"
535
674
 
536
675
 
537
676
  @dataclass
538
677
  class GraphSolverInput:
678
+ """Defines settings for a graph-solving operation (e.g., finding connected components)."""
539
679
  col_from: str
540
680
  col_to: str
541
681
  output_column_name: Optional[str] = 'graph_group'
@@ -543,5 +683,5 @@ class GraphSolverInput:
543
683
 
544
684
  @dataclass
545
685
  class PolarsCodeInput:
686
+ """A simple container for a string of user-provided Polars code to be executed."""
546
687
  polars_code: str
547
-
@@ -1,8 +1,47 @@
1
1
  import re
2
-
2
+ from itertools import chain
3
+ from typing import List, Dict
3
4
 
4
5
  def camel_case_to_snake_case(text: str) -> str:
5
6
  # Use a regular expression to find capital letters and replace them with _ followed by the lowercase letter
6
7
  transformed_text = re.sub(r'(?<!^)(?=[A-Z])', '_', text).lower()
7
8
  return transformed_text
8
9
 
10
+
11
+ def ensure_similarity_dicts(datas: List[Dict], respect_order: bool = True):
12
+ all_cols = (data.keys() for data in datas)
13
+ if not respect_order:
14
+ unique_cols = set(chain(*all_cols))
15
+ else:
16
+ col_store = set()
17
+ unique_cols = list()
18
+ for row in all_cols:
19
+ for col in row:
20
+ if col not in col_store:
21
+ unique_cols.append(col)
22
+ col_store.update((col,))
23
+ output = []
24
+ for data in datas:
25
+ new_record = dict()
26
+ for col in unique_cols:
27
+ val = data.get(col)
28
+ new_record[col] = val
29
+ output.append(new_record)
30
+ return output
31
+
32
+
33
+ def convert_to_string(v):
34
+ try:
35
+ return str(v)
36
+ except:
37
+ return None
38
+
39
+
40
+ def standardize_col_dtype(vals):
41
+ types = set(type(val) for val in vals)
42
+ if len(types) == 1:
43
+ return vals
44
+ elif int in types and float in types:
45
+ return vals
46
+ else:
47
+ return [convert_to_string(v) for v in vals]
@@ -0,0 +1,41 @@
1
+ """This script runs on run time and checks if all the nodes that are created have a function in the flow_graph as well
2
+ as have a component in flowfile_frontend"""
3
+
4
+ from flowfile_core.schemas import input_schema
5
+ from flowfile_core.flowfile.flow_graph import FlowGraph
6
+ from flowfile_core.configs.node_store.nodes import nodes_list, NodeTemplate
7
+ import inspect
8
+
9
+
10
+ def check_if_node_has_add_function_in_flow_graph(node: NodeTemplate):
11
+ func_name = "add_" + node.item
12
+ if not hasattr(FlowGraph, func_name):
13
+ raise ValueError(
14
+ f"Node {node.name} ({node.item}) does not have a corresponding function in FlowGraph: {func_name}"
15
+ "Check if the function is implemented in flow_graph.py or if the node item is correct."
16
+ )
17
+
18
+
19
+ def check_if_node_has_input_schema_definition(node: NodeTemplate):
20
+ if "node"+node.item.replace("_","") not in {k.lower() for k in inspect.getmodule(input_schema).__dict__.keys()}:
21
+ raise ValueError(
22
+ f"Node {node.name} ({node.item}) does not have a corresponding input schema definition in input_schema.py."
23
+ "Check if the schema is implemented or if the node item is correct."
24
+ )
25
+
26
+
27
+ def validate_setup():
28
+ """
29
+ Validates the setup by checking if all nodes in the nodes_list have a corresponding function in FlowGraph
30
+ and a corresponding input schema definition in input_schema.py.
31
+ Raises ValueError if any node is missing either.
32
+ """
33
+ for node in nodes_list:
34
+ check_if_node_has_add_function_in_flow_graph(node)
35
+ check_if_node_has_input_schema_definition(node)
36
+
37
+ print("All nodes have corresponding functions in FlowGraph and input schema definitions.")
38
+
39
+
40
+ if __name__ == "__main__":
41
+ validate_setup()
@@ -31,7 +31,15 @@ from flowfile_frame.series import Series
31
31
 
32
32
  # File I/O
33
33
  from flowfile_frame.flow_frame_methods import ( # noqa: F401
34
- read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet)
34
+ read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet, scan_json_from_cloud_storage,
35
+ scan_parquet_from_cloud_storage,
36
+ scan_csv_from_cloud_storage,
37
+ scan_delta)
38
+
39
+ from flowfile_frame.cloud_storage.secret_manager import (del_cloud_storage_connection,
40
+ create_cloud_storage_connection,
41
+ get_all_available_cloud_storage_connections,
42
+ create_cloud_storage_connection_if_not_exists)
35
43
 
36
44
  from polars.datatypes import ( # noqa: F401
37
45
  # Integer types
@@ -0,0 +1,39 @@
1
+ from typing import Optional, Literal
2
+
3
+ from polars._typing import (CsvEncoding)
4
+
5
+ from flowfile_core.flowfile.flow_graph import FlowGraph
6
+ from flowfile_core.schemas import input_schema, cloud_storage_schemas
7
+ from flowfile_frame.cloud_storage.secret_manager import get_current_user_id
8
+ from flowfile_frame.utils import generate_node_id
9
+
10
+
11
+ def add_write_ff_to_cloud_storage(path: str,
12
+ flow_graph: Optional[FlowGraph],
13
+ depends_on_node_id: int,
14
+ *,
15
+ connection_name: Optional[str] = None,
16
+ write_mode: Literal["overwrite", "append"] = "overwrite",
17
+ file_format: Literal["csv", "parquet", "json", "delta"] = "parquet",
18
+ csv_delimiter: str = ";",
19
+ csv_encoding: CsvEncoding = "utf8",
20
+ parquet_compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy",
21
+ description: Optional[str] = None
22
+ ) -> int:
23
+ node_id = generate_node_id()
24
+ flow_id = flow_graph.flow_id
25
+ settings = input_schema.NodeCloudStorageWriter(
26
+ flow_id=flow_id,
27
+ node_id=node_id,
28
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageWriteSettings(resource_path=path,
29
+ connection_name=connection_name,
30
+ file_format=file_format,
31
+ write_mode=write_mode,
32
+ csv_delimiter=csv_delimiter,
33
+ csv_encoding=csv_encoding,
34
+ parquet_compression=parquet_compression),
35
+ user_id=get_current_user_id(),
36
+ depending_on_id=depends_on_node_id,
37
+ description=description)
38
+ flow_graph.add_cloud_storage_writer(settings)
39
+ return node_id