Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (121) hide show
  1. flowfile/__init__.py +3 -3
  2. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  3. flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
  4. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  5. flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
  6. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  7. flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
  8. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
  9. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
  11. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
  12. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
  13. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
  14. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
  15. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
  16. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
  17. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  18. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
  19. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
  20. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
  21. flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
  22. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  23. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
  24. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  25. flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
  26. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
  27. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
  28. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
  29. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
  30. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
  31. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
  32. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
  33. flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
  34. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
  35. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
  36. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
  37. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
  38. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
  39. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
  40. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
  41. flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
  42. flowfile/web/static/assets/api-fb67319c.js +80 -0
  43. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  44. flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
  45. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
  46. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
  47. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
  48. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
  49. flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
  50. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
  51. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
  52. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
  53. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
  54. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
  55. flowfile/web/static/index.html +1 -1
  56. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
  57. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
  59. flowfile_core/__init__.py +2 -0
  60. flowfile_core/configs/node_store/nodes.py +8 -6
  61. flowfile_core/database/connection.py +63 -15
  62. flowfile_core/database/init_db.py +0 -1
  63. flowfile_core/database/models.py +49 -2
  64. flowfile_core/flowfile/code_generator/code_generator.py +401 -17
  65. flowfile_core/flowfile/connection_manager/models.py +1 -1
  66. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  67. flowfile_core/flowfile/extensions.py +1 -1
  68. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  69. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  70. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
  71. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  72. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  73. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  74. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  75. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  76. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  77. flowfile_core/flowfile/flow_graph.py +119 -82
  78. flowfile_core/flowfile/flow_node/flow_node.py +68 -33
  79. flowfile_core/flowfile/flow_node/models.py +32 -3
  80. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  81. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  82. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  83. flowfile_core/flowfile/utils.py +1 -23
  84. flowfile_core/main.py +3 -2
  85. flowfile_core/routes/cloud_connections.py +81 -0
  86. flowfile_core/routes/logs.py +0 -1
  87. flowfile_core/routes/routes.py +3 -39
  88. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  89. flowfile_core/schemas/input_schema.py +37 -15
  90. flowfile_core/schemas/schemas.py +7 -2
  91. flowfile_core/schemas/transform_schema.py +97 -22
  92. flowfile_core/utils/utils.py +40 -1
  93. flowfile_core/utils/validate_setup.py +41 -0
  94. flowfile_frame/flow_frame.py +253 -102
  95. flowfile_frame/flow_frame_methods.py +13 -13
  96. flowfile_worker/external_sources/s3_source/main.py +216 -0
  97. flowfile_worker/external_sources/s3_source/models.py +142 -0
  98. flowfile_worker/funcs.py +51 -6
  99. flowfile_worker/models.py +22 -2
  100. flowfile_worker/routes.py +40 -38
  101. flowfile_worker/utils.py +1 -1
  102. test_utils/s3/commands.py +46 -0
  103. test_utils/s3/data_generator.py +291 -0
  104. test_utils/s3/fixtures.py +209 -0
  105. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  106. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  107. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  108. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  109. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  110. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  111. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  112. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  113. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  114. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  115. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  116. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  117. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  118. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
  119. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
  120. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  121. {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
@@ -0,0 +1,215 @@
1
+ """Cloud storage connection schemas for S3, ADLS, and other cloud providers."""
2
+
3
+ from typing import Optional, Literal
4
+ import polars as pl
5
+ import base64
6
+
7
+ from pydantic import BaseModel, SecretStr, field_validator, Field
8
+
9
+ from flowfile_core.secret_manager.secret_manager import encrypt_secret
10
+
11
+ CloudStorageType = Literal["s3", "adls", "gcs"]
12
+ AuthMethod = Literal["access_key", "iam_role", "service_principal", "managed_identity", "sas_token", "aws-cli", "env_vars"]
13
+
14
+
15
+ def encrypt_for_worker(secret_value: SecretStr|None) -> str|None:
16
+ """
17
+ Encrypts a secret value for use in worker contexts.
18
+ This is a placeholder function that simulates encryption.
19
+ In practice, you would use a secure encryption method.
20
+ """
21
+ if secret_value is not None:
22
+ return encrypt_secret(secret_value.get_secret_value())
23
+
24
+
25
+ class AuthSettingsInput(BaseModel):
26
+ """
27
+ The information needed for the user to provide the details that are needed to provide how to connect to the
28
+ Cloud provider
29
+ """
30
+ storage_type: CloudStorageType
31
+ auth_method: AuthMethod
32
+ connection_name: Optional[str] = "None" # This is the reference to the item we will fetch that contains the data
33
+
34
+
35
+ class FullCloudStorageConnectionWorkerInterface(AuthSettingsInput):
36
+ """Internal model with decrypted secrets"""
37
+
38
+ # AWS S3
39
+ aws_region: Optional[str] = None
40
+ aws_access_key_id: Optional[str] = None
41
+ aws_secret_access_key: Optional[str] = None
42
+ aws_role_arn: Optional[str] = None
43
+ aws_allow_unsafe_html: Optional[bool] = None
44
+ aws_session_token: Optional[str] = None
45
+
46
+ # Azure ADLS
47
+ azure_account_name: Optional[str] = None
48
+ azure_account_key: Optional[str] = None
49
+ azure_tenant_id: Optional[str] = None
50
+ azure_client_id: Optional[str] = None
51
+ azure_client_secret: Optional[str] = None
52
+
53
+ # Common
54
+ endpoint_url: Optional[str] = None
55
+ verify_ssl: bool = True
56
+
57
+
58
+ class FullCloudStorageConnection(AuthSettingsInput):
59
+ """Internal model with decrypted secrets"""
60
+
61
+ # AWS S3
62
+ aws_region: Optional[str] = None
63
+ aws_access_key_id: Optional[str] = None
64
+ aws_secret_access_key: Optional[SecretStr] = None
65
+ aws_role_arn: Optional[str] = None
66
+ aws_allow_unsafe_html: Optional[bool] = None
67
+ aws_session_token: Optional[SecretStr] = None
68
+
69
+ # Azure ADLS
70
+ azure_account_name: Optional[str] = None
71
+ azure_account_key: Optional[SecretStr] = None
72
+ azure_tenant_id: Optional[str] = None
73
+ azure_client_id: Optional[str] = None
74
+ azure_client_secret: Optional[SecretStr] = None
75
+
76
+ # Common
77
+ endpoint_url: Optional[str] = None
78
+ verify_ssl: bool = True
79
+
80
+ def get_worker_interface(self) -> "FullCloudStorageConnectionWorkerInterface":
81
+ """
82
+ Convert to a public interface model without secrets.
83
+ """
84
+ return FullCloudStorageConnectionWorkerInterface(
85
+ storage_type=self.storage_type,
86
+ auth_method=self.auth_method,
87
+ connection_name=self.connection_name,
88
+ aws_allow_unsafe_html=self.aws_allow_unsafe_html,
89
+ aws_secret_access_key=encrypt_for_worker(self.aws_secret_access_key),
90
+ aws_region=self.aws_region,
91
+ aws_access_key_id=self.aws_access_key_id,
92
+ aws_role_arn=self.aws_role_arn,
93
+ aws_session_token=encrypt_for_worker(self.aws_session_token),
94
+ azure_account_name=self.azure_account_name,
95
+ azure_tenant_id=self.azure_tenant_id,
96
+ azure_account_key=encrypt_for_worker(self.azure_account_key),
97
+ azure_client_id=self.azure_client_id,
98
+ azure_client_secret=encrypt_for_worker(self.azure_client_secret),
99
+ endpoint_url=self.endpoint_url,
100
+ verify_ssl=self.verify_ssl
101
+ )
102
+
103
+
104
+ class FullCloudStorageConnectionInterface(AuthSettingsInput):
105
+ """API response model - no secrets exposed"""
106
+
107
+ # Public fields only
108
+ aws_allow_unsafe_html: Optional[bool] = None
109
+ aws_region: Optional[str] = None
110
+ aws_access_key_id: Optional[str] = None
111
+ aws_role_arn: Optional[str] = None
112
+ azure_account_name: Optional[str] = None
113
+ azure_tenant_id: Optional[str] = None
114
+ azure_client_id: Optional[str] = None
115
+ endpoint_url: Optional[str] = None
116
+ verify_ssl: bool = True
117
+
118
+
119
+ class CloudStorageSettings(BaseModel):
120
+ """Settings for cloud storage nodes in the visual designer"""
121
+
122
+ auth_mode: AuthMethod = "auto"
123
+ connection_name: Optional[str] = None # Required only for 'reference' mode
124
+ resource_path: str # s3://bucket/path/to/file.csv
125
+
126
+ @field_validator("auth_mode", mode="after")
127
+ def validate_auth_requirements(cls, v, values):
128
+ data = values.data
129
+ if v == "reference" and not data.get("connection_name"):
130
+ raise ValueError("connection_name required when using reference mode")
131
+ return v
132
+
133
+
134
+ class CloudStorageReadSettings(CloudStorageSettings):
135
+ """Settings for reading from cloud storage"""
136
+
137
+ scan_mode: Literal["single_file", "directory"] = "single_file"
138
+ file_format: Literal["csv", "parquet", "json", "delta", "iceberg"] = "parquet"
139
+ # CSV specific options
140
+ csv_has_header: Optional[bool] = True
141
+ csv_delimiter: Optional[str] = ","
142
+ csv_encoding: Optional[str] = "utf8"
143
+ # Deltalake specific settings
144
+ delta_version: Optional[int] = None
145
+
146
+
147
+ class CloudStorageReadSettingsInternal(BaseModel):
148
+ read_settings: CloudStorageReadSettings
149
+ connection: FullCloudStorageConnection
150
+
151
+
152
+ class WriteSettingsWorkerInterface(BaseModel):
153
+ """Settings for writing to cloud storage"""
154
+ resource_path: str # s3://bucket/path/to/file.csv
155
+
156
+ write_mode: Literal["overwrite", "append"] = "overwrite"
157
+ file_format: Literal["csv", "parquet", "json", "delta"] = "parquet"
158
+
159
+ parquet_compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy"
160
+
161
+ csv_delimiter: str = ","
162
+ csv_encoding: str = "utf8"
163
+
164
+
165
+ class CloudStorageWriteSettings(CloudStorageSettings, WriteSettingsWorkerInterface):
166
+ """Settings for writing to cloud storage"""
167
+ pass
168
+
169
+ def get_write_setting_worker_interface(self) -> WriteSettingsWorkerInterface:
170
+ """
171
+ Convert to a worker interface model without secrets.
172
+ """
173
+ return WriteSettingsWorkerInterface(
174
+ resource_path=self.resource_path,
175
+ write_mode=self.write_mode,
176
+ file_format=self.file_format,
177
+ parquet_compression=self.parquet_compression,
178
+ csv_delimiter=self.csv_delimiter,
179
+ csv_encoding=self.csv_encoding
180
+ )
181
+
182
+
183
+ class CloudStorageWriteSettingsInternal(BaseModel):
184
+ write_settings: CloudStorageWriteSettings
185
+ connection: FullCloudStorageConnection
186
+
187
+
188
+ class CloudStorageWriteSettingsWorkerInterface(BaseModel):
189
+ """Settings for writing to cloud storage in worker context"""
190
+ operation: str
191
+ write_settings: WriteSettingsWorkerInterface
192
+ connection: FullCloudStorageConnectionWorkerInterface
193
+ flowfile_flow_id: int = 1
194
+ flowfile_node_id: int | str = -1
195
+
196
+
197
+ def get_cloud_storage_write_settings_worker_interface(
198
+ write_settings: CloudStorageWriteSettings,
199
+ connection: FullCloudStorageConnection,
200
+ lf: pl.LazyFrame,
201
+ flowfile_flow_id: int = 1,
202
+ flowfile_node_id: int | str = -1,
203
+ ) -> CloudStorageWriteSettingsWorkerInterface:
204
+ """
205
+ Convert to a worker interface model with hashed secrets.
206
+ """
207
+ operation = base64.b64encode(lf.serialize()).decode()
208
+
209
+ return CloudStorageWriteSettingsWorkerInterface(
210
+ operation=operation,
211
+ write_settings=write_settings.get_write_setting_worker_interface(),
212
+ connection=connection.get_worker_interface(),
213
+ flowfile_flow_id=flowfile_flow_id, # Default value, can be overridden
214
+ flowfile_node_id=flowfile_node_id # Default value, can be overridden
215
+ )
@@ -3,8 +3,11 @@ from flowfile_core.schemas import transform_schema
3
3
  from pathlib import Path
4
4
  import os
5
5
  from flowfile_core.schemas.analysis_schemas import graphic_walker_schemas as gs_schemas
6
- from flowfile_core.schemas.external_sources.airbyte_schemas import AirbyteConfig
6
+ from flowfile_core.schemas.cloud_storage_schemas import CloudStorageReadSettings, CloudStorageWriteSettings
7
+ from flowfile_core.schemas.schemas import SecretRef
8
+ from flowfile_core.utils.utils import ensure_similarity_dicts, standardize_col_dtype
7
9
  from pydantic import BaseModel, Field, model_validator, SecretStr, ConfigDict
10
+ import polars as pl
8
11
 
9
12
 
10
13
  OutputConnectionClass = Literal['output-0', 'output-1', 'output-2', 'output-3', 'output-4',
@@ -33,7 +36,7 @@ class RemoveItemsInput(BaseModel):
33
36
 
34
37
  class MinimalFieldInfo(BaseModel):
35
38
  name: str
36
- data_type: str
39
+ data_type: str = "String"
37
40
 
38
41
 
39
42
  class ReceivedTableBase(BaseModel):
@@ -250,11 +253,29 @@ class NodeDatasource(NodeBase):
250
253
 
251
254
  class RawData(BaseModel):
252
255
  columns: List[MinimalFieldInfo] = None
253
- data: List[List] # List of list where each inner list is a column of data. This ensures more efficient storage
256
+ data: List[List]
257
+
258
+ @classmethod
259
+ def from_columns(cls, columns: List[str], data: List[List]):
260
+ return cls(columns=[MinimalFieldInfo(name=column) for column in columns], data=data)
261
+
262
+ @classmethod
263
+ def from_pylist(cls, pylist: List[dict]):
264
+ if len(pylist) == 0:
265
+ return cls(columns=[], data=[])
266
+ pylist = ensure_similarity_dicts(pylist)
267
+ values = [standardize_col_dtype([vv for vv in c]) for c in
268
+ zip(*(r.values() for r in pylist))]
269
+
270
+ data_types = (pl.DataType.from_python(type(next((v for v in column_values), None))) for column_values in values)
271
+ columns = [MinimalFieldInfo(name=c, data_type=str(next(data_types))) for c in pylist[0].keys()]
272
+ return cls(columns=columns, data=values)
273
+
274
+ def to_pylist(self):
275
+ return [{c.name: self.data[ci][ri] for ci, c in enumerate(self.columns)} for ri in range(len(self.data[0]))]
254
276
 
255
277
 
256
278
  class NodeManualInput(NodeBase):
257
- raw_data: Optional[List] = None
258
279
  raw_data_format: Optional[RawData] = None
259
280
 
260
281
 
@@ -265,7 +286,7 @@ class NodeRead(NodeBase):
265
286
  class DatabaseConnection(BaseModel):
266
287
  database_type: str = "postgresql" # Database type (postgresql, mysql, etc.)
267
288
  username: Optional[str] = None
268
- password_ref: Optional[str] = None
289
+ password_ref: Optional[SecretRef] = None
269
290
  host: Optional[str] = None
270
291
  port: Optional[int] = None
271
292
  database: Optional[str] = None
@@ -338,6 +359,17 @@ class NodeDatabaseWriter(NodeSingleInput):
338
359
  database_write_settings: DatabaseWriteSettings
339
360
 
340
361
 
362
+ class NodeCloudStorageReader(NodeBase):
363
+ """Cloud storage source node"""
364
+ cloud_storage_settings: CloudStorageReadSettings
365
+ fields: Optional[List[MinimalFieldInfo]] = None
366
+
367
+
368
+ class NodeCloudStorageWriter(NodeSingleInput):
369
+ """Cloud storage destination node"""
370
+ cloud_storage_settings: CloudStorageWriteSettings
371
+
372
+
341
373
  class ExternalSource(BaseModel):
342
374
  orientation: str = 'row'
343
375
  fields: Optional[List[MinimalFieldInfo]] = None
@@ -349,11 +381,6 @@ class SampleUsers(ExternalSource):
349
381
  size: int = 100
350
382
 
351
383
 
352
- class AirbyteReader(AirbyteConfig):
353
- class_name: Optional[str] = "airbyte_reader"
354
- fields: Optional[List[MinimalFieldInfo]] = None
355
-
356
-
357
384
  class AccessToken(BaseModel):
358
385
  user_id: str
359
386
  access_token: SecretStr = None
@@ -364,11 +391,6 @@ class NodeExternalSource(NodeBase):
364
391
  source_settings: SampleUsers
365
392
 
366
393
 
367
- class NodeAirbyteReader(NodeExternalSource):
368
- identifier: str = 'airbyte'
369
- source_settings: AirbyteReader
370
-
371
-
372
394
  class NodeFormula(NodeSingleInput):
373
395
  function: transform_schema.FunctionInput = None
374
396
 
@@ -1,6 +1,6 @@
1
- from pydantic import BaseModel, field_validator, ConfigDict
2
- from typing import List, Dict, Tuple, Iterable, Optional, Any, Literal
1
+ from typing import List, Dict, Tuple, Optional, Any, Literal, Annotated
3
2
 
3
+ from pydantic import BaseModel, field_validator, ConfigDict, Field, StringConstraints
4
4
 
5
5
  ExecutionModeLiteral = Literal['Development', 'Performance']
6
6
  ExecutionLocationsLiteral = Literal['auto', 'local', 'remote']
@@ -104,3 +104,8 @@ class NodeDefault(BaseModel):
104
104
  node_type: NodeTypeLiteral
105
105
  transform_type: TransformTypeLiteral
106
106
  has_default_settings: Optional[Any] = None
107
+
108
+
109
+ # Define SecretRef here if not in a common location
110
+ SecretRef = Annotated[str, StringConstraints(min_length=1, max_length=100),
111
+ Field(description="An ID referencing an encrypted secret.")]
@@ -4,6 +4,8 @@ import polars as pl
4
4
  from polars import selectors
5
5
  from copy import deepcopy
6
6
 
7
+ from typing import NamedTuple
8
+
7
9
 
8
10
  def get_func_type_mapping(func: str):
9
11
  if func in ["mean", "avg", "median", "std", "var"]:
@@ -20,10 +22,30 @@ def string_concat(*column: str):
20
22
  return pl.col(column).cast(pl.Utf8).str.concat(delimiter=',')
21
23
 
22
24
 
23
- JoinStrategy = Literal['inner', 'left', 'right', 'full', 'semi', 'anti', 'cross']
25
+ SideLit = Literal["left", "right"]
26
+ JoinStrategy = Literal['inner', 'left', 'right', 'full', 'semi', 'anti', 'cross', 'outer']
24
27
  FuzzyTypeLiteral = Literal['levenshtein', 'jaro', 'jaro_winkler', 'hamming', 'damerau_levenshtein', 'indel']
25
28
 
26
29
 
30
+ def construct_join_key_name(side: SideLit, column_name: str) -> str:
31
+ return "_FLOWFILE_JOIN_KEY_" + side.upper() + "_" + column_name
32
+
33
+
34
+ class JoinKeyRename(NamedTuple):
35
+ original_name: str
36
+ temp_name: str
37
+
38
+
39
+ class JoinKeyRenameResponse(NamedTuple):
40
+ side: SideLit
41
+ join_key_renames: List[JoinKeyRename]
42
+
43
+
44
+ class FullJoinKeyResponse(NamedTuple):
45
+ left: JoinKeyRenameResponse
46
+ right: JoinKeyRenameResponse
47
+
48
+
27
49
  @dataclass
28
50
  class SelectInput:
29
51
  # __slots__ = ['old_name', 'new_name', 'keep', 'data_type', 'data_type_change', 'join_key']
@@ -108,11 +130,11 @@ class SelectInputs:
108
130
 
109
131
  @property
110
132
  def new_cols(self) -> Set:
111
- return set(v.new_name for v in self.renames if v.keep or v.join_key)
133
+ return set(v.new_name for v in self.renames if v.keep)
112
134
 
113
135
  @property
114
136
  def rename_table(self):
115
- return {v.old_name: v.new_name for v in self.renames if (v.keep or v.join_key) and v.is_available}
137
+ return {v.old_name: v.new_name for v in self.renames if v.is_available}
116
138
 
117
139
  def get_select_cols(self, include_join_key: bool = True):
118
140
  return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
@@ -126,6 +148,11 @@ class SelectInputs:
126
148
  def remove_select_input(self, old_key: str):
127
149
  self.renames = [rename for rename in self.renames if rename.old_name != old_key]
128
150
 
151
+ def unselect_field(self, old_key: str):
152
+ for rename in self.renames:
153
+ if old_key == rename.old_name:
154
+ rename.keep = False
155
+
129
156
  @classmethod
130
157
  def create_from_list(cls, col_list: str):
131
158
  return cls([SelectInput(c) for c in col_list])
@@ -134,13 +161,42 @@ class SelectInputs:
134
161
  def create_from_pl_df(cls, df: pl.DataFrame | pl.LazyFrame):
135
162
  return cls([SelectInput(c) for c in df.columns])
136
163
 
164
+ def get_select_input_on_old_name(self, old_name: str) -> SelectInput | None:
165
+ return next((v for v in self.renames if v.old_name == old_name), None)
166
+
167
+ def get_select_input_on_new_name(self, old_name: str) -> SelectInput | None:
168
+ return next((v for v in self.renames if v.new_name == old_name), None)
169
+
170
+
171
+ class JoinInputs(SelectInputs):
172
+
173
+ def __init__(self, renames: List[SelectInput]):
174
+ self.renames = renames
175
+
176
+ @property
177
+ def join_key_selects(self) -> List[SelectInput]:
178
+ return [v for v in self.renames if v.join_key]
179
+
180
+ def get_join_key_renames(self, side: SideLit, filter_drop: bool = False) -> JoinKeyRenameResponse:
181
+ return JoinKeyRenameResponse(
182
+ side,
183
+ [JoinKeyRename(jk.new_name,
184
+ construct_join_key_name(side, jk.new_name))
185
+ for jk in self.join_key_selects if jk.keep or not filter_drop]
186
+ )
187
+
188
+ def get_join_key_rename_mapping(self, side: SideLit) -> Dict[str, str]:
189
+ return {jkr[0]: jkr[1] for jkr in self.get_join_key_renames(side)[1]}
190
+
137
191
 
138
192
  @dataclass
139
193
  class JoinMap:
194
+ # __slots__ = "left_col", "right_col"
140
195
  left_col: str
141
196
  right_col: str
142
197
 
143
198
 
199
+
144
200
  @dataclass
145
201
  class FuzzyMap(JoinMap):
146
202
  threshold_score: Optional[float] = 80.0
@@ -168,19 +224,21 @@ class FuzzyMap(JoinMap):
168
224
 
169
225
  class JoinSelectMixin:
170
226
  """Mixin for common join selection functionality"""
227
+ left_select: JoinInputs = None
228
+ right_select: JoinInputs = None
171
229
 
172
230
  @staticmethod
173
- def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> SelectInputs:
231
+ def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> JoinInputs | None:
174
232
  if all(isinstance(c, SelectInput) for c in select):
175
- return SelectInputs(select)
233
+ return JoinInputs(select)
176
234
  elif all(isinstance(c, dict) for c in select):
177
- return SelectInputs([SelectInput(**c) for c in select])
235
+ return JoinInputs([SelectInput(**c.__dict__) for c in select])
178
236
  elif isinstance(select, dict):
179
237
  renames = select.get('renames')
180
238
  if renames:
181
- return SelectInputs([SelectInput(**c) for c in renames])
239
+ return JoinInputs([SelectInput(**c) for c in renames])
182
240
  elif all(isinstance(c, str) for c in select):
183
- return SelectInputs([SelectInput(s, s) for s in select])
241
+ return JoinInputs([SelectInput(s, s) for s in select])
184
242
 
185
243
  def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
186
244
  current_names = self.left_select.new_cols & self.right_select.new_cols
@@ -223,8 +281,8 @@ class CrossJoinInput(JoinSelectMixin):
223
281
  @dataclass
224
282
  class JoinInput(JoinSelectMixin):
225
283
  join_mapping: List[JoinMap]
226
- left_select: SelectInputs = None
227
- right_select: SelectInputs = None
284
+ left_select: JoinInputs = None
285
+ right_select: JoinInputs = None
228
286
  how: JoinStrategy = 'inner'
229
287
 
230
288
  @staticmethod
@@ -254,9 +312,26 @@ class JoinInput(JoinSelectMixin):
254
312
  self.join_mapping = self.parse_join_mapping(join_mapping)
255
313
  self.left_select = self.parse_select(left_select)
256
314
  self.right_select = self.parse_select(right_select)
315
+ self.set_join_keys()
316
+ self.how = how
317
+
318
+ def set_join_keys(self):
257
319
  [setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
258
320
  [setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
259
- self.how = how
321
+
322
+ def get_join_key_renames(self, filter_drop: bool = False) -> FullJoinKeyResponse:
323
+ return FullJoinKeyResponse(self.left_select.get_join_key_renames(side="left", filter_drop=filter_drop),
324
+ self.right_select.get_join_key_renames(side="right", filter_drop=filter_drop))
325
+
326
+ def get_names_for_table_rename(self) -> List[JoinMap]:
327
+ new_mappings: List[JoinMap] = []
328
+ left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
329
+ for join_map in self.join_mapping:
330
+ new_mappings.append(JoinMap(left_rename_table.get(join_map.left_col, join_map.left_col),
331
+ right_rename_table.get(join_map.right_col, join_map.right_col)
332
+ )
333
+ )
334
+ return new_mappings
260
335
 
261
336
  @property
262
337
  def _left_join_keys(self) -> Set:
@@ -268,22 +343,21 @@ class JoinInput(JoinSelectMixin):
268
343
 
269
344
  @property
270
345
  def left_join_keys(self) -> List:
271
- return [self.left_select.rename_table.get(jm.left_col) for jm in self.join_mapping]
346
+ return [jm.left_col for jm in self.used_join_mapping]
272
347
 
273
348
  @property
274
349
  def right_join_keys(self) -> List:
275
- return [self.right_select.rename_table.get(jm.right_col, jm.right_col) for jm in self.join_mapping]
350
+ return [jm.right_col for jm in self.used_join_mapping]
276
351
 
277
352
  @property
278
353
  def overlapping_records(self):
279
354
  if self.how in ('left', 'right', 'inner'):
280
- # Never consider join keys as overlapping records since they will be dropped after the join
281
- return ((self.left_select.new_cols & self.right_select.new_cols) -
282
- (set(self.left_join_keys) & set(self.right_join_keys)))
355
+ return self.left_select.new_cols & self.right_select.new_cols
283
356
  else:
284
357
  return self.left_select.new_cols & self.right_select.new_cols
285
358
 
286
359
  def auto_rename(self):
360
+ self.set_join_keys()
287
361
  overlapping_records = self.overlapping_records
288
362
  while len(overlapping_records) > 0:
289
363
  for right_col in self.right_select.renames:
@@ -292,13 +366,15 @@ class JoinInput(JoinSelectMixin):
292
366
  overlapping_records = self.overlapping_records
293
367
 
294
368
  @property
295
- def join_mappings(self):
296
- new_mappings = []
369
+ def used_join_mapping(self):
370
+ new_mappings: List[JoinMap] = []
297
371
  left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
372
+ left_join_rename_mapping: Dict[str, str] = self.left_select.get_join_key_rename_mapping("left")
373
+ right_join_rename_mapping: Dict[str, str] = self.right_select.get_join_key_rename_mapping("right")
298
374
  for join_map in self.join_mapping:
299
375
  # del self.right_select.rename_table, self.left_select.rename_table
300
- new_mappings.append(JoinMap(left_rename_table.get(join_map.left_col),
301
- right_rename_table.get(join_map.right_col)
376
+ new_mappings.append(JoinMap(left_join_rename_mapping.get(left_rename_table.get(join_map.left_col, join_map.left_col)),
377
+ right_join_rename_mapping.get(right_rename_table.get(join_map.right_col, join_map.right_col))
302
378
  )
303
379
  )
304
380
  return new_mappings
@@ -332,7 +408,7 @@ class FuzzyMatchInput(JoinInput):
332
408
  return fuzz_mapping
333
409
 
334
410
  def __init__(self, join_mapping: List[FuzzyMap] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
335
- right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: str = 'inner'):
411
+ right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: JoinStrategy = 'inner'):
336
412
  self.join_mapping = self.parse_fuzz_mapping(join_mapping)
337
413
  self.left_select = self.parse_select(left_select)
338
414
  self.right_select = self.parse_select(right_select)
@@ -472,7 +548,6 @@ class PivotInput:
472
548
  return pl.struct([pl.col(c) for c in self.aggregations]).alias('vals')
473
549
 
474
550
 
475
-
476
551
  @dataclass
477
552
  class SortByInput:
478
553
  column: str
@@ -1,8 +1,47 @@
1
1
  import re
2
-
2
+ from itertools import chain
3
+ from typing import List, Dict
3
4
 
4
5
  def camel_case_to_snake_case(text: str) -> str:
5
6
  # Use a regular expression to find capital letters and replace them with _ followed by the lowercase letter
6
7
  transformed_text = re.sub(r'(?<!^)(?=[A-Z])', '_', text).lower()
7
8
  return transformed_text
8
9
 
10
+
11
+ def ensure_similarity_dicts(datas: List[Dict], respect_order: bool = True):
12
+ all_cols = (data.keys() for data in datas)
13
+ if not respect_order:
14
+ unique_cols = set(chain(*all_cols))
15
+ else:
16
+ col_store = set()
17
+ unique_cols = list()
18
+ for row in all_cols:
19
+ for col in row:
20
+ if col not in col_store:
21
+ unique_cols.append(col)
22
+ col_store.update((col,))
23
+ output = []
24
+ for data in datas:
25
+ new_record = dict()
26
+ for col in unique_cols:
27
+ val = data.get(col)
28
+ new_record[col] = val
29
+ output.append(new_record)
30
+ return output
31
+
32
+
33
+ def convert_to_string(v):
34
+ try:
35
+ return str(v)
36
+ except:
37
+ return None
38
+
39
+
40
+ def standardize_col_dtype(vals):
41
+ types = set(type(val) for val in vals)
42
+ if len(types) == 1:
43
+ return vals
44
+ elif int in types and float in types:
45
+ return vals
46
+ else:
47
+ return [convert_to_string(v) for v in vals]
@@ -0,0 +1,41 @@
1
+ """This script runs on run time and checks if all the nodes that are created have a function in the flow_graph as well
2
+ as have a component in flowfile_frontend"""
3
+
4
+ from flowfile_core.schemas import input_schema
5
+ from flowfile_core.flowfile.flow_graph import FlowGraph
6
+ from flowfile_core.configs.node_store.nodes import nodes_list, NodeTemplate
7
+ import inspect
8
+
9
+
10
+ def check_if_node_has_add_function_in_flow_graph(node: NodeTemplate):
11
+ func_name = "add_" + node.item
12
+ if not hasattr(FlowGraph, func_name):
13
+ raise ValueError(
14
+ f"Node {node.name} ({node.item}) does not have a corresponding function in FlowGraph: {func_name}"
15
+ "Check if the function is implemented in flow_graph.py or if the node item is correct."
16
+ )
17
+
18
+
19
+ def check_if_node_has_input_schema_definition(node: NodeTemplate):
20
+ if "node"+node.item.replace("_","") not in {k.lower() for k in inspect.getmodule(input_schema).__dict__.keys()}:
21
+ raise ValueError(
22
+ f"Node {node.name} ({node.item}) does not have a corresponding input schema definition in input_schema.py."
23
+ "Check if the schema is implemented or if the node item is correct."
24
+ )
25
+
26
+
27
+ def validate_setup():
28
+ """
29
+ Validates the setup by checking if all nodes in the nodes_list have a corresponding function in FlowGraph
30
+ and a corresponding input schema definition in input_schema.py.
31
+ Raises ValueError if any node is missing either.
32
+ """
33
+ for node in nodes_list:
34
+ check_if_node_has_add_function_in_flow_graph(node)
35
+ check_if_node_has_input_schema_definition(node)
36
+
37
+ print("All nodes have corresponding functions in FlowGraph and input schema definitions.")
38
+
39
+
40
+ if __name__ == "__main__":
41
+ validate_setup()