Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (121) hide show
  1. flowfile/__init__.py +3 -3
  2. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  3. flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
  4. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  5. flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
  6. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  7. flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
  8. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
  9. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
  11. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
  12. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
  13. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
  14. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
  15. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
  16. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
  17. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  18. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
  19. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
  20. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
  21. flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
  22. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  23. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
  24. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  25. flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
  26. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
  27. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
  28. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
  29. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
  30. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
  31. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
  32. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
  33. flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
  34. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
  35. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
  36. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
  37. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
  38. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
  39. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
  40. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
  41. flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
  42. flowfile/web/static/assets/api-fb67319c.js +80 -0
  43. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  44. flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
  45. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
  46. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
  47. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
  48. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
  49. flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
  50. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
  51. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
  52. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
  53. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
  54. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
  55. flowfile/web/static/index.html +1 -1
  56. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
  57. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
  59. flowfile_core/__init__.py +2 -0
  60. flowfile_core/configs/node_store/nodes.py +8 -6
  61. flowfile_core/database/connection.py +63 -15
  62. flowfile_core/database/init_db.py +0 -1
  63. flowfile_core/database/models.py +49 -2
  64. flowfile_core/flowfile/code_generator/code_generator.py +401 -17
  65. flowfile_core/flowfile/connection_manager/models.py +1 -1
  66. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  67. flowfile_core/flowfile/extensions.py +1 -1
  68. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  69. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  70. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
  71. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  72. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  73. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  74. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  75. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  76. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  77. flowfile_core/flowfile/flow_graph.py +119 -82
  78. flowfile_core/flowfile/flow_node/flow_node.py +68 -33
  79. flowfile_core/flowfile/flow_node/models.py +32 -3
  80. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  81. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  82. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  83. flowfile_core/flowfile/utils.py +1 -23
  84. flowfile_core/main.py +3 -2
  85. flowfile_core/routes/cloud_connections.py +81 -0
  86. flowfile_core/routes/logs.py +0 -1
  87. flowfile_core/routes/routes.py +3 -39
  88. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  89. flowfile_core/schemas/input_schema.py +37 -15
  90. flowfile_core/schemas/schemas.py +7 -2
  91. flowfile_core/schemas/transform_schema.py +97 -22
  92. flowfile_core/utils/utils.py +40 -1
  93. flowfile_core/utils/validate_setup.py +41 -0
  94. flowfile_frame/flow_frame.py +253 -102
  95. flowfile_frame/flow_frame_methods.py +13 -13
  96. flowfile_worker/external_sources/s3_source/main.py +216 -0
  97. flowfile_worker/external_sources/s3_source/models.py +142 -0
  98. flowfile_worker/funcs.py +51 -6
  99. flowfile_worker/models.py +22 -2
  100. flowfile_worker/routes.py +40 -38
  101. flowfile_worker/utils.py +1 -1
  102. test_utils/s3/commands.py +46 -0
  103. test_utils/s3/data_generator.py +291 -0
  104. test_utils/s3/fixtures.py +209 -0
  105. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  106. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  107. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  108. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  109. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  110. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  111. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  112. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  113. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  114. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  115. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  116. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  117. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  118. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
  119. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
  120. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  121. {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
@@ -1,6 +1,8 @@
1
1
  from flowfile_core.schemas.input_schema import FullDatabaseConnection, FullDatabaseConnectionInterface
2
+ from flowfile_core.schemas.cloud_storage_schemas import FullCloudStorageConnection, FullCloudStorageConnectionInterface
2
3
  from sqlalchemy.orm import Session
3
- from flowfile_core.database.models import DatabaseConnection as DBConnectionModel, Secret
4
+ from flowfile_core.database.models import (DatabaseConnection as DBConnectionModel, Secret,
5
+ CloudStorageConnection as DBCloudStorageConnection)
4
6
  from flowfile_core.secret_manager.secret_manager import store_secret, SecretInput, decrypt_secret
5
7
  from flowfile_core.database.connection import get_db_context
6
8
 
@@ -53,6 +55,18 @@ def get_database_connection(db: Session, connection_name: str, user_id: int) ->
53
55
  return db_connection
54
56
 
55
57
 
58
+ def get_cloud_connection(db: Session, connection_name: str, user_id: int) -> DBCloudStorageConnection | None:
59
+ """
60
+ Get a cloud storage connection by its name and user ID.
61
+ """
62
+ db_connection = db.query(DBCloudStorageConnection).filter(
63
+ DBCloudStorageConnection.connection_name == connection_name,
64
+ DBCloudStorageConnection.user_id == user_id
65
+ ).first()
66
+
67
+ return db_connection
68
+
69
+
56
70
  def get_database_connection_schema(db: Session, connection_name: str, user_id: int) -> FullDatabaseConnection | None:
57
71
  """
58
72
  Get a database connection schema by its name and user ID.
@@ -84,6 +98,20 @@ def get_local_database_connection(connection_name: str, user_id: int) -> FullDat
84
98
  return get_database_connection_schema(db, connection_name, user_id)
85
99
 
86
100
 
101
+ def get_local_cloud_connection(connection_name: str, user_id: int) -> FullCloudStorageConnection | None:
102
+ """
103
+ Get a cloud storage connection schema by its name and user ID.
104
+ Args:
105
+ connection_name (str): The name of the cloud storage connection.
106
+ user_id (int): The ID of the user who owns the connection.
107
+
108
+ Returns:
109
+ FullCloudStorageConnection | None: The cloud storage connection schema if found, otherwise None.
110
+ """
111
+ with get_db_context() as db:
112
+ return get_cloud_connection_schema(db, connection_name, user_id)
113
+
114
+
87
115
  def delete_database_connection(db: Session, connection_name: str, user_id: int) -> None:
88
116
  """
89
117
  Delete a database connection by its name and user ID.
@@ -102,7 +130,8 @@ def delete_database_connection(db: Session, connection_name: str, user_id: int)
102
130
  db.commit()
103
131
 
104
132
 
105
- def database_connection_interface_from_db_connection(db_connection: DBConnectionModel) -> FullDatabaseConnectionInterface:
133
+ def database_connection_interface_from_db_connection(
134
+ db_connection: DBConnectionModel) -> FullDatabaseConnectionInterface:
106
135
  """
107
136
  Convert a database connection from the database model to the interface model.
108
137
  """
@@ -137,3 +166,188 @@ def get_all_database_connections_interface(db: Session, user_id: int) -> list[Fu
137
166
  raise TypeError(f"Expected a DBConnectionModel instance, got {type(db_connection)}")
138
167
 
139
168
  return result
169
+
170
+
171
+ def store_cloud_connection(db: Session, connection: FullCloudStorageConnection, user_id: int) -> DBCloudStorageConnection:
172
+ """
173
+ Placeholder function to store a cloud database connection.
174
+ This function should be implemented based on specific cloud provider requirements.
175
+ """
176
+ existing_database_connection = get_cloud_connection(db, connection.connection_name, user_id)
177
+ if existing_database_connection:
178
+ raise ValueError(
179
+ f"Database connection with name '{connection.connection_name}' already exists for user {user_id}."
180
+ f" Please use a unique connection name or delete the existing connection first."
181
+ )
182
+ if connection.aws_secret_access_key is not None:
183
+ aws_secret_access_key_ref_id = store_secret(db,
184
+ SecretInput(name=connection.connection_name + "_aws_secret_access_key",
185
+ value=connection.aws_secret_access_key), user_id).id
186
+ else:
187
+ aws_secret_access_key_ref_id = None
188
+ if connection.azure_client_secret is not None:
189
+ azure_client_secret_ref_id = store_secret(db,
190
+ SecretInput(name=connection.connection_name + "azure_client_secret",
191
+ value=connection.azure_client_secret), user_id).id
192
+ else:
193
+ azure_client_secret_ref_id = None
194
+ if connection.azure_account_key is not None:
195
+ azure_account_key_ref_id = store_secret(db, SecretInput(name=connection.connection_name + "azure_account_key",
196
+ value=connection.azure_account_key), user_id).id
197
+ else:
198
+ azure_account_key_ref_id = None
199
+
200
+ db_cloud_connection = DBCloudStorageConnection(
201
+ connection_name=connection.connection_name,
202
+ storage_type=connection.storage_type,
203
+ auth_method=connection.auth_method,
204
+ user_id=user_id,
205
+
206
+ # AWS S3 fields
207
+ aws_region=connection.aws_region,
208
+ aws_access_key_id=connection.aws_access_key_id,
209
+ aws_role_arn=connection.aws_role_arn,
210
+ aws_secret_access_key_id=aws_secret_access_key_ref_id,
211
+ aws_allow_unsafe_html=connection.aws_allow_unsafe_html,
212
+
213
+ # Azure ADLS fields
214
+ azure_account_name=connection.azure_account_name,
215
+ azure_tenant_id=connection.azure_tenant_id,
216
+ azure_client_id=connection.azure_client_id,
217
+ azure_account_key_id=azure_account_key_ref_id,
218
+ azure_client_secret_id=azure_client_secret_ref_id,
219
+
220
+ # Common fields
221
+ endpoint_url=connection.endpoint_url,
222
+ verify_ssl=connection.verify_ssl
223
+ )
224
+ db.add(db_cloud_connection)
225
+ db.commit()
226
+ db.refresh(db_cloud_connection)
227
+ return db_cloud_connection
228
+
229
+
230
+ def get_full_cloud_storage_interface_from_db(
231
+ db_cloud_connection: DBCloudStorageConnection) -> FullCloudStorageConnectionInterface:
232
+ """
233
+ Convert a cloud storage connection from the database model to the interface model.
234
+ """
235
+ return FullCloudStorageConnectionInterface(
236
+ connection_name=db_cloud_connection.connection_name,
237
+ storage_type=db_cloud_connection.storage_type,
238
+ auth_method=db_cloud_connection.auth_method,
239
+ aws_allow_unsafe_html=db_cloud_connection.aws_allow_unsafe_html,
240
+ aws_region=db_cloud_connection.aws_region,
241
+ aws_access_key_id=db_cloud_connection.aws_access_key_id,
242
+ aws_role_arn=db_cloud_connection.aws_role_arn,
243
+ azure_account_name=db_cloud_connection.azure_account_name,
244
+ azure_tenant_id=db_cloud_connection.azure_tenant_id,
245
+ azure_client_id=db_cloud_connection.azure_client_id,
246
+ endpoint_url=db_cloud_connection.endpoint_url,
247
+ verify_ssl=db_cloud_connection.verify_ssl
248
+ )
249
+
250
+
251
+ def get_cloud_connection_schema(db: Session, connection_name: str, user_id: int) -> FullCloudStorageConnection | None:
252
+ """
253
+ Retrieves a full cloud storage connection schema, including decrypted secrets, by its name and user ID.
254
+ """
255
+ db_connection = get_cloud_connection(db, connection_name, user_id)
256
+ if not db_connection:
257
+ return None
258
+
259
+ # Decrypt secrets associated with the connection
260
+ aws_secret_key = None
261
+ if db_connection.aws_secret_access_key_id:
262
+ secret_record = db.query(Secret).filter(Secret.id == db_connection.aws_secret_access_key_id).first()
263
+ if secret_record:
264
+ aws_secret_key = decrypt_secret(secret_record.encrypted_value)
265
+
266
+ azure_account_key = None
267
+ if db_connection.azure_account_key_id:
268
+ secret_record = db.query(Secret).filter(Secret.id == db_connection.azure_account_key_id).first()
269
+ if secret_record:
270
+ azure_account_key = decrypt_secret(secret_record.encrypted_value)
271
+
272
+ azure_client_secret = None
273
+ if db_connection.azure_client_secret_id:
274
+ secret_record = db.query(Secret).filter(Secret.id == db_connection.azure_client_secret_id).first()
275
+ if secret_record:
276
+ azure_client_secret = decrypt_secret(secret_record.encrypted_value)
277
+
278
+ # Construct the full Pydantic model
279
+ return FullCloudStorageConnection(
280
+ connection_name=db_connection.connection_name,
281
+ storage_type=db_connection.storage_type,
282
+ auth_method=db_connection.auth_method,
283
+ aws_allow_unsafe_html=db_connection.aws_allow_unsafe_html,
284
+ aws_region=db_connection.aws_region,
285
+ aws_access_key_id=db_connection.aws_access_key_id,
286
+ aws_secret_access_key=aws_secret_key,
287
+ aws_role_arn=db_connection.aws_role_arn,
288
+ azure_account_name=db_connection.azure_account_name,
289
+ azure_account_key=azure_account_key,
290
+ azure_tenant_id=db_connection.azure_tenant_id,
291
+ azure_client_id=db_connection.azure_client_id,
292
+ azure_client_secret=azure_client_secret,
293
+ endpoint_url=db_connection.endpoint_url,
294
+ verify_ssl=db_connection.verify_ssl
295
+ )
296
+
297
+
298
+ def cloud_connection_interface_from_db_connection(
299
+ db_connection: DBCloudStorageConnection) -> FullCloudStorageConnectionInterface:
300
+ """
301
+ Converts a DBCloudStorageConnection model to a FullCloudStorageConnectionInterface model,
302
+ which safely exposes non-sensitive data.
303
+ """
304
+ return FullCloudStorageConnectionInterface(
305
+ connection_name=db_connection.connection_name,
306
+ storage_type=db_connection.storage_type,
307
+ auth_method=db_connection.auth_method,
308
+ aws_allow_unsafe_html=db_connection.aws_allow_unsafe_html,
309
+ aws_region=db_connection.aws_region,
310
+ aws_access_key_id=db_connection.aws_access_key_id,
311
+ aws_role_arn=db_connection.aws_role_arn,
312
+ azure_account_name=db_connection.azure_account_name,
313
+ azure_tenant_id=db_connection.azure_tenant_id,
314
+ azure_client_id=db_connection.azure_client_id,
315
+ endpoint_url=db_connection.endpoint_url,
316
+ verify_ssl=db_connection.verify_ssl
317
+ )
318
+
319
+
320
+ def get_all_cloud_connections_interface(db: Session, user_id: int) -> list[FullCloudStorageConnectionInterface]:
321
+ """
322
+ Retrieves a list of all cloud storage connections for a user in a safe interface format (no secrets).
323
+ """
324
+ db_connections = db.query(DBCloudStorageConnection).filter(DBCloudStorageConnection.user_id == user_id).all()
325
+
326
+ return [cloud_connection_interface_from_db_connection(conn) for conn in db_connections]
327
+
328
+
329
+ def delete_cloud_connection(db: Session, connection_name: str, user_id: int) -> None:
330
+ """
331
+ Deletes a cloud storage connection and all of its associated secrets from the database.
332
+ """
333
+ db_connection = get_cloud_connection(db, connection_name, user_id)
334
+
335
+ if db_connection:
336
+ # Collect all secret IDs associated with this connection
337
+ secret_ids_to_delete = [
338
+ db_connection.aws_secret_access_key_id,
339
+ db_connection.aws_session_token_id,
340
+ db_connection.azure_account_key_id,
341
+ db_connection.azure_client_secret_id,
342
+ db_connection.azure_sas_token_id
343
+ ]
344
+ # Filter out None values
345
+ secret_ids_to_delete = [id for id in secret_ids_to_delete if id is not None]
346
+
347
+ # Delete associated secrets if they exist
348
+ if secret_ids_to_delete:
349
+ db.query(Secret).filter(Secret.id.in_(secret_ids_to_delete)).delete(synchronize_session=False)
350
+
351
+ # Delete the connection record itself
352
+ db.delete(db_connection)
353
+ db.commit()
@@ -17,7 +17,7 @@ def get_instant_func_results(node_step: FlowNode, func_string: str) -> InstantFu
17
17
  return InstantFuncResult(result='No input data connected, so cannot evaluate the result', success=None)
18
18
  node_input = node_step.main_input[0]
19
19
  try:
20
- if node_input.node_stats.has_run and node_input.is_setup and node_input.results.example_data_path:
20
+ if node_input.node_stats.has_run_with_current_setup and node_input.is_setup and node_input.results.example_data_path:
21
21
  df = get_first_row(node_input.results.example_data_path)
22
22
  else:
23
23
  df = node_input.get_predicted_resulting_data().data_frame.collect()
@@ -0,0 +1,259 @@
1
+ import boto3
2
+ from botocore.exceptions import ClientError
3
+ from typing import Optional, Dict, Any, Callable, Literal
4
+
5
+ from flowfile_core.schemas.cloud_storage_schemas import FullCloudStorageConnection
6
+
7
+
8
+ def create_storage_options_from_boto_credentials(profile_name: Optional[str],
9
+ region_name: Optional[str] = None) -> Dict[str, Any]:
10
+ """
11
+ Create a storage options dictionary from AWS credentials using a boto3 profile.
12
+ This is the most robust way to handle profile-based authentication as it
13
+ bypasses Polars' internal credential provider chain, avoiding conflicts.
14
+
15
+ Parameters
16
+ ----------
17
+ profile_name
18
+ The name of the AWS profile in ~/.aws/credentials.
19
+ region_name
20
+ The AWS region to use.
21
+
22
+ Returns
23
+ -------
24
+ Dict[str, Any]
25
+ A storage options dictionary for Polars with explicit credentials.
26
+ """
27
+ session = boto3.Session(profile_name=profile_name, region_name=region_name)
28
+ credentials = session.get_credentials()
29
+ frozen_creds = credentials.get_frozen_credentials()
30
+
31
+ storage_options = {
32
+ "aws_access_key_id": frozen_creds.access_key,
33
+ "aws_secret_access_key": frozen_creds.secret_key,
34
+ "aws_session_token": frozen_creds.token,
35
+ }
36
+ # Use the session's region if one was resolved, otherwise use the provided one
37
+ if session.region_name:
38
+ storage_options["aws_region"] = session.region_name
39
+
40
+ print("Boto3: Successfully created storage options with explicit credentials.")
41
+ return storage_options
42
+
43
+
44
+ class CloudStorageReader:
45
+ """Helper class to handle different cloud storage authentication methods and read operations."""
46
+
47
+ @staticmethod
48
+ def get_storage_options(connection: FullCloudStorageConnection) -> Dict[str, Any]:
49
+ """
50
+ Build storage options dict based on the connection type and auth method.
51
+
52
+ Args:
53
+ connection: Full connection details with decrypted secrets
54
+
55
+ Returns:
56
+ Dict containing appropriate storage options for the provider
57
+ """
58
+ if connection.storage_type == "s3":
59
+ return CloudStorageReader._get_s3_storage_options(connection)
60
+ elif connection.storage_type == "adls":
61
+ return CloudStorageReader._get_adls_storage_options(connection)
62
+ elif connection.storage_type == "gcs":
63
+ return CloudStorageReader._get_gcs_storage_options(connection)
64
+ else:
65
+ raise ValueError(f"Unsupported storage type: {connection.storage_type}")
66
+
67
+ @staticmethod
68
+ def _get_s3_storage_options(connection: 'FullCloudStorageConnection') -> Dict[str, Any]:
69
+ """Build S3-specific storage options."""
70
+ auth_method = connection.auth_method
71
+ print(f"Building S3 storage options for auth_method: '{auth_method}'")
72
+ if auth_method == "aws-cli":
73
+ return create_storage_options_from_boto_credentials(
74
+ profile_name=connection.connection_name,
75
+ region_name=connection.aws_region
76
+ )
77
+
78
+ storage_options = {}
79
+ if connection.aws_region:
80
+ storage_options["aws_region"] = connection.aws_region
81
+ if connection.endpoint_url:
82
+ storage_options["endpoint_url"] = connection.endpoint_url
83
+ if not connection.verify_ssl:
84
+ storage_options["verify"] = "False"
85
+ if connection.aws_allow_unsafe_html: # Note: Polars uses aws_allow_http
86
+ storage_options["aws_allow_http"] = "true"
87
+
88
+ if auth_method == "access_key":
89
+ storage_options["aws_access_key_id"] = connection.aws_access_key_id
90
+ storage_options["aws_secret_access_key"] = connection.aws_secret_access_key.get_secret_value()
91
+ # Explicitly clear any session token from the environment
92
+ storage_options["aws_session_token"] = ""
93
+
94
+ elif auth_method == "iam_role":
95
+ # Correctly implement IAM role assumption using boto3 STS client.
96
+ sts_client = boto3.client('sts', region_name=connection.aws_region)
97
+ assumed_role_object = sts_client.assume_role(
98
+ RoleArn=connection.aws_role_arn,
99
+ RoleSessionName="PolarsCloudStorageReaderSession" # A descriptive session name
100
+ )
101
+ credentials = assumed_role_object['Credentials']
102
+ storage_options["aws_access_key_id"] = credentials['AccessKeyId']
103
+ storage_options["aws_secret_access_key"] = credentials['SecretAccessKey']
104
+ storage_options["aws_session_token"] = credentials['SessionToken']
105
+
106
+ return storage_options
107
+
108
+ @staticmethod
109
+ def _get_adls_storage_options(connection: 'FullCloudStorageConnection') -> Dict[str, Any]:
110
+ """Build Azure ADLS-specific storage options."""
111
+ storage_options = {}
112
+
113
+ if connection.auth_method == "access_key":
114
+ # Account key authentication
115
+ if connection.azure_account_name:
116
+ storage_options["account_name"] = connection.azure_account_name
117
+ if connection.azure_account_key:
118
+ storage_options["account_key"] = connection.azure_account_key.get_secret_value()
119
+
120
+ elif connection.auth_method == "service_principal":
121
+ # Service principal authentication
122
+ if connection.azure_tenant_id:
123
+ storage_options["tenant_id"] = connection.azure_tenant_id
124
+ if connection.azure_client_id:
125
+ storage_options["client_id"] = connection.azure_client_id
126
+ if connection.azure_client_secret:
127
+ storage_options["client_secret"] = connection.azure_client_secret.get_secret_value()
128
+
129
+ elif connection.auth_method == "sas_token":
130
+ # SAS token authentication
131
+ if connection.azure_sas_token:
132
+ storage_options["sas_token"] = connection.azure_sas_token.get_secret_value()
133
+
134
+ return storage_options
135
+
136
+ @staticmethod
137
+ def _get_gcs_storage_options(connection: 'FullCloudStorageConnection') -> Dict[str, Any]:
138
+ """Build GCS-specific storage options."""
139
+ # GCS typically uses service account authentication
140
+ # Implementation would depend on how credentials are stored
141
+ return {}
142
+
143
+ @staticmethod
144
+ def get_credential_provider(connection: 'FullCloudStorageConnection') -> Optional[Callable]:
145
+ """
146
+ Get a credential provider function if needed for the authentication method.
147
+
148
+ Args:
149
+ connection: Full connection details
150
+
151
+ Returns:
152
+ Credential provider function or None
153
+ """
154
+ if connection.storage_type == "s3" and connection.auth_method == "iam_role":
155
+ # For IAM role, create a credential provider
156
+ def aws_credential_provider():
157
+ # This would typically use boto3 to assume the role
158
+ # For now, returning a placeholder
159
+ return {
160
+ "aws_access_key_id": "...",
161
+ "aws_secret_access_key": "...",
162
+ "aws_session_token": "...",
163
+ }, None # expiry
164
+
165
+ return aws_credential_provider
166
+ return None
167
+
168
+
169
+ def get_first_file_from_s3_dir(source: str, storage_options: Dict[str, Any] = None) -> str:
170
+ """
171
+ Get the first parquet file from an S3 directory path.
172
+
173
+ Parameters
174
+ ----------
175
+ source : str
176
+ S3 path with wildcards (e.g., 's3://bucket/prefix/**/*/*.parquet')
177
+
178
+ storage_options: FullCloudStorageConnection
179
+
180
+ Returns
181
+ -------
182
+ str
183
+ S3 URI of the first parquet file found
184
+
185
+ Raises
186
+ ------
187
+ ValueError
188
+ If source path is invalid or no parquet files found
189
+ ClientError
190
+ If S3 access fails
191
+ """
192
+ if not source.startswith('s3://'):
193
+ raise ValueError("Source must be a valid S3 URI starting with 's3://'")
194
+ bucket_name, prefix = _parse_s3_path(source)
195
+ file_extension = _get_file_extension(source)
196
+ base_prefix = _remove_wildcards_from_prefix(prefix)
197
+ s3_client = _create_s3_client(storage_options)
198
+
199
+ # Get parquet files
200
+ first_file = _get_first_file(s3_client, bucket_name, base_prefix, file_extension)
201
+
202
+ # Return first file URI
203
+ return f"s3://{bucket_name}/{first_file['Key']}"
204
+
205
+
206
+ def _get_file_extension(source: str) -> str:
207
+ parts = source.split(".")
208
+ if len(parts) == 1:
209
+ raise ValueError("Source path does not contain a file extension")
210
+ return parts[-1].lower()
211
+
212
+
213
+ def _parse_s3_path(source: str) -> tuple[str, str]:
214
+ """Parse S3 URI into bucket name and prefix."""
215
+ path_parts = source[5:].split('/', 1) # Remove 's3://'
216
+ bucket_name = path_parts[0]
217
+ prefix = path_parts[1] if len(path_parts) > 1 else ''
218
+ return bucket_name, prefix
219
+
220
+
221
+ def _remove_wildcards_from_prefix(prefix: str) -> str:
222
+ """Remove wildcard patterns from S3 prefix."""
223
+ return prefix.split('*')[0]
224
+
225
+
226
+ def _create_s3_client(storage_options: Optional[Dict[str, Any]]):
227
+ """Create boto3 S3 client with optional credentials."""
228
+ if storage_options is None:
229
+ return boto3.client('s3')
230
+
231
+ # Handle both 'aws_region' and 'region_name' keys
232
+ client_options = storage_options.copy()
233
+ if 'aws_region' in client_options:
234
+ client_options['region_name'] = client_options.pop('aws_region')
235
+
236
+ return boto3.client('s3', **client_options)
237
+
238
+
239
+ def _get_first_file(s3_client, bucket_name: str, base_prefix: str, file_extension: str) -> Dict[Any, Any]:
240
+ """List all parquet files in S3 bucket with given prefix."""
241
+ try:
242
+ paginator = s3_client.get_paginator('list_objects_v2')
243
+ pages = paginator.paginate(Bucket=bucket_name, Prefix=base_prefix)
244
+ for page in pages:
245
+ if 'Contents' in page:
246
+ for obj in page['Contents']:
247
+ if obj['Key'].endswith(f".{file_extension}"):
248
+ return obj
249
+ else:
250
+ raise ValueError(f"No objects found in s3://{bucket_name}/{base_prefix}")
251
+ raise ValueError(f"No {file_extension} files found in s3://{bucket_name}/{base_prefix}")
252
+ except ClientError as e:
253
+ raise ValueError(f"Failed to list files in s3://{bucket_name}/{base_prefix}: {e}")
254
+
255
+
256
+ def ensure_path_has_wildcard_pattern(resource_path: str, file_format: Literal["csv", "parquet", "json"]):
257
+ if not resource_path.endswith(f"*.{file_format}"):
258
+ resource_path = resource_path.rstrip("/") + f"/**/*.{file_format}"
259
+ return resource_path
@@ -1,9 +1,9 @@
1
1
  import polars as pl
2
2
  import os
3
-
4
3
  from flowfile_core.schemas import input_schema
5
4
  from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
6
5
  from flowfile_core.flowfile.flow_data_engine.read_excel_tables import df_from_openpyxl, df_from_calamine_xlsx
6
+ from polars._typing import CsvEncoding
7
7
 
8
8
 
9
9
  def create_from_json(received_table: input_schema.ReceivedCsvTable):
@@ -49,11 +49,21 @@ def create_from_json(received_table: input_schema.ReceivedCsvTable):
49
49
  return data
50
50
 
51
51
 
52
- def create_from_path_csv(received_table: input_schema.ReceivedCsvTable) -> pl.DataFrame:
52
+ def standardize_utf8_encoding(non_standardized_encoding: str) -> CsvEncoding:
53
+ if non_standardized_encoding.upper() in ('UTF-8', 'UTF8'):
54
+ return 'utf8'
55
+ elif non_standardized_encoding.upper() in ('UTF-8-LOSSY', 'UTF8-LOSSY'):
56
+ return 'utf8-lossy'
57
+ else:
58
+ raise ValueError(f"Encoding {non_standardized_encoding} is not supported.")
59
+
60
+
61
+ def create_from_path_csv(received_table: input_schema.ReceivedCsvTable) -> pl.LazyFrame:
53
62
  f = received_table.abs_file_path
54
63
  gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
55
64
  low_mem = gbs_to_load > 10
56
- if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
65
+ if received_table.encoding.upper() in ("UTF-8", "UTF8", 'UTF8-LOSSY', 'UTF-8-LOSSY'):
66
+ encoding: CsvEncoding = standardize_utf8_encoding(received_table.encoding)
57
67
  try:
58
68
  data = pl.scan_csv(f,
59
69
  low_memory=low_mem,
@@ -61,11 +71,12 @@ def create_from_path_csv(received_table: input_schema.ReceivedCsvTable) -> pl.Da
61
71
  separator=received_table.delimiter,
62
72
  has_header=received_table.has_headers,
63
73
  skip_rows=received_table.starting_from_line,
64
- encoding='utf8',
74
+ encoding=encoding,
65
75
  infer_schema_length=received_table.infer_schema_length)
66
76
  data.head(1).collect()
67
77
  return data
68
78
  except:
79
+
69
80
  try:
70
81
  data = pl.scan_csv(f, low_memory=low_mem,
71
82
  separator=received_table.delimiter,
@@ -75,11 +86,11 @@ def create_from_path_csv(received_table: input_schema.ReceivedCsvTable) -> pl.Da
75
86
  ignore_errors=True)
76
87
  return data
77
88
  except:
78
- data = pl.scan_csv(f, low_memory=low_mem,
89
+ data = pl.scan_csv(f, low_memory=False,
79
90
  separator=received_table.delimiter,
80
91
  has_header=received_table.has_headers,
81
92
  skip_rows=received_table.starting_from_line,
82
- encoding='utf8',
93
+ encoding=encoding,
83
94
  ignore_errors=True)
84
95
  return data
85
96
  else:
@@ -90,14 +101,14 @@ def create_from_path_csv(received_table: input_schema.ReceivedCsvTable) -> pl.Da
90
101
  skip_rows=received_table.starting_from_line,
91
102
  encoding=received_table.encoding,
92
103
  ignore_errors=True, batch_size=2).next_batches(1)
93
- return data[0]
104
+ return data[0].lazy()
94
105
 
95
106
 
96
107
  def create_random(number_of_records: int = 1000) -> pl.LazyFrame:
97
108
  return create_fake_data(number_of_records).lazy()
98
109
 
99
110
 
100
- def create_from_path_parquet(received_table: input_schema.ReceivedParquetTable):
111
+ def create_from_path_parquet(received_table: input_schema.ReceivedParquetTable) -> pl.LazyFrame:
101
112
  low_mem = (os.path.getsize(received_table.abs_file_path) / 1024 / 1000 / 1000) > 2
102
113
  return pl.scan_parquet(source=received_table.abs_file_path, low_memory=low_mem)
103
114