Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (121) hide show
  1. flowfile/__init__.py +3 -3
  2. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  3. flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
  4. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  5. flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
  6. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  7. flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
  8. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
  9. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
  11. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
  12. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
  13. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
  14. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
  15. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
  16. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
  17. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  18. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
  19. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
  20. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
  21. flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
  22. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  23. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
  24. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  25. flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
  26. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
  27. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
  28. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
  29. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
  30. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
  31. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
  32. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
  33. flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
  34. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
  35. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
  36. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
  37. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
  38. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
  39. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
  40. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
  41. flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
  42. flowfile/web/static/assets/api-fb67319c.js +80 -0
  43. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  44. flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
  45. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
  46. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
  47. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
  48. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
  49. flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
  50. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
  51. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
  52. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
  53. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
  54. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
  55. flowfile/web/static/index.html +1 -1
  56. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
  57. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
  59. flowfile_core/__init__.py +2 -0
  60. flowfile_core/configs/node_store/nodes.py +8 -6
  61. flowfile_core/database/connection.py +63 -15
  62. flowfile_core/database/init_db.py +0 -1
  63. flowfile_core/database/models.py +49 -2
  64. flowfile_core/flowfile/code_generator/code_generator.py +401 -17
  65. flowfile_core/flowfile/connection_manager/models.py +1 -1
  66. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  67. flowfile_core/flowfile/extensions.py +1 -1
  68. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  69. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  70. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
  71. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  72. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  73. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  74. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  75. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  76. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  77. flowfile_core/flowfile/flow_graph.py +119 -82
  78. flowfile_core/flowfile/flow_node/flow_node.py +68 -33
  79. flowfile_core/flowfile/flow_node/models.py +32 -3
  80. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  81. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  82. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  83. flowfile_core/flowfile/utils.py +1 -23
  84. flowfile_core/main.py +3 -2
  85. flowfile_core/routes/cloud_connections.py +81 -0
  86. flowfile_core/routes/logs.py +0 -1
  87. flowfile_core/routes/routes.py +3 -39
  88. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  89. flowfile_core/schemas/input_schema.py +37 -15
  90. flowfile_core/schemas/schemas.py +7 -2
  91. flowfile_core/schemas/transform_schema.py +97 -22
  92. flowfile_core/utils/utils.py +40 -1
  93. flowfile_core/utils/validate_setup.py +41 -0
  94. flowfile_frame/flow_frame.py +253 -102
  95. flowfile_frame/flow_frame_methods.py +13 -13
  96. flowfile_worker/external_sources/s3_source/main.py +216 -0
  97. flowfile_worker/external_sources/s3_source/models.py +142 -0
  98. flowfile_worker/funcs.py +51 -6
  99. flowfile_worker/models.py +22 -2
  100. flowfile_worker/routes.py +40 -38
  101. flowfile_worker/utils.py +1 -1
  102. test_utils/s3/commands.py +46 -0
  103. test_utils/s3/data_generator.py +291 -0
  104. test_utils/s3/fixtures.py +209 -0
  105. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  106. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  107. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  108. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  109. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  110. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  111. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  112. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  113. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  114. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  115. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  116. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  117. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  118. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
  119. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
  120. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  121. {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
@@ -0,0 +1,291 @@
1
+
2
+ import logging
3
+ import io
4
+ import os
5
+
6
+ # Third-party libraries
7
+ import boto3
8
+ from botocore.client import Config
9
+ import polars as pl
10
+ import pyarrow as pa
11
+ from deltalake import write_deltalake
12
+ from pyiceberg.catalog import load_catalog
13
+
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+ MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
20
+ MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
21
+ MINIO_CONSOLE_PORT = int(os.environ.get("TEST_MINIO_CONSOLE_PORT", 9001))
22
+ MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
23
+ MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
24
+ MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
25
+ MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
26
+
27
+ def _create_single_csv_file(s3_client, df: pl.DataFrame, bucket_name: str):
28
+ """Creates a single CSV file from a DataFrame and uploads it to S3."""
29
+ logger.info("Writing single-file CSV...")
30
+ csv_buffer = io.BytesIO()
31
+ df.write_csv(csv_buffer)
32
+ csv_buffer.seek(0)
33
+ s3_client.put_object(
34
+ Bucket=bucket_name,
35
+ Key='single-file-csv/data.csv',
36
+ Body=csv_buffer.getvalue()
37
+ )
38
+
39
+
40
+ def _create_multi_file_csv(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
41
+ """Creates multiple CSV files from a DataFrame and uploads them to S3."""
42
+ logger.info(f"Writing {num_files} CSV files...")
43
+ data_size = len(df)
44
+ rows_per_file = data_size // num_files
45
+ for i in range(num_files):
46
+ sub_df = df.slice(i * rows_per_file, rows_per_file)
47
+ csv_buffer = io.BytesIO()
48
+ sub_df.write_csv(csv_buffer)
49
+ csv_buffer.seek(0)
50
+ s3_client.put_object(
51
+ Bucket=bucket_name,
52
+ Key=f'multi-file-csv/part_{i:02d}.csv',
53
+ Body=csv_buffer.getvalue()
54
+ )
55
+
56
+
57
+ def _create_single_file_json(s3_client, df: pl.DataFrame, bucket_name: str):
58
+ """Creates a single JSON file from a DataFrame and uploads it to S3."""
59
+ logger.info("Writing single-file JSON...")
60
+ json_buffer = io.BytesIO()
61
+ df.write_ndjson(json_buffer)
62
+ json_buffer.seek(0)
63
+ s3_client.put_object(
64
+ Bucket=bucket_name,
65
+ Key='single-file-json/data.json',
66
+ Body=json_buffer.getvalue()
67
+ )
68
+
69
+
70
+ def _create_multi_file_json(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
71
+ """Creates multiple JSON files from a DataFrame and uploads them to S3."""
72
+ logger.info(f"Writing {num_files} JSON files...")
73
+ data_size = len(df)
74
+ rows_per_file = data_size // num_files
75
+ for i in range(num_files):
76
+ sub_df = df.slice(i * rows_per_file, rows_per_file)
77
+ json_buffer = io.BytesIO()
78
+ sub_df.write_ndjson(json_buffer)
79
+ json_buffer.seek(0)
80
+ s3_client.put_object(
81
+ Bucket=bucket_name,
82
+ Key=f'multi-file-json/part_{i:02d}.json',
83
+ Body=json_buffer.getvalue()
84
+ )
85
+
86
+
87
+ def _create_single_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str):
88
+ """Creates a single Parquet file from a DataFrame and uploads it to S3."""
89
+ logger.info("Writing single-file Parquet...")
90
+ parquet_buffer = io.BytesIO()
91
+ df.write_parquet(parquet_buffer)
92
+ parquet_buffer.seek(0)
93
+ s3_client.put_object(
94
+ Bucket=bucket_name,
95
+ Key='single-file-parquet/data.parquet',
96
+ Body=parquet_buffer.getvalue()
97
+ )
98
+
99
+
100
+ def _create_multi_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
101
+ """Creates multiple Parquet files from a DataFrame and uploads them to S3."""
102
+ logger.info(f"Writing {num_files} Parquet files...")
103
+ data_size = len(df)
104
+ rows_per_file = data_size // num_files
105
+ for i in range(num_files):
106
+ sub_df = df.slice(i * rows_per_file, rows_per_file)
107
+ parquet_buffer = io.BytesIO()
108
+ sub_df.write_parquet(parquet_buffer)
109
+ parquet_buffer.seek(0)
110
+ s3_client.put_object(
111
+ Bucket=bucket_name,
112
+ Key=f'multi-file-parquet/part_{i:02d}.parquet',
113
+ Body=parquet_buffer.getvalue()
114
+ )
115
+
116
+
117
+ def _create_delta_lake_table(arrow_table: pa.Table, bucket_name: str, storage_options: dict):
118
+ """Creates a Delta Lake table from a PyArrow table in S3."""
119
+ logger.info("Writing Delta Lake table...")
120
+ delta_table_path = f"s3://{bucket_name}/delta-lake-table"
121
+ write_deltalake(
122
+ delta_table_path,
123
+ arrow_table,
124
+ mode='overwrite',
125
+ storage_options=storage_options
126
+ )
127
+
128
+
129
+ def _create_iceberg_table(df: pl.DataFrame, bucket_name: str, endpoint_url: str, access_key: str, secret_key: str,
130
+ s3_client):
131
+ """Creates an Apache Iceberg table and FORCES sane metadata pointers."""
132
+ logger.info("Writing Apache Iceberg table with SANE metadata access...")
133
+ # Configure the catalog properties for S3 access
134
+ catalog_props = {
135
+ "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
136
+ "s3.endpoint": endpoint_url,
137
+ "s3.access-key-id": access_key,
138
+ "s3.secret-access-key": secret_key,
139
+ }
140
+ # Use the SQL catalog with an in-memory SQLite database for storing metadata pointers
141
+ catalog = load_catalog(
142
+ "default",
143
+ **{
144
+ "type": "sql",
145
+ "uri": "sqlite:///:memory:", # Use an in-memory SQL DB for the catalog
146
+ "warehouse": f"s3a://{bucket_name}/iceberg_warehouse",
147
+ **catalog_props,
148
+ }
149
+ )
150
+ table_identifier = ("default_db", "iceberg_table")
151
+ # Create a namespace (like a schema or database) for the table
152
+ try:
153
+ catalog.drop_namespace("default_db")
154
+ except Exception:
155
+ pass # Ignore if namespace doesn't exist
156
+ catalog.create_namespace("default_db")
157
+ try:
158
+ catalog.load_table(table_identifier)
159
+ catalog.drop_table(table_identifier)
160
+ except:
161
+ pass
162
+
163
+ # Create the table schema and object first
164
+ schema = df.to_arrow().schema
165
+ table = catalog.create_table(identifier=table_identifier, schema=schema)
166
+
167
+ # Use the simplified write_iceberg method from Polars
168
+ df.write_iceberg(table, mode='overwrite')
169
+
170
+ # NOW CREATE WHAT SHOULD EXIST BY DEFAULT - SANE METADATA POINTERS
171
+ # Get the current metadata location from the table
172
+ current_metadata = table.metadata_location
173
+ logger.info(f"Original metadata location: {current_metadata}")
174
+
175
+ # Extract just the path part
176
+ if current_metadata.startswith("s3a://"):
177
+ current_metadata_key = current_metadata.replace(f"s3a://{bucket_name}/", "")
178
+ else:
179
+ current_metadata_key = current_metadata.replace(f"s3://{bucket_name}/", "")
180
+
181
+ # Read the current metadata
182
+ response = s3_client.get_object(Bucket=bucket_name, Key=current_metadata_key)
183
+ metadata_content = response['Body'].read()
184
+
185
+ # Get the metadata directory
186
+ metadata_dir = "/".join(current_metadata_key.split("/")[:-1])
187
+
188
+ # Write it to standardized locations
189
+ # 1. metadata.json in the metadata folder (this is what pl.scan_iceberg expects)
190
+ s3_client.put_object(
191
+ Bucket=bucket_name,
192
+ Key=f"{metadata_dir}/metadata.json",
193
+ Body=metadata_content
194
+ )
195
+ logger.info(f"Created stable metadata.json at: s3://{bucket_name}/{metadata_dir}/metadata.json")
196
+
197
+ # 2. current.json as an additional pointer
198
+ s3_client.put_object(
199
+ Bucket=bucket_name,
200
+ Key=f"{metadata_dir}/current.json",
201
+ Body=metadata_content
202
+ )
203
+
204
+ # 3. VERSION file that contains the current metadata filename
205
+ current_metadata_filename = current_metadata_key.split("/")[-1]
206
+ s3_client.put_object(
207
+ Bucket=bucket_name,
208
+ Key=f"{metadata_dir}/VERSION",
209
+ Body=current_metadata_filename.encode()
210
+ )
211
+
212
+ # 4. version-hint.text (some Iceberg readers look for this)
213
+ s3_client.put_object(
214
+ Bucket=bucket_name,
215
+ Key=f"{metadata_dir}/version-hint.text",
216
+ Body=current_metadata_filename.encode()
217
+ )
218
+
219
+ table_base = "iceberg_warehouse/default_db.db/my_iceberg_table"
220
+ logger.info(f"""
221
+ ✅ Iceberg table created with SANE access patterns:
222
+ - Versioned metadata: s3://{bucket_name}/{current_metadata_key}
223
+ - Latest metadata: s3://{bucket_name}/{table_base}/metadata/metadata.json
224
+ - Current pointer: s3://{bucket_name}/{table_base}/metadata/current.json
225
+ - Version hint: s3://{bucket_name}/{table_base}/metadata/version-hint.text
226
+
227
+ Read with: pl.scan_iceberg('s3://{bucket_name}/{table_base}/metadata/metadata.json').collect()
228
+ """)
229
+
230
+
231
+ def populate_test_data(endpoint_url: str, access_key: str, secret_key: str, bucket_name: str):
232
+ """
233
+ Populates a MinIO bucket with a variety of large-scale test data formats.
234
+
235
+ Args:
236
+ endpoint_url (str): The S3 endpoint URL for the MinIO instance.
237
+ access_key (str): The access key for MinIO.
238
+ secret_key (str): The secret key for MinIO.
239
+ bucket_name (str): The name of the bucket to populate.
240
+ """
241
+ logger.info("🚀 Starting data population...")
242
+ # --- S3 Client and Storage Options ---
243
+ s3_client = boto3.client(
244
+ 's3',
245
+ endpoint_url=endpoint_url,
246
+ aws_access_key_id=access_key,
247
+ aws_secret_access_key=secret_key,
248
+ config=Config(signature_version='s3v4'),
249
+ region_name='us-east-1'
250
+ )
251
+ storage_options = {
252
+ "AWS_ENDPOINT_URL": endpoint_url,
253
+ "AWS_ACCESS_KEY_ID": access_key,
254
+ "AWS_SECRET_ACCESS_KEY": secret_key,
255
+ "AWS_REGION": "us-east-1",
256
+ "AWS_ALLOW_HTTP": "true",
257
+ "AWS_S3_ALLOW_UNSAFE_RENAME": "true"
258
+ }
259
+
260
+ # --- Data Generation ---
261
+ data_size = 100_000
262
+ df = pl.DataFrame({
263
+ "id": range(1, data_size + 1),
264
+ "name": [f"user_{i}" for i in range(1, data_size + 1)],
265
+ "value": [i * 10.5 for i in range(1, data_size + 1)],
266
+ "category": ["A", "B", "C", "D", "E"] * (data_size // 5)
267
+ })
268
+ logger.info(f"Generated a Polars DataFrame with {data_size} rows.")
269
+ #
270
+ # # --- Execute Data Population Scenarios ---
271
+ _create_single_csv_file(s3_client, df, bucket_name)
272
+ _create_multi_file_csv(s3_client, df, bucket_name)
273
+ _create_single_file_json(s3_client, df, bucket_name)
274
+ _create_multi_file_json(s3_client, df, bucket_name)
275
+ _create_single_parquet_file(s3_client, df, bucket_name)
276
+ _create_multi_parquet_file(s3_client, df, bucket_name)
277
+
278
+ # Convert to PyArrow table once for Delta and Iceberg
279
+ arrow_table = df.to_arrow()
280
+
281
+ _create_delta_lake_table(arrow_table, bucket_name, storage_options)
282
+ _create_iceberg_table(df, bucket_name, endpoint_url, access_key, secret_key, s3_client)
283
+
284
+ logger.info("✅ All test data populated successfully.")
285
+
286
+
287
+ if __name__ == '__main__':
288
+ populate_test_data(endpoint_url=MINIO_ENDPOINT_URL,
289
+ access_key=MINIO_ACCESS_KEY,
290
+ secret_key=MINIO_SECRET_KEY,
291
+ bucket_name="test-bucket")
@@ -0,0 +1,209 @@
1
+ import os
2
+ import time
3
+ import subprocess
4
+ import logging
5
+ from contextlib import contextmanager
6
+ from typing import Dict, Generator
7
+ import shutil
8
+ import boto3
9
+ from botocore.client import Config
10
+ from test_utils.s3.data_generator import populate_test_data
11
+
12
+ logger = logging.getLogger("s3_fixture")
13
+
14
+ MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
15
+ MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
16
+ MINIO_CONSOLE_PORT = int(os.environ.get("TEST_MINIO_CONSOLE_PORT", 9001))
17
+ MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
18
+ MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
19
+ MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
20
+ MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
21
+
22
+ # Operating system detection
23
+ IS_MACOS = os.uname().sysname == 'Darwin' if hasattr(os, 'uname') else False
24
+ IS_WINDOWS = os.name == 'nt'
25
+
26
+ def get_minio_client():
27
+ """Get boto3 client for MinIO"""
28
+ return boto3.client(
29
+ 's3',
30
+ endpoint_url=MINIO_ENDPOINT_URL,
31
+ aws_access_key_id=MINIO_ACCESS_KEY,
32
+ aws_secret_access_key=MINIO_SECRET_KEY,
33
+ config=Config(signature_version='s3v4'),
34
+ region_name='us-east-1'
35
+ )
36
+
37
+
38
+ def wait_for_minio(max_retries=30, interval=1):
39
+ """Wait for MinIO to be ready"""
40
+ for i in range(max_retries):
41
+ try:
42
+ client = get_minio_client()
43
+ client.list_buckets()
44
+ logger.info("MinIO is ready")
45
+ return True
46
+ except Exception:
47
+ if i < max_retries - 1:
48
+ time.sleep(interval)
49
+ continue
50
+ return False
51
+
52
+ def is_container_running(container_name: str) -> bool:
53
+ """Check if MinIO container is already running"""
54
+ try:
55
+ result = subprocess.run(
56
+ ["docker", "ps", "--filter", f"name={container_name}", "--format", "{{.Names}}"],
57
+ capture_output=True,
58
+ text=True,
59
+ check=True
60
+ )
61
+ return container_name in result.stdout.strip()
62
+ except subprocess.CalledProcessError:
63
+ return False
64
+
65
+
66
+ def stop_minio_container() -> bool:
67
+ """Stop the MinIO container and remove its data volume for a clean shutdown."""
68
+ container_name = MINIO_CONTAINER_NAME
69
+ volume_name = f"{container_name}-data"
70
+
71
+ if not is_container_running(container_name):
72
+ logger.info(f"Container '{container_name}' is not running.")
73
+ # Attempt to remove the volume in case it was left orphaned
74
+ try:
75
+ subprocess.run(["docker", "volume", "rm", volume_name], check=False, capture_output=True)
76
+ except Exception:
77
+ pass # Ignore errors if volume doesn't exist
78
+ return True
79
+
80
+ logger.info(f"Stopping and cleaning up container '{container_name}' and volume '{volume_name}'...")
81
+ try:
82
+ # Stop and remove the container
83
+ subprocess.run(["docker", "stop", container_name], check=True, capture_output=True)
84
+ subprocess.run(["docker", "rm", container_name], check=True, capture_output=True)
85
+
86
+ # Remove the associated volume to clear all data
87
+ subprocess.run(["docker", "volume", "rm", volume_name], check=True, capture_output=True)
88
+
89
+ logger.info("✅ MinIO container and data volume successfully removed.")
90
+ return True
91
+ except subprocess.CalledProcessError as e:
92
+ stderr = e.stderr.decode()
93
+ if "no such volume" in stderr:
94
+ logger.info("Volume was already removed or never created.")
95
+ return True
96
+ logger.error(f"❌ Failed to clean up MinIO resources: {stderr}")
97
+ return False
98
+
99
+
100
+ def create_test_buckets():
101
+ """Create test buckets and populate with sample data"""
102
+ client = get_minio_client()
103
+
104
+ # Create test buckets
105
+ buckets = ['test-bucket', 'flowfile-test', 'sample-data', 'worker-test-bucket']
106
+ for bucket in buckets:
107
+ try:
108
+ client.create_bucket(Bucket=bucket)
109
+ logger.info(f"Created bucket: {bucket}")
110
+ except client.exceptions.BucketAlreadyExists:
111
+ logger.info(f"Bucket already exists: {bucket}")
112
+ except client.exceptions.BucketAlreadyOwnedByYou:
113
+ logger.info(f"Bucket already owned: {bucket}")
114
+
115
+
116
+ def is_docker_available() -> bool:
117
+ """
118
+ Check if Docker is available on the system.
119
+
120
+ Returns:
121
+ bool: True if Docker is available and working, False otherwise
122
+ """
123
+ # Skip Docker on macOS and Windows in CI
124
+ if (IS_MACOS or IS_WINDOWS) and os.environ.get('CI', '').lower() in ('true', '1', 'yes'):
125
+ logger.info("Skipping Docker on macOS/Windows in CI environment")
126
+ return False
127
+
128
+ # If docker executable is not in PATH
129
+ if shutil.which("docker") is None:
130
+ logger.warning("Docker executable not found in PATH")
131
+ return False
132
+
133
+ # Try a simple docker command
134
+ try:
135
+ result = subprocess.run(
136
+ ["docker", "info"],
137
+ stdout=subprocess.PIPE,
138
+ stderr=subprocess.PIPE,
139
+ timeout=5,
140
+ check=False # Don't raise exception on non-zero return code
141
+ )
142
+
143
+ if result.returncode != 0:
144
+ logger.warning("Docker is not operational")
145
+ return False
146
+
147
+ return True
148
+ except (subprocess.SubprocessError, OSError):
149
+ logger.warning("Error running Docker command")
150
+ return False
151
+
152
+
153
+ def start_minio_container() -> bool:
154
+ """Start MinIO container with initialization"""
155
+ if is_container_running(MINIO_CONTAINER_NAME):
156
+ logger.info(f"Container {MINIO_CONTAINER_NAME} is already running")
157
+ return True
158
+
159
+ try:
160
+ # Start MinIO with volume for persistence
161
+ subprocess.run([
162
+ "docker", "run", "-d",
163
+ "--name", MINIO_CONTAINER_NAME,
164
+ "-p", f"{MINIO_PORT}:9000",
165
+ "-p", f"{MINIO_CONSOLE_PORT}:9001",
166
+ "-e", f"MINIO_ROOT_USER={MINIO_ACCESS_KEY}",
167
+ "-e", f"MINIO_ROOT_PASSWORD={MINIO_SECRET_KEY}",
168
+ "-v", f"{MINIO_CONTAINER_NAME}-data:/data",
169
+ "minio/minio", "server", "/data", "--console-address", ":9001"
170
+ ], check=True)
171
+
172
+ # Wait for MinIO to be ready
173
+ if wait_for_minio():
174
+ create_test_buckets()
175
+ populate_test_data(endpoint_url=MINIO_ENDPOINT_URL,
176
+ access_key=MINIO_ACCESS_KEY,
177
+ secret_key=MINIO_SECRET_KEY,
178
+ bucket_name="test-bucket")
179
+ return True
180
+ return False
181
+
182
+ except Exception as e:
183
+ logger.error(f"Failed to start MinIO: {e}")
184
+ stop_minio_container()
185
+ return False
186
+
187
+
188
+ @contextmanager
189
+ def managed_minio() -> Generator[Dict[str, any], None, None]:
190
+ """Context manager for MinIO container with full connection info"""
191
+ if not start_minio_container():
192
+ yield {}
193
+ return
194
+
195
+ try:
196
+ connection_info = {
197
+ "endpoint_url": MINIO_ENDPOINT_URL,
198
+ "access_key": MINIO_ACCESS_KEY,
199
+ "secret_key": MINIO_SECRET_KEY,
200
+ "host": MINIO_HOST,
201
+ "port": MINIO_PORT,
202
+ "console_port": MINIO_CONSOLE_PORT,
203
+ "connection_string": f"s3://{MINIO_ACCESS_KEY}:{MINIO_SECRET_KEY}@{MINIO_HOST}:{MINIO_PORT}"
204
+ }
205
+ yield connection_info
206
+ finally:
207
+ # Optionally keep container running for debugging
208
+ if os.environ.get("KEEP_MINIO_RUNNING", "false").lower() != "true":
209
+ stop_minio_container()