Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +3 -3
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
- flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
- flowfile/web/static/assets/api-fb67319c.js +80 -0
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +2 -0
- flowfile_core/configs/node_store/nodes.py +8 -6
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +401 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +119 -82
- flowfile_core/flowfile/flow_node/flow_node.py +68 -33
- flowfile_core/flowfile/flow_node/models.py +32 -3
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/utils.py +1 -23
- flowfile_core/main.py +3 -2
- flowfile_core/routes/cloud_connections.py +81 -0
- flowfile_core/routes/logs.py +0 -1
- flowfile_core/routes/routes.py +3 -39
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +37 -15
- flowfile_core/schemas/schemas.py +7 -2
- flowfile_core/schemas/transform_schema.py +97 -22
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/flow_frame.py +253 -102
- flowfile_frame/flow_frame_methods.py +13 -13
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +291 -0
- test_utils/s3/fixtures.py +209 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
|
|
2
|
+
import logging
|
|
3
|
+
import io
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
# Third-party libraries
|
|
7
|
+
import boto3
|
|
8
|
+
from botocore.client import Config
|
|
9
|
+
import polars as pl
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from deltalake import write_deltalake
|
|
12
|
+
from pyiceberg.catalog import load_catalog
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Configure logging
|
|
16
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
|
|
20
|
+
MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
|
|
21
|
+
MINIO_CONSOLE_PORT = int(os.environ.get("TEST_MINIO_CONSOLE_PORT", 9001))
|
|
22
|
+
MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
|
|
23
|
+
MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
|
|
24
|
+
MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
|
|
25
|
+
MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
|
|
26
|
+
|
|
27
|
+
def _create_single_csv_file(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
28
|
+
"""Creates a single CSV file from a DataFrame and uploads it to S3."""
|
|
29
|
+
logger.info("Writing single-file CSV...")
|
|
30
|
+
csv_buffer = io.BytesIO()
|
|
31
|
+
df.write_csv(csv_buffer)
|
|
32
|
+
csv_buffer.seek(0)
|
|
33
|
+
s3_client.put_object(
|
|
34
|
+
Bucket=bucket_name,
|
|
35
|
+
Key='single-file-csv/data.csv',
|
|
36
|
+
Body=csv_buffer.getvalue()
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _create_multi_file_csv(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
|
|
41
|
+
"""Creates multiple CSV files from a DataFrame and uploads them to S3."""
|
|
42
|
+
logger.info(f"Writing {num_files} CSV files...")
|
|
43
|
+
data_size = len(df)
|
|
44
|
+
rows_per_file = data_size // num_files
|
|
45
|
+
for i in range(num_files):
|
|
46
|
+
sub_df = df.slice(i * rows_per_file, rows_per_file)
|
|
47
|
+
csv_buffer = io.BytesIO()
|
|
48
|
+
sub_df.write_csv(csv_buffer)
|
|
49
|
+
csv_buffer.seek(0)
|
|
50
|
+
s3_client.put_object(
|
|
51
|
+
Bucket=bucket_name,
|
|
52
|
+
Key=f'multi-file-csv/part_{i:02d}.csv',
|
|
53
|
+
Body=csv_buffer.getvalue()
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _create_single_file_json(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
58
|
+
"""Creates a single JSON file from a DataFrame and uploads it to S3."""
|
|
59
|
+
logger.info("Writing single-file JSON...")
|
|
60
|
+
json_buffer = io.BytesIO()
|
|
61
|
+
df.write_ndjson(json_buffer)
|
|
62
|
+
json_buffer.seek(0)
|
|
63
|
+
s3_client.put_object(
|
|
64
|
+
Bucket=bucket_name,
|
|
65
|
+
Key='single-file-json/data.json',
|
|
66
|
+
Body=json_buffer.getvalue()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _create_multi_file_json(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
|
|
71
|
+
"""Creates multiple JSON files from a DataFrame and uploads them to S3."""
|
|
72
|
+
logger.info(f"Writing {num_files} JSON files...")
|
|
73
|
+
data_size = len(df)
|
|
74
|
+
rows_per_file = data_size // num_files
|
|
75
|
+
for i in range(num_files):
|
|
76
|
+
sub_df = df.slice(i * rows_per_file, rows_per_file)
|
|
77
|
+
json_buffer = io.BytesIO()
|
|
78
|
+
sub_df.write_ndjson(json_buffer)
|
|
79
|
+
json_buffer.seek(0)
|
|
80
|
+
s3_client.put_object(
|
|
81
|
+
Bucket=bucket_name,
|
|
82
|
+
Key=f'multi-file-json/part_{i:02d}.json',
|
|
83
|
+
Body=json_buffer.getvalue()
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _create_single_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
88
|
+
"""Creates a single Parquet file from a DataFrame and uploads it to S3."""
|
|
89
|
+
logger.info("Writing single-file Parquet...")
|
|
90
|
+
parquet_buffer = io.BytesIO()
|
|
91
|
+
df.write_parquet(parquet_buffer)
|
|
92
|
+
parquet_buffer.seek(0)
|
|
93
|
+
s3_client.put_object(
|
|
94
|
+
Bucket=bucket_name,
|
|
95
|
+
Key='single-file-parquet/data.parquet',
|
|
96
|
+
Body=parquet_buffer.getvalue()
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _create_multi_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
|
|
101
|
+
"""Creates multiple Parquet files from a DataFrame and uploads them to S3."""
|
|
102
|
+
logger.info(f"Writing {num_files} Parquet files...")
|
|
103
|
+
data_size = len(df)
|
|
104
|
+
rows_per_file = data_size // num_files
|
|
105
|
+
for i in range(num_files):
|
|
106
|
+
sub_df = df.slice(i * rows_per_file, rows_per_file)
|
|
107
|
+
parquet_buffer = io.BytesIO()
|
|
108
|
+
sub_df.write_parquet(parquet_buffer)
|
|
109
|
+
parquet_buffer.seek(0)
|
|
110
|
+
s3_client.put_object(
|
|
111
|
+
Bucket=bucket_name,
|
|
112
|
+
Key=f'multi-file-parquet/part_{i:02d}.parquet',
|
|
113
|
+
Body=parquet_buffer.getvalue()
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _create_delta_lake_table(arrow_table: pa.Table, bucket_name: str, storage_options: dict):
|
|
118
|
+
"""Creates a Delta Lake table from a PyArrow table in S3."""
|
|
119
|
+
logger.info("Writing Delta Lake table...")
|
|
120
|
+
delta_table_path = f"s3://{bucket_name}/delta-lake-table"
|
|
121
|
+
write_deltalake(
|
|
122
|
+
delta_table_path,
|
|
123
|
+
arrow_table,
|
|
124
|
+
mode='overwrite',
|
|
125
|
+
storage_options=storage_options
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _create_iceberg_table(df: pl.DataFrame, bucket_name: str, endpoint_url: str, access_key: str, secret_key: str,
|
|
130
|
+
s3_client):
|
|
131
|
+
"""Creates an Apache Iceberg table and FORCES sane metadata pointers."""
|
|
132
|
+
logger.info("Writing Apache Iceberg table with SANE metadata access...")
|
|
133
|
+
# Configure the catalog properties for S3 access
|
|
134
|
+
catalog_props = {
|
|
135
|
+
"py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
|
|
136
|
+
"s3.endpoint": endpoint_url,
|
|
137
|
+
"s3.access-key-id": access_key,
|
|
138
|
+
"s3.secret-access-key": secret_key,
|
|
139
|
+
}
|
|
140
|
+
# Use the SQL catalog with an in-memory SQLite database for storing metadata pointers
|
|
141
|
+
catalog = load_catalog(
|
|
142
|
+
"default",
|
|
143
|
+
**{
|
|
144
|
+
"type": "sql",
|
|
145
|
+
"uri": "sqlite:///:memory:", # Use an in-memory SQL DB for the catalog
|
|
146
|
+
"warehouse": f"s3a://{bucket_name}/iceberg_warehouse",
|
|
147
|
+
**catalog_props,
|
|
148
|
+
}
|
|
149
|
+
)
|
|
150
|
+
table_identifier = ("default_db", "iceberg_table")
|
|
151
|
+
# Create a namespace (like a schema or database) for the table
|
|
152
|
+
try:
|
|
153
|
+
catalog.drop_namespace("default_db")
|
|
154
|
+
except Exception:
|
|
155
|
+
pass # Ignore if namespace doesn't exist
|
|
156
|
+
catalog.create_namespace("default_db")
|
|
157
|
+
try:
|
|
158
|
+
catalog.load_table(table_identifier)
|
|
159
|
+
catalog.drop_table(table_identifier)
|
|
160
|
+
except:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
# Create the table schema and object first
|
|
164
|
+
schema = df.to_arrow().schema
|
|
165
|
+
table = catalog.create_table(identifier=table_identifier, schema=schema)
|
|
166
|
+
|
|
167
|
+
# Use the simplified write_iceberg method from Polars
|
|
168
|
+
df.write_iceberg(table, mode='overwrite')
|
|
169
|
+
|
|
170
|
+
# NOW CREATE WHAT SHOULD EXIST BY DEFAULT - SANE METADATA POINTERS
|
|
171
|
+
# Get the current metadata location from the table
|
|
172
|
+
current_metadata = table.metadata_location
|
|
173
|
+
logger.info(f"Original metadata location: {current_metadata}")
|
|
174
|
+
|
|
175
|
+
# Extract just the path part
|
|
176
|
+
if current_metadata.startswith("s3a://"):
|
|
177
|
+
current_metadata_key = current_metadata.replace(f"s3a://{bucket_name}/", "")
|
|
178
|
+
else:
|
|
179
|
+
current_metadata_key = current_metadata.replace(f"s3://{bucket_name}/", "")
|
|
180
|
+
|
|
181
|
+
# Read the current metadata
|
|
182
|
+
response = s3_client.get_object(Bucket=bucket_name, Key=current_metadata_key)
|
|
183
|
+
metadata_content = response['Body'].read()
|
|
184
|
+
|
|
185
|
+
# Get the metadata directory
|
|
186
|
+
metadata_dir = "/".join(current_metadata_key.split("/")[:-1])
|
|
187
|
+
|
|
188
|
+
# Write it to standardized locations
|
|
189
|
+
# 1. metadata.json in the metadata folder (this is what pl.scan_iceberg expects)
|
|
190
|
+
s3_client.put_object(
|
|
191
|
+
Bucket=bucket_name,
|
|
192
|
+
Key=f"{metadata_dir}/metadata.json",
|
|
193
|
+
Body=metadata_content
|
|
194
|
+
)
|
|
195
|
+
logger.info(f"Created stable metadata.json at: s3://{bucket_name}/{metadata_dir}/metadata.json")
|
|
196
|
+
|
|
197
|
+
# 2. current.json as an additional pointer
|
|
198
|
+
s3_client.put_object(
|
|
199
|
+
Bucket=bucket_name,
|
|
200
|
+
Key=f"{metadata_dir}/current.json",
|
|
201
|
+
Body=metadata_content
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# 3. VERSION file that contains the current metadata filename
|
|
205
|
+
current_metadata_filename = current_metadata_key.split("/")[-1]
|
|
206
|
+
s3_client.put_object(
|
|
207
|
+
Bucket=bucket_name,
|
|
208
|
+
Key=f"{metadata_dir}/VERSION",
|
|
209
|
+
Body=current_metadata_filename.encode()
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# 4. version-hint.text (some Iceberg readers look for this)
|
|
213
|
+
s3_client.put_object(
|
|
214
|
+
Bucket=bucket_name,
|
|
215
|
+
Key=f"{metadata_dir}/version-hint.text",
|
|
216
|
+
Body=current_metadata_filename.encode()
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
table_base = "iceberg_warehouse/default_db.db/my_iceberg_table"
|
|
220
|
+
logger.info(f"""
|
|
221
|
+
✅ Iceberg table created with SANE access patterns:
|
|
222
|
+
- Versioned metadata: s3://{bucket_name}/{current_metadata_key}
|
|
223
|
+
- Latest metadata: s3://{bucket_name}/{table_base}/metadata/metadata.json
|
|
224
|
+
- Current pointer: s3://{bucket_name}/{table_base}/metadata/current.json
|
|
225
|
+
- Version hint: s3://{bucket_name}/{table_base}/metadata/version-hint.text
|
|
226
|
+
|
|
227
|
+
Read with: pl.scan_iceberg('s3://{bucket_name}/{table_base}/metadata/metadata.json').collect()
|
|
228
|
+
""")
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def populate_test_data(endpoint_url: str, access_key: str, secret_key: str, bucket_name: str):
|
|
232
|
+
"""
|
|
233
|
+
Populates a MinIO bucket with a variety of large-scale test data formats.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
endpoint_url (str): The S3 endpoint URL for the MinIO instance.
|
|
237
|
+
access_key (str): The access key for MinIO.
|
|
238
|
+
secret_key (str): The secret key for MinIO.
|
|
239
|
+
bucket_name (str): The name of the bucket to populate.
|
|
240
|
+
"""
|
|
241
|
+
logger.info("🚀 Starting data population...")
|
|
242
|
+
# --- S3 Client and Storage Options ---
|
|
243
|
+
s3_client = boto3.client(
|
|
244
|
+
's3',
|
|
245
|
+
endpoint_url=endpoint_url,
|
|
246
|
+
aws_access_key_id=access_key,
|
|
247
|
+
aws_secret_access_key=secret_key,
|
|
248
|
+
config=Config(signature_version='s3v4'),
|
|
249
|
+
region_name='us-east-1'
|
|
250
|
+
)
|
|
251
|
+
storage_options = {
|
|
252
|
+
"AWS_ENDPOINT_URL": endpoint_url,
|
|
253
|
+
"AWS_ACCESS_KEY_ID": access_key,
|
|
254
|
+
"AWS_SECRET_ACCESS_KEY": secret_key,
|
|
255
|
+
"AWS_REGION": "us-east-1",
|
|
256
|
+
"AWS_ALLOW_HTTP": "true",
|
|
257
|
+
"AWS_S3_ALLOW_UNSAFE_RENAME": "true"
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
# --- Data Generation ---
|
|
261
|
+
data_size = 100_000
|
|
262
|
+
df = pl.DataFrame({
|
|
263
|
+
"id": range(1, data_size + 1),
|
|
264
|
+
"name": [f"user_{i}" for i in range(1, data_size + 1)],
|
|
265
|
+
"value": [i * 10.5 for i in range(1, data_size + 1)],
|
|
266
|
+
"category": ["A", "B", "C", "D", "E"] * (data_size // 5)
|
|
267
|
+
})
|
|
268
|
+
logger.info(f"Generated a Polars DataFrame with {data_size} rows.")
|
|
269
|
+
#
|
|
270
|
+
# # --- Execute Data Population Scenarios ---
|
|
271
|
+
_create_single_csv_file(s3_client, df, bucket_name)
|
|
272
|
+
_create_multi_file_csv(s3_client, df, bucket_name)
|
|
273
|
+
_create_single_file_json(s3_client, df, bucket_name)
|
|
274
|
+
_create_multi_file_json(s3_client, df, bucket_name)
|
|
275
|
+
_create_single_parquet_file(s3_client, df, bucket_name)
|
|
276
|
+
_create_multi_parquet_file(s3_client, df, bucket_name)
|
|
277
|
+
|
|
278
|
+
# Convert to PyArrow table once for Delta and Iceberg
|
|
279
|
+
arrow_table = df.to_arrow()
|
|
280
|
+
|
|
281
|
+
_create_delta_lake_table(arrow_table, bucket_name, storage_options)
|
|
282
|
+
_create_iceberg_table(df, bucket_name, endpoint_url, access_key, secret_key, s3_client)
|
|
283
|
+
|
|
284
|
+
logger.info("✅ All test data populated successfully.")
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
if __name__ == '__main__':
|
|
288
|
+
populate_test_data(endpoint_url=MINIO_ENDPOINT_URL,
|
|
289
|
+
access_key=MINIO_ACCESS_KEY,
|
|
290
|
+
secret_key=MINIO_SECRET_KEY,
|
|
291
|
+
bucket_name="test-bucket")
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import subprocess
|
|
4
|
+
import logging
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from typing import Dict, Generator
|
|
7
|
+
import shutil
|
|
8
|
+
import boto3
|
|
9
|
+
from botocore.client import Config
|
|
10
|
+
from test_utils.s3.data_generator import populate_test_data
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("s3_fixture")
|
|
13
|
+
|
|
14
|
+
MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
|
|
15
|
+
MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
|
|
16
|
+
MINIO_CONSOLE_PORT = int(os.environ.get("TEST_MINIO_CONSOLE_PORT", 9001))
|
|
17
|
+
MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
|
|
18
|
+
MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
|
|
19
|
+
MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
|
|
20
|
+
MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
|
|
21
|
+
|
|
22
|
+
# Operating system detection
|
|
23
|
+
IS_MACOS = os.uname().sysname == 'Darwin' if hasattr(os, 'uname') else False
|
|
24
|
+
IS_WINDOWS = os.name == 'nt'
|
|
25
|
+
|
|
26
|
+
def get_minio_client():
|
|
27
|
+
"""Get boto3 client for MinIO"""
|
|
28
|
+
return boto3.client(
|
|
29
|
+
's3',
|
|
30
|
+
endpoint_url=MINIO_ENDPOINT_URL,
|
|
31
|
+
aws_access_key_id=MINIO_ACCESS_KEY,
|
|
32
|
+
aws_secret_access_key=MINIO_SECRET_KEY,
|
|
33
|
+
config=Config(signature_version='s3v4'),
|
|
34
|
+
region_name='us-east-1'
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def wait_for_minio(max_retries=30, interval=1):
|
|
39
|
+
"""Wait for MinIO to be ready"""
|
|
40
|
+
for i in range(max_retries):
|
|
41
|
+
try:
|
|
42
|
+
client = get_minio_client()
|
|
43
|
+
client.list_buckets()
|
|
44
|
+
logger.info("MinIO is ready")
|
|
45
|
+
return True
|
|
46
|
+
except Exception:
|
|
47
|
+
if i < max_retries - 1:
|
|
48
|
+
time.sleep(interval)
|
|
49
|
+
continue
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
def is_container_running(container_name: str) -> bool:
|
|
53
|
+
"""Check if MinIO container is already running"""
|
|
54
|
+
try:
|
|
55
|
+
result = subprocess.run(
|
|
56
|
+
["docker", "ps", "--filter", f"name={container_name}", "--format", "{{.Names}}"],
|
|
57
|
+
capture_output=True,
|
|
58
|
+
text=True,
|
|
59
|
+
check=True
|
|
60
|
+
)
|
|
61
|
+
return container_name in result.stdout.strip()
|
|
62
|
+
except subprocess.CalledProcessError:
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def stop_minio_container() -> bool:
|
|
67
|
+
"""Stop the MinIO container and remove its data volume for a clean shutdown."""
|
|
68
|
+
container_name = MINIO_CONTAINER_NAME
|
|
69
|
+
volume_name = f"{container_name}-data"
|
|
70
|
+
|
|
71
|
+
if not is_container_running(container_name):
|
|
72
|
+
logger.info(f"Container '{container_name}' is not running.")
|
|
73
|
+
# Attempt to remove the volume in case it was left orphaned
|
|
74
|
+
try:
|
|
75
|
+
subprocess.run(["docker", "volume", "rm", volume_name], check=False, capture_output=True)
|
|
76
|
+
except Exception:
|
|
77
|
+
pass # Ignore errors if volume doesn't exist
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
logger.info(f"Stopping and cleaning up container '{container_name}' and volume '{volume_name}'...")
|
|
81
|
+
try:
|
|
82
|
+
# Stop and remove the container
|
|
83
|
+
subprocess.run(["docker", "stop", container_name], check=True, capture_output=True)
|
|
84
|
+
subprocess.run(["docker", "rm", container_name], check=True, capture_output=True)
|
|
85
|
+
|
|
86
|
+
# Remove the associated volume to clear all data
|
|
87
|
+
subprocess.run(["docker", "volume", "rm", volume_name], check=True, capture_output=True)
|
|
88
|
+
|
|
89
|
+
logger.info("✅ MinIO container and data volume successfully removed.")
|
|
90
|
+
return True
|
|
91
|
+
except subprocess.CalledProcessError as e:
|
|
92
|
+
stderr = e.stderr.decode()
|
|
93
|
+
if "no such volume" in stderr:
|
|
94
|
+
logger.info("Volume was already removed or never created.")
|
|
95
|
+
return True
|
|
96
|
+
logger.error(f"❌ Failed to clean up MinIO resources: {stderr}")
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def create_test_buckets():
|
|
101
|
+
"""Create test buckets and populate with sample data"""
|
|
102
|
+
client = get_minio_client()
|
|
103
|
+
|
|
104
|
+
# Create test buckets
|
|
105
|
+
buckets = ['test-bucket', 'flowfile-test', 'sample-data', 'worker-test-bucket']
|
|
106
|
+
for bucket in buckets:
|
|
107
|
+
try:
|
|
108
|
+
client.create_bucket(Bucket=bucket)
|
|
109
|
+
logger.info(f"Created bucket: {bucket}")
|
|
110
|
+
except client.exceptions.BucketAlreadyExists:
|
|
111
|
+
logger.info(f"Bucket already exists: {bucket}")
|
|
112
|
+
except client.exceptions.BucketAlreadyOwnedByYou:
|
|
113
|
+
logger.info(f"Bucket already owned: {bucket}")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def is_docker_available() -> bool:
|
|
117
|
+
"""
|
|
118
|
+
Check if Docker is available on the system.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
bool: True if Docker is available and working, False otherwise
|
|
122
|
+
"""
|
|
123
|
+
# Skip Docker on macOS and Windows in CI
|
|
124
|
+
if (IS_MACOS or IS_WINDOWS) and os.environ.get('CI', '').lower() in ('true', '1', 'yes'):
|
|
125
|
+
logger.info("Skipping Docker on macOS/Windows in CI environment")
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
# If docker executable is not in PATH
|
|
129
|
+
if shutil.which("docker") is None:
|
|
130
|
+
logger.warning("Docker executable not found in PATH")
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
# Try a simple docker command
|
|
134
|
+
try:
|
|
135
|
+
result = subprocess.run(
|
|
136
|
+
["docker", "info"],
|
|
137
|
+
stdout=subprocess.PIPE,
|
|
138
|
+
stderr=subprocess.PIPE,
|
|
139
|
+
timeout=5,
|
|
140
|
+
check=False # Don't raise exception on non-zero return code
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if result.returncode != 0:
|
|
144
|
+
logger.warning("Docker is not operational")
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
return True
|
|
148
|
+
except (subprocess.SubprocessError, OSError):
|
|
149
|
+
logger.warning("Error running Docker command")
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def start_minio_container() -> bool:
|
|
154
|
+
"""Start MinIO container with initialization"""
|
|
155
|
+
if is_container_running(MINIO_CONTAINER_NAME):
|
|
156
|
+
logger.info(f"Container {MINIO_CONTAINER_NAME} is already running")
|
|
157
|
+
return True
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
# Start MinIO with volume for persistence
|
|
161
|
+
subprocess.run([
|
|
162
|
+
"docker", "run", "-d",
|
|
163
|
+
"--name", MINIO_CONTAINER_NAME,
|
|
164
|
+
"-p", f"{MINIO_PORT}:9000",
|
|
165
|
+
"-p", f"{MINIO_CONSOLE_PORT}:9001",
|
|
166
|
+
"-e", f"MINIO_ROOT_USER={MINIO_ACCESS_KEY}",
|
|
167
|
+
"-e", f"MINIO_ROOT_PASSWORD={MINIO_SECRET_KEY}",
|
|
168
|
+
"-v", f"{MINIO_CONTAINER_NAME}-data:/data",
|
|
169
|
+
"minio/minio", "server", "/data", "--console-address", ":9001"
|
|
170
|
+
], check=True)
|
|
171
|
+
|
|
172
|
+
# Wait for MinIO to be ready
|
|
173
|
+
if wait_for_minio():
|
|
174
|
+
create_test_buckets()
|
|
175
|
+
populate_test_data(endpoint_url=MINIO_ENDPOINT_URL,
|
|
176
|
+
access_key=MINIO_ACCESS_KEY,
|
|
177
|
+
secret_key=MINIO_SECRET_KEY,
|
|
178
|
+
bucket_name="test-bucket")
|
|
179
|
+
return True
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"Failed to start MinIO: {e}")
|
|
184
|
+
stop_minio_container()
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@contextmanager
|
|
189
|
+
def managed_minio() -> Generator[Dict[str, any], None, None]:
|
|
190
|
+
"""Context manager for MinIO container with full connection info"""
|
|
191
|
+
if not start_minio_container():
|
|
192
|
+
yield {}
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
connection_info = {
|
|
197
|
+
"endpoint_url": MINIO_ENDPOINT_URL,
|
|
198
|
+
"access_key": MINIO_ACCESS_KEY,
|
|
199
|
+
"secret_key": MINIO_SECRET_KEY,
|
|
200
|
+
"host": MINIO_HOST,
|
|
201
|
+
"port": MINIO_PORT,
|
|
202
|
+
"console_port": MINIO_CONSOLE_PORT,
|
|
203
|
+
"connection_string": f"s3://{MINIO_ACCESS_KEY}:{MINIO_SECRET_KEY}@{MINIO_HOST}:{MINIO_PORT}"
|
|
204
|
+
}
|
|
205
|
+
yield connection_info
|
|
206
|
+
finally:
|
|
207
|
+
# Optionally keep container running for debugging
|
|
208
|
+
if os.environ.get("KEEP_MINIO_RUNNING", "false").lower() != "true":
|
|
209
|
+
stop_minio_container()
|