Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (145) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  5. flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
  6. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  7. flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
  8. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  9. flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
  10. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
  11. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  12. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
  13. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
  14. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
  15. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
  16. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
  17. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
  18. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  19. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
  20. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
  21. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
  22. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
  23. flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
  24. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  25. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
  26. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  27. flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
  28. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
  29. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
  30. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
  31. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
  32. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
  33. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
  34. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
  35. flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
  36. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
  37. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
  38. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
  39. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
  40. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
  41. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
  42. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
  43. flowfile/web/static/assets/api-6ef0dcef.js +80 -0
  44. flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
  45. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  46. flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
  47. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
  48. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
  49. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
  50. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
  51. flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
  52. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
  53. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
  54. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
  55. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
  56. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
  57. flowfile/web/static/index.html +1 -1
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
  59. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
  60. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
  61. flowfile_core/__init__.py +3 -0
  62. flowfile_core/auth/jwt.py +39 -0
  63. flowfile_core/configs/node_store/nodes.py +9 -6
  64. flowfile_core/configs/settings.py +6 -5
  65. flowfile_core/database/connection.py +63 -15
  66. flowfile_core/database/init_db.py +0 -1
  67. flowfile_core/database/models.py +49 -2
  68. flowfile_core/flowfile/code_generator/code_generator.py +472 -17
  69. flowfile_core/flowfile/connection_manager/models.py +1 -1
  70. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  71. flowfile_core/flowfile/extensions.py +1 -1
  72. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  73. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  74. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
  75. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  76. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  77. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  78. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  79. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  80. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  81. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  82. flowfile_core/flowfile/flow_graph.py +718 -253
  83. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  84. flowfile_core/flowfile/flow_node/flow_node.py +563 -117
  85. flowfile_core/flowfile/flow_node/models.py +154 -20
  86. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  87. flowfile_core/flowfile/handler.py +2 -33
  88. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  89. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  90. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  91. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  92. flowfile_core/flowfile/utils.py +35 -26
  93. flowfile_core/main.py +35 -15
  94. flowfile_core/routes/cloud_connections.py +77 -0
  95. flowfile_core/routes/logs.py +2 -7
  96. flowfile_core/routes/public.py +1 -0
  97. flowfile_core/routes/routes.py +130 -90
  98. flowfile_core/routes/secrets.py +72 -14
  99. flowfile_core/schemas/__init__.py +8 -0
  100. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  101. flowfile_core/schemas/input_schema.py +121 -71
  102. flowfile_core/schemas/output_model.py +19 -3
  103. flowfile_core/schemas/schemas.py +150 -12
  104. flowfile_core/schemas/transform_schema.py +175 -35
  105. flowfile_core/utils/utils.py +40 -1
  106. flowfile_core/utils/validate_setup.py +41 -0
  107. flowfile_frame/__init__.py +9 -1
  108. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  109. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  110. flowfile_frame/expr.py +28 -1
  111. flowfile_frame/expr.pyi +76 -61
  112. flowfile_frame/flow_frame.py +481 -208
  113. flowfile_frame/flow_frame.pyi +140 -91
  114. flowfile_frame/flow_frame_methods.py +160 -22
  115. flowfile_frame/group_frame.py +3 -0
  116. flowfile_frame/utils.py +25 -3
  117. flowfile_worker/external_sources/s3_source/main.py +216 -0
  118. flowfile_worker/external_sources/s3_source/models.py +142 -0
  119. flowfile_worker/funcs.py +51 -6
  120. flowfile_worker/models.py +22 -2
  121. flowfile_worker/routes.py +40 -38
  122. flowfile_worker/utils.py +1 -1
  123. test_utils/s3/commands.py +46 -0
  124. test_utils/s3/data_generator.py +292 -0
  125. test_utils/s3/demo_data_generator.py +186 -0
  126. test_utils/s3/fixtures.py +214 -0
  127. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  128. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  129. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  130. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  131. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  132. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  133. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  134. flowfile_core/schemas/defaults.py +0 -9
  135. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  136. flowfile_core/schemas/models.py +0 -193
  137. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  138. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  139. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  140. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  141. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  142. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  143. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
  144. {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  145. {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
flowfile_worker/funcs.py CHANGED
@@ -6,7 +6,9 @@ from flowfile_worker.polars_fuzzy_match.matcher import fuzzy_match_dfs
6
6
  from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
7
7
  from flowfile_worker.flow_logger import get_worker_logger
8
8
  from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
9
- from flowfile_worker.external_sources.sql_source.main import write_serialized_df_to_database, write_df_to_database
9
+ from flowfile_worker.external_sources.sql_source.main import write_df_to_database
10
+ from flowfile_worker.external_sources.s3_source.main import write_df_to_cloud
11
+ from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
10
12
  from base64 import encodebytes
11
13
  from logging import Logger
12
14
  import logging
@@ -205,9 +207,9 @@ def execute_write_method(write_method: Callable, path: str, data_type: str = Non
205
207
  logger.info('Writing as csv file')
206
208
  if write_mode == 'append':
207
209
  with open(path, 'ab') as f:
208
- write_method(file=f, separator=delimiter, quote_style='always')
210
+ write_method(f, separator=delimiter, quote_style='always')
209
211
  else:
210
- write_method(file=path, separator=delimiter, quote_style='always')
212
+ write_method(path, separator=delimiter, quote_style='always')
211
213
  elif data_type == 'parquet':
212
214
  logger.info('Writing as parquet file')
213
215
  write_method(path)
@@ -243,6 +245,49 @@ def write_to_database(polars_serializable_object: bytes,
243
245
  progress.value = -1
244
246
 
245
247
 
248
+ def write_to_cloud_storage(polars_serializable_object: bytes,
249
+ progress: Value,
250
+ error_message: Array,
251
+ queue: Queue,
252
+ file_path: str,
253
+ cloud_write_settings: CloudStorageWriteSettings,
254
+ flowfile_flow_id: int = -1,
255
+ flowfile_node_id: int | str = -1
256
+ ) -> None:
257
+ """
258
+ Writes a Polars DataFrame to cloud storage using the provided settings.
259
+ Args:
260
+ polars_serializable_object (): # Serialized Polars DataFrame object
261
+ progress (): Multiprocessing Value to track progress
262
+ error_message (): Array to store error messages
263
+ queue (): Queue to send results back
264
+ file_path (): Path to the file where the DataFrame will be written
265
+ cloud_write_settings (): CloudStorageWriteSettings object containing write settings and connection details
266
+ flowfile_flow_id (): Flowfile flow ID for logging
267
+ flowfile_node_id (): Flowfile node ID for logging
268
+
269
+ Returns:
270
+ None
271
+ """
272
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
273
+ flowfile_logger.info(f"Starting write operation to: {cloud_write_settings.write_settings.resource_path}")
274
+ df = pl.LazyFrame.deserialize(io.BytesIO(polars_serializable_object))
275
+ flowfile_logger.info(f"Starting to sync the data to cloud, execution plan: \n"
276
+ f"{df.explain(format='plain')}")
277
+ try:
278
+ write_df_to_cloud(df, cloud_write_settings, flowfile_logger)
279
+ flowfile_logger.info("Write operation completed successfully")
280
+ with progress.get_lock():
281
+ progress.value = 100
282
+ except Exception as e:
283
+ error_msg = str(e).encode()[:1024]
284
+ flowfile_logger.error(f'Error during write operation: {str(e)}')
285
+ with error_message.get_lock():
286
+ error_message[:len(error_msg)] = error_msg
287
+ with progress.get_lock():
288
+ progress.value = -1
289
+
290
+
246
291
  def write_output(polars_serializable_object: bytes,
247
292
  progress: Value,
248
293
  error_message: Array,
@@ -263,16 +308,16 @@ def write_output(polars_serializable_object: bytes,
263
308
  if isinstance(df, pl.LazyFrame):
264
309
  flowfile_logger.info(f'Execution plan explanation:\n{df.explain(format="plain")}')
265
310
  flowfile_logger.info("Successfully deserialized dataframe")
266
- is_lazy = False
267
311
  sink_method_str = 'sink_'+data_type
268
312
  write_method_str = 'write_'+data_type
269
313
  has_sink_method = hasattr(df, sink_method_str)
270
314
  write_method = None
271
315
  if os.path.exists(path) and write_mode == 'create':
272
316
  raise Exception('File already exists')
273
- if has_sink_method and is_lazy:
317
+ if has_sink_method and write_method != 'append':
318
+ flowfile_logger.info(f'Using sink method: {sink_method_str}')
274
319
  write_method = getattr(df, 'sink_' + data_type)
275
- elif not is_lazy or not has_sink_method:
320
+ elif not has_sink_method:
276
321
  if isinstance(df, pl.LazyFrame):
277
322
  df = collect_lazy_frame(df)
278
323
  write_method = getattr(df, write_method_str)
flowfile_worker/models.py CHANGED
@@ -3,11 +3,12 @@ from typing import Optional, Literal, Any
3
3
  from base64 import decodebytes
4
4
  from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
5
5
  from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
6
+ from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
6
7
 
7
8
 
8
9
  OperationType = Literal[
9
10
  'store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'fuzzy', 'store_sample',
10
- 'write_to_database']
11
+ 'write_to_database', "write_to_cloud_storage",]
11
12
  ResultType = Literal['polars', 'other']
12
13
 
13
14
 
@@ -55,7 +56,6 @@ class DatabaseScriptWrite(DatabaseWriteSettings):
55
56
  Returns:
56
57
  DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
57
58
  """
58
-
59
59
  return DatabaseWriteSettings(
60
60
  connection=self.connection,
61
61
  table_name=self.table_name,
@@ -65,6 +65,26 @@ class DatabaseScriptWrite(DatabaseWriteSettings):
65
65
  )
66
66
 
67
67
 
68
+ class CloudStorageScriptWrite(CloudStorageWriteSettings):
69
+ operation: bytes
70
+
71
+ def polars_serializable_object(self):
72
+ return decodebytes(self.operation)
73
+
74
+ def get_cloud_storage_write_settings(self) -> CloudStorageWriteSettings:
75
+ """
76
+ Converts the current instance to a DatabaseWriteSettings object.
77
+ Returns:
78
+ DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
79
+ """
80
+ return CloudStorageWriteSettings(
81
+ write_settings=self.write_settings,
82
+ connection=self.connection,
83
+ flowfile_flow_id=self.flowfile_flow_id,
84
+ flowfile_node_id=self.flowfile_node_id
85
+ )
86
+
87
+
68
88
  class FuzzyJoinInput(BaseModel):
69
89
  task_id: Optional[str] = None
70
90
  cache_dir: Optional[str] = None
flowfile_worker/routes.py CHANGED
@@ -10,10 +10,8 @@ from flowfile_worker import models
10
10
  from flowfile_worker.spawner import start_process, start_fuzzy_process, start_generic_process, process_manager
11
11
  from flowfile_worker.create import table_creator_factory_method, received_table_parser, FileType
12
12
  from flowfile_worker.configs import logger
13
- from flowfile_worker.external_sources.airbyte_sources.models import AirbyteSettings
14
13
  from flowfile_worker.external_sources.sql_source.models import DatabaseReadSettings
15
14
  from flowfile_worker.external_sources.sql_source.main import read_sql_source, write_serialized_df_to_database
16
- from flowfile_worker.external_sources.airbyte_sources.main import read_airbyte_source
17
15
 
18
16
 
19
17
  router = APIRouter()
@@ -74,6 +72,44 @@ def store_sample(polars_script: models.PolarsScriptSample, background_tasks: Bac
74
72
  raise HTTPException(status_code=500, detail=str(e))
75
73
 
76
74
 
75
+ @router.post("/write_data_to_cloud/")
76
+ def write_data_to_cloud(cloud_storage_script_write: models.CloudStorageScriptWrite,
77
+ background_tasks: BackgroundTasks) -> models.Status:
78
+ """
79
+ Write polars dataframe to a file in cloud storage.
80
+ Args:
81
+ cloud_storage_script_write (): Contains dataframe and write options for cloud storage
82
+ background_tasks (): FastAPI background tasks handler
83
+
84
+ Returns:
85
+ models.Status: Status object tracking the write operation
86
+ """
87
+ try:
88
+ logger.info("Starting write operation to: cloud storage")
89
+ task_id = str(uuid.uuid4())
90
+ polars_serializable_object = cloud_storage_script_write.polars_serializable_object()
91
+ status = models.Status(background_task_id=task_id, status="Starting", file_ref='',
92
+ result_type="other")
93
+ status_dict[task_id] = status
94
+ background_tasks.add_task(
95
+ start_process,
96
+ polars_serializable_object=polars_serializable_object,
97
+ task_id=task_id,
98
+ operation="write_to_cloud_storage",
99
+ file_ref='',
100
+ flowfile_flow_id=cloud_storage_script_write.flowfile_flow_id,
101
+ flowfile_node_id=cloud_storage_script_write.flowfile_node_id,
102
+ kwargs=dict(cloud_write_settings=cloud_storage_script_write.get_cloud_storage_write_settings()),
103
+ )
104
+ logger.info(
105
+ f"Started write task: {task_id} to database"
106
+ )
107
+ return status
108
+ except Exception as e:
109
+ logger.error(f"Error in write operation: {str(e)}", exc_info=True)
110
+ raise HTTPException(status_code=500, detail=str(e))
111
+
112
+
77
113
  @router.post('/store_database_write_result/')
78
114
  def store_in_database(database_script_write: models.DatabaseScriptWrite, background_tasks: BackgroundTasks) -> models.Status:
79
115
  """
@@ -158,44 +194,10 @@ def write_results(polars_script_write: models.PolarsScriptWrite, background_task
158
194
  raise HTTPException(status_code=500, detail=str(e))
159
195
 
160
196
 
161
- @router.post('/store_airbyte_result')
162
- def store_airbyte_result(airbyte_settings: AirbyteSettings, background_tasks: BackgroundTasks) -> models.Status:
163
- """
164
- Store the result of an Airbyte source operation.
165
-
166
- Args:
167
- airbyte_settings (AirbyteSettings): Settings for the Airbyte source operation
168
- background_tasks (BackgroundTasks): FastAPI background tasks handler
169
-
170
- Returns:
171
- models.Status: Status object tracking the Airbyte source operation
172
- """
173
- logger.info("Processing Airbyte source operation")
174
-
175
- try:
176
- task_id = str(uuid.uuid4())
177
- file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
178
- status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
179
- result_type="polars")
180
- status_dict[task_id] = status
181
- logger.info(f"Starting Airbyte source task: {task_id}")
182
- background_tasks.add_task(start_generic_process, func_ref=read_airbyte_source, file_ref=file_path,
183
- flowfile_flow_id=airbyte_settings.flowfile_flow_id,
184
- flowfile_node_id=airbyte_settings.flowfile_node_id,
185
- task_id=task_id, kwargs=dict(airbyte_settings=airbyte_settings))
186
- logger.info(f"Started Airbyte source task: {task_id}")
187
-
188
- return status
189
-
190
- except Exception as e:
191
- logger.error(f"Error processing Airbyte source: {str(e)}", exc_info=True)
192
- raise HTTPException(status_code=500, detail=str(e))
193
-
194
-
195
197
  @router.post('/store_database_read_result')
196
198
  def store_sql_db_result(database_read_settings: DatabaseReadSettings, background_tasks: BackgroundTasks) -> models.Status:
197
199
  """
198
- Store the result of an Airbyte source operation.
200
+ Store the result of an sql source operation.
199
201
 
200
202
  Args:
201
203
  database_read_settings (SQLSourceSettings): Settings for the SQL source operation
@@ -204,7 +206,7 @@ def store_sql_db_result(database_read_settings: DatabaseReadSettings, background
204
206
  Returns:
205
207
  models.Status: Status object tracking the Sql operation
206
208
  """
207
- logger.info("Processing Airbyte source operation")
209
+ logger.info("Processing Sql source operation")
208
210
 
209
211
  try:
210
212
  task_id = str(uuid.uuid4())
flowfile_worker/utils.py CHANGED
@@ -7,7 +7,7 @@ def collect_lazy_frame(lf: pl.LazyFrame) -> pl.DataFrame:
7
7
  try:
8
8
  return lf.collect(engine="streaming")
9
9
  except PanicException:
10
- return lf.collect(engine="auto")
10
+ return lf.collect(engine="in-memory")
11
11
 
12
12
 
13
13
  @dataclass
@@ -0,0 +1,46 @@
1
+ import logging
2
+
3
+ # Set up logging
4
+ logging.basicConfig(
5
+ level=logging.INFO,
6
+ format='%(asctime)s - %(levelname)s - %(message)s',
7
+ datefmt='%Y-%m-%d %H:%M:%S'
8
+ )
9
+ logger = logging.getLogger("postgres_commands")
10
+
11
+
12
+ def start_minio():
13
+ """Start MinIO container for S3 testing"""
14
+ from . import fixtures
15
+ if not fixtures.is_docker_available():
16
+ logger.warning("Docker is not available. Cannot start PostgreSQL container.")
17
+ print("\n" + "=" * 50)
18
+ print("SKIPPING: Docker is not available on this system")
19
+ print("Tests requiring Docker will need to be skipped")
20
+ print("=" * 50 + "\n")
21
+ return 0 # Return success to allow pipeline to continue
22
+
23
+
24
+ if fixtures.start_minio_container():
25
+ print(f"MinIO started at http://localhost:{fixtures.MINIO_PORT}")
26
+ print(f"Access Key: {fixtures.MINIO_ACCESS_KEY}")
27
+ return 0
28
+ return 1
29
+
30
+
31
+ def stop_minio():
32
+ """Stop MinIO container"""
33
+ from . import fixtures
34
+
35
+ if not fixtures.is_docker_available():
36
+ logger.warning("Docker is not available. Cannot stop MinIO container.")
37
+ print("\n" + "=" * 50)
38
+ print("SKIPPING: Docker is not available on this system")
39
+ print("Tests requiring Docker will need to be skipped")
40
+ print("=" * 50 + "\n")
41
+ return 0
42
+
43
+ if fixtures.stop_minio_container():
44
+ print("MinIO stopped successfully")
45
+ return 0
46
+ return 1
@@ -0,0 +1,292 @@
1
+
2
+ import logging
3
+ import io
4
+ import os
5
+
6
+ # Third-party libraries
7
+ import boto3
8
+ from botocore.client import Config
9
+ import polars as pl
10
+ import pyarrow as pa
11
+ from deltalake import write_deltalake
12
+ from pyiceberg.catalog import load_catalog
13
+
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+ MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
20
+ MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
21
+ MINIO_CONSOLE_PORT = int(os.environ.get("TEST_MINIO_CONSOLE_PORT", 9001))
22
+ MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
23
+ MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
24
+ MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
25
+ MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
26
+
27
+
28
+ def _create_single_csv_file(s3_client, df: pl.DataFrame, bucket_name: str):
29
+ """Creates a single CSV file from a DataFrame and uploads it to S3."""
30
+ logger.info("Writing single-file CSV...")
31
+ csv_buffer = io.BytesIO()
32
+ df.write_csv(csv_buffer)
33
+ csv_buffer.seek(0)
34
+ s3_client.put_object(
35
+ Bucket=bucket_name,
36
+ Key='single-file-csv/data.csv',
37
+ Body=csv_buffer.getvalue()
38
+ )
39
+
40
+
41
+ def _create_multi_file_csv(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
42
+ """Creates multiple CSV files from a DataFrame and uploads them to S3."""
43
+ logger.info(f"Writing {num_files} CSV files...")
44
+ data_size = len(df)
45
+ rows_per_file = data_size // num_files
46
+ for i in range(num_files):
47
+ sub_df = df.slice(i * rows_per_file, rows_per_file)
48
+ csv_buffer = io.BytesIO()
49
+ sub_df.write_csv(csv_buffer)
50
+ csv_buffer.seek(0)
51
+ s3_client.put_object(
52
+ Bucket=bucket_name,
53
+ Key=f'multi-file-csv/part_{i:02d}.csv',
54
+ Body=csv_buffer.getvalue()
55
+ )
56
+
57
+
58
+ def _create_single_file_json(s3_client, df: pl.DataFrame, bucket_name: str):
59
+ """Creates a single JSON file from a DataFrame and uploads it to S3."""
60
+ logger.info("Writing single-file JSON...")
61
+ json_buffer = io.BytesIO()
62
+ df.write_ndjson(json_buffer)
63
+ json_buffer.seek(0)
64
+ s3_client.put_object(
65
+ Bucket=bucket_name,
66
+ Key='single-file-json/data.json',
67
+ Body=json_buffer.getvalue()
68
+ )
69
+
70
+
71
+ def _create_multi_file_json(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
72
+ """Creates multiple JSON files from a DataFrame and uploads them to S3."""
73
+ logger.info(f"Writing {num_files} JSON files...")
74
+ data_size = len(df)
75
+ rows_per_file = data_size // num_files
76
+ for i in range(num_files):
77
+ sub_df = df.slice(i * rows_per_file, rows_per_file)
78
+ json_buffer = io.BytesIO()
79
+ sub_df.write_ndjson(json_buffer)
80
+ json_buffer.seek(0)
81
+ s3_client.put_object(
82
+ Bucket=bucket_name,
83
+ Key=f'multi-file-json/part_{i:02d}.json',
84
+ Body=json_buffer.getvalue()
85
+ )
86
+
87
+
88
+ def _create_single_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str):
89
+ """Creates a single Parquet file from a DataFrame and uploads it to S3."""
90
+ logger.info("Writing single-file Parquet...")
91
+ parquet_buffer = io.BytesIO()
92
+ df.write_parquet(parquet_buffer)
93
+ parquet_buffer.seek(0)
94
+ s3_client.put_object(
95
+ Bucket=bucket_name,
96
+ Key='single-file-parquet/data.parquet',
97
+ Body=parquet_buffer.getvalue()
98
+ )
99
+
100
+
101
+ def _create_multi_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
102
+ """Creates multiple Parquet files from a DataFrame and uploads them to S3."""
103
+ logger.info(f"Writing {num_files} Parquet files...")
104
+ data_size = len(df)
105
+ rows_per_file = data_size // num_files
106
+ for i in range(num_files):
107
+ sub_df = df.slice(i * rows_per_file, rows_per_file)
108
+ parquet_buffer = io.BytesIO()
109
+ sub_df.write_parquet(parquet_buffer)
110
+ parquet_buffer.seek(0)
111
+ s3_client.put_object(
112
+ Bucket=bucket_name,
113
+ Key=f'multi-file-parquet/part_{i:02d}.parquet',
114
+ Body=parquet_buffer.getvalue()
115
+ )
116
+
117
+
118
+ def _create_delta_lake_table(arrow_table: pa.Table, bucket_name: str, storage_options: dict):
119
+ """Creates a Delta Lake table from a PyArrow table in S3."""
120
+ logger.info("Writing Delta Lake table...")
121
+ delta_table_path = f"s3://{bucket_name}/delta-lake-table"
122
+ write_deltalake(
123
+ delta_table_path,
124
+ arrow_table,
125
+ mode='overwrite',
126
+ storage_options=storage_options
127
+ )
128
+
129
+
130
+ def _create_iceberg_table(df: pl.DataFrame, bucket_name: str, endpoint_url: str, access_key: str, secret_key: str,
131
+ s3_client):
132
+ """Creates an Apache Iceberg table and FORCES sane metadata pointers."""
133
+ logger.info("Writing Apache Iceberg table with SANE metadata access...")
134
+ # Configure the catalog properties for S3 access
135
+ catalog_props = {
136
+ "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
137
+ "s3.endpoint": endpoint_url,
138
+ "s3.access-key-id": access_key,
139
+ "s3.secret-access-key": secret_key,
140
+ }
141
+ # Use the SQL catalog with an in-memory SQLite database for storing metadata pointers
142
+ catalog = load_catalog(
143
+ "default",
144
+ **{
145
+ "type": "sql",
146
+ "uri": "sqlite:///:memory:", # Use an in-memory SQL DB for the catalog
147
+ "warehouse": f"s3a://{bucket_name}/iceberg_warehouse",
148
+ **catalog_props,
149
+ }
150
+ )
151
+ table_identifier = ("default_db", "iceberg_table")
152
+ # Create a namespace (like a schema or database) for the table
153
+ try:
154
+ catalog.drop_namespace("default_db")
155
+ except Exception:
156
+ pass # Ignore if namespace doesn't exist
157
+ catalog.create_namespace("default_db")
158
+ try:
159
+ catalog.load_table(table_identifier)
160
+ catalog.drop_table(table_identifier)
161
+ except:
162
+ pass
163
+
164
+ # Create the table schema and object first
165
+ schema = df.to_arrow().schema
166
+ table = catalog.create_table(identifier=table_identifier, schema=schema)
167
+
168
+ # Use the simplified write_iceberg method from Polars
169
+ df.write_iceberg(table, mode='overwrite')
170
+
171
+ # NOW CREATE WHAT SHOULD EXIST BY DEFAULT - SANE METADATA POINTERS
172
+ # Get the current metadata location from the table
173
+ current_metadata = table.metadata_location
174
+ logger.info(f"Original metadata location: {current_metadata}")
175
+
176
+ # Extract just the path part
177
+ if current_metadata.startswith("s3a://"):
178
+ current_metadata_key = current_metadata.replace(f"s3a://{bucket_name}/", "")
179
+ else:
180
+ current_metadata_key = current_metadata.replace(f"s3://{bucket_name}/", "")
181
+
182
+ # Read the current metadata
183
+ response = s3_client.get_object(Bucket=bucket_name, Key=current_metadata_key)
184
+ metadata_content = response['Body'].read()
185
+
186
+ # Get the metadata directory
187
+ metadata_dir = "/".join(current_metadata_key.split("/")[:-1])
188
+
189
+ # Write it to standardized locations
190
+ # 1. metadata.json in the metadata folder (this is what pl.scan_iceberg expects)
191
+ s3_client.put_object(
192
+ Bucket=bucket_name,
193
+ Key=f"{metadata_dir}/metadata.json",
194
+ Body=metadata_content
195
+ )
196
+ logger.info(f"Created stable metadata.json at: s3://{bucket_name}/{metadata_dir}/metadata.json")
197
+
198
+ # 2. current.json as an additional pointer
199
+ s3_client.put_object(
200
+ Bucket=bucket_name,
201
+ Key=f"{metadata_dir}/current.json",
202
+ Body=metadata_content
203
+ )
204
+
205
+ # 3. VERSION file that contains the current metadata filename
206
+ current_metadata_filename = current_metadata_key.split("/")[-1]
207
+ s3_client.put_object(
208
+ Bucket=bucket_name,
209
+ Key=f"{metadata_dir}/VERSION",
210
+ Body=current_metadata_filename.encode()
211
+ )
212
+
213
+ # 4. version-hint.text (some Iceberg readers look for this)
214
+ s3_client.put_object(
215
+ Bucket=bucket_name,
216
+ Key=f"{metadata_dir}/version-hint.text",
217
+ Body=current_metadata_filename.encode()
218
+ )
219
+
220
+ table_base = "iceberg_warehouse/default_db.db/my_iceberg_table"
221
+ logger.info(f"""
222
+ ✅ Iceberg table created with SANE access patterns:
223
+ - Versioned metadata: s3://{bucket_name}/{current_metadata_key}
224
+ - Latest metadata: s3://{bucket_name}/{table_base}/metadata/metadata.json
225
+ - Current pointer: s3://{bucket_name}/{table_base}/metadata/current.json
226
+ - Version hint: s3://{bucket_name}/{table_base}/metadata/version-hint.text
227
+
228
+ Read with: pl.scan_iceberg('s3://{bucket_name}/{table_base}/metadata/metadata.json').collect()
229
+ """)
230
+
231
+
232
+ def populate_test_data(endpoint_url: str, access_key: str, secret_key: str, bucket_name: str):
233
+ """
234
+ Populates a MinIO bucket with a variety of large-scale test data formats.
235
+
236
+ Args:
237
+ endpoint_url (str): The S3 endpoint URL for the MinIO instance.
238
+ access_key (str): The access key for MinIO.
239
+ secret_key (str): The secret key for MinIO.
240
+ bucket_name (str): The name of the bucket to populate.
241
+ """
242
+ logger.info("🚀 Starting data population...")
243
+ # --- S3 Client and Storage Options ---
244
+ s3_client = boto3.client(
245
+ 's3',
246
+ endpoint_url=endpoint_url,
247
+ aws_access_key_id=access_key,
248
+ aws_secret_access_key=secret_key,
249
+ config=Config(signature_version='s3v4'),
250
+ region_name='us-east-1'
251
+ )
252
+ storage_options = {
253
+ "AWS_ENDPOINT_URL": endpoint_url,
254
+ "AWS_ACCESS_KEY_ID": access_key,
255
+ "AWS_SECRET_ACCESS_KEY": secret_key,
256
+ "AWS_REGION": "us-east-1",
257
+ "AWS_ALLOW_HTTP": "true",
258
+ "AWS_S3_ALLOW_UNSAFE_RENAME": "true"
259
+ }
260
+
261
+ # --- Data Generation ---
262
+ data_size = 100_000
263
+ df = pl.DataFrame({
264
+ "id": range(1, data_size + 1),
265
+ "name": [f"user_{i}" for i in range(1, data_size + 1)],
266
+ "value": [i * 10.5 for i in range(1, data_size + 1)],
267
+ "category": ["A", "B", "C", "D", "E"] * (data_size // 5)
268
+ })
269
+ logger.info(f"Generated a Polars DataFrame with {data_size} rows.")
270
+ #
271
+ # # --- Execute Data Population Scenarios ---
272
+ _create_single_csv_file(s3_client, df, bucket_name)
273
+ _create_multi_file_csv(s3_client, df, bucket_name)
274
+ _create_single_file_json(s3_client, df, bucket_name)
275
+ _create_multi_file_json(s3_client, df, bucket_name)
276
+ _create_single_parquet_file(s3_client, df, bucket_name)
277
+ _create_multi_parquet_file(s3_client, df, bucket_name)
278
+
279
+ # Convert to PyArrow table once for Delta and Iceberg
280
+ arrow_table = df.to_arrow()
281
+
282
+ _create_delta_lake_table(arrow_table, bucket_name, storage_options)
283
+ _create_iceberg_table(df, bucket_name, endpoint_url, access_key, secret_key, s3_client)
284
+
285
+ logger.info("✅ All test data populated successfully.")
286
+
287
+
288
+ if __name__ == '__main__':
289
+ populate_test_data(endpoint_url=MINIO_ENDPOINT_URL,
290
+ access_key=MINIO_ACCESS_KEY,
291
+ secret_key=MINIO_SECRET_KEY,
292
+ bucket_name="test-bucket")