Flowfile 0.3.1.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (100) hide show
  1. flowfile/__init__.py +2 -1
  2. flowfile/api.py +5 -3
  3. flowfile/web/__init__.py +3 -0
  4. flowfile/web/static/assets/{AirbyteReader-cb0c1d4a.js → AirbyteReader-2b1cf2d8.js} +10 -9
  5. flowfile/web/static/assets/{CrossJoin-a514fa59.js → CrossJoin-cc3ab73c.js} +8 -8
  6. flowfile/web/static/assets/{DatabaseConnectionSettings-f2cecf33.js → DatabaseConnectionSettings-307c4652.js} +2 -2
  7. flowfile/web/static/assets/{DatabaseManager-83ee3c98.js → DatabaseManager-69faa6e1.js} +10 -6
  8. flowfile/web/static/assets/{DatabaseReader-dc0c6881.js → DatabaseReader-e4134cd0.js} +9 -9
  9. flowfile/web/static/assets/{DatabaseWriter-5afe9f8d.js → DatabaseWriter-d32d75b1.js} +9 -9
  10. flowfile/web/static/assets/{ExploreData-c7ee19cf.js → ExploreData-5eb48389.js} +18639 -18629
  11. flowfile/web/static/assets/{ExternalSource-17b23a01.js → ExternalSource-29489051.js} +8 -21
  12. flowfile/web/static/assets/{Filter-90856b4f.js → Filter-031332bb.js} +9 -9
  13. flowfile/web/static/assets/{Formula-38b71e9e.js → Formula-3b900540.js} +15 -15
  14. flowfile/web/static/assets/{Formula-d60a74f4.css → Formula-b8cefc31.css} +4 -4
  15. flowfile/web/static/assets/{FuzzyMatch-d0f1fe81.js → FuzzyMatch-dee31153.js} +9 -9
  16. flowfile/web/static/assets/{GraphSolver-0c86bbc6.js → GraphSolver-ca74eb47.js} +5 -5
  17. flowfile/web/static/assets/{GroupBy-f2772e9f.js → GroupBy-081b6591.js} +8 -7
  18. flowfile/web/static/assets/{Join-bc3e1cf7.js → Join-b467376f.js} +11 -10
  19. flowfile/web/static/assets/{ManualInput-03aa0245.js → ManualInput-ffffb80a.js} +11 -8
  20. flowfile/web/static/assets/{Output-5b35eee8.js → Output-9a87d4ba.js} +4 -4
  21. flowfile/web/static/assets/{Pivot-7164087c.js → Pivot-ee3e6093.js} +8 -7
  22. flowfile/web/static/assets/{PolarsCode-3abf6507.js → PolarsCode-03921254.js} +13 -11
  23. flowfile/web/static/assets/{PopOver-b37ff9be.js → PopOver-3bdf8951.js} +1 -1
  24. flowfile/web/static/assets/{Read-65966a3e.js → Read-67fee3a0.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-c66c6d6d.js → RecordCount-a2acd02d.js} +7 -6
  26. flowfile/web/static/assets/{RecordId-826dc095.js → RecordId-0c8bcd77.js} +10 -8
  27. flowfile/web/static/assets/{Sample-4ed555c8.js → Sample-60594a3a.js} +7 -6
  28. flowfile/web/static/assets/{SecretManager-eac1e97d.js → SecretManager-bbcec2ac.js} +2 -2
  29. flowfile/web/static/assets/{Select-085f05cc.js → Select-9540e6ca.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-1f5e79c1.js → SettingsSection-48f28104.js} +1 -1
  31. flowfile/web/static/assets/{Sort-3e6cb414.js → Sort-6dbe3633.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-606349bc.js → TextToRows-27aab4a8.js} +18 -13
  33. flowfile/web/static/assets/{UnavailableFields-b41976ed.js → UnavailableFields-8143044b.js} +2 -2
  34. flowfile/web/static/assets/{Union-fca91665.js → Union-52460248.js} +7 -6
  35. flowfile/web/static/assets/{Unique-a59f830e.js → Unique-f6962644.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-c3815565.js → Unpivot-1ff1e938.js} +5 -5
  37. flowfile/web/static/assets/{api-22b338bd.js → api-3b345d92.js} +1 -1
  38. flowfile/web/static/assets/{designer-e5bbe26f.js → designer-4736134f.js} +72 -42
  39. flowfile/web/static/assets/{documentation-08045cf2.js → documentation-b9545eba.js} +1 -1
  40. flowfile/web/static/assets/{dropDown-5e7e9a5a.js → dropDown-d5a4014c.js} +1 -1
  41. flowfile/web/static/assets/{dropDownGeneric-50a91b99.js → dropDownGeneric-1f4e32ec.js} +2 -2
  42. flowfile/web/static/assets/{fullEditor-705c6ccb.js → fullEditor-f4791c23.js} +3 -3
  43. flowfile/web/static/assets/{genericNodeSettings-65587f20.js → genericNodeSettings-1d456350.js} +3 -3
  44. flowfile/web/static/assets/{index-552863fd.js → index-f25c9283.js} +2608 -1570
  45. flowfile/web/static/assets/{nodeTitle-cf9bae3c.js → nodeTitle-cad6fd9d.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-3ad510e1.js → secretApi-01f07e2c.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-bd644891.js → selectDynamic-f46a4e3f.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-dd17b478.js → vue-codemirror.esm-eb98fc8b.js} +15 -14
  49. flowfile/web/static/assets/{vue-content-loader.es-6b36f05e.js → vue-content-loader.es-860c0380.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/METADATA +1 -3
  52. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/RECORD +97 -88
  53. flowfile_core/configs/__init__.py +15 -4
  54. flowfile_core/configs/node_store/nodes.py +2 -4
  55. flowfile_core/configs/settings.py +5 -3
  56. flowfile_core/configs/utils.py +18 -0
  57. flowfile_core/flowfile/FlowfileFlow.py +84 -29
  58. flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
  59. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +55 -18
  60. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
  61. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
  62. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +34 -2
  63. flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
  64. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
  65. flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
  66. flowfile_core/flowfile/flow_graph_utils.py +320 -0
  67. flowfile_core/flowfile/flow_node/flow_node.py +2 -1
  68. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
  69. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +0 -1
  70. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
  71. flowfile_core/flowfile/utils.py +34 -3
  72. flowfile_core/main.py +2 -3
  73. flowfile_core/routes/secrets.py +1 -1
  74. flowfile_core/schemas/input_schema.py +12 -14
  75. flowfile_core/schemas/transform_schema.py +25 -47
  76. flowfile_frame/__init__.py +11 -4
  77. flowfile_frame/adding_expr.py +280 -0
  78. flowfile_frame/config.py +9 -0
  79. flowfile_frame/expr.py +301 -83
  80. flowfile_frame/expr.pyi +2174 -0
  81. flowfile_frame/expr_name.py +258 -0
  82. flowfile_frame/flow_frame.py +616 -627
  83. flowfile_frame/flow_frame.pyi +336 -0
  84. flowfile_frame/flow_frame_methods.py +617 -0
  85. flowfile_frame/group_frame.py +89 -42
  86. flowfile_frame/join.py +1 -2
  87. flowfile_frame/lazy.py +704 -0
  88. flowfile_frame/lazy_methods.py +201 -0
  89. flowfile_frame/list_name_space.py +324 -0
  90. flowfile_frame/selectors.py +3 -0
  91. flowfile_frame/series.py +70 -0
  92. flowfile_frame/utils.py +80 -4
  93. flowfile/web/static/assets/GoogleSheet-854294a4.js +0 -2616
  94. flowfile/web/static/assets/GoogleSheet-92084da7.css +0 -233
  95. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +0 -74
  96. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/LICENSE +0 -0
  97. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/WHEEL +0 -0
  98. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/entry_points.txt +0 -0
  99. /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
  100. /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
@@ -5,10 +5,10 @@ import os
5
5
  import tempfile
6
6
  import argparse
7
7
 
8
- from databases import DatabaseURL
9
8
  from passlib.context import CryptContext
10
9
  from starlette.config import Config
11
- from starlette.datastructures import Secret
10
+
11
+ from flowfile_core.configs.utils import MutableBool
12
12
 
13
13
 
14
14
  # Constants for server and worker configuration
@@ -18,6 +18,9 @@ DEFAULT_WORKER_PORT = 63579
18
18
  SINGLE_FILE_MODE: bool = os.environ.get("SINGLE_FILE_MODE", "0") == "1"
19
19
 
20
20
 
21
+ OFFLOAD_TO_WORKER = MutableBool(True)
22
+
23
+
21
24
  def parse_args():
22
25
  """Parse command line arguments"""
23
26
  parser = argparse.ArgumentParser(description="Flowfile Backend Server")
@@ -79,7 +82,6 @@ args = parse_args()
79
82
  SERVER_HOST = args.host if args.host is not None else DEFAULT_SERVER_HOST
80
83
  SERVER_PORT = args.port if args.port is not None else DEFAULT_SERVER_PORT
81
84
  WORKER_PORT = args.worker_port if args.worker_port is not None else int(os.getenv("WORKER_PORT", DEFAULT_WORKER_PORT))
82
- # Worker configuration
83
85
  WORKER_HOST = os.getenv("WORKER_HOST", "0.0.0.0" if platform.system() != "Windows" else "127.0.0.1")
84
86
 
85
87
  config = Config(".env")
@@ -0,0 +1,18 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class MutableBool:
6
+ value: bool
7
+
8
+ def __bool__(self) -> bool:
9
+ """Allow direct boolean evaluation"""
10
+ return self.value
11
+
12
+ def __eq__(self, other) -> bool:
13
+ """Allow equality comparison with booleans"""
14
+ if isinstance(other, bool):
15
+ return self.value == other
16
+ elif isinstance(other, MutableBool):
17
+ return self.value == other.value
18
+ return NotImplemented
@@ -2,6 +2,8 @@ import datetime
2
2
  import pickle
3
3
  import polars as pl
4
4
  import fastexcel
5
+ import copy
6
+
5
7
  from fastapi.exceptions import HTTPException
6
8
  from time import time
7
9
  from functools import partial
@@ -13,7 +15,7 @@ from flowfile_core.configs import logger
13
15
  from flowfile_core.configs.flow_logger import FlowLogger
14
16
  from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
15
17
  from flowfile_core.flowfile.sources.external_sources.airbyte_sources.settings import airbyte_settings_from_config
16
- from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import type_to_polars_str, FlowfileColumn
18
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
17
19
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.settings_validator import (calculate_fuzzy_match_schema,
18
20
  pre_calculate_pivot_schema)
19
21
  from flowfile_core.utils.arrow_reader import get_read_top_n
@@ -23,7 +25,7 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
23
25
  from flowfile_core.flowfile.sources import external_sources
24
26
  from flowfile_core.schemas import input_schema, schemas, transform_schema
25
27
  from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
26
- from flowfile_core.flowfile.utils import snake_case_to_camel_case
28
+ from flowfile_core.flowfile.utils import snake_case_to_camel_case, _handle_raw_data
27
29
  from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
28
30
  from flowfile_core.flowfile.flow_node.flow_node import FlowNode
29
31
  from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
@@ -32,7 +34,7 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
32
34
  ExternalDatabaseFetcher,
33
35
  ExternalDatabaseWriter,
34
36
  ExternalDfFetcher)
35
- from flowfile_core.secrets.secrets import get_encrypted_secret, decrypt_secret
37
+ from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
36
38
  from flowfile_core.flowfile.sources.external_sources.sql_source import utils as sql_utils, models as sql_models
37
39
  from flowfile_core.flowfile.sources.external_sources.sql_source.sql_source import SqlSource, BaseSqlSource
38
40
  from flowfile_core.flowfile.database_connection_manager.db_connections import get_local_database_connection
@@ -203,28 +205,20 @@ class FlowGraph:
203
205
  sample_size: int = 10000
204
206
 
205
207
  def analysis_preparation(flowfile_table: FlowDataEngine):
206
-
207
- if flowfile_table.number_of_records<0:
208
-
209
- number_of_records = ExternalDfFetcher(
210
- lf=flowfile_table.data_frame,
211
- operation_type="calculate_number_of_records",
212
- flow_id=self.flow_id,
213
- node_id=node.node_id,
214
- ).result
208
+ if flowfile_table.number_of_records <= 0:
209
+ number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
215
210
  else:
216
211
  number_of_records = flowfile_table.number_of_records
217
212
  if number_of_records > sample_size:
218
213
  flowfile_table = flowfile_table.get_sample(sample_size, random=True)
219
-
220
214
  external_sampler = ExternalDfFetcher(
221
215
  lf=flowfile_table.data_frame,
222
- file_ref=node.hash,
216
+ file_ref="__gf_walker"+node.hash,
223
217
  wait_on_completion=True,
224
218
  node_id=node.node_id,
225
219
  flow_id=self.flow_id,
226
220
  )
227
- node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref, 10000)
221
+ node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
228
222
  return flowfile_table
229
223
 
230
224
  def schema_callback():
@@ -439,11 +433,11 @@ class FlowGraph:
439
433
 
440
434
  def add_formula(self, function_settings: input_schema.NodeFormula):
441
435
  error = ""
442
- if function_settings.function.field.data_type is not None:
443
- output_type = type_to_polars_str(function_settings.function.field.data_type)
436
+ if function_settings.function.field.data_type not in (None, "Auto"):
437
+ output_type = cast_str_to_polars_type(function_settings.function.field.data_type)
444
438
  else:
445
439
  output_type = None
446
- if output_type is not None:
440
+ if output_type not in (None, "Auto"):
447
441
  new_col = [FlowfileColumn.from_input(column_name=function_settings.function.field.name,
448
442
  data_type=str(output_type))]
449
443
  else:
@@ -485,7 +479,8 @@ class FlowGraph:
485
479
  function=_func,
486
480
  input_columns=[],
487
481
  node_type='cross_join',
488
- setting_input=cross_join_settings)
482
+ setting_input=cross_join_settings,
483
+ input_node_ids=cross_join_settings.depending_on_ids)
489
484
  return self
490
485
 
491
486
  def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
@@ -587,6 +582,8 @@ class FlowGraph:
587
582
  input_cols = set(f.name for f in table.schema)
588
583
  ids_to_remove = []
589
584
  for i, select_col in enumerate(select_cols):
585
+ if select_col.data_type is None:
586
+ select_col.data_type = table.get_schema_column(select_col.old_name).data_type
590
587
  if select_col.old_name not in input_cols:
591
588
  select_col.is_available = False
592
589
  if not select_col.keep:
@@ -900,9 +897,6 @@ class FlowGraph:
900
897
  if external_source_input.source_settings.fields and len(external_source_input.source_settings.fields) > 0:
901
898
  logger.info('Using provided schema in the node')
902
899
 
903
- def add_google_sheet(self, external_source_input: input_schema.NodeExternalSource):
904
- logger.info('Adding google sheet reader')
905
- self.add_external_source(external_source_input)
906
900
 
907
901
  def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
908
902
  logger.info('Adding sql source')
@@ -1044,11 +1038,10 @@ class FlowGraph:
1044
1038
  return self
1045
1039
 
1046
1040
  def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
1047
-
1048
1041
  if isinstance(input_file, input_schema.NodeManualInput):
1049
- input_data = FlowDataEngine(input_file.raw_data)
1042
+ _handle_raw_data(input_file)
1043
+ input_data = FlowDataEngine(input_file.raw_data_format)
1050
1044
  ref = 'manual_input'
1051
-
1052
1045
  else:
1053
1046
  input_data = FlowDataEngine(path_ref=input_file.file_ref)
1054
1047
  ref = 'datasource'
@@ -1061,7 +1054,9 @@ class FlowGraph:
1061
1054
 
1062
1055
  if not input_file.node_id in set(start_node.node_id for start_node in self._flow_starts):
1063
1056
  self._flow_starts.append(node)
1057
+
1064
1058
  else:
1059
+ input_data.collect()
1065
1060
  node = FlowNode(input_file.node_id, function=input_data,
1066
1061
  setting_input=input_file,
1067
1062
  name=ref, node_type=ref, parent_uuid=self.uuid)
@@ -1083,7 +1078,7 @@ class FlowGraph:
1083
1078
  self._output_cols += cols_available
1084
1079
 
1085
1080
  @property
1086
- def input_data_columns(self) -> List[str]:
1081
+ def input_data_columns(self) -> List[str] | None:
1087
1082
  if self._input_cols:
1088
1083
  return list(set([col for col in self._input_cols if
1089
1084
  col in [table_col.name for table_col in self._input_data.schema]]))
@@ -1102,7 +1097,7 @@ class FlowGraph:
1102
1097
  return implicit_starting_nodes
1103
1098
 
1104
1099
  @execution_mode.setter
1105
- def execution_mode(self, mode: str):
1100
+ def execution_mode(self, mode: schemas.ExecutionModeLiteral):
1106
1101
  self.flow_settings.execution_mode = mode
1107
1102
 
1108
1103
  @property
@@ -1158,13 +1153,13 @@ class FlowGraph:
1158
1153
  continue
1159
1154
  node_result.success = node.results.errors is None
1160
1155
  node_result.end_timestamp = time()
1161
- node_result.run_time = node_result.end_timestamp - node_result.start_timestamp
1156
+ node_result.run_time = int(node_result.end_timestamp - node_result.start_timestamp)
1162
1157
  node_result.is_running = False
1163
1158
  except Exception as e:
1164
1159
  node_result.error = 'Node did not run'
1165
1160
  node_result.success = False
1166
1161
  node_result.end_timestamp = time()
1167
- node_result.run_time = node_result.end_timestamp - node_result.start_timestamp
1162
+ node_result.run_time = int(node_result.end_timestamp - node_result.start_timestamp)
1168
1163
  node_result.is_running = False
1169
1164
  node_logger.error(f'Error in node {node.node_id}: {e}')
1170
1165
  if not node_result.success:
@@ -1352,6 +1347,66 @@ class FlowGraph:
1352
1347
  getattr(self, f"add_{node_type}")(combined_settings)
1353
1348
 
1354
1349
 
1350
+ def combine_flow_graphs(*flow_graphs: FlowGraph) -> FlowGraph:
1351
+ """
1352
+ Combine multiple flow graphs into a single graph, ensuring node IDs don't overlap.
1353
+
1354
+ Args:
1355
+ *flow_graphs: Multiple FlowGraph instances to combine
1356
+
1357
+ Returns:
1358
+ A new FlowGraph containing all nodes and edges from the input graphs with remapped IDs
1359
+
1360
+ Raises:
1361
+ ValueError: If any flow_ids overlap
1362
+ """
1363
+ # Validate flow IDs are unique
1364
+ _validate_unique_flow_ids(flow_graphs)
1365
+
1366
+ # Create ID mapping for all nodes
1367
+ node_id_mapping = _create_node_id_mapping(flow_graphs)
1368
+
1369
+ # Remap and combine nodes
1370
+ all_nodes = _remap_nodes(flow_graphs, node_id_mapping)
1371
+
1372
+ # Create a new combined flow graph
1373
+ combined_flow_id = hash(tuple(fg.flow_id for fg in flow_graphs))
1374
+ # return FlowGraph(flow_id=combined_flow_id, nodes=all_nodes, edges=all_edges)
1375
+
1376
+
1377
+ def _validate_unique_flow_ids(flow_graphs: Tuple[FlowGraph, ...]) -> None:
1378
+ """Ensure all flow graphs have unique flow_ids."""
1379
+ all_flow_ids = [fg.flow_id for fg in flow_graphs]
1380
+ if len(all_flow_ids) != len(set(all_flow_ids)):
1381
+ raise ValueError("Cannot combine overlapping graphs, make sure the graphs have a unique identifier")
1382
+
1383
+
1384
+ def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[int, Dict[int, int]]:
1385
+ """Create a mapping from original node IDs to new unique node IDs."""
1386
+ node_id_mapping: Dict[int, Dict[int, int]] = {}
1387
+ next_node_id = 0
1388
+
1389
+ for fg in flow_graphs:
1390
+ node_id_mapping[fg.flow_id] = {}
1391
+ for node in fg.nodes:
1392
+ node_id_mapping[fg.flow_id][node.node_id] = next_node_id
1393
+ next_node_id += 1
1394
+
1395
+ return node_id_mapping
1396
+
1397
+
1398
+ def _remap_nodes(flow_graphs: Tuple[FlowGraph, ...],
1399
+ node_id_mapping: Dict[int, Dict[int, int]]) -> List:
1400
+ """Create new nodes with remapped IDs."""
1401
+ all_nodes = []
1402
+ for fg in flow_graphs:
1403
+ for node in fg.nodes:
1404
+ new_node = copy.deepcopy(node)
1405
+ new_node.node_id = node_id_mapping[fg.flow_id][node.node_id]
1406
+ all_nodes.append(new_node)
1407
+ return all_nodes
1408
+
1409
+
1355
1410
  def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
1356
1411
  """Combine excopy_nodeisting settings with new settings from a NodePromise."""
1357
1412
  copied_setting_input = deepcopy(setting_input)
@@ -1,7 +1,7 @@
1
1
  from flowfile_core.schemas.input_schema import FullDatabaseConnection, FullDatabaseConnectionInterface
2
2
  from sqlalchemy.orm import Session
3
3
  from flowfile_core.database.models import DatabaseConnection as DBConnectionModel, Secret
4
- from flowfile_core.secrets.secrets import store_secret, SecretInput, decrypt_secret
4
+ from flowfile_core.secret_manager.secret_manager import store_secret, SecretInput, decrypt_secret
5
5
  from flowfile_core.database.connection import get_db_context
6
6
 
7
7
 
@@ -17,6 +17,7 @@ from pyarrow.parquet import ParquetFile
17
17
  # Local imports - Core
18
18
  from flowfile_core.configs import logger
19
19
  from flowfile_core.configs.flow_logger import NodeLogger
20
+ from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
20
21
  from flowfile_core.schemas import (
21
22
  input_schema,
22
23
  transform_schema as transform_schemas
@@ -29,7 +30,7 @@ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
29
30
  FlowfileColumn,
30
31
  convert_stats_to_column_info
31
32
  )
32
- from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars
33
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
33
34
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
34
35
  from flowfile_core.flowfile.flow_data_engine.join import (
35
36
  verify_join_select_integrity,
@@ -109,7 +110,7 @@ class FlowDataEngine:
109
110
  # flow_id: int = None # TODO: Implement flow_id
110
111
 
111
112
  def __init__(self,
112
- raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame] = None,
113
+ raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
113
114
  path_ref: str = None,
114
115
  name: str = None,
115
116
  optimize_memory: bool = True,
@@ -147,7 +148,10 @@ class FlowDataEngine:
147
148
 
148
149
  def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
149
150
  """Process different types of input data."""
150
- if isinstance(raw_data, pl.DataFrame):
151
+
152
+ if isinstance(raw_data, input_schema.RawData):
153
+ self._handle_raw_data_format(raw_data)
154
+ elif isinstance(raw_data, pl.DataFrame):
151
155
  self._handle_polars_dataframe(raw_data, number_of_records)
152
156
  elif isinstance(raw_data, pl.LazyFrame):
153
157
  self._handle_polars_lazy_frame(raw_data, number_of_records, optimize_memory)
@@ -190,6 +194,20 @@ class FlowDataEngine:
190
194
  self.number_of_records = 1
191
195
  self.data_frame = pl.DataFrame([data])
192
196
 
197
+ def _handle_raw_data_format(self, raw_data: input_schema.RawData):
198
+ """Create a FlowDataEngine from a RawData object."""
199
+ flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
200
+ polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
201
+ for flowfile_column in flowfile_schema])
202
+ try:
203
+ df = pl.DataFrame(raw_data.data, polars_schema)
204
+ except TypeError as e:
205
+ logger.warning(f"Could not parse the data with the schema:\n{e}")
206
+ df = pl.DataFrame(raw_data.data)
207
+ self.number_of_records = len(df)
208
+ self.data_frame = df.lazy()
209
+ self.lazy = True
210
+
193
211
  def _handle_list_input(self, data: List):
194
212
  """Handle list input."""
195
213
  number_of_records = len(data)
@@ -462,6 +480,9 @@ class FlowDataEngine:
462
480
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
463
481
  return self.data_frame.to_dicts()
464
482
 
483
+ def to_dict(self) -> Dict[str, List]:
484
+ return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
485
+
465
486
  @classmethod
466
487
  def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
467
488
  """Create a FlowDataEngine from an external data source."""
@@ -484,7 +505,7 @@ class FlowDataEngine:
484
505
  """Create a FlowDataEngine from a schema definition."""
485
506
  pl_schema = []
486
507
  for i, flow_file_column in enumerate(schema):
487
- pl_schema.append((flow_file_column.name, type_to_polars(flow_file_column.data_type)))
508
+ pl_schema.append((flow_file_column.name, cast_str_to_polars_type(flow_file_column.data_type)))
488
509
  schema[i].col_index = i
489
510
  df = pl.LazyFrame(schema=pl_schema)
490
511
  return cls(df, schema=schema, calculate_schema_stats=False, number_of_records=0)
@@ -824,7 +845,7 @@ class FlowDataEngine:
824
845
  Returns:
825
846
  FlowDataEngine: New instance with sampled data
826
847
  """
827
- n_records = min(n_rows, self.number_of_records)
848
+ n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=True))
828
849
  logging.info(f'Getting sample of {n_rows} rows')
829
850
 
830
851
  if random:
@@ -1158,14 +1179,25 @@ class FlowDataEngine:
1158
1179
  self.number_of_records = 0
1159
1180
  self._lazy = True
1160
1181
 
1161
- def get_number_of_records(self, warn: bool = False, force_calculate: bool = False) -> int:
1182
+ def _calculate_number_of_records_in_worker(self) -> int:
1183
+ number_of_records = ExternalDfFetcher(
1184
+ lf=self.data_frame,
1185
+ operation_type="calculate_number_of_records",
1186
+ flow_id=-1,
1187
+ node_id=-1,
1188
+ wait_on_completion=True
1189
+ ).result
1190
+ return number_of_records
1191
+
1192
+ def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
1193
+ calculate_in_worker_process: bool = False) -> int:
1162
1194
  """
1163
1195
  Get the total number of records in the DataFrame.
1164
1196
 
1165
1197
  Args:
1166
1198
  warn: Whether to warn about expensive operations
1167
1199
  force_calculate: Whether to force recalculation
1168
-
1200
+ calculate_in_worker_process: Whether to offload compute to the worker process
1169
1201
  Returns:
1170
1202
  int: Number of records
1171
1203
 
@@ -1174,22 +1206,24 @@ class FlowDataEngine:
1174
1206
  """
1175
1207
  if self.is_future and not self.is_collected:
1176
1208
  return -1
1177
-
1209
+ calculate_in_worker_process = False if not OFFLOAD_TO_WORKER.value else calculate_in_worker_process
1178
1210
  if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
1179
1211
  if self._number_of_records_callback is not None:
1180
1212
  self._number_of_records_callback(self)
1181
1213
 
1182
1214
  if self.lazy:
1183
- if warn:
1184
- logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1185
- try:
1186
- self.number_of_records = self.data_frame.select(pl.len()).collect(
1187
- engine="streaming" if self._streamable else "auto")[0, 0]
1188
- except Exception:
1189
- raise Exception('Could not get number of records')
1215
+ if calculate_in_worker_process:
1216
+ self.number_of_records = self._calculate_number_of_records_in_worker()
1217
+ else:
1218
+ if warn:
1219
+ logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1220
+ try:
1221
+ self.number_of_records = self.data_frame.select(pl.len()).collect(
1222
+ engine="streaming" if self._streamable else "auto")[0, 0]
1223
+ except Exception:
1224
+ raise ValueError('Could not get number of records')
1190
1225
  else:
1191
1226
  self.number_of_records = self.data_frame.__len__()
1192
-
1193
1227
  return self.number_of_records
1194
1228
 
1195
1229
  # Properties
@@ -1345,7 +1379,7 @@ class FlowDataEngine:
1345
1379
  FlowDataEngine: New instance with added column
1346
1380
  """
1347
1381
  expr = to_expr(func)
1348
- if output_data_type is not None:
1382
+ if output_data_type not in (None, "Auto"):
1349
1383
  df = self.data_frame.with_columns(expr.cast(output_data_type).alias(col_name))
1350
1384
  else:
1351
1385
  df = self.data_frame.with_columns(expr.alias(col_name))
@@ -1518,4 +1552,7 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
1518
1552
  kwargs = {'input_df': flowfile_tables[0].data_frame}
1519
1553
  else:
1520
1554
  kwargs = {f'input_df_{i+1}': flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
1521
- return FlowDataEngine(polars_executable(**kwargs))
1555
+ df = polars_executable(**kwargs)
1556
+ if isinstance(df, pl.DataFrame):
1557
+ logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
1558
+ return FlowDataEngine(df)
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import Optional, Any, List, Dict, Literal
3
3
  from flowfile_core.schemas import input_schema
4
- from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
4
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
5
5
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
6
6
  from polars import datatypes
7
7
  import polars as pl
@@ -9,6 +9,37 @@ import polars as pl
9
9
  DataTypeGroup = Literal['numeric', 'str', 'date']
10
10
 
11
11
 
12
+ def convert_pl_type_to_string(pl_type: pl.DataType, inner: bool = False) -> str:
13
+ if isinstance(pl_type, pl.List):
14
+ inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
15
+ return f"pl.List({inner_str})"
16
+ elif isinstance(pl_type, pl.Array):
17
+ inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
18
+ return f"pl.Array({inner_str})"
19
+ elif isinstance(pl_type, pl.Decimal):
20
+ precision = pl_type.precision if hasattr(pl_type, 'precision') else None
21
+ scale = pl_type.scale if hasattr(pl_type, 'scale') else None
22
+ if precision is not None and scale is not None:
23
+ return f"pl.Decimal({precision}, {scale})"
24
+ elif precision is not None:
25
+ return f"pl.Decimal({precision})"
26
+ else:
27
+ return "pl.Decimal()"
28
+ elif isinstance(pl_type, pl.Struct):
29
+ # Handle Struct with field definitions
30
+ fields = []
31
+ if hasattr(pl_type, 'fields'):
32
+ for field in pl_type.fields:
33
+ field_name = field.name
34
+ field_type = convert_pl_type_to_string(field.dtype, inner=True)
35
+ fields.append(f'pl.Field("{field_name}", {field_type})')
36
+ field_str = ", ".join(fields)
37
+ return f"pl.Struct([{field_str}])"
38
+ else:
39
+ # For base types, we want the full pl.TypeName format
40
+ return str(pl_type.base_type()) if not inner else f"pl.{pl_type}"
41
+
42
+
12
43
  @dataclass
13
44
  class FlowfileColumn:
14
45
  column_name: str
@@ -28,7 +59,7 @@ class FlowfileColumn:
28
59
  __perc_unique: Optional[float]
29
60
 
30
61
  def __init__(self, polars_type: PlType):
31
- self.data_type = str(polars_type.pl_datatype.base_type())
62
+ self.data_type = convert_pl_type_to_string(polars_type.pl_datatype)
32
63
  self.size = polars_type.count - polars_type.null_count
33
64
  self.max_value = polars_type.max
34
65
  self.min_value = polars_type.min
@@ -53,7 +84,7 @@ class FlowfileColumn:
53
84
 
54
85
  @classmethod
55
86
  def from_input(cls, column_name: str, data_type: str, **kwargs) -> "FlowfileColumn":
56
- pl_type = type_to_polars_str(data_type)
87
+ pl_type = cast_str_to_polars_type(data_type)
57
88
  if pl_type is not None:
58
89
  data_type = pl_type
59
90
  return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
@@ -129,12 +160,9 @@ class FlowfileColumn:
129
160
  return 'date'
130
161
 
131
162
  def get_polars_type(self) -> PlType:
132
- if hasattr(datatypes, self.data_type):
133
- pl_datatype = getattr(datatypes, self.data_type)
134
- else:
135
- pl_datatype = None
136
-
137
- return PlType(pl_datatype=pl_datatype, **self.__dict__)
163
+ pl_datatype = cast_str_to_polars_type(self.data_type)
164
+ pl_type = PlType(pl_datatype=pl_datatype, **self.__dict__)
165
+ return pl_type
138
166
 
139
167
  def update_type_from_polars_type(self, pl_type: PlType):
140
168
  self.data_type = str(pl_type.pl_datatype.base_type())
@@ -142,3 +170,8 @@ class FlowfileColumn:
142
170
 
143
171
  def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
144
172
  return [FlowfileColumn.create_from_polars_type(PlType(**c)) for c in stats]
173
+
174
+
175
+ def convert_pl_schema_to_raw_data_format(pl_schema: pl.Schema) -> List[input_schema.MinimalFieldInfo]:
176
+ return [FlowfileColumn.create_from_polars_type(PlType(column_name=k, pl_datatype=v)).get_minimal_field_info()
177
+ for k, v in pl_schema.items()]
@@ -18,10 +18,45 @@ dtype_to_pl = {
18
18
  'time': pl.Time,
19
19
  }
20
20
 
21
+
22
+ def safe_eval_pl_type(type_string: str):
23
+ """
24
+ Safely evaluate a Polars type string with restricted namespace.
25
+ Only allows Polars types and basic Python literals.
26
+ """
27
+ # Define allowed names in the evaluation namespace
28
+ safe_dict = {
29
+ # Polars module and types
30
+ 'pl': pl,
31
+
32
+ # Basic Python built-ins for literals
33
+ 'int': int,
34
+ 'str': str,
35
+ 'float': float,
36
+ 'bool': bool,
37
+ 'list': list,
38
+ 'dict': dict,
39
+ 'tuple': tuple,
40
+
41
+ # Disable dangerous built-ins
42
+ '__builtins__': {},
43
+ }
44
+
45
+ try:
46
+ return eval(type_string, safe_dict, {})
47
+ except Exception as e:
48
+ raise ValueError(f"Failed to safely evaluate type string '{type_string}': {e}")
49
+
50
+
21
51
  dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
22
52
 
23
53
 
24
- def type_to_polars(dtype: str):
54
+ def get_polars_type(dtype: str):
55
+ if 'pl.' in dtype:
56
+ try:
57
+ return safe_eval_pl_type(dtype)
58
+ except Exception as e:
59
+ return pl.String
25
60
  pl_datetype = dtype_to_pl.get(dtype.lower())
26
61
  if pl_datetype is not None:
27
62
  return pl_datetype
@@ -31,6 +66,10 @@ def type_to_polars(dtype: str):
31
66
  return pl.String
32
67
 
33
68
 
34
- def type_to_polars_str(dtype: str) -> pl.DataType:
35
- return type_to_polars(dtype)()
69
+ def cast_str_to_polars_type(dtype: str) -> pl.DataType:
70
+ pl_type = get_polars_type(dtype)
71
+ if hasattr(pl_type, '__call__'):
72
+ return pl_type()
73
+ else:
74
+ return pl_type
36
75
 
@@ -3,6 +3,7 @@ from typing import Dict, Any, Callable
3
3
  import textwrap
4
4
  import ast
5
5
  import time
6
+ from io import BytesIO
6
7
 
7
8
 
8
9
  def remove_comments_and_docstrings(source: str) -> str:
@@ -126,6 +127,37 @@ class PolarsCodeParser:
126
127
  'col': pl.col,
127
128
  'lit': pl.lit,
128
129
  'expr': pl.expr,
130
+
131
+ # Polars datatypes - added directly
132
+ 'Int8': pl.Int8,
133
+ 'Int16': pl.Int16,
134
+ 'Int32': pl.Int32,
135
+ 'Int64': pl.Int64,
136
+ 'Int128': pl.Int128,
137
+ 'UInt8': pl.UInt8,
138
+ 'UInt16': pl.UInt16,
139
+ 'UInt32': pl.UInt32,
140
+ 'UInt64': pl.UInt64,
141
+ 'Float32': pl.Float32,
142
+ 'Float64': pl.Float64,
143
+ 'Boolean': pl.Boolean,
144
+ 'String': pl.String,
145
+ 'Utf8': pl.Utf8,
146
+ 'Binary': pl.Binary,
147
+ 'Null': pl.Null,
148
+ 'List': pl.List,
149
+ 'Array': pl.Array,
150
+ 'Struct': pl.Struct,
151
+ 'Object': pl.Object,
152
+ 'Date': pl.Date,
153
+ 'Time': pl.Time,
154
+ 'Datetime': pl.Datetime,
155
+ 'Duration': pl.Duration,
156
+ 'Categorical': pl.Categorical,
157
+ 'Decimal': pl.Decimal,
158
+ 'Enum': pl.Enum,
159
+ 'Unknown': pl.Unknown,
160
+
129
161
  # Basic Python built-ins
130
162
  'print': print,
131
163
  'len': len,
@@ -142,7 +174,8 @@ class PolarsCodeParser:
142
174
  'True': True,
143
175
  'False': False,
144
176
  'None': None,
145
- 'time': time
177
+ 'time': time,
178
+ 'BytesIO': BytesIO
146
179
  }
147
180
 
148
181
  @staticmethod
@@ -225,7 +258,6 @@ class PolarsCodeParser:
225
258
 
226
259
  # Wrap the code in a function
227
260
  wrapped_code = self._wrap_in_function(code, num_inputs)
228
-
229
261
  try:
230
262
  # Create namespace for execution
231
263
  local_namespace: Dict[str, Any] = {}