Flowfile 0.3.2__py3-none-any.whl → 0.3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. flowfile/__init__.py +3 -2
  2. flowfile/web/__init__.py +3 -0
  3. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/METADATA +4 -3
  4. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/RECORD +46 -35
  5. flowfile_core/configs/__init__.py +15 -4
  6. flowfile_core/configs/settings.py +5 -3
  7. flowfile_core/configs/utils.py +18 -0
  8. flowfile_core/flowfile/FlowfileFlow.py +13 -18
  9. flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
  10. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
  11. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
  12. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
  13. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
  14. flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
  15. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
  16. flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
  17. flowfile_core/flowfile/flow_node/flow_node.py +2 -1
  18. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
  19. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
  20. flowfile_core/flowfile/utils.py +34 -3
  21. flowfile_core/main.py +2 -3
  22. flowfile_core/routes/secrets.py +1 -1
  23. flowfile_core/schemas/input_schema.py +10 -4
  24. flowfile_core/schemas/transform_schema.py +25 -47
  25. flowfile_frame/__init__.py +11 -4
  26. flowfile_frame/adding_expr.py +280 -0
  27. flowfile_frame/config.py +9 -0
  28. flowfile_frame/expr.py +301 -83
  29. flowfile_frame/expr.pyi +2174 -0
  30. flowfile_frame/expr_name.py +258 -0
  31. flowfile_frame/flow_frame.py +584 -1002
  32. flowfile_frame/flow_frame.pyi +368 -0
  33. flowfile_frame/flow_frame_methods.py +617 -0
  34. flowfile_frame/group_frame.py +89 -42
  35. flowfile_frame/join.py +1 -2
  36. flowfile_frame/lazy.py +704 -0
  37. flowfile_frame/lazy_methods.py +201 -0
  38. flowfile_frame/list_name_space.py +324 -0
  39. flowfile_frame/selectors.py +3 -0
  40. flowfile_frame/series.py +70 -0
  41. flowfile_frame/utils.py +80 -4
  42. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/LICENSE +0 -0
  43. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/WHEEL +0 -0
  44. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/entry_points.txt +0 -0
  45. /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
  46. /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import Optional, Any, List, Dict, Literal
3
3
  from flowfile_core.schemas import input_schema
4
- from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
4
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
5
5
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
6
6
  from polars import datatypes
7
7
  import polars as pl
@@ -9,6 +9,37 @@ import polars as pl
9
9
  DataTypeGroup = Literal['numeric', 'str', 'date']
10
10
 
11
11
 
12
+ def convert_pl_type_to_string(pl_type: pl.DataType, inner: bool = False) -> str:
13
+ if isinstance(pl_type, pl.List):
14
+ inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
15
+ return f"pl.List({inner_str})"
16
+ elif isinstance(pl_type, pl.Array):
17
+ inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
18
+ return f"pl.Array({inner_str})"
19
+ elif isinstance(pl_type, pl.Decimal):
20
+ precision = pl_type.precision if hasattr(pl_type, 'precision') else None
21
+ scale = pl_type.scale if hasattr(pl_type, 'scale') else None
22
+ if precision is not None and scale is not None:
23
+ return f"pl.Decimal({precision}, {scale})"
24
+ elif precision is not None:
25
+ return f"pl.Decimal({precision})"
26
+ else:
27
+ return "pl.Decimal()"
28
+ elif isinstance(pl_type, pl.Struct):
29
+ # Handle Struct with field definitions
30
+ fields = []
31
+ if hasattr(pl_type, 'fields'):
32
+ for field in pl_type.fields:
33
+ field_name = field.name
34
+ field_type = convert_pl_type_to_string(field.dtype, inner=True)
35
+ fields.append(f'pl.Field("{field_name}", {field_type})')
36
+ field_str = ", ".join(fields)
37
+ return f"pl.Struct([{field_str}])"
38
+ else:
39
+ # For base types, we want the full pl.TypeName format
40
+ return str(pl_type.base_type()) if not inner else f"pl.{pl_type}"
41
+
42
+
12
43
  @dataclass
13
44
  class FlowfileColumn:
14
45
  column_name: str
@@ -28,7 +59,7 @@ class FlowfileColumn:
28
59
  __perc_unique: Optional[float]
29
60
 
30
61
  def __init__(self, polars_type: PlType):
31
- self.data_type = str(polars_type.pl_datatype.base_type())
62
+ self.data_type = convert_pl_type_to_string(polars_type.pl_datatype)
32
63
  self.size = polars_type.count - polars_type.null_count
33
64
  self.max_value = polars_type.max
34
65
  self.min_value = polars_type.min
@@ -53,7 +84,7 @@ class FlowfileColumn:
53
84
 
54
85
  @classmethod
55
86
  def from_input(cls, column_name: str, data_type: str, **kwargs) -> "FlowfileColumn":
56
- pl_type = type_to_polars_str(data_type)
87
+ pl_type = cast_str_to_polars_type(data_type)
57
88
  if pl_type is not None:
58
89
  data_type = pl_type
59
90
  return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
@@ -129,12 +160,9 @@ class FlowfileColumn:
129
160
  return 'date'
130
161
 
131
162
  def get_polars_type(self) -> PlType:
132
- if hasattr(datatypes, self.data_type):
133
- pl_datatype = getattr(datatypes, self.data_type)
134
- else:
135
- pl_datatype = None
136
-
137
- return PlType(pl_datatype=pl_datatype, **self.__dict__)
163
+ pl_datatype = cast_str_to_polars_type(self.data_type)
164
+ pl_type = PlType(pl_datatype=pl_datatype, **self.__dict__)
165
+ return pl_type
138
166
 
139
167
  def update_type_from_polars_type(self, pl_type: PlType):
140
168
  self.data_type = str(pl_type.pl_datatype.base_type())
@@ -142,3 +170,8 @@ class FlowfileColumn:
142
170
 
143
171
  def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
144
172
  return [FlowfileColumn.create_from_polars_type(PlType(**c)) for c in stats]
173
+
174
+
175
+ def convert_pl_schema_to_raw_data_format(pl_schema: pl.Schema) -> List[input_schema.MinimalFieldInfo]:
176
+ return [FlowfileColumn.create_from_polars_type(PlType(column_name=k, pl_datatype=v)).get_minimal_field_info()
177
+ for k, v in pl_schema.items()]
@@ -18,10 +18,45 @@ dtype_to_pl = {
18
18
  'time': pl.Time,
19
19
  }
20
20
 
21
+
22
+ def safe_eval_pl_type(type_string: str):
23
+ """
24
+ Safely evaluate a Polars type string with restricted namespace.
25
+ Only allows Polars types and basic Python literals.
26
+ """
27
+ # Define allowed names in the evaluation namespace
28
+ safe_dict = {
29
+ # Polars module and types
30
+ 'pl': pl,
31
+
32
+ # Basic Python built-ins for literals
33
+ 'int': int,
34
+ 'str': str,
35
+ 'float': float,
36
+ 'bool': bool,
37
+ 'list': list,
38
+ 'dict': dict,
39
+ 'tuple': tuple,
40
+
41
+ # Disable dangerous built-ins
42
+ '__builtins__': {},
43
+ }
44
+
45
+ try:
46
+ return eval(type_string, safe_dict, {})
47
+ except Exception as e:
48
+ raise ValueError(f"Failed to safely evaluate type string '{type_string}': {e}")
49
+
50
+
21
51
  dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
22
52
 
23
53
 
24
- def type_to_polars(dtype: str):
54
+ def get_polars_type(dtype: str):
55
+ if 'pl.' in dtype:
56
+ try:
57
+ return safe_eval_pl_type(dtype)
58
+ except Exception as e:
59
+ return pl.String
25
60
  pl_datetype = dtype_to_pl.get(dtype.lower())
26
61
  if pl_datetype is not None:
27
62
  return pl_datetype
@@ -31,6 +66,10 @@ def type_to_polars(dtype: str):
31
66
  return pl.String
32
67
 
33
68
 
34
- def type_to_polars_str(dtype: str) -> pl.DataType:
35
- return type_to_polars(dtype)()
69
+ def cast_str_to_polars_type(dtype: str) -> pl.DataType:
70
+ pl_type = get_polars_type(dtype)
71
+ if hasattr(pl_type, '__call__'):
72
+ return pl_type()
73
+ else:
74
+ return pl_type
36
75
 
@@ -3,6 +3,7 @@ from typing import Dict, Any, Callable
3
3
  import textwrap
4
4
  import ast
5
5
  import time
6
+ from io import BytesIO
6
7
 
7
8
 
8
9
  def remove_comments_and_docstrings(source: str) -> str:
@@ -174,6 +175,7 @@ class PolarsCodeParser:
174
175
  'False': False,
175
176
  'None': None,
176
177
  'time': time,
178
+ 'BytesIO': BytesIO
177
179
  }
178
180
 
179
181
  @staticmethod
@@ -256,7 +258,6 @@ class PolarsCodeParser:
256
258
 
257
259
  # Wrap the code in a function
258
260
  wrapped_code = self._wrap_in_function(code, num_inputs)
259
-
260
261
  try:
261
262
  # Create namespace for execution
262
263
  local_namespace: Dict[str, Any] = {}
@@ -1,14 +1,27 @@
1
1
  from faker import Faker
2
2
  from functools import partial
3
+ from math import ceil
3
4
  from random import randint
4
5
  import polars as pl
5
6
  from typing import List, Dict, Any, Generator
6
7
 
7
8
 
8
- def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
9
+ def create_fake_data(n_records: int = 1000, optimized: bool = True) -> pl.DataFrame:
10
+ """
11
+
12
+ Args:
13
+ n_records (): Number of records to return
14
+ optimized (): Indicator if creation should be optimized, will result in more identical rows when True
15
+
16
+ Returns:
17
+ pl.DataFrame
18
+ """
9
19
  fake = Faker()
10
- selector = partial(randint,0)
11
- min_range = partial(min, n_records)
20
+ selector = partial(randint, 0)
21
+
22
+ max_n_records = min(10_000, n_records) if optimized else n_records
23
+
24
+ min_range = partial(min, max_n_records)
12
25
  # Pre-generation of static data
13
26
  cities = [fake.city() for _ in range(min_range(7000))]
14
27
  companies = [fake.company() for _ in range(min_range(100_000))]
@@ -19,7 +32,7 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
19
32
  first_names = [fake.first_name() for _ in range(min_range(100_000))]
20
33
  last_names = [fake.last_name() for _ in range(min_range(50_000))]
21
34
  domain_names = [fake.domain_name() for _ in range(10)]
22
- sales_data = [fake.random_int(0, 1000) for _ in range(n_records)]
35
+ sales_data = [fake.random_int(0, 1000) for _ in range(max_n_records)]
23
36
 
24
37
  def generate_name():
25
38
  return f"{first_names[selector(min_range(100_000))-1]} {last_names[selector(min_range(50_000))-1]}"
@@ -32,9 +45,8 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
32
45
 
33
46
  def generate_phone_number():
34
47
  return fake.phone_number()
35
-
36
48
  data = []
37
- for i in range(n_records):
49
+ for i in range(max_n_records):
38
50
  name = generate_name()
39
51
  data.append(dict(
40
52
  ID=randint(1, 1000000),
@@ -47,8 +59,14 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
47
59
  Work=companies[selector(min_range(100_000))-1],
48
60
  Zipcode=zipcodes[selector(min_range(200_000))-1],
49
61
  Country=countries[selector(min_range(50))-1],
50
- sales_data=sales_data[selector(n_records)-1]
62
+ sales_data=sales_data[selector(max_n_records)-1]
51
63
  ))
64
+ if max_n_records < n_records:
65
+ n_duplicates: int = ceil(n_records / max_n_records)
66
+ output = []
67
+ for _ in range(n_duplicates):
68
+ output.extend(data)
69
+ data = output[:n_records]
52
70
 
53
71
  return pl.DataFrame(data)
54
72
 
@@ -190,7 +190,7 @@ class BaseFetcher:
190
190
  logger.info('Already running the fetching')
191
191
  return
192
192
 
193
- sleep_time = 1
193
+ sleep_time = .5
194
194
  self.running = True
195
195
  while not self.stop_event.is_set():
196
196
  try:
@@ -205,7 +205,8 @@ class BaseFetcher:
205
205
  break
206
206
  elif status.status == 'Unknown Error':
207
207
  self._handle_error(-1,
208
- 'There was an unknown error with the process, and the process got killed by the server')
208
+ 'There was an unknown error with the process, '
209
+ 'and the process got killed by the server')
209
210
  break
210
211
  else:
211
212
  self._handle_error(2, r.text)
@@ -284,7 +285,7 @@ class ExternalDfFetcher(BaseFetcher):
284
285
 
285
286
  def __init__(self, flow_id: int, node_id: int | str, lf: pl.LazyFrame | pl.DataFrame, file_ref: str = None,
286
287
  wait_on_completion: bool = True,
287
- operation_type: OperationType = 'store'):
288
+ operation_type: OperationType = 'store', offload_to_worker: bool = True):
288
289
  super().__init__(file_ref=file_ref)
289
290
  lf = lf.lazy() if isinstance(lf, pl.DataFrame) else lf
290
291
  r = trigger_df_operation(lf=lf, file_ref=self.file_ref, operation_type=operation_type,
@@ -210,3 +210,4 @@ def match_order(l: List[str], ref: List[str]) -> List[str]:
210
210
  org_order = ref_order.get(v, float('inf'))
211
211
  order.append(org_order)
212
212
  return [v for _, v in sorted(zip(order, l))]
213
+
@@ -146,6 +146,7 @@ class FlowNode:
146
146
  self.node_settings.renew_schema = True
147
147
  if hasattr(setting_input, 'cache_results'):
148
148
  self.node_settings.cache_results = setting_input.cache_results
149
+
149
150
  self.setting_input = setting_input
150
151
  self.results.errors = None
151
152
  self.add_lead_to_in_depend_source()
@@ -174,7 +175,7 @@ class FlowNode:
174
175
  self.set_node_information()
175
176
  if self.node_type == 'manual_input' and isinstance(self._setting_input, input_schema.NodeManualInput):
176
177
  if self.hash != self.calculate_hash(setting_input) or not self.node_stats.has_run:
177
- self.function = self.function.__class__(setting_input.raw_data)
178
+ self.function = self.function.__class__(setting_input.raw_data_format)
178
179
  self.reset()
179
180
  self.get_predicted_schema()
180
181
  elif self._setting_input is not None:
@@ -1,7 +1,7 @@
1
1
  from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union
2
2
  from pydantic import BaseModel, field_validator, ConfigDict
3
3
  import polars as pl
4
- from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
4
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
5
5
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
6
6
  from flowfile_core.schemas.input_schema import MinimalFieldInfo
7
7
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
@@ -56,7 +56,7 @@ class JsonSchema(BaseModel):
56
56
  dtype = 'string'
57
57
  else:
58
58
  dtype = type_mapping.get(self.type[0] if isinstance(self.type, list) else self.type, 'string')
59
- return type_to_polars_str(dtype)
59
+ return cast_str_to_polars_type(dtype)
60
60
 
61
61
 
62
62
  class AirbyteProperty(BaseModel):
@@ -4,7 +4,7 @@ from flowfile_core.configs import logger
4
4
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
5
5
  from flowfile_core.schemas.input_schema import MinimalFieldInfo, DatabaseSettings
6
6
  from sqlalchemy import Engine, inspect, create_engine, text
7
- from flowfile_core.secrets.secrets import get_encrypted_secret, decrypt_secret
7
+ from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
8
8
 
9
9
  from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
10
10
  from flowfile_core.flowfile.sources.external_sources.sql_source.utils import get_polars_type, construct_sql_uri
@@ -1,9 +1,15 @@
1
- from typing import List
2
1
  import os
3
- import shutil
4
2
  import hashlib
5
- from datetime import datetime
6
3
  import json
4
+ import polars as pl
5
+ import shutil
6
+
7
+ from datetime import datetime, date, time
8
+ from typing import List
9
+ from decimal import Decimal
10
+
11
+ from flowfile_core.flowfile.flow_data_engine.utils import standardize_col_dtype
12
+ from flowfile_core.schemas import input_schema
7
13
 
8
14
 
9
15
  def generate_sha256_hash(data: bytes):
@@ -26,8 +32,16 @@ def snake_case_to_camel_case(text: str) -> str:
26
32
  def json_default(val):
27
33
  if isinstance(val, datetime):
28
34
  return val.isoformat(timespec='microseconds')
35
+ elif isinstance(val, date):
36
+ return val.isoformat()
37
+ elif isinstance(val, time):
38
+ return val.isoformat()
29
39
  elif hasattr(val, '__dict__'):
30
40
  return val.__dict__
41
+ elif isinstance(val, Decimal):
42
+ if val.as_integer_ratio()[1] == 1:
43
+ return int(val)
44
+ return float(val)
31
45
  else:
32
46
  raise Exception('Value is not serializable')
33
47
 
@@ -104,3 +118,20 @@ def batch_generator(input_list: List, batch_size: int = 10000):
104
118
  input_list = []
105
119
  run = False
106
120
 
121
+
122
+ def _handle_raw_data(node_manual_input: input_schema.NodeManualInput):
123
+ """Ensure compatibility with the new typed raw data and the old dict form data type"""
124
+ if (not (hasattr(node_manual_input, "raw_data_format") and node_manual_input.raw_data_format)
125
+ and (hasattr(node_manual_input, 'raw_data') and node_manual_input.raw_data)):
126
+ values = [standardize_col_dtype([vv for vv in c]) for c in zip(*(r.values()
127
+ for r in node_manual_input.raw_data))]
128
+ data_types = (pl.DataType.from_python(type(next((v for v in column_values), None))) for column_values in values)
129
+ _columns = [input_schema.MinimalFieldInfo(name=c, data_type=str(next(data_types))) for c in
130
+ node_manual_input.raw_data[0].keys()]
131
+
132
+ node_manual_input.raw_data_format = input_schema.RawData(columns=_columns, data=values)
133
+ elif ((hasattr(node_manual_input, "raw_data_format") and node_manual_input.raw_data_format)
134
+ and not (hasattr(node_manual_input, 'raw_data') and node_manual_input.raw_data)):
135
+ node_manual_input.raw_data = [{c.name: node_manual_input.raw_data_format.data[ci][ri] for ci, c in
136
+ enumerate(node_manual_input.raw_data_format.columns)}
137
+ for ri in range(len(node_manual_input.raw_data_format.data[0]))]
flowfile_core/main.py CHANGED
@@ -8,7 +8,8 @@ from fastapi import FastAPI
8
8
  from fastapi.middleware.cors import CORSMiddleware
9
9
 
10
10
  from flowfile_core import ServerRun
11
- from flowfile_core.configs.settings import SERVER_HOST, SERVER_PORT, WORKER_HOST, WORKER_PORT, WORKER_URL
11
+ from flowfile_core.configs.settings import (SERVER_HOST, SERVER_PORT, WORKER_HOST, WORKER_PORT, WORKER_URL,
12
+ OFFLOAD_TO_WORKER)
12
13
 
13
14
  from flowfile_core.routes.auth import router as auth_router
14
15
  from flowfile_core.routes.secrets import router as secrets_router
@@ -107,7 +108,6 @@ def run(host: str = None, port: int = None):
107
108
  host = SERVER_HOST
108
109
  if port is None:
109
110
  port = SERVER_PORT
110
-
111
111
  print(f"Starting server on {host}:{port}")
112
112
  print(f"Worker configured at {WORKER_URL} (host: {WORKER_HOST}, port: {WORKER_PORT})")
113
113
 
@@ -120,7 +120,6 @@ def run(host: str = None, port: int = None):
120
120
  host=host,
121
121
  port=port,
122
122
  loop="asyncio",
123
- log_level="warning",
124
123
  )
125
124
  server = uvicorn.Server(config)
126
125
  server_instance = server # Store server instance globally
@@ -10,7 +10,7 @@ from flowfile_core.auth.jwt import get_current_active_user
10
10
  from flowfile_core.auth.models import Secret, SecretInput
11
11
  from flowfile_core.database import models as db_models
12
12
  from flowfile_core.database.connection import get_db
13
- from flowfile_core.secrets.secrets import encrypt_secret, store_secret, delete_secret as delete_secret_action
13
+ from flowfile_core.secret_manager.secret_manager import encrypt_secret, store_secret, delete_secret as delete_secret_action
14
14
 
15
15
  router = APIRouter(dependencies=[Depends(get_current_active_user)])
16
16
 
@@ -1,4 +1,4 @@
1
- from typing import List, Optional, Literal
1
+ from typing import List, Optional, Literal, Iterator
2
2
  from flowfile_core.schemas import transform_schema
3
3
  from pathlib import Path
4
4
  import os
@@ -61,7 +61,7 @@ class ReceivedTableBase(BaseModel):
61
61
  return self.path
62
62
 
63
63
  def set_absolute_filepath(self):
64
- base_path = Path(self.path)
64
+ base_path = Path(self.path).expanduser()
65
65
  # Check if the path is relative, resolve it with the current working directory
66
66
  if not base_path.is_absolute():
67
67
  base_path = Path.cwd() / base_path
@@ -97,7 +97,7 @@ class ReceivedJsonTable(ReceivedCsvTable):
97
97
  pass
98
98
 
99
99
 
100
- class ReceivedParquetTable(BaseModel):
100
+ class ReceivedParquetTable(ReceivedTableBase):
101
101
  file_type: str = 'parquet'
102
102
 
103
103
 
@@ -247,8 +247,14 @@ class NodeDatasource(NodeBase):
247
247
  file_ref: str = None
248
248
 
249
249
 
250
+ class RawData(BaseModel):
251
+ columns: List[MinimalFieldInfo] = None
252
+ data: List[List] # List of list where each inner list is a column of data. This ensures more efficient storage
253
+
254
+
250
255
  class NodeManualInput(NodeBase):
251
- raw_data: List = None
256
+ raw_data: Optional[List] = None
257
+ raw_data_format: Optional[RawData] = None
252
258
 
253
259
 
254
260
  class NodeRead(NodeBase):
@@ -166,10 +166,8 @@ class FuzzyMap(JoinMap):
166
166
  self.output_column_name = f'fuzzy_score_{self.left_col}_{self.right_col}'
167
167
 
168
168
 
169
- @dataclass
170
- class CrossJoinInput:
171
- left_select: SelectInputs = None
172
- right_select: SelectInputs = None
169
+ class JoinSelectMixin:
170
+ """Mixin for common join selection functionality"""
173
171
 
174
172
  @staticmethod
175
173
  def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> SelectInputs:
@@ -184,6 +182,26 @@ class CrossJoinInput:
184
182
  elif all(isinstance(c, str) for c in select):
185
183
  return SelectInputs([SelectInput(s, s) for s in select])
186
184
 
185
+ def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
186
+ current_names = self.left_select.new_cols & self.right_select.new_cols
187
+ if old_col_name not in current_names:
188
+ return old_col_name
189
+ while True:
190
+ if old_col_name not in current_names:
191
+ return old_col_name
192
+ old_col_name = f'{side}_{old_col_name}'
193
+
194
+ def add_new_select_column(self, select_input: SelectInput, side: str):
195
+ selects = self.right_select if side == 'right' else self.left_select
196
+ select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
197
+ selects.__add__(select_input)
198
+
199
+
200
+ @dataclass
201
+ class CrossJoinInput(JoinSelectMixin):
202
+ left_select: SelectInputs = None
203
+ right_select: SelectInputs = None
204
+
187
205
  def __init__(self, left_select: List[SelectInput] | List[str],
188
206
  right_select: List[SelectInput] | List[str]):
189
207
  self.left_select = self.parse_select(left_select)
@@ -201,23 +219,9 @@ class CrossJoinInput:
201
219
  right_col.new_name = 'right_' + right_col.new_name
202
220
  overlapping_records = self.overlapping_records
203
221
 
204
- def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
205
- current_names = self.left_select.new_cols & self.right_select.new_cols
206
- if old_col_name not in current_names:
207
- return old_col_name
208
- while True:
209
- if old_col_name not in current_names:
210
- return old_col_name
211
- old_col_name = f'{side}_{old_col_name}'
212
-
213
- def add_new_select_column(self, select_input: SelectInput, side: str):
214
- selects = self.right_select if side == 'right' else self.left_select
215
- select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
216
- selects.__add__(select_input)
217
-
218
222
 
219
223
  @dataclass
220
- class JoinInput:
224
+ class JoinInput(JoinSelectMixin):
221
225
  join_mapping: List[JoinMap]
222
226
  left_select: SelectInputs = None
223
227
  right_select: SelectInputs = None
@@ -243,20 +247,8 @@ class JoinInput:
243
247
  raise Exception('No valid join mapping as input')
244
248
  return join_mapping
245
249
 
246
- @staticmethod
247
- def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> SelectInputs:
248
- if all(isinstance(c, SelectInput) for c in select):
249
- return SelectInputs(select)
250
- elif all(isinstance(c, dict) for c in select):
251
- return SelectInputs([SelectInput(**c) for c in select])
252
- elif isinstance(select, dict):
253
- renames = select.get('renames')
254
- if renames:
255
- return SelectInputs([SelectInput(**c) for c in renames])
256
- elif all(isinstance(c, str) for c in select):
257
- return SelectInputs([SelectInput(s, s) for s in select])
258
-
259
- def __init__(self, join_mapping: List[JoinMap] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
250
+ def __init__(self, join_mapping: List[JoinMap] | Tuple[str, str] | str,
251
+ left_select: List[SelectInput] | List[str],
260
252
  right_select: List[SelectInput] | List[str],
261
253
  how: JoinStrategy = 'inner'):
262
254
  self.join_mapping = self.parse_join_mapping(join_mapping)
@@ -311,20 +303,6 @@ class JoinInput:
311
303
  )
312
304
  return new_mappings
313
305
 
314
- def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
315
- current_names = self.left_select.new_cols & self.right_select.new_cols
316
- if old_col_name not in current_names:
317
- return old_col_name
318
- while True:
319
- if old_col_name not in current_names:
320
- return old_col_name
321
- old_col_name = f'{side}_{old_col_name}'
322
-
323
- def add_new_select_column(self, select_input: SelectInput, side: str):
324
- selects = self.right_select if side == 'right' else self.left_select
325
- select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
326
- selects.__add__(select_input)
327
-
328
306
 
329
307
  @dataclass
330
308
  class FuzzyMatchInput(JoinInput):
@@ -1,6 +1,10 @@
1
1
  # flowframe/__init__.py
2
2
  """A Polars-like API for building ETL graphs."""
3
3
 
4
+ from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
5
+
6
+ OFFLOAD_TO_WORKER.value = False
7
+
4
8
  # Core classes
5
9
  from flowfile_frame.flow_frame import FlowFrame # noqa: F401
6
10
 
@@ -10,9 +14,11 @@ from flowfile_frame.utils import create_flow_graph # noqa: F401
10
14
  from flowfile_frame.expr import ( # noqa: F401
11
15
  col, lit, column,
12
16
  cum_count, len,
13
- sum, min, max, mean, count, when
17
+ sum, min, max, mean, count, when, implode, last, corr, cov, first
14
18
  )
15
19
 
20
+ from flowfile_frame.lazy import (fold)
21
+
16
22
  # Selector utilities
17
23
  from flowfile_frame.selectors import ( # noqa: F401
18
24
  numeric, float_, integer, string, temporal,
@@ -21,10 +27,11 @@ from flowfile_frame.selectors import ( # noqa: F401
21
27
  by_dtype, contains, starts_with, ends_with, matches
22
28
  )
23
29
 
30
+ from flowfile_frame.series import Series
31
+
24
32
  # File I/O
25
- from flowfile_frame.flow_frame import ( # noqa: F401
26
- read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet
27
- )
33
+ from flowfile_frame.flow_frame_methods import ( # noqa: F401
34
+ read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet)
28
35
 
29
36
  from polars.datatypes import ( # noqa: F401
30
37
  # Integer types