Flowfile 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +2 -1
- flowfile/web/__init__.py +3 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/METADATA +1 -1
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/RECORD +46 -35
- flowfile_core/configs/__init__.py +15 -4
- flowfile_core/configs/settings.py +5 -3
- flowfile_core/configs/utils.py +18 -0
- flowfile_core/flowfile/FlowfileFlow.py +13 -18
- flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
- flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
- flowfile_core/flowfile/flow_node/flow_node.py +2 -1
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
- flowfile_core/flowfile/utils.py +34 -3
- flowfile_core/main.py +2 -3
- flowfile_core/routes/secrets.py +1 -1
- flowfile_core/schemas/input_schema.py +10 -4
- flowfile_core/schemas/transform_schema.py +25 -47
- flowfile_frame/__init__.py +11 -4
- flowfile_frame/adding_expr.py +280 -0
- flowfile_frame/config.py +9 -0
- flowfile_frame/expr.py +301 -83
- flowfile_frame/expr.pyi +2174 -0
- flowfile_frame/expr_name.py +258 -0
- flowfile_frame/flow_frame.py +587 -1002
- flowfile_frame/flow_frame.pyi +336 -0
- flowfile_frame/flow_frame_methods.py +617 -0
- flowfile_frame/group_frame.py +89 -42
- flowfile_frame/join.py +1 -2
- flowfile_frame/lazy.py +704 -0
- flowfile_frame/lazy_methods.py +201 -0
- flowfile_frame/list_name_space.py +324 -0
- flowfile_frame/selectors.py +3 -0
- flowfile_frame/series.py +70 -0
- flowfile_frame/utils.py +80 -4
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/LICENSE +0 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/WHEEL +0 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/entry_points.txt +0 -0
- /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
- /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from typing import Optional, Any, List, Dict, Literal
|
|
3
3
|
from flowfile_core.schemas import input_schema
|
|
4
|
-
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import
|
|
4
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
5
5
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
|
|
6
6
|
from polars import datatypes
|
|
7
7
|
import polars as pl
|
|
@@ -9,6 +9,37 @@ import polars as pl
|
|
|
9
9
|
DataTypeGroup = Literal['numeric', 'str', 'date']
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
def convert_pl_type_to_string(pl_type: pl.DataType, inner: bool = False) -> str:
|
|
13
|
+
if isinstance(pl_type, pl.List):
|
|
14
|
+
inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
|
|
15
|
+
return f"pl.List({inner_str})"
|
|
16
|
+
elif isinstance(pl_type, pl.Array):
|
|
17
|
+
inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
|
|
18
|
+
return f"pl.Array({inner_str})"
|
|
19
|
+
elif isinstance(pl_type, pl.Decimal):
|
|
20
|
+
precision = pl_type.precision if hasattr(pl_type, 'precision') else None
|
|
21
|
+
scale = pl_type.scale if hasattr(pl_type, 'scale') else None
|
|
22
|
+
if precision is not None and scale is not None:
|
|
23
|
+
return f"pl.Decimal({precision}, {scale})"
|
|
24
|
+
elif precision is not None:
|
|
25
|
+
return f"pl.Decimal({precision})"
|
|
26
|
+
else:
|
|
27
|
+
return "pl.Decimal()"
|
|
28
|
+
elif isinstance(pl_type, pl.Struct):
|
|
29
|
+
# Handle Struct with field definitions
|
|
30
|
+
fields = []
|
|
31
|
+
if hasattr(pl_type, 'fields'):
|
|
32
|
+
for field in pl_type.fields:
|
|
33
|
+
field_name = field.name
|
|
34
|
+
field_type = convert_pl_type_to_string(field.dtype, inner=True)
|
|
35
|
+
fields.append(f'pl.Field("{field_name}", {field_type})')
|
|
36
|
+
field_str = ", ".join(fields)
|
|
37
|
+
return f"pl.Struct([{field_str}])"
|
|
38
|
+
else:
|
|
39
|
+
# For base types, we want the full pl.TypeName format
|
|
40
|
+
return str(pl_type.base_type()) if not inner else f"pl.{pl_type}"
|
|
41
|
+
|
|
42
|
+
|
|
12
43
|
@dataclass
|
|
13
44
|
class FlowfileColumn:
|
|
14
45
|
column_name: str
|
|
@@ -28,7 +59,7 @@ class FlowfileColumn:
|
|
|
28
59
|
__perc_unique: Optional[float]
|
|
29
60
|
|
|
30
61
|
def __init__(self, polars_type: PlType):
|
|
31
|
-
self.data_type =
|
|
62
|
+
self.data_type = convert_pl_type_to_string(polars_type.pl_datatype)
|
|
32
63
|
self.size = polars_type.count - polars_type.null_count
|
|
33
64
|
self.max_value = polars_type.max
|
|
34
65
|
self.min_value = polars_type.min
|
|
@@ -53,7 +84,7 @@ class FlowfileColumn:
|
|
|
53
84
|
|
|
54
85
|
@classmethod
|
|
55
86
|
def from_input(cls, column_name: str, data_type: str, **kwargs) -> "FlowfileColumn":
|
|
56
|
-
pl_type =
|
|
87
|
+
pl_type = cast_str_to_polars_type(data_type)
|
|
57
88
|
if pl_type is not None:
|
|
58
89
|
data_type = pl_type
|
|
59
90
|
return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
|
|
@@ -129,12 +160,9 @@ class FlowfileColumn:
|
|
|
129
160
|
return 'date'
|
|
130
161
|
|
|
131
162
|
def get_polars_type(self) -> PlType:
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
pl_datatype = None
|
|
136
|
-
|
|
137
|
-
return PlType(pl_datatype=pl_datatype, **self.__dict__)
|
|
163
|
+
pl_datatype = cast_str_to_polars_type(self.data_type)
|
|
164
|
+
pl_type = PlType(pl_datatype=pl_datatype, **self.__dict__)
|
|
165
|
+
return pl_type
|
|
138
166
|
|
|
139
167
|
def update_type_from_polars_type(self, pl_type: PlType):
|
|
140
168
|
self.data_type = str(pl_type.pl_datatype.base_type())
|
|
@@ -142,3 +170,8 @@ class FlowfileColumn:
|
|
|
142
170
|
|
|
143
171
|
def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
|
|
144
172
|
return [FlowfileColumn.create_from_polars_type(PlType(**c)) for c in stats]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def convert_pl_schema_to_raw_data_format(pl_schema: pl.Schema) -> List[input_schema.MinimalFieldInfo]:
|
|
176
|
+
return [FlowfileColumn.create_from_polars_type(PlType(column_name=k, pl_datatype=v)).get_minimal_field_info()
|
|
177
|
+
for k, v in pl_schema.items()]
|
|
@@ -18,10 +18,45 @@ dtype_to_pl = {
|
|
|
18
18
|
'time': pl.Time,
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
+
|
|
22
|
+
def safe_eval_pl_type(type_string: str):
|
|
23
|
+
"""
|
|
24
|
+
Safely evaluate a Polars type string with restricted namespace.
|
|
25
|
+
Only allows Polars types and basic Python literals.
|
|
26
|
+
"""
|
|
27
|
+
# Define allowed names in the evaluation namespace
|
|
28
|
+
safe_dict = {
|
|
29
|
+
# Polars module and types
|
|
30
|
+
'pl': pl,
|
|
31
|
+
|
|
32
|
+
# Basic Python built-ins for literals
|
|
33
|
+
'int': int,
|
|
34
|
+
'str': str,
|
|
35
|
+
'float': float,
|
|
36
|
+
'bool': bool,
|
|
37
|
+
'list': list,
|
|
38
|
+
'dict': dict,
|
|
39
|
+
'tuple': tuple,
|
|
40
|
+
|
|
41
|
+
# Disable dangerous built-ins
|
|
42
|
+
'__builtins__': {},
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
return eval(type_string, safe_dict, {})
|
|
47
|
+
except Exception as e:
|
|
48
|
+
raise ValueError(f"Failed to safely evaluate type string '{type_string}': {e}")
|
|
49
|
+
|
|
50
|
+
|
|
21
51
|
dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
|
|
22
52
|
|
|
23
53
|
|
|
24
|
-
def
|
|
54
|
+
def get_polars_type(dtype: str):
|
|
55
|
+
if 'pl.' in dtype:
|
|
56
|
+
try:
|
|
57
|
+
return safe_eval_pl_type(dtype)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
return pl.String
|
|
25
60
|
pl_datetype = dtype_to_pl.get(dtype.lower())
|
|
26
61
|
if pl_datetype is not None:
|
|
27
62
|
return pl_datetype
|
|
@@ -31,6 +66,10 @@ def type_to_polars(dtype: str):
|
|
|
31
66
|
return pl.String
|
|
32
67
|
|
|
33
68
|
|
|
34
|
-
def
|
|
35
|
-
|
|
69
|
+
def cast_str_to_polars_type(dtype: str) -> pl.DataType:
|
|
70
|
+
pl_type = get_polars_type(dtype)
|
|
71
|
+
if hasattr(pl_type, '__call__'):
|
|
72
|
+
return pl_type()
|
|
73
|
+
else:
|
|
74
|
+
return pl_type
|
|
36
75
|
|
|
@@ -3,6 +3,7 @@ from typing import Dict, Any, Callable
|
|
|
3
3
|
import textwrap
|
|
4
4
|
import ast
|
|
5
5
|
import time
|
|
6
|
+
from io import BytesIO
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
def remove_comments_and_docstrings(source: str) -> str:
|
|
@@ -174,6 +175,7 @@ class PolarsCodeParser:
|
|
|
174
175
|
'False': False,
|
|
175
176
|
'None': None,
|
|
176
177
|
'time': time,
|
|
178
|
+
'BytesIO': BytesIO
|
|
177
179
|
}
|
|
178
180
|
|
|
179
181
|
@staticmethod
|
|
@@ -256,7 +258,6 @@ class PolarsCodeParser:
|
|
|
256
258
|
|
|
257
259
|
# Wrap the code in a function
|
|
258
260
|
wrapped_code = self._wrap_in_function(code, num_inputs)
|
|
259
|
-
|
|
260
261
|
try:
|
|
261
262
|
# Create namespace for execution
|
|
262
263
|
local_namespace: Dict[str, Any] = {}
|
|
@@ -1,14 +1,27 @@
|
|
|
1
1
|
from faker import Faker
|
|
2
2
|
from functools import partial
|
|
3
|
+
from math import ceil
|
|
3
4
|
from random import randint
|
|
4
5
|
import polars as pl
|
|
5
6
|
from typing import List, Dict, Any, Generator
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
|
|
9
|
+
def create_fake_data(n_records: int = 1000, optimized: bool = True) -> pl.DataFrame:
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
n_records (): Number of records to return
|
|
14
|
+
optimized (): Indicator if creation should be optimized, will result in more identical rows when True
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
pl.DataFrame
|
|
18
|
+
"""
|
|
9
19
|
fake = Faker()
|
|
10
|
-
selector = partial(randint,0)
|
|
11
|
-
|
|
20
|
+
selector = partial(randint, 0)
|
|
21
|
+
|
|
22
|
+
max_n_records = min(10_000, n_records) if optimized else n_records
|
|
23
|
+
|
|
24
|
+
min_range = partial(min, max_n_records)
|
|
12
25
|
# Pre-generation of static data
|
|
13
26
|
cities = [fake.city() for _ in range(min_range(7000))]
|
|
14
27
|
companies = [fake.company() for _ in range(min_range(100_000))]
|
|
@@ -19,7 +32,7 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
|
|
|
19
32
|
first_names = [fake.first_name() for _ in range(min_range(100_000))]
|
|
20
33
|
last_names = [fake.last_name() for _ in range(min_range(50_000))]
|
|
21
34
|
domain_names = [fake.domain_name() for _ in range(10)]
|
|
22
|
-
sales_data = [fake.random_int(0, 1000) for _ in range(
|
|
35
|
+
sales_data = [fake.random_int(0, 1000) for _ in range(max_n_records)]
|
|
23
36
|
|
|
24
37
|
def generate_name():
|
|
25
38
|
return f"{first_names[selector(min_range(100_000))-1]} {last_names[selector(min_range(50_000))-1]}"
|
|
@@ -32,9 +45,8 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
|
|
|
32
45
|
|
|
33
46
|
def generate_phone_number():
|
|
34
47
|
return fake.phone_number()
|
|
35
|
-
|
|
36
48
|
data = []
|
|
37
|
-
for i in range(
|
|
49
|
+
for i in range(max_n_records):
|
|
38
50
|
name = generate_name()
|
|
39
51
|
data.append(dict(
|
|
40
52
|
ID=randint(1, 1000000),
|
|
@@ -47,8 +59,14 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
|
|
|
47
59
|
Work=companies[selector(min_range(100_000))-1],
|
|
48
60
|
Zipcode=zipcodes[selector(min_range(200_000))-1],
|
|
49
61
|
Country=countries[selector(min_range(50))-1],
|
|
50
|
-
sales_data=sales_data[selector(
|
|
62
|
+
sales_data=sales_data[selector(max_n_records)-1]
|
|
51
63
|
))
|
|
64
|
+
if max_n_records < n_records:
|
|
65
|
+
n_duplicates: int = ceil(n_records / max_n_records)
|
|
66
|
+
output = []
|
|
67
|
+
for _ in range(n_duplicates):
|
|
68
|
+
output.extend(data)
|
|
69
|
+
data = output[:n_records]
|
|
52
70
|
|
|
53
71
|
return pl.DataFrame(data)
|
|
54
72
|
|
|
@@ -190,7 +190,7 @@ class BaseFetcher:
|
|
|
190
190
|
logger.info('Already running the fetching')
|
|
191
191
|
return
|
|
192
192
|
|
|
193
|
-
sleep_time =
|
|
193
|
+
sleep_time = .5
|
|
194
194
|
self.running = True
|
|
195
195
|
while not self.stop_event.is_set():
|
|
196
196
|
try:
|
|
@@ -205,7 +205,8 @@ class BaseFetcher:
|
|
|
205
205
|
break
|
|
206
206
|
elif status.status == 'Unknown Error':
|
|
207
207
|
self._handle_error(-1,
|
|
208
|
-
'There was an unknown error with the process,
|
|
208
|
+
'There was an unknown error with the process, '
|
|
209
|
+
'and the process got killed by the server')
|
|
209
210
|
break
|
|
210
211
|
else:
|
|
211
212
|
self._handle_error(2, r.text)
|
|
@@ -284,7 +285,7 @@ class ExternalDfFetcher(BaseFetcher):
|
|
|
284
285
|
|
|
285
286
|
def __init__(self, flow_id: int, node_id: int | str, lf: pl.LazyFrame | pl.DataFrame, file_ref: str = None,
|
|
286
287
|
wait_on_completion: bool = True,
|
|
287
|
-
operation_type: OperationType = 'store'):
|
|
288
|
+
operation_type: OperationType = 'store', offload_to_worker: bool = True):
|
|
288
289
|
super().__init__(file_ref=file_ref)
|
|
289
290
|
lf = lf.lazy() if isinstance(lf, pl.DataFrame) else lf
|
|
290
291
|
r = trigger_df_operation(lf=lf, file_ref=self.file_ref, operation_type=operation_type,
|
|
@@ -146,6 +146,7 @@ class FlowNode:
|
|
|
146
146
|
self.node_settings.renew_schema = True
|
|
147
147
|
if hasattr(setting_input, 'cache_results'):
|
|
148
148
|
self.node_settings.cache_results = setting_input.cache_results
|
|
149
|
+
|
|
149
150
|
self.setting_input = setting_input
|
|
150
151
|
self.results.errors = None
|
|
151
152
|
self.add_lead_to_in_depend_source()
|
|
@@ -174,7 +175,7 @@ class FlowNode:
|
|
|
174
175
|
self.set_node_information()
|
|
175
176
|
if self.node_type == 'manual_input' and isinstance(self._setting_input, input_schema.NodeManualInput):
|
|
176
177
|
if self.hash != self.calculate_hash(setting_input) or not self.node_stats.has_run:
|
|
177
|
-
self.function = self.function.__class__(setting_input.
|
|
178
|
+
self.function = self.function.__class__(setting_input.raw_data_format)
|
|
178
179
|
self.reset()
|
|
179
180
|
self.get_predicted_schema()
|
|
180
181
|
elif self._setting_input is not None:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union
|
|
2
2
|
from pydantic import BaseModel, field_validator, ConfigDict
|
|
3
3
|
import polars as pl
|
|
4
|
-
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import
|
|
4
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
5
5
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
|
|
6
6
|
from flowfile_core.schemas.input_schema import MinimalFieldInfo
|
|
7
7
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
|
|
@@ -56,7 +56,7 @@ class JsonSchema(BaseModel):
|
|
|
56
56
|
dtype = 'string'
|
|
57
57
|
else:
|
|
58
58
|
dtype = type_mapping.get(self.type[0] if isinstance(self.type, list) else self.type, 'string')
|
|
59
|
-
return
|
|
59
|
+
return cast_str_to_polars_type(dtype)
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
class AirbyteProperty(BaseModel):
|
|
@@ -4,7 +4,7 @@ from flowfile_core.configs import logger
|
|
|
4
4
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
|
|
5
5
|
from flowfile_core.schemas.input_schema import MinimalFieldInfo, DatabaseSettings
|
|
6
6
|
from sqlalchemy import Engine, inspect, create_engine, text
|
|
7
|
-
from flowfile_core.
|
|
7
|
+
from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
|
|
8
8
|
|
|
9
9
|
from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
|
|
10
10
|
from flowfile_core.flowfile.sources.external_sources.sql_source.utils import get_polars_type, construct_sql_uri
|
flowfile_core/flowfile/utils.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
1
|
-
from typing import List
|
|
2
1
|
import os
|
|
3
|
-
import shutil
|
|
4
2
|
import hashlib
|
|
5
|
-
from datetime import datetime
|
|
6
3
|
import json
|
|
4
|
+
import polars as pl
|
|
5
|
+
import shutil
|
|
6
|
+
|
|
7
|
+
from datetime import datetime, date, time
|
|
8
|
+
from typing import List
|
|
9
|
+
from decimal import Decimal
|
|
10
|
+
|
|
11
|
+
from flowfile_core.flowfile.flow_data_engine.utils import standardize_col_dtype
|
|
12
|
+
from flowfile_core.schemas import input_schema
|
|
7
13
|
|
|
8
14
|
|
|
9
15
|
def generate_sha256_hash(data: bytes):
|
|
@@ -26,8 +32,16 @@ def snake_case_to_camel_case(text: str) -> str:
|
|
|
26
32
|
def json_default(val):
|
|
27
33
|
if isinstance(val, datetime):
|
|
28
34
|
return val.isoformat(timespec='microseconds')
|
|
35
|
+
elif isinstance(val, date):
|
|
36
|
+
return val.isoformat()
|
|
37
|
+
elif isinstance(val, time):
|
|
38
|
+
return val.isoformat()
|
|
29
39
|
elif hasattr(val, '__dict__'):
|
|
30
40
|
return val.__dict__
|
|
41
|
+
elif isinstance(val, Decimal):
|
|
42
|
+
if val.as_integer_ratio()[1] == 1:
|
|
43
|
+
return int(val)
|
|
44
|
+
return float(val)
|
|
31
45
|
else:
|
|
32
46
|
raise Exception('Value is not serializable')
|
|
33
47
|
|
|
@@ -104,3 +118,20 @@ def batch_generator(input_list: List, batch_size: int = 10000):
|
|
|
104
118
|
input_list = []
|
|
105
119
|
run = False
|
|
106
120
|
|
|
121
|
+
|
|
122
|
+
def _handle_raw_data(node_manual_input: input_schema.NodeManualInput):
|
|
123
|
+
"""Ensure compatibility with the new typed raw data and the old dict form data type"""
|
|
124
|
+
if (not (hasattr(node_manual_input, "raw_data_format") and node_manual_input.raw_data_format)
|
|
125
|
+
and (hasattr(node_manual_input, 'raw_data') and node_manual_input.raw_data)):
|
|
126
|
+
values = [standardize_col_dtype([vv for vv in c]) for c in zip(*(r.values()
|
|
127
|
+
for r in node_manual_input.raw_data))]
|
|
128
|
+
data_types = (pl.DataType.from_python(type(next((v for v in column_values), None))) for column_values in values)
|
|
129
|
+
_columns = [input_schema.MinimalFieldInfo(name=c, data_type=str(next(data_types))) for c in
|
|
130
|
+
node_manual_input.raw_data[0].keys()]
|
|
131
|
+
|
|
132
|
+
node_manual_input.raw_data_format = input_schema.RawData(columns=_columns, data=values)
|
|
133
|
+
elif ((hasattr(node_manual_input, "raw_data_format") and node_manual_input.raw_data_format)
|
|
134
|
+
and not (hasattr(node_manual_input, 'raw_data') and node_manual_input.raw_data)):
|
|
135
|
+
node_manual_input.raw_data = [{c.name: node_manual_input.raw_data_format.data[ci][ri] for ci, c in
|
|
136
|
+
enumerate(node_manual_input.raw_data_format.columns)}
|
|
137
|
+
for ri in range(len(node_manual_input.raw_data_format.data[0]))]
|
flowfile_core/main.py
CHANGED
|
@@ -8,7 +8,8 @@ from fastapi import FastAPI
|
|
|
8
8
|
from fastapi.middleware.cors import CORSMiddleware
|
|
9
9
|
|
|
10
10
|
from flowfile_core import ServerRun
|
|
11
|
-
from flowfile_core.configs.settings import SERVER_HOST, SERVER_PORT, WORKER_HOST, WORKER_PORT, WORKER_URL
|
|
11
|
+
from flowfile_core.configs.settings import (SERVER_HOST, SERVER_PORT, WORKER_HOST, WORKER_PORT, WORKER_URL,
|
|
12
|
+
OFFLOAD_TO_WORKER)
|
|
12
13
|
|
|
13
14
|
from flowfile_core.routes.auth import router as auth_router
|
|
14
15
|
from flowfile_core.routes.secrets import router as secrets_router
|
|
@@ -107,7 +108,6 @@ def run(host: str = None, port: int = None):
|
|
|
107
108
|
host = SERVER_HOST
|
|
108
109
|
if port is None:
|
|
109
110
|
port = SERVER_PORT
|
|
110
|
-
|
|
111
111
|
print(f"Starting server on {host}:{port}")
|
|
112
112
|
print(f"Worker configured at {WORKER_URL} (host: {WORKER_HOST}, port: {WORKER_PORT})")
|
|
113
113
|
|
|
@@ -120,7 +120,6 @@ def run(host: str = None, port: int = None):
|
|
|
120
120
|
host=host,
|
|
121
121
|
port=port,
|
|
122
122
|
loop="asyncio",
|
|
123
|
-
log_level="warning",
|
|
124
123
|
)
|
|
125
124
|
server = uvicorn.Server(config)
|
|
126
125
|
server_instance = server # Store server instance globally
|
flowfile_core/routes/secrets.py
CHANGED
|
@@ -10,7 +10,7 @@ from flowfile_core.auth.jwt import get_current_active_user
|
|
|
10
10
|
from flowfile_core.auth.models import Secret, SecretInput
|
|
11
11
|
from flowfile_core.database import models as db_models
|
|
12
12
|
from flowfile_core.database.connection import get_db
|
|
13
|
-
from flowfile_core.
|
|
13
|
+
from flowfile_core.secret_manager.secret_manager import encrypt_secret, store_secret, delete_secret as delete_secret_action
|
|
14
14
|
|
|
15
15
|
router = APIRouter(dependencies=[Depends(get_current_active_user)])
|
|
16
16
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Literal
|
|
1
|
+
from typing import List, Optional, Literal, Iterator
|
|
2
2
|
from flowfile_core.schemas import transform_schema
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
import os
|
|
@@ -61,7 +61,7 @@ class ReceivedTableBase(BaseModel):
|
|
|
61
61
|
return self.path
|
|
62
62
|
|
|
63
63
|
def set_absolute_filepath(self):
|
|
64
|
-
base_path = Path(self.path)
|
|
64
|
+
base_path = Path(self.path).expanduser()
|
|
65
65
|
# Check if the path is relative, resolve it with the current working directory
|
|
66
66
|
if not base_path.is_absolute():
|
|
67
67
|
base_path = Path.cwd() / base_path
|
|
@@ -97,7 +97,7 @@ class ReceivedJsonTable(ReceivedCsvTable):
|
|
|
97
97
|
pass
|
|
98
98
|
|
|
99
99
|
|
|
100
|
-
class ReceivedParquetTable(
|
|
100
|
+
class ReceivedParquetTable(ReceivedTableBase):
|
|
101
101
|
file_type: str = 'parquet'
|
|
102
102
|
|
|
103
103
|
|
|
@@ -247,8 +247,14 @@ class NodeDatasource(NodeBase):
|
|
|
247
247
|
file_ref: str = None
|
|
248
248
|
|
|
249
249
|
|
|
250
|
+
class RawData(BaseModel):
|
|
251
|
+
columns: List[MinimalFieldInfo] = None
|
|
252
|
+
data: List[List] # List of list where each inner list is a column of data. This ensures more efficient storage
|
|
253
|
+
|
|
254
|
+
|
|
250
255
|
class NodeManualInput(NodeBase):
|
|
251
|
-
raw_data: List = None
|
|
256
|
+
raw_data: Optional[List] = None
|
|
257
|
+
raw_data_format: Optional[RawData] = None
|
|
252
258
|
|
|
253
259
|
|
|
254
260
|
class NodeRead(NodeBase):
|
|
@@ -166,10 +166,8 @@ class FuzzyMap(JoinMap):
|
|
|
166
166
|
self.output_column_name = f'fuzzy_score_{self.left_col}_{self.right_col}'
|
|
167
167
|
|
|
168
168
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
left_select: SelectInputs = None
|
|
172
|
-
right_select: SelectInputs = None
|
|
169
|
+
class JoinSelectMixin:
|
|
170
|
+
"""Mixin for common join selection functionality"""
|
|
173
171
|
|
|
174
172
|
@staticmethod
|
|
175
173
|
def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> SelectInputs:
|
|
@@ -184,6 +182,26 @@ class CrossJoinInput:
|
|
|
184
182
|
elif all(isinstance(c, str) for c in select):
|
|
185
183
|
return SelectInputs([SelectInput(s, s) for s in select])
|
|
186
184
|
|
|
185
|
+
def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
|
|
186
|
+
current_names = self.left_select.new_cols & self.right_select.new_cols
|
|
187
|
+
if old_col_name not in current_names:
|
|
188
|
+
return old_col_name
|
|
189
|
+
while True:
|
|
190
|
+
if old_col_name not in current_names:
|
|
191
|
+
return old_col_name
|
|
192
|
+
old_col_name = f'{side}_{old_col_name}'
|
|
193
|
+
|
|
194
|
+
def add_new_select_column(self, select_input: SelectInput, side: str):
|
|
195
|
+
selects = self.right_select if side == 'right' else self.left_select
|
|
196
|
+
select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
|
|
197
|
+
selects.__add__(select_input)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@dataclass
|
|
201
|
+
class CrossJoinInput(JoinSelectMixin):
|
|
202
|
+
left_select: SelectInputs = None
|
|
203
|
+
right_select: SelectInputs = None
|
|
204
|
+
|
|
187
205
|
def __init__(self, left_select: List[SelectInput] | List[str],
|
|
188
206
|
right_select: List[SelectInput] | List[str]):
|
|
189
207
|
self.left_select = self.parse_select(left_select)
|
|
@@ -201,23 +219,9 @@ class CrossJoinInput:
|
|
|
201
219
|
right_col.new_name = 'right_' + right_col.new_name
|
|
202
220
|
overlapping_records = self.overlapping_records
|
|
203
221
|
|
|
204
|
-
def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
|
|
205
|
-
current_names = self.left_select.new_cols & self.right_select.new_cols
|
|
206
|
-
if old_col_name not in current_names:
|
|
207
|
-
return old_col_name
|
|
208
|
-
while True:
|
|
209
|
-
if old_col_name not in current_names:
|
|
210
|
-
return old_col_name
|
|
211
|
-
old_col_name = f'{side}_{old_col_name}'
|
|
212
|
-
|
|
213
|
-
def add_new_select_column(self, select_input: SelectInput, side: str):
|
|
214
|
-
selects = self.right_select if side == 'right' else self.left_select
|
|
215
|
-
select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
|
|
216
|
-
selects.__add__(select_input)
|
|
217
|
-
|
|
218
222
|
|
|
219
223
|
@dataclass
|
|
220
|
-
class JoinInput:
|
|
224
|
+
class JoinInput(JoinSelectMixin):
|
|
221
225
|
join_mapping: List[JoinMap]
|
|
222
226
|
left_select: SelectInputs = None
|
|
223
227
|
right_select: SelectInputs = None
|
|
@@ -243,20 +247,8 @@ class JoinInput:
|
|
|
243
247
|
raise Exception('No valid join mapping as input')
|
|
244
248
|
return join_mapping
|
|
245
249
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
if all(isinstance(c, SelectInput) for c in select):
|
|
249
|
-
return SelectInputs(select)
|
|
250
|
-
elif all(isinstance(c, dict) for c in select):
|
|
251
|
-
return SelectInputs([SelectInput(**c) for c in select])
|
|
252
|
-
elif isinstance(select, dict):
|
|
253
|
-
renames = select.get('renames')
|
|
254
|
-
if renames:
|
|
255
|
-
return SelectInputs([SelectInput(**c) for c in renames])
|
|
256
|
-
elif all(isinstance(c, str) for c in select):
|
|
257
|
-
return SelectInputs([SelectInput(s, s) for s in select])
|
|
258
|
-
|
|
259
|
-
def __init__(self, join_mapping: List[JoinMap] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
|
|
250
|
+
def __init__(self, join_mapping: List[JoinMap] | Tuple[str, str] | str,
|
|
251
|
+
left_select: List[SelectInput] | List[str],
|
|
260
252
|
right_select: List[SelectInput] | List[str],
|
|
261
253
|
how: JoinStrategy = 'inner'):
|
|
262
254
|
self.join_mapping = self.parse_join_mapping(join_mapping)
|
|
@@ -311,20 +303,6 @@ class JoinInput:
|
|
|
311
303
|
)
|
|
312
304
|
return new_mappings
|
|
313
305
|
|
|
314
|
-
def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
|
|
315
|
-
current_names = self.left_select.new_cols & self.right_select.new_cols
|
|
316
|
-
if old_col_name not in current_names:
|
|
317
|
-
return old_col_name
|
|
318
|
-
while True:
|
|
319
|
-
if old_col_name not in current_names:
|
|
320
|
-
return old_col_name
|
|
321
|
-
old_col_name = f'{side}_{old_col_name}'
|
|
322
|
-
|
|
323
|
-
def add_new_select_column(self, select_input: SelectInput, side: str):
|
|
324
|
-
selects = self.right_select if side == 'right' else self.left_select
|
|
325
|
-
select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
|
|
326
|
-
selects.__add__(select_input)
|
|
327
|
-
|
|
328
306
|
|
|
329
307
|
@dataclass
|
|
330
308
|
class FuzzyMatchInput(JoinInput):
|
flowfile_frame/__init__.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
# flowframe/__init__.py
|
|
2
2
|
"""A Polars-like API for building ETL graphs."""
|
|
3
3
|
|
|
4
|
+
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
5
|
+
|
|
6
|
+
OFFLOAD_TO_WORKER.value = False
|
|
7
|
+
|
|
4
8
|
# Core classes
|
|
5
9
|
from flowfile_frame.flow_frame import FlowFrame # noqa: F401
|
|
6
10
|
|
|
@@ -10,9 +14,11 @@ from flowfile_frame.utils import create_flow_graph # noqa: F401
|
|
|
10
14
|
from flowfile_frame.expr import ( # noqa: F401
|
|
11
15
|
col, lit, column,
|
|
12
16
|
cum_count, len,
|
|
13
|
-
sum, min, max, mean, count, when
|
|
17
|
+
sum, min, max, mean, count, when, implode, last, corr, cov, first
|
|
14
18
|
)
|
|
15
19
|
|
|
20
|
+
from flowfile_frame.lazy import (fold)
|
|
21
|
+
|
|
16
22
|
# Selector utilities
|
|
17
23
|
from flowfile_frame.selectors import ( # noqa: F401
|
|
18
24
|
numeric, float_, integer, string, temporal,
|
|
@@ -21,10 +27,11 @@ from flowfile_frame.selectors import ( # noqa: F401
|
|
|
21
27
|
by_dtype, contains, starts_with, ends_with, matches
|
|
22
28
|
)
|
|
23
29
|
|
|
30
|
+
from flowfile_frame.series import Series
|
|
31
|
+
|
|
24
32
|
# File I/O
|
|
25
|
-
from flowfile_frame.
|
|
26
|
-
read_csv, read_parquet, from_dict, concat,
|
|
27
|
-
)
|
|
33
|
+
from flowfile_frame.flow_frame_methods import ( # noqa: F401
|
|
34
|
+
read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet)
|
|
28
35
|
|
|
29
36
|
from polars.datatypes import ( # noqa: F401
|
|
30
37
|
# Integer types
|