Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
File without changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional, Any, List, Dict, Literal
|
|
3
|
+
from flowfile_core.schemas import input_schema
|
|
4
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
|
|
5
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
|
|
6
|
+
from polars import datatypes
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
DataTypeGroup = Literal['numeric', 'str', 'date']
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class FlowfileColumn:
|
|
14
|
+
column_name: str
|
|
15
|
+
data_type: str
|
|
16
|
+
size: int
|
|
17
|
+
max_value: str
|
|
18
|
+
min_value: str
|
|
19
|
+
col_index: int
|
|
20
|
+
number_of_empty_values: int
|
|
21
|
+
number_of_unique_values: int
|
|
22
|
+
example_values: str
|
|
23
|
+
__sql_type: Optional[Any]
|
|
24
|
+
__is_unique: Optional[bool]
|
|
25
|
+
__nullable: Optional[bool]
|
|
26
|
+
__has_values: Optional[bool]
|
|
27
|
+
average_value: Optional[str]
|
|
28
|
+
__perc_unique: Optional[float]
|
|
29
|
+
|
|
30
|
+
def __init__(self, polars_type: PlType):
|
|
31
|
+
self.data_type = str(polars_type.pl_datatype.base_type())
|
|
32
|
+
self.size = polars_type.count - polars_type.null_count
|
|
33
|
+
self.max_value = polars_type.max
|
|
34
|
+
self.min_value = polars_type.min
|
|
35
|
+
self.number_of_unique_values = polars_type.n_unique
|
|
36
|
+
self.number_of_empty_values = polars_type.null_count
|
|
37
|
+
self.example_values = polars_type.examples
|
|
38
|
+
self.column_name = polars_type.column_name
|
|
39
|
+
self.average_value = polars_type.mean
|
|
40
|
+
self.col_index = polars_type.col_index
|
|
41
|
+
self.__has_values = None
|
|
42
|
+
self.__nullable = None
|
|
43
|
+
self.__is_unique = None
|
|
44
|
+
self.__sql_type = None
|
|
45
|
+
self.__perc_unique = None
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def create_from_polars_type(cls, polars_type: PlType, **kwargs) -> "FlowfileColumn":
|
|
49
|
+
for k, v in kwargs.items():
|
|
50
|
+
if hasattr(polars_type, k):
|
|
51
|
+
setattr(polars_type, k, v)
|
|
52
|
+
return cls(polars_type)
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_input(cls, column_name: str, data_type: str, **kwargs) -> "FlowfileColumn":
|
|
56
|
+
pl_type = type_to_polars_str(data_type)
|
|
57
|
+
if pl_type is not None:
|
|
58
|
+
data_type = pl_type
|
|
59
|
+
return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def create_from_polars_dtype(cls, column_name: str, data_type: pl.DataType, **kwargs):
|
|
63
|
+
return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
|
|
64
|
+
|
|
65
|
+
def get_minimal_field_info(self) -> input_schema.MinimalFieldInfo:
|
|
66
|
+
return input_schema.MinimalFieldInfo(name=self.column_name, data_type=self.data_type)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def create_from_minimal_field_info(cls, minimal_field_info: input_schema.MinimalFieldInfo) -> "FlowfileColumn":
|
|
70
|
+
return cls.from_input(column_name=minimal_field_info.name,
|
|
71
|
+
data_type=minimal_field_info.data_type)
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def is_unique(self) -> bool:
|
|
75
|
+
if self.__is_unique is None:
|
|
76
|
+
if self.has_values:
|
|
77
|
+
self.__is_unique = self.number_of_unique_values == self.number_of_filled_values
|
|
78
|
+
else:
|
|
79
|
+
self.__is_unique = False
|
|
80
|
+
return self.__is_unique
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def perc_unique(self) -> float:
|
|
84
|
+
if self.__perc_unique is None:
|
|
85
|
+
self.__perc_unique = self.number_of_unique_values / self.number_of_filled_values
|
|
86
|
+
return self.__perc_unique
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def has_values(self) -> bool:
|
|
90
|
+
if not self.__has_values:
|
|
91
|
+
self.__has_values = self.number_of_unique_values > 0
|
|
92
|
+
return self.__has_values
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def number_of_filled_values(self):
|
|
96
|
+
return self.size
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def nullable(self):
|
|
100
|
+
if self.__nullable is None:
|
|
101
|
+
self.__nullable = self.number_of_empty_values > 0
|
|
102
|
+
return self.__nullable
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def name(self):
|
|
106
|
+
return self.column_name
|
|
107
|
+
|
|
108
|
+
def get_column_repr(self):
|
|
109
|
+
return dict(name=self.name,
|
|
110
|
+
size=self.size,
|
|
111
|
+
data_type=str(self.data_type),
|
|
112
|
+
has_values=self.has_values,
|
|
113
|
+
is_unique=self.is_unique,
|
|
114
|
+
max_value=str(self.max_value),
|
|
115
|
+
min_value=str(self.min_value),
|
|
116
|
+
number_of_unique_values=self.number_of_unique_values,
|
|
117
|
+
number_of_filled_values=self.number_of_filled_values,
|
|
118
|
+
number_of_empty_values=self.number_of_empty_values,
|
|
119
|
+
average_size=self.average_value)
|
|
120
|
+
|
|
121
|
+
def generic_datatype(self) -> DataTypeGroup:
|
|
122
|
+
if self.data_type in ('Utf8', 'VARCHAR', 'CHAR', 'NVARCHAR', 'String'):
|
|
123
|
+
return 'str'
|
|
124
|
+
elif self.data_type in ('fixed_decimal', 'decimal', 'float', 'integer', 'boolean', 'double', 'Int16', 'Int32',
|
|
125
|
+
'Int64', 'Float32', 'Float64', 'Decimal', 'Binary', 'Boolean', 'Uint8', 'Uint16',
|
|
126
|
+
'Uint32', 'Uint64'):
|
|
127
|
+
return 'numeric'
|
|
128
|
+
elif self.data_type in ('datetime', 'date', 'Date', 'Datetime', 'Time'):
|
|
129
|
+
return 'date'
|
|
130
|
+
|
|
131
|
+
def get_polars_type(self) -> PlType:
|
|
132
|
+
if hasattr(datatypes, self.data_type):
|
|
133
|
+
pl_datatype = getattr(datatypes, self.data_type)
|
|
134
|
+
else:
|
|
135
|
+
pl_datatype = None
|
|
136
|
+
|
|
137
|
+
return PlType(pl_datatype=pl_datatype, **self.__dict__)
|
|
138
|
+
|
|
139
|
+
def update_type_from_polars_type(self, pl_type: PlType):
|
|
140
|
+
self.data_type = str(pl_type.pl_datatype.base_type())
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
|
|
144
|
+
return [FlowfileColumn.create_from_polars_type(PlType(**c)) for c in stats]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from typing import Optional, Any
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ColumnInfo:
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PlType(BaseModel):
|
|
10
|
+
column_name: str
|
|
11
|
+
col_index: int = -1
|
|
12
|
+
count: Optional[int] = -1
|
|
13
|
+
null_count: Optional[int] = -1
|
|
14
|
+
mean: Optional[str] = ""
|
|
15
|
+
std: Optional[float] = -1
|
|
16
|
+
min: Optional[str] = ""
|
|
17
|
+
max: Optional[str] = ""
|
|
18
|
+
median: Optional[str] = 0
|
|
19
|
+
pl_datatype: Optional[Any]
|
|
20
|
+
n_unique: Optional[int] = -1
|
|
21
|
+
examples: Optional[str] = ""
|
|
22
|
+
|
|
23
|
+
class Config:
|
|
24
|
+
arbitrary_types_allowed = True
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
dtype_to_pl = {
|
|
5
|
+
'int': pl.Int64,
|
|
6
|
+
'integer': pl.Int64,
|
|
7
|
+
'char': pl.String,
|
|
8
|
+
'fixed decimal': pl.Float32,
|
|
9
|
+
'double': pl.Float64,
|
|
10
|
+
'float': pl.Float64,
|
|
11
|
+
'bool': pl.Boolean,
|
|
12
|
+
'byte': pl.UInt8,
|
|
13
|
+
'bit': pl.Binary,
|
|
14
|
+
'date': pl.Date,
|
|
15
|
+
'datetime': pl.Datetime,
|
|
16
|
+
'string': pl.String,
|
|
17
|
+
'str': pl.String,
|
|
18
|
+
'time': pl.Time,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def type_to_polars(dtype: str):
|
|
25
|
+
pl_datetype = dtype_to_pl.get(dtype.lower())
|
|
26
|
+
if pl_datetype is not None:
|
|
27
|
+
return pl_datetype
|
|
28
|
+
elif hasattr(pl, dtype):
|
|
29
|
+
return getattr(pl, dtype)
|
|
30
|
+
else:
|
|
31
|
+
return pl.String
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def type_to_polars_str(dtype: str) -> pl.DataType:
|
|
35
|
+
return type_to_polars(dtype)()
|
|
36
|
+
|
|
File without changes
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from flowfile_core.schemas.transform_schema import FuzzyMatchInput
|
|
2
|
+
from flowfile_core.flowfile.flow_data_engine.join import verify_join_select_integrity, verify_join_map_integrity
|
|
3
|
+
import polars as pl
|
|
4
|
+
from typing import TYPE_CHECKING, Tuple
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
|
|
11
|
+
fuzzy_match_input: FuzzyMatchInput) -> Tuple[pl.LazyFrame, pl.LazyFrame]:
|
|
12
|
+
"""
|
|
13
|
+
Prepare two FlowDataEngines for fuzzy matching.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
left: Left FlowDataEngine for fuzzy join
|
|
17
|
+
right: Right FlowDataEngine for fuzzy join
|
|
18
|
+
fuzzy_match_input: Parameters for fuzzy matching configuration
|
|
19
|
+
Returns:
|
|
20
|
+
Tuple[pl.LazyFrame, pl.LazyFrame]: Prepared left and right lazy frames
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
left.lazy = True
|
|
24
|
+
right.lazy = True
|
|
25
|
+
verify_join_select_integrity(fuzzy_match_input, left_columns=left.columns, right_columns=right.columns)
|
|
26
|
+
if not verify_join_map_integrity(fuzzy_match_input, left_columns=left.schema, right_columns=right.schema):
|
|
27
|
+
raise Exception('Join is not valid by the data fields')
|
|
28
|
+
fuzzy_match_input = fuzzy_match_input
|
|
29
|
+
fuzzy_match_input.auto_rename()
|
|
30
|
+
right_select = [v.old_name for v in fuzzy_match_input.right_select.renames if
|
|
31
|
+
(v.keep or v.join_key) and v.is_available]
|
|
32
|
+
left_select = [v.old_name for v in fuzzy_match_input.left_select.renames if
|
|
33
|
+
(v.keep or v.join_key) and v.is_available]
|
|
34
|
+
left_df: pl.LazyFrame | pl.DataFrame = left.data_frame.select(left_select).rename(
|
|
35
|
+
fuzzy_match_input.left_select.rename_table)
|
|
36
|
+
right_df: pl.LazyFrame | pl.DataFrame = right.data_frame.select(right_select).rename(
|
|
37
|
+
fuzzy_match_input.right_select.rename_table)
|
|
38
|
+
return left_df, right_df
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import List
|
|
3
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, PlType
|
|
4
|
+
from flowfile_core.schemas import transform_schema
|
|
5
|
+
from flowfile_core.schemas import input_schema
|
|
6
|
+
from polars import datatypes
|
|
7
|
+
import polars as pl
|
|
8
|
+
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import fetch_unique_values
|
|
9
|
+
from flowfile_core.configs.flow_logger import main_logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def calculate_uniqueness(a: float, b: float) -> float:
|
|
13
|
+
return ((pow(a + 0.5, 2) + pow(b + 0.5, 2)) / 2 - pow(0.5, 2)) + 0.5 * abs(a - b)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
|
|
17
|
+
left_schema: List[FlowfileColumn],
|
|
18
|
+
right_schema: List[FlowfileColumn]):
|
|
19
|
+
print('calculating fuzzy match schema')
|
|
20
|
+
left_schema_dict, right_schema_dict = ({ls.name: ls for ls in left_schema}, {rs.name: rs for rs in right_schema})
|
|
21
|
+
fm_input.auto_rename()
|
|
22
|
+
|
|
23
|
+
output_schema = []
|
|
24
|
+
for column in fm_input.left_select.renames:
|
|
25
|
+
column_schema = left_schema_dict.get(column.old_name)
|
|
26
|
+
if column_schema and column.keep:
|
|
27
|
+
output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
|
|
28
|
+
example_values=column_schema.example_values))
|
|
29
|
+
for column in fm_input.right_select.renames:
|
|
30
|
+
column_schema = right_schema_dict.get(column.old_name)
|
|
31
|
+
if column_schema and column.keep:
|
|
32
|
+
output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
|
|
33
|
+
example_values=column_schema.example_values))
|
|
34
|
+
|
|
35
|
+
for i, fm in enumerate(fm_input.join_mappings):
|
|
36
|
+
output_schema.append(FlowfileColumn.from_input(f'fuzzy_score_{i}', 'Float64'))
|
|
37
|
+
return output_schema
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_schema_of_column(node_input_schema: List[FlowfileColumn], col_name: str) -> FlowfileColumn|None:
|
|
41
|
+
for s in node_input_schema:
|
|
42
|
+
if s.name == col_name:
|
|
43
|
+
return s
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class InvalidSetup(ValueError):
|
|
47
|
+
"""Error raised when pivot column has too many unique values."""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_output_data_type_pivot(schema: FlowfileColumn, agg_type: str) -> datatypes:
|
|
52
|
+
if agg_type in ('count', 'n_unique'):
|
|
53
|
+
output_type = datatypes.Float64 # count is always float
|
|
54
|
+
elif schema.generic_datatype() == 'numeric':
|
|
55
|
+
output_type = datatypes.Float64
|
|
56
|
+
elif schema.generic_datatype() == 'string':
|
|
57
|
+
output_type = datatypes.Utf8
|
|
58
|
+
elif schema.generic_datatype() == 'date':
|
|
59
|
+
output_type = datatypes.Datetime
|
|
60
|
+
else:
|
|
61
|
+
output_type = datatypes.Utf8
|
|
62
|
+
return output_type
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def pre_calculate_pivot_schema(node_input_schema: List[FlowfileColumn],
|
|
66
|
+
pivot_input: transform_schema.PivotInput,
|
|
67
|
+
output_fields: List[input_schema.MinimalFieldInfo] = None,
|
|
68
|
+
input_lf: pl.LazyFrame = None) -> List[FlowfileColumn]:
|
|
69
|
+
index_columns_schema = [get_schema_of_column(node_input_schema, index_col) for index_col in
|
|
70
|
+
pivot_input.index_columns]
|
|
71
|
+
val_column_schema = get_schema_of_column(node_input_schema, pivot_input.value_col)
|
|
72
|
+
if output_fields is not None and len(output_fields) > 0:
|
|
73
|
+
return index_columns_schema+[FlowfileColumn(PlType(Plcolumn_name=output_field.name,
|
|
74
|
+
pl_datatype=output_field.data_type)) for output_field in output_fields]
|
|
75
|
+
|
|
76
|
+
else:
|
|
77
|
+
max_unique_vals = 200
|
|
78
|
+
unique_vals = fetch_unique_values(input_lf.select(pivot_input.pivot_column)
|
|
79
|
+
.unique()
|
|
80
|
+
.sort(pivot_input.pivot_column)
|
|
81
|
+
.limit(max_unique_vals).cast(pl.String))
|
|
82
|
+
if len(unique_vals) >= max_unique_vals:
|
|
83
|
+
main_logger.warning('Pivot column has too many unique values. Please consider using a different column.'
|
|
84
|
+
f' Max unique values: {max_unique_vals}')
|
|
85
|
+
pl_output_fields = []
|
|
86
|
+
for val in unique_vals:
|
|
87
|
+
for agg in pivot_input.aggregations:
|
|
88
|
+
output_type = get_output_data_type_pivot(val_column_schema, agg)
|
|
89
|
+
pl_output_fields.append(PlType(column_name=f'{val}_{agg}', pl_datatype=output_type))
|
|
90
|
+
return index_columns_schema + [FlowfileColumn(pl_output_field) for pl_output_field in pl_output_fields]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from flowfile_core.flowfile.flow_data_engine.join.verify_integrity import *
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import List
|
|
3
|
+
from flowfile_core.schemas import transform_schema
|
|
4
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def verify_join_select_integrity(join_input: transform_schema.JoinInput | transform_schema.CrossJoinInput,
|
|
8
|
+
left_columns: List[str],
|
|
9
|
+
right_columns: List[str]):
|
|
10
|
+
"""
|
|
11
|
+
Verify column availability for join selection and update availability flags.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
join_input: Join configuration input containing column selections
|
|
15
|
+
left_columns: List of available column names in left table
|
|
16
|
+
right_columns: List of available column names in right table
|
|
17
|
+
"""
|
|
18
|
+
for c in join_input.left_select.renames:
|
|
19
|
+
if c.old_name not in left_columns:
|
|
20
|
+
c.is_available = False
|
|
21
|
+
else:
|
|
22
|
+
c.is_available = True
|
|
23
|
+
for c in join_input.right_select.renames:
|
|
24
|
+
if c.old_name not in right_columns:
|
|
25
|
+
c.is_available = False
|
|
26
|
+
else:
|
|
27
|
+
c.is_available = True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def verify_join_map_integrity(join_input: transform_schema.JoinInput,
|
|
31
|
+
left_columns: List[FlowfileColumn],
|
|
32
|
+
right_columns: List[FlowfileColumn]
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Verify data type compatibility for join mappings between tables.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
join_input: Join configuration with mappings between columns
|
|
39
|
+
left_columns: Schema columns from left table
|
|
40
|
+
right_columns: Schema columns from right table
|
|
41
|
+
Returns:
|
|
42
|
+
bool: True if join mapping is valid, False otherwise
|
|
43
|
+
"""
|
|
44
|
+
join_mappings = join_input.join_mapping
|
|
45
|
+
left_column_dict = {lc.name: lc for lc in left_columns}
|
|
46
|
+
right_column_dict = {rc.name: rc for rc in right_columns}
|
|
47
|
+
for join_mapping in join_mappings:
|
|
48
|
+
left_column_info: FlowfileColumn | None = left_column_dict.get(join_mapping.left_col)
|
|
49
|
+
right_column_info: FlowfileColumn | None = right_column_dict.get(join_mapping.right_col)
|
|
50
|
+
if not left_column_info or not right_column_info:
|
|
51
|
+
return False
|
|
52
|
+
if left_column_info.generic_datatype() != right_column_info.generic_datatype():
|
|
53
|
+
return False
|
|
54
|
+
return True
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
from polars.expr import Expr
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class AggFunc:
|
|
9
|
+
__slots__ = ['func_name', 'func_expr']
|
|
10
|
+
func_name: str
|
|
11
|
+
func_expr: Expr
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
AggFuncs = List[AggFunc]
|
|
15
|
+
|
|
16
|
+
pl.Expr.sum
|
|
17
|
+
|
|
18
|
+
agg_funcs = ['sum', 'max', 'min', 'count', 'first', 'last', 'std', 'var', 'n_unique', 'list', 'list_agg']
|
|
19
|
+
|
|
20
|
+
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
from typing import Dict, Any, Callable
|
|
3
|
+
import textwrap
|
|
4
|
+
import ast
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def remove_comments_and_docstrings(source: str) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Remove comments and docstrings from Python source code.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
source: Python source code as string
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Cleaned Python source code
|
|
17
|
+
"""
|
|
18
|
+
if not source.strip():
|
|
19
|
+
return ""
|
|
20
|
+
|
|
21
|
+
def remove_comments_from_line(line: str) -> str:
|
|
22
|
+
"""Remove comments while preserving string literals."""
|
|
23
|
+
result = []
|
|
24
|
+
i = 0
|
|
25
|
+
in_string = False
|
|
26
|
+
string_char = None
|
|
27
|
+
|
|
28
|
+
while i < len(line):
|
|
29
|
+
char = line[i]
|
|
30
|
+
|
|
31
|
+
# Handle string boundaries
|
|
32
|
+
if char in ('"', "'"):
|
|
33
|
+
# Check for escaped quotes
|
|
34
|
+
if i > 0 and line[i - 1] == '\\':
|
|
35
|
+
result.append(char)
|
|
36
|
+
i += 1
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
if not in_string:
|
|
40
|
+
# Check if it's the start of a string
|
|
41
|
+
in_string = True
|
|
42
|
+
string_char = char
|
|
43
|
+
elif string_char == char:
|
|
44
|
+
# Check if it's the end of a string
|
|
45
|
+
in_string = False
|
|
46
|
+
string_char = None
|
|
47
|
+
|
|
48
|
+
# Only process comment characters outside strings
|
|
49
|
+
elif char == '#' and not in_string:
|
|
50
|
+
break
|
|
51
|
+
|
|
52
|
+
result.append(char)
|
|
53
|
+
i += 1
|
|
54
|
+
|
|
55
|
+
return ''.join(result).rstrip()
|
|
56
|
+
|
|
57
|
+
# First pass: handle comments
|
|
58
|
+
lines = [remove_comments_from_line(line) for line in source.splitlines()]
|
|
59
|
+
source = '\n'.join(line for line in lines if line.strip())
|
|
60
|
+
|
|
61
|
+
# Second pass: handle docstrings using AST
|
|
62
|
+
try:
|
|
63
|
+
tree = ast.parse(source)
|
|
64
|
+
except SyntaxError:
|
|
65
|
+
return source
|
|
66
|
+
|
|
67
|
+
class DocstringRemover(ast.NodeTransformer):
|
|
68
|
+
def visit_Module(self, node):
|
|
69
|
+
# Remove module-level docstrings
|
|
70
|
+
while (node.body and isinstance(node.body[0], ast.Expr)
|
|
71
|
+
and isinstance(node.body[0].value, ast.Constant)
|
|
72
|
+
and isinstance(node.body[0].value.value, str)):
|
|
73
|
+
node.body.pop(0)
|
|
74
|
+
return self.generic_visit(node)
|
|
75
|
+
|
|
76
|
+
def visit_FunctionDef(self, node):
|
|
77
|
+
# Remove function docstrings
|
|
78
|
+
if (node.body and isinstance(node.body[0], ast.Expr)
|
|
79
|
+
and isinstance(node.body[0].value, ast.Constant)
|
|
80
|
+
and isinstance(node.body[0].value.value, str)):
|
|
81
|
+
node.body.pop(0)
|
|
82
|
+
return self.generic_visit(node)
|
|
83
|
+
|
|
84
|
+
def visit_ClassDef(self, node):
|
|
85
|
+
# Remove class docstrings
|
|
86
|
+
if (node.body and isinstance(node.body[0], ast.Expr)
|
|
87
|
+
and isinstance(node.body[0].value, ast.Constant)
|
|
88
|
+
and isinstance(node.body[0].value.value, str)):
|
|
89
|
+
node.body.pop(0)
|
|
90
|
+
return self.generic_visit(node)
|
|
91
|
+
|
|
92
|
+
def visit_AsyncFunctionDef(self, node):
|
|
93
|
+
# Remove async function docstrings
|
|
94
|
+
if (node.body and isinstance(node.body[0], ast.Expr)
|
|
95
|
+
and isinstance(node.body[0].value, ast.Constant)
|
|
96
|
+
and isinstance(node.body[0].value.value, str)):
|
|
97
|
+
node.body.pop(0)
|
|
98
|
+
return self.generic_visit(node)
|
|
99
|
+
|
|
100
|
+
def visit_Expr(self, node):
|
|
101
|
+
# Remove standalone string literals
|
|
102
|
+
if isinstance(node.value, (ast.Str, ast.Constant)) and isinstance(getattr(node.value, 'value', None), str):
|
|
103
|
+
return None
|
|
104
|
+
return self.generic_visit(node)
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
tree = DocstringRemover().visit(tree)
|
|
108
|
+
ast.fix_missing_locations(tree)
|
|
109
|
+
result = ast.unparse(tree)
|
|
110
|
+
# Remove empty lines
|
|
111
|
+
return '\n'.join(line for line in result.splitlines() if line.strip())
|
|
112
|
+
except Exception:
|
|
113
|
+
return source
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class PolarsCodeParser:
|
|
117
|
+
"""
|
|
118
|
+
Securely executes Polars code with restricted access to Python functionality.
|
|
119
|
+
Supports multiple input DataFrames or no input DataFrames.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def __init__(self):
|
|
123
|
+
self.safe_globals = {
|
|
124
|
+
# Polars functionality
|
|
125
|
+
'pl': pl,
|
|
126
|
+
'col': pl.col,
|
|
127
|
+
'lit': pl.lit,
|
|
128
|
+
'expr': pl.expr,
|
|
129
|
+
# Basic Python built-ins
|
|
130
|
+
'print': print,
|
|
131
|
+
'len': len,
|
|
132
|
+
'range': range,
|
|
133
|
+
'enumerate': enumerate,
|
|
134
|
+
'zip': zip,
|
|
135
|
+
'list': list,
|
|
136
|
+
'dict': dict,
|
|
137
|
+
'set': set,
|
|
138
|
+
'str': str,
|
|
139
|
+
'int': int,
|
|
140
|
+
'float': float,
|
|
141
|
+
'bool': bool,
|
|
142
|
+
'True': True,
|
|
143
|
+
'False': False,
|
|
144
|
+
'None': None,
|
|
145
|
+
'time': time
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _validate_code(code: str) -> None:
|
|
150
|
+
"""
|
|
151
|
+
Validate code for security concerns before execution.
|
|
152
|
+
"""
|
|
153
|
+
try:
|
|
154
|
+
tree = ast.parse(code)
|
|
155
|
+
for node in ast.walk(tree):
|
|
156
|
+
# Block imports
|
|
157
|
+
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
|
158
|
+
raise ValueError("Import statements are not allowed")
|
|
159
|
+
|
|
160
|
+
# Block exec/eval
|
|
161
|
+
if isinstance(node, ast.Call):
|
|
162
|
+
if isinstance(node.func, ast.Name):
|
|
163
|
+
if node.func.id in {'exec', 'eval', 'compile', '__import__'}:
|
|
164
|
+
raise ValueError(f"Function '{node.func.id}' is not allowed")
|
|
165
|
+
|
|
166
|
+
# Block access to system attributes
|
|
167
|
+
if isinstance(node, ast.Attribute):
|
|
168
|
+
if node.attr.startswith('__'):
|
|
169
|
+
raise ValueError(f"Access to '{node.attr}' is not allowed")
|
|
170
|
+
|
|
171
|
+
except SyntaxError as e:
|
|
172
|
+
raise ValueError(f"Invalid Python syntax: {str(e)}")
|
|
173
|
+
|
|
174
|
+
def _wrap_in_function(self, code: str, num_inputs: int = 1) -> str:
|
|
175
|
+
"""
|
|
176
|
+
Wraps code in a function definition that can accept multiple input DataFrames or none.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
code: The code to wrap
|
|
180
|
+
num_inputs: Number of expected input DataFrames (0 for none)
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Wrapped code as a function
|
|
184
|
+
"""
|
|
185
|
+
# Dedent the code first to handle various indentation styles
|
|
186
|
+
code = textwrap.dedent(code).strip()
|
|
187
|
+
|
|
188
|
+
# Create appropriate function signature based on number of inputs
|
|
189
|
+
if num_inputs == 0:
|
|
190
|
+
function_def = "def _transform():\n"
|
|
191
|
+
elif num_inputs == 1:
|
|
192
|
+
function_def = "def _transform(input_df):\n"
|
|
193
|
+
else:
|
|
194
|
+
params = ", ".join([f"input_df_{i+1}" for i in range(num_inputs)])
|
|
195
|
+
function_def = f"def _transform({params}):\n"
|
|
196
|
+
|
|
197
|
+
# Handle single line expressions
|
|
198
|
+
if '\n' not in code:
|
|
199
|
+
# For expressions that should return directly
|
|
200
|
+
if any(code.startswith(prefix) for prefix in ['pl.', 'col(', 'input_df', 'expr(']):
|
|
201
|
+
return function_def + f" return {code}"
|
|
202
|
+
# For assignments
|
|
203
|
+
else:
|
|
204
|
+
return function_def + f" {code}\n return output_df"
|
|
205
|
+
|
|
206
|
+
# For multi-line code
|
|
207
|
+
indented_code = '\n'.join(f" {line}" for line in code.split('\n'))
|
|
208
|
+
return function_def + indented_code + '\n return output_df'
|
|
209
|
+
|
|
210
|
+
def get_executable(self, code: str, num_inputs: int = 1) -> Callable:
|
|
211
|
+
"""
|
|
212
|
+
Securely get a function that can be executed with multiple DataFrames or none.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
code: The code to execute
|
|
216
|
+
num_inputs: Number of expected input DataFrames (0 for none)
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Callable: A function that takes the specified number of DataFrames
|
|
220
|
+
"""
|
|
221
|
+
# Validate and clean the code
|
|
222
|
+
code = remove_comments_and_docstrings(code)
|
|
223
|
+
code = textwrap.dedent(code).strip()
|
|
224
|
+
self._validate_code(code)
|
|
225
|
+
|
|
226
|
+
# Wrap the code in a function
|
|
227
|
+
wrapped_code = self._wrap_in_function(code, num_inputs)
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
# Create namespace for execution
|
|
231
|
+
local_namespace: Dict[str, Any] = {}
|
|
232
|
+
|
|
233
|
+
exec(wrapped_code, self.safe_globals, local_namespace)
|
|
234
|
+
|
|
235
|
+
transform_func = local_namespace['_transform']
|
|
236
|
+
return transform_func
|
|
237
|
+
except Exception as e:
|
|
238
|
+
raise ValueError(f"Error executing code: {str(e)}")
|
|
239
|
+
|
|
240
|
+
def validate_code(self, code: str):
|
|
241
|
+
"""
|
|
242
|
+
Validate code for security concerns before execution
|
|
243
|
+
"""
|
|
244
|
+
code = remove_comments_and_docstrings(code)
|
|
245
|
+
code = textwrap.dedent(code).strip()
|
|
246
|
+
self._validate_code(code)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
polars_code_parser = PolarsCodeParser()
|