Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,144 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Any, List, Dict, Literal
3
+ from flowfile_core.schemas import input_schema
4
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
5
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
6
+ from polars import datatypes
7
+ import polars as pl
8
+
9
+ DataTypeGroup = Literal['numeric', 'str', 'date']
10
+
11
+
12
+ @dataclass
13
+ class FlowfileColumn:
14
+ column_name: str
15
+ data_type: str
16
+ size: int
17
+ max_value: str
18
+ min_value: str
19
+ col_index: int
20
+ number_of_empty_values: int
21
+ number_of_unique_values: int
22
+ example_values: str
23
+ __sql_type: Optional[Any]
24
+ __is_unique: Optional[bool]
25
+ __nullable: Optional[bool]
26
+ __has_values: Optional[bool]
27
+ average_value: Optional[str]
28
+ __perc_unique: Optional[float]
29
+
30
+ def __init__(self, polars_type: PlType):
31
+ self.data_type = str(polars_type.pl_datatype.base_type())
32
+ self.size = polars_type.count - polars_type.null_count
33
+ self.max_value = polars_type.max
34
+ self.min_value = polars_type.min
35
+ self.number_of_unique_values = polars_type.n_unique
36
+ self.number_of_empty_values = polars_type.null_count
37
+ self.example_values = polars_type.examples
38
+ self.column_name = polars_type.column_name
39
+ self.average_value = polars_type.mean
40
+ self.col_index = polars_type.col_index
41
+ self.__has_values = None
42
+ self.__nullable = None
43
+ self.__is_unique = None
44
+ self.__sql_type = None
45
+ self.__perc_unique = None
46
+
47
+ @classmethod
48
+ def create_from_polars_type(cls, polars_type: PlType, **kwargs) -> "FlowfileColumn":
49
+ for k, v in kwargs.items():
50
+ if hasattr(polars_type, k):
51
+ setattr(polars_type, k, v)
52
+ return cls(polars_type)
53
+
54
+ @classmethod
55
+ def from_input(cls, column_name: str, data_type: str, **kwargs) -> "FlowfileColumn":
56
+ pl_type = type_to_polars_str(data_type)
57
+ if pl_type is not None:
58
+ data_type = pl_type
59
+ return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
60
+
61
+ @classmethod
62
+ def create_from_polars_dtype(cls, column_name: str, data_type: pl.DataType, **kwargs):
63
+ return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
64
+
65
+ def get_minimal_field_info(self) -> input_schema.MinimalFieldInfo:
66
+ return input_schema.MinimalFieldInfo(name=self.column_name, data_type=self.data_type)
67
+
68
+ @classmethod
69
+ def create_from_minimal_field_info(cls, minimal_field_info: input_schema.MinimalFieldInfo) -> "FlowfileColumn":
70
+ return cls.from_input(column_name=minimal_field_info.name,
71
+ data_type=minimal_field_info.data_type)
72
+
73
+ @property
74
+ def is_unique(self) -> bool:
75
+ if self.__is_unique is None:
76
+ if self.has_values:
77
+ self.__is_unique = self.number_of_unique_values == self.number_of_filled_values
78
+ else:
79
+ self.__is_unique = False
80
+ return self.__is_unique
81
+
82
+ @property
83
+ def perc_unique(self) -> float:
84
+ if self.__perc_unique is None:
85
+ self.__perc_unique = self.number_of_unique_values / self.number_of_filled_values
86
+ return self.__perc_unique
87
+
88
+ @property
89
+ def has_values(self) -> bool:
90
+ if not self.__has_values:
91
+ self.__has_values = self.number_of_unique_values > 0
92
+ return self.__has_values
93
+
94
+ @property
95
+ def number_of_filled_values(self):
96
+ return self.size
97
+
98
+ @property
99
+ def nullable(self):
100
+ if self.__nullable is None:
101
+ self.__nullable = self.number_of_empty_values > 0
102
+ return self.__nullable
103
+
104
+ @property
105
+ def name(self):
106
+ return self.column_name
107
+
108
+ def get_column_repr(self):
109
+ return dict(name=self.name,
110
+ size=self.size,
111
+ data_type=str(self.data_type),
112
+ has_values=self.has_values,
113
+ is_unique=self.is_unique,
114
+ max_value=str(self.max_value),
115
+ min_value=str(self.min_value),
116
+ number_of_unique_values=self.number_of_unique_values,
117
+ number_of_filled_values=self.number_of_filled_values,
118
+ number_of_empty_values=self.number_of_empty_values,
119
+ average_size=self.average_value)
120
+
121
+ def generic_datatype(self) -> DataTypeGroup:
122
+ if self.data_type in ('Utf8', 'VARCHAR', 'CHAR', 'NVARCHAR', 'String'):
123
+ return 'str'
124
+ elif self.data_type in ('fixed_decimal', 'decimal', 'float', 'integer', 'boolean', 'double', 'Int16', 'Int32',
125
+ 'Int64', 'Float32', 'Float64', 'Decimal', 'Binary', 'Boolean', 'Uint8', 'Uint16',
126
+ 'Uint32', 'Uint64'):
127
+ return 'numeric'
128
+ elif self.data_type in ('datetime', 'date', 'Date', 'Datetime', 'Time'):
129
+ return 'date'
130
+
131
+ def get_polars_type(self) -> PlType:
132
+ if hasattr(datatypes, self.data_type):
133
+ pl_datatype = getattr(datatypes, self.data_type)
134
+ else:
135
+ pl_datatype = None
136
+
137
+ return PlType(pl_datatype=pl_datatype, **self.__dict__)
138
+
139
+ def update_type_from_polars_type(self, pl_type: PlType):
140
+ self.data_type = str(pl_type.pl_datatype.base_type())
141
+
142
+
143
+ def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
144
+ return [FlowfileColumn.create_from_polars_type(PlType(**c)) for c in stats]
@@ -0,0 +1,24 @@
1
+ from typing import Optional, Any
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class ColumnInfo:
6
+ pass
7
+
8
+
9
+ class PlType(BaseModel):
10
+ column_name: str
11
+ col_index: int = -1
12
+ count: Optional[int] = -1
13
+ null_count: Optional[int] = -1
14
+ mean: Optional[str] = ""
15
+ std: Optional[float] = -1
16
+ min: Optional[str] = ""
17
+ max: Optional[str] = ""
18
+ median: Optional[str] = 0
19
+ pl_datatype: Optional[Any]
20
+ n_unique: Optional[int] = -1
21
+ examples: Optional[str] = ""
22
+
23
+ class Config:
24
+ arbitrary_types_allowed = True
@@ -0,0 +1,36 @@
1
+ import polars as pl
2
+
3
+
4
+ dtype_to_pl = {
5
+ 'int': pl.Int64,
6
+ 'integer': pl.Int64,
7
+ 'char': pl.String,
8
+ 'fixed decimal': pl.Float32,
9
+ 'double': pl.Float64,
10
+ 'float': pl.Float64,
11
+ 'bool': pl.Boolean,
12
+ 'byte': pl.UInt8,
13
+ 'bit': pl.Binary,
14
+ 'date': pl.Date,
15
+ 'datetime': pl.Datetime,
16
+ 'string': pl.String,
17
+ 'str': pl.String,
18
+ 'time': pl.Time,
19
+ }
20
+
21
+ dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
22
+
23
+
24
+ def type_to_polars(dtype: str):
25
+ pl_datetype = dtype_to_pl.get(dtype.lower())
26
+ if pl_datetype is not None:
27
+ return pl_datetype
28
+ elif hasattr(pl, dtype):
29
+ return getattr(pl, dtype)
30
+ else:
31
+ return pl.String
32
+
33
+
34
+ def type_to_polars_str(dtype: str) -> pl.DataType:
35
+ return type_to_polars(dtype)()
36
+
@@ -0,0 +1,38 @@
1
+ from flowfile_core.schemas.transform_schema import FuzzyMatchInput
2
+ from flowfile_core.flowfile.flow_data_engine.join import verify_join_select_integrity, verify_join_map_integrity
3
+ import polars as pl
4
+ from typing import TYPE_CHECKING, Tuple
5
+
6
+ if TYPE_CHECKING:
7
+ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
8
+
9
+
10
+ def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
11
+ fuzzy_match_input: FuzzyMatchInput) -> Tuple[pl.LazyFrame, pl.LazyFrame]:
12
+ """
13
+ Prepare two FlowDataEngines for fuzzy matching.
14
+
15
+ Args:
16
+ left: Left FlowDataEngine for fuzzy join
17
+ right: Right FlowDataEngine for fuzzy join
18
+ fuzzy_match_input: Parameters for fuzzy matching configuration
19
+ Returns:
20
+ Tuple[pl.LazyFrame, pl.LazyFrame]: Prepared left and right lazy frames
21
+ """
22
+
23
+ left.lazy = True
24
+ right.lazy = True
25
+ verify_join_select_integrity(fuzzy_match_input, left_columns=left.columns, right_columns=right.columns)
26
+ if not verify_join_map_integrity(fuzzy_match_input, left_columns=left.schema, right_columns=right.schema):
27
+ raise Exception('Join is not valid by the data fields')
28
+ fuzzy_match_input = fuzzy_match_input
29
+ fuzzy_match_input.auto_rename()
30
+ right_select = [v.old_name for v in fuzzy_match_input.right_select.renames if
31
+ (v.keep or v.join_key) and v.is_available]
32
+ left_select = [v.old_name for v in fuzzy_match_input.left_select.renames if
33
+ (v.keep or v.join_key) and v.is_available]
34
+ left_df: pl.LazyFrame | pl.DataFrame = left.data_frame.select(left_select).rename(
35
+ fuzzy_match_input.left_select.rename_table)
36
+ right_df: pl.LazyFrame | pl.DataFrame = right.data_frame.select(right_select).rename(
37
+ fuzzy_match_input.right_select.rename_table)
38
+ return left_df, right_df
@@ -0,0 +1,90 @@
1
+
2
+ from typing import List
3
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, PlType
4
+ from flowfile_core.schemas import transform_schema
5
+ from flowfile_core.schemas import input_schema
6
+ from polars import datatypes
7
+ import polars as pl
8
+ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import fetch_unique_values
9
+ from flowfile_core.configs.flow_logger import main_logger
10
+
11
+
12
+ def calculate_uniqueness(a: float, b: float) -> float:
13
+ return ((pow(a + 0.5, 2) + pow(b + 0.5, 2)) / 2 - pow(0.5, 2)) + 0.5 * abs(a - b)
14
+
15
+
16
+ def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
17
+ left_schema: List[FlowfileColumn],
18
+ right_schema: List[FlowfileColumn]):
19
+ print('calculating fuzzy match schema')
20
+ left_schema_dict, right_schema_dict = ({ls.name: ls for ls in left_schema}, {rs.name: rs for rs in right_schema})
21
+ fm_input.auto_rename()
22
+
23
+ output_schema = []
24
+ for column in fm_input.left_select.renames:
25
+ column_schema = left_schema_dict.get(column.old_name)
26
+ if column_schema and column.keep:
27
+ output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
28
+ example_values=column_schema.example_values))
29
+ for column in fm_input.right_select.renames:
30
+ column_schema = right_schema_dict.get(column.old_name)
31
+ if column_schema and column.keep:
32
+ output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
33
+ example_values=column_schema.example_values))
34
+
35
+ for i, fm in enumerate(fm_input.join_mappings):
36
+ output_schema.append(FlowfileColumn.from_input(f'fuzzy_score_{i}', 'Float64'))
37
+ return output_schema
38
+
39
+
40
+ def get_schema_of_column(node_input_schema: List[FlowfileColumn], col_name: str) -> FlowfileColumn|None:
41
+ for s in node_input_schema:
42
+ if s.name == col_name:
43
+ return s
44
+
45
+
46
+ class InvalidSetup(ValueError):
47
+ """Error raised when pivot column has too many unique values."""
48
+ pass
49
+
50
+
51
+ def get_output_data_type_pivot(schema: FlowfileColumn, agg_type: str) -> datatypes:
52
+ if agg_type in ('count', 'n_unique'):
53
+ output_type = datatypes.Float64 # count is always float
54
+ elif schema.generic_datatype() == 'numeric':
55
+ output_type = datatypes.Float64
56
+ elif schema.generic_datatype() == 'string':
57
+ output_type = datatypes.Utf8
58
+ elif schema.generic_datatype() == 'date':
59
+ output_type = datatypes.Datetime
60
+ else:
61
+ output_type = datatypes.Utf8
62
+ return output_type
63
+
64
+
65
+ def pre_calculate_pivot_schema(node_input_schema: List[FlowfileColumn],
66
+ pivot_input: transform_schema.PivotInput,
67
+ output_fields: List[input_schema.MinimalFieldInfo] = None,
68
+ input_lf: pl.LazyFrame = None) -> List[FlowfileColumn]:
69
+ index_columns_schema = [get_schema_of_column(node_input_schema, index_col) for index_col in
70
+ pivot_input.index_columns]
71
+ val_column_schema = get_schema_of_column(node_input_schema, pivot_input.value_col)
72
+ if output_fields is not None and len(output_fields) > 0:
73
+ return index_columns_schema+[FlowfileColumn(PlType(Plcolumn_name=output_field.name,
74
+ pl_datatype=output_field.data_type)) for output_field in output_fields]
75
+
76
+ else:
77
+ max_unique_vals = 200
78
+ unique_vals = fetch_unique_values(input_lf.select(pivot_input.pivot_column)
79
+ .unique()
80
+ .sort(pivot_input.pivot_column)
81
+ .limit(max_unique_vals).cast(pl.String))
82
+ if len(unique_vals) >= max_unique_vals:
83
+ main_logger.warning('Pivot column has too many unique values. Please consider using a different column.'
84
+ f' Max unique values: {max_unique_vals}')
85
+ pl_output_fields = []
86
+ for val in unique_vals:
87
+ for agg in pivot_input.aggregations:
88
+ output_type = get_output_data_type_pivot(val_column_schema, agg)
89
+ pl_output_fields.append(PlType(column_name=f'{val}_{agg}', pl_datatype=output_type))
90
+ return index_columns_schema + [FlowfileColumn(pl_output_field) for pl_output_field in pl_output_fields]
@@ -0,0 +1 @@
1
+ from flowfile_core.flowfile.flow_data_engine.join.verify_integrity import *
@@ -0,0 +1,54 @@
1
+
2
+ from typing import List
3
+ from flowfile_core.schemas import transform_schema
4
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
5
+
6
+
7
+ def verify_join_select_integrity(join_input: transform_schema.JoinInput | transform_schema.CrossJoinInput,
8
+ left_columns: List[str],
9
+ right_columns: List[str]):
10
+ """
11
+ Verify column availability for join selection and update availability flags.
12
+
13
+ Args:
14
+ join_input: Join configuration input containing column selections
15
+ left_columns: List of available column names in left table
16
+ right_columns: List of available column names in right table
17
+ """
18
+ for c in join_input.left_select.renames:
19
+ if c.old_name not in left_columns:
20
+ c.is_available = False
21
+ else:
22
+ c.is_available = True
23
+ for c in join_input.right_select.renames:
24
+ if c.old_name not in right_columns:
25
+ c.is_available = False
26
+ else:
27
+ c.is_available = True
28
+
29
+
30
+ def verify_join_map_integrity(join_input: transform_schema.JoinInput,
31
+ left_columns: List[FlowfileColumn],
32
+ right_columns: List[FlowfileColumn]
33
+ ):
34
+ """
35
+ Verify data type compatibility for join mappings between tables.
36
+
37
+ Args:
38
+ join_input: Join configuration with mappings between columns
39
+ left_columns: Schema columns from left table
40
+ right_columns: Schema columns from right table
41
+ Returns:
42
+ bool: True if join mapping is valid, False otherwise
43
+ """
44
+ join_mappings = join_input.join_mapping
45
+ left_column_dict = {lc.name: lc for lc in left_columns}
46
+ right_column_dict = {rc.name: rc for rc in right_columns}
47
+ for join_mapping in join_mappings:
48
+ left_column_info: FlowfileColumn | None = left_column_dict.get(join_mapping.left_col)
49
+ right_column_info: FlowfileColumn | None = right_column_dict.get(join_mapping.right_col)
50
+ if not left_column_info or not right_column_info:
51
+ return False
52
+ if left_column_info.generic_datatype() != right_column_info.generic_datatype():
53
+ return False
54
+ return True
@@ -0,0 +1,20 @@
1
+ import polars as pl
2
+ from polars.expr import Expr
3
+ from dataclasses import dataclass
4
+ from typing import List
5
+
6
+
7
+ @dataclass
8
+ class AggFunc:
9
+ __slots__ = ['func_name', 'func_expr']
10
+ func_name: str
11
+ func_expr: Expr
12
+
13
+
14
+ AggFuncs = List[AggFunc]
15
+
16
+ pl.Expr.sum
17
+
18
+ agg_funcs = ['sum', 'max', 'min', 'count', 'first', 'last', 'std', 'var', 'n_unique', 'list', 'list_agg']
19
+
20
+
@@ -0,0 +1,249 @@
1
+ import polars as pl
2
+ from typing import Dict, Any, Callable
3
+ import textwrap
4
+ import ast
5
+ import time
6
+
7
+
8
+ def remove_comments_and_docstrings(source: str) -> str:
9
+ """
10
+ Remove comments and docstrings from Python source code.
11
+
12
+ Args:
13
+ source: Python source code as string
14
+
15
+ Returns:
16
+ Cleaned Python source code
17
+ """
18
+ if not source.strip():
19
+ return ""
20
+
21
+ def remove_comments_from_line(line: str) -> str:
22
+ """Remove comments while preserving string literals."""
23
+ result = []
24
+ i = 0
25
+ in_string = False
26
+ string_char = None
27
+
28
+ while i < len(line):
29
+ char = line[i]
30
+
31
+ # Handle string boundaries
32
+ if char in ('"', "'"):
33
+ # Check for escaped quotes
34
+ if i > 0 and line[i - 1] == '\\':
35
+ result.append(char)
36
+ i += 1
37
+ continue
38
+
39
+ if not in_string:
40
+ # Check if it's the start of a string
41
+ in_string = True
42
+ string_char = char
43
+ elif string_char == char:
44
+ # Check if it's the end of a string
45
+ in_string = False
46
+ string_char = None
47
+
48
+ # Only process comment characters outside strings
49
+ elif char == '#' and not in_string:
50
+ break
51
+
52
+ result.append(char)
53
+ i += 1
54
+
55
+ return ''.join(result).rstrip()
56
+
57
+ # First pass: handle comments
58
+ lines = [remove_comments_from_line(line) for line in source.splitlines()]
59
+ source = '\n'.join(line for line in lines if line.strip())
60
+
61
+ # Second pass: handle docstrings using AST
62
+ try:
63
+ tree = ast.parse(source)
64
+ except SyntaxError:
65
+ return source
66
+
67
+ class DocstringRemover(ast.NodeTransformer):
68
+ def visit_Module(self, node):
69
+ # Remove module-level docstrings
70
+ while (node.body and isinstance(node.body[0], ast.Expr)
71
+ and isinstance(node.body[0].value, ast.Constant)
72
+ and isinstance(node.body[0].value.value, str)):
73
+ node.body.pop(0)
74
+ return self.generic_visit(node)
75
+
76
+ def visit_FunctionDef(self, node):
77
+ # Remove function docstrings
78
+ if (node.body and isinstance(node.body[0], ast.Expr)
79
+ and isinstance(node.body[0].value, ast.Constant)
80
+ and isinstance(node.body[0].value.value, str)):
81
+ node.body.pop(0)
82
+ return self.generic_visit(node)
83
+
84
+ def visit_ClassDef(self, node):
85
+ # Remove class docstrings
86
+ if (node.body and isinstance(node.body[0], ast.Expr)
87
+ and isinstance(node.body[0].value, ast.Constant)
88
+ and isinstance(node.body[0].value.value, str)):
89
+ node.body.pop(0)
90
+ return self.generic_visit(node)
91
+
92
+ def visit_AsyncFunctionDef(self, node):
93
+ # Remove async function docstrings
94
+ if (node.body and isinstance(node.body[0], ast.Expr)
95
+ and isinstance(node.body[0].value, ast.Constant)
96
+ and isinstance(node.body[0].value.value, str)):
97
+ node.body.pop(0)
98
+ return self.generic_visit(node)
99
+
100
+ def visit_Expr(self, node):
101
+ # Remove standalone string literals
102
+ if isinstance(node.value, (ast.Str, ast.Constant)) and isinstance(getattr(node.value, 'value', None), str):
103
+ return None
104
+ return self.generic_visit(node)
105
+
106
+ try:
107
+ tree = DocstringRemover().visit(tree)
108
+ ast.fix_missing_locations(tree)
109
+ result = ast.unparse(tree)
110
+ # Remove empty lines
111
+ return '\n'.join(line for line in result.splitlines() if line.strip())
112
+ except Exception:
113
+ return source
114
+
115
+
116
+ class PolarsCodeParser:
117
+ """
118
+ Securely executes Polars code with restricted access to Python functionality.
119
+ Supports multiple input DataFrames or no input DataFrames.
120
+ """
121
+
122
+ def __init__(self):
123
+ self.safe_globals = {
124
+ # Polars functionality
125
+ 'pl': pl,
126
+ 'col': pl.col,
127
+ 'lit': pl.lit,
128
+ 'expr': pl.expr,
129
+ # Basic Python built-ins
130
+ 'print': print,
131
+ 'len': len,
132
+ 'range': range,
133
+ 'enumerate': enumerate,
134
+ 'zip': zip,
135
+ 'list': list,
136
+ 'dict': dict,
137
+ 'set': set,
138
+ 'str': str,
139
+ 'int': int,
140
+ 'float': float,
141
+ 'bool': bool,
142
+ 'True': True,
143
+ 'False': False,
144
+ 'None': None,
145
+ 'time': time
146
+ }
147
+
148
+ @staticmethod
149
+ def _validate_code(code: str) -> None:
150
+ """
151
+ Validate code for security concerns before execution.
152
+ """
153
+ try:
154
+ tree = ast.parse(code)
155
+ for node in ast.walk(tree):
156
+ # Block imports
157
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
158
+ raise ValueError("Import statements are not allowed")
159
+
160
+ # Block exec/eval
161
+ if isinstance(node, ast.Call):
162
+ if isinstance(node.func, ast.Name):
163
+ if node.func.id in {'exec', 'eval', 'compile', '__import__'}:
164
+ raise ValueError(f"Function '{node.func.id}' is not allowed")
165
+
166
+ # Block access to system attributes
167
+ if isinstance(node, ast.Attribute):
168
+ if node.attr.startswith('__'):
169
+ raise ValueError(f"Access to '{node.attr}' is not allowed")
170
+
171
+ except SyntaxError as e:
172
+ raise ValueError(f"Invalid Python syntax: {str(e)}")
173
+
174
+ def _wrap_in_function(self, code: str, num_inputs: int = 1) -> str:
175
+ """
176
+ Wraps code in a function definition that can accept multiple input DataFrames or none.
177
+
178
+ Args:
179
+ code: The code to wrap
180
+ num_inputs: Number of expected input DataFrames (0 for none)
181
+
182
+ Returns:
183
+ Wrapped code as a function
184
+ """
185
+ # Dedent the code first to handle various indentation styles
186
+ code = textwrap.dedent(code).strip()
187
+
188
+ # Create appropriate function signature based on number of inputs
189
+ if num_inputs == 0:
190
+ function_def = "def _transform():\n"
191
+ elif num_inputs == 1:
192
+ function_def = "def _transform(input_df):\n"
193
+ else:
194
+ params = ", ".join([f"input_df_{i+1}" for i in range(num_inputs)])
195
+ function_def = f"def _transform({params}):\n"
196
+
197
+ # Handle single line expressions
198
+ if '\n' not in code:
199
+ # For expressions that should return directly
200
+ if any(code.startswith(prefix) for prefix in ['pl.', 'col(', 'input_df', 'expr(']):
201
+ return function_def + f" return {code}"
202
+ # For assignments
203
+ else:
204
+ return function_def + f" {code}\n return output_df"
205
+
206
+ # For multi-line code
207
+ indented_code = '\n'.join(f" {line}" for line in code.split('\n'))
208
+ return function_def + indented_code + '\n return output_df'
209
+
210
+ def get_executable(self, code: str, num_inputs: int = 1) -> Callable:
211
+ """
212
+ Securely get a function that can be executed with multiple DataFrames or none.
213
+
214
+ Args:
215
+ code: The code to execute
216
+ num_inputs: Number of expected input DataFrames (0 for none)
217
+
218
+ Returns:
219
+ Callable: A function that takes the specified number of DataFrames
220
+ """
221
+ # Validate and clean the code
222
+ code = remove_comments_and_docstrings(code)
223
+ code = textwrap.dedent(code).strip()
224
+ self._validate_code(code)
225
+
226
+ # Wrap the code in a function
227
+ wrapped_code = self._wrap_in_function(code, num_inputs)
228
+
229
+ try:
230
+ # Create namespace for execution
231
+ local_namespace: Dict[str, Any] = {}
232
+
233
+ exec(wrapped_code, self.safe_globals, local_namespace)
234
+
235
+ transform_func = local_namespace['_transform']
236
+ return transform_func
237
+ except Exception as e:
238
+ raise ValueError(f"Error executing code: {str(e)}")
239
+
240
+ def validate_code(self, code: str):
241
+ """
242
+ Validate code for security concerns before execution
243
+ """
244
+ code = remove_comments_and_docstrings(code)
245
+ code = textwrap.dedent(code).strip()
246
+ self._validate_code(code)
247
+
248
+
249
+ polars_code_parser = PolarsCodeParser()