Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
from flowfile_core.configs.settings import AVAILABLE_RAM, WORKER_URL
|
|
3
|
+
from flowfile_core.configs import logger
|
|
4
|
+
from flowfile_core.flowfile.flow_data_engine.subprocess_operations import ExternalDfFetcher
|
|
5
|
+
from flowfile_core.flowfile.flow_data_engine.subprocess_operations import Status
|
|
6
|
+
import os
|
|
7
|
+
from typing import List, Dict, Iterable, Callable, Any
|
|
8
|
+
from itertools import chain
|
|
9
|
+
import requests
|
|
10
|
+
from base64 import encodebytes
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def convert_to_string(v):
|
|
14
|
+
try:
|
|
15
|
+
return str(v)
|
|
16
|
+
except:
|
|
17
|
+
return None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def standardize_col_dtype(vals):
|
|
21
|
+
types = set(type(val) for val in vals)
|
|
22
|
+
if len(types) == 1:
|
|
23
|
+
return vals
|
|
24
|
+
elif int in types and float in types:
|
|
25
|
+
return vals
|
|
26
|
+
else:
|
|
27
|
+
return [convert_to_string(v) for v in vals]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_data_type(vals: Iterable[Any]):
|
|
31
|
+
types = set(type(val) for val in vals)
|
|
32
|
+
if len(types) == 1:
|
|
33
|
+
return types.pop().__name__
|
|
34
|
+
elif types == {float, int}:
|
|
35
|
+
return 'float'
|
|
36
|
+
else:
|
|
37
|
+
return 'str'
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def ensure_similarity_dicts(datas: List[Dict], respect_order: bool = True):
|
|
41
|
+
all_cols = (data.keys() for data in datas)
|
|
42
|
+
if not respect_order:
|
|
43
|
+
unique_cols = set(chain(*all_cols))
|
|
44
|
+
else:
|
|
45
|
+
col_store = set()
|
|
46
|
+
unique_cols = list()
|
|
47
|
+
for row in all_cols:
|
|
48
|
+
for col in row:
|
|
49
|
+
if col not in col_store:
|
|
50
|
+
unique_cols.append(col)
|
|
51
|
+
col_store.update((col,))
|
|
52
|
+
output = []
|
|
53
|
+
for data in datas:
|
|
54
|
+
new_record = dict()
|
|
55
|
+
for col in unique_cols:
|
|
56
|
+
val = data.get(col)
|
|
57
|
+
new_record[col] = val
|
|
58
|
+
output.append(new_record)
|
|
59
|
+
return output
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def calculate_schema(lf: pl.LazyFrame) -> List[Dict]:
|
|
63
|
+
r = ExternalDfFetcher(lf=lf, operation_type='calculate_schema', wait_on_completion=False, flow_id=-1, node_id=-1)
|
|
64
|
+
schema_stats: List[Dict] = r.get_result()
|
|
65
|
+
for schema_stat in schema_stats:
|
|
66
|
+
schema_stat['pl_datatype'] = getattr(pl.datatypes, schema_stat['pl_datatype'])
|
|
67
|
+
return schema_stats
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def write_polars_frame(_df: pl.LazyFrame | pl.DataFrame, path: str, data_type: str = 'parquet',
|
|
71
|
+
estimated_size: int = 0):
|
|
72
|
+
is_lazy = isinstance(_df, pl.LazyFrame)
|
|
73
|
+
logger.info('Caching data frame')
|
|
74
|
+
if is_lazy:
|
|
75
|
+
if estimated_size > 0:
|
|
76
|
+
fit_memory = estimated_size / 1024 / 1000 / 1000 < AVAILABLE_RAM
|
|
77
|
+
if fit_memory:
|
|
78
|
+
_df = _df.collect()
|
|
79
|
+
is_lazy = False
|
|
80
|
+
|
|
81
|
+
if is_lazy:
|
|
82
|
+
logger.info("Writing in memory efficient mode")
|
|
83
|
+
write_method = getattr(_df, 'sink_' + data_type)
|
|
84
|
+
try:
|
|
85
|
+
write_method(path)
|
|
86
|
+
return True
|
|
87
|
+
except Exception as e:
|
|
88
|
+
pass
|
|
89
|
+
if is_lazy:
|
|
90
|
+
_df = _df.collect()
|
|
91
|
+
try:
|
|
92
|
+
write_method = getattr(_df, 'write_' + data_type)
|
|
93
|
+
write_method(path)
|
|
94
|
+
return True
|
|
95
|
+
except:
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def collect(df: pl.LazyFrame, streamable: bool = True):
|
|
100
|
+
try:
|
|
101
|
+
return df.collect(engine="streaming" if streamable else "auto")
|
|
102
|
+
except:
|
|
103
|
+
return df.collect(engine="auto")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def cache_polars_frame_to_temp(_df: pl.LazyFrame | pl.DataFrame, tempdir: str = None) -> pl.LazyFrame:
|
|
107
|
+
path = f'{tempdir}\\fl_file_{id(_df)}'
|
|
108
|
+
result = write_polars_frame(_df, path)
|
|
109
|
+
if result:
|
|
110
|
+
df = pl.read_parquet(path)
|
|
111
|
+
return df.lazy()
|
|
112
|
+
else:
|
|
113
|
+
raise Exception('Could not cache the data')
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def define_pl_col_transformation(col_name: str, col_type: pl.DataType) -> pl.Expr:
|
|
117
|
+
if col_type == pl.Datetime:
|
|
118
|
+
return pl.col(col_name).str.to_datetime(strict=False)
|
|
119
|
+
elif col_type == pl.Date:
|
|
120
|
+
return pl.col(col_name).str.to_date(strict=False)
|
|
121
|
+
else:
|
|
122
|
+
return pl.col(col_name).cast(col_type, strict=False)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def execute_write_method(write_method: Callable, path: str, data_type: str = None, sheet_name: str = None,
|
|
126
|
+
delimiter: str = None,
|
|
127
|
+
write_mode: str = 'create'):
|
|
128
|
+
if data_type == 'excel':
|
|
129
|
+
logger.info('Writing as excel file')
|
|
130
|
+
write_method(path, worksheet=sheet_name)
|
|
131
|
+
elif data_type == 'csv':
|
|
132
|
+
logger.info('Writing as csv file')
|
|
133
|
+
if write_mode == 'append':
|
|
134
|
+
with open(path, 'ab') as f:
|
|
135
|
+
write_method(path=f, separator=delimiter, quote_style='always')
|
|
136
|
+
else:
|
|
137
|
+
write_method(path=path, separator=delimiter, quote_style='always')
|
|
138
|
+
elif data_type == 'parquet':
|
|
139
|
+
logger.info('Writing as parquet file')
|
|
140
|
+
write_method(path)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def write_output(_df: pl.LazyFrame,
|
|
144
|
+
data_type: str, path: str, write_mode: str, sheet_name: str = None,
|
|
145
|
+
delimiter: str = None, flow_id: int = -1, node_id: int | str = -1) -> Status:
|
|
146
|
+
serializable_df = _df.serialize()
|
|
147
|
+
r = requests.post(f'{WORKER_URL}/write_results/',
|
|
148
|
+
json={'operation': encodebytes(serializable_df).decode(),
|
|
149
|
+
'data_type': data_type,
|
|
150
|
+
'path': path,
|
|
151
|
+
'write_mode': write_mode,
|
|
152
|
+
'sheet_name': sheet_name,
|
|
153
|
+
'delimiter': delimiter,
|
|
154
|
+
'flowfile_node_id': node_id,
|
|
155
|
+
'flowfile_flow_id': flow_id})
|
|
156
|
+
if r.ok:
|
|
157
|
+
return Status(**r.json())
|
|
158
|
+
else:
|
|
159
|
+
raise Exception(f'Could not cache the data, {r.text}')
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def local_write_output(_df: pl.LazyFrame | pl.DataFrame, data_type: str, path: str, write_mode: str,
|
|
163
|
+
sheet_name: str = None, delimiter: str = None, flow_id: int = -1, node_id: int | str = -1):
|
|
164
|
+
is_lazy = isinstance(_df, pl.LazyFrame)
|
|
165
|
+
sink_method_str = 'sink_' + data_type
|
|
166
|
+
write_method_str = 'write_' + data_type
|
|
167
|
+
has_sink_method = hasattr(_df, sink_method_str)
|
|
168
|
+
write_method = None
|
|
169
|
+
if os.path.exists(path) and write_mode == 'create':
|
|
170
|
+
return None
|
|
171
|
+
if has_sink_method and is_lazy:
|
|
172
|
+
write_method = getattr(_df, 'sink_' + data_type)
|
|
173
|
+
elif not is_lazy or not has_sink_method:
|
|
174
|
+
if is_lazy:
|
|
175
|
+
_df = _df.collect()
|
|
176
|
+
write_method = getattr(_df, write_method_str)
|
|
177
|
+
if write_method is not None:
|
|
178
|
+
execute_write_method(write_method, path=path, data_type=data_type, sheet_name=sheet_name,
|
|
179
|
+
delimiter=delimiter, write_mode=write_mode)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def create_pl_df_type_save(raw_data: Iterable[Iterable], orient: str = 'row') -> pl.DataFrame:
|
|
183
|
+
"""
|
|
184
|
+
orient : {'col', 'row'}, default None
|
|
185
|
+
Whether to interpret two-dimensional data as columns or as rows. If None,
|
|
186
|
+
the orientation is inferred by matching the columns and data dimensions. If
|
|
187
|
+
this does not yield conclusive results, column orientation is used.
|
|
188
|
+
:param raw_data: iterables with values
|
|
189
|
+
:param orient:
|
|
190
|
+
:return: polars dataframe
|
|
191
|
+
"""
|
|
192
|
+
if orient == 'row':
|
|
193
|
+
raw_data = zip(*raw_data)
|
|
194
|
+
raw_data = [standardize_col_dtype(values) for values in raw_data]
|
|
195
|
+
return pl.DataFrame(raw_data, orient='col')
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def find_first_positions(lst: List[str]) -> Dict[str, int]:
|
|
199
|
+
first_positions: Dict[str, int] = {}
|
|
200
|
+
for i, value in enumerate(lst):
|
|
201
|
+
if value not in first_positions:
|
|
202
|
+
first_positions[value] = i
|
|
203
|
+
return first_positions
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def match_order(l: List[str], ref: List[str]) -> List[str]:
|
|
207
|
+
ref_order = find_first_positions(ref)
|
|
208
|
+
order = []
|
|
209
|
+
for v in l:
|
|
210
|
+
org_order = ref_order.get(v, float('inf'))
|
|
211
|
+
order.append(org_order)
|
|
212
|
+
return [v for _, v in sorted(zip(order, l))]
|
|
File without changes
|