Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
from typing import Generator, List
|
|
2
|
+
from openpyxl import Workbook, load_workbook
|
|
3
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
4
|
+
import gc
|
|
5
|
+
import polars as pl
|
|
6
|
+
from flowfile_core.flowfile.flow_data_engine.utils import create_pl_df_type_save, get_data_type
|
|
7
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
|
|
8
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import dtype_to_pl_str
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def raw_data_openpyxl(file_path: str,
|
|
12
|
+
sheet_name: str = None,
|
|
13
|
+
min_row: int = None,
|
|
14
|
+
max_row: int = None,
|
|
15
|
+
min_col: int = None,
|
|
16
|
+
max_col: int = None
|
|
17
|
+
) -> Generator[List, None, None]:
|
|
18
|
+
workbook: Workbook = load_workbook(file_path, data_only=True, read_only=True)
|
|
19
|
+
sheet_name = workbook.sheetnames[0] if sheet_name is None else sheet_name
|
|
20
|
+
sheet: Worksheet = workbook[sheet_name]
|
|
21
|
+
for row in sheet.iter_rows(min_row=min_row,
|
|
22
|
+
max_row=max_row,
|
|
23
|
+
min_col=min_col,
|
|
24
|
+
max_col=max_col,
|
|
25
|
+
values_only=True):
|
|
26
|
+
yield row
|
|
27
|
+
workbook.close()
|
|
28
|
+
del workbook
|
|
29
|
+
gc.collect()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_calamine_xlsx_data_types(file_path: str, sheet_name: str, start_row: int=0, end_row: int=0):
|
|
33
|
+
df = df_from_calamine_xlsx(file_path, sheet_name, start_row, end_row)
|
|
34
|
+
return [FlowfileColumn.from_input(n, str(dt), col_index=i) for i, (n, dt) in enumerate(zip(df.columns, df.dtypes))]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def df_from_calamine_xlsx(file_path: str, sheet_name: str, start_row: int = 0, end_row: int = 0) -> pl.DataFrame:
|
|
38
|
+
read_options = {}
|
|
39
|
+
if start_row > 0:
|
|
40
|
+
read_options['header_row'] = start_row
|
|
41
|
+
if end_row > 0:
|
|
42
|
+
read_options['n_rows'] = end_row - start_row
|
|
43
|
+
return pl.read_excel(source=file_path,
|
|
44
|
+
engine='calamine',
|
|
45
|
+
sheet_name=sheet_name,
|
|
46
|
+
read_options=read_options,
|
|
47
|
+
raise_if_empty=False
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def df_from_openpyxl(file_path: str,
|
|
52
|
+
sheet_name: str = None,
|
|
53
|
+
min_row: int = None,
|
|
54
|
+
max_row: int = None,
|
|
55
|
+
min_col: int = None,
|
|
56
|
+
max_col: int = None,
|
|
57
|
+
has_headers: bool = True
|
|
58
|
+
) -> pl.DataFrame:
|
|
59
|
+
data_iterator = raw_data_openpyxl(file_path=file_path,
|
|
60
|
+
sheet_name=sheet_name,
|
|
61
|
+
min_row=min_row,
|
|
62
|
+
max_row=max_row,
|
|
63
|
+
min_col=min_col,
|
|
64
|
+
max_col=max_col)
|
|
65
|
+
raw_data = list(data_iterator)
|
|
66
|
+
if len(raw_data) > 0:
|
|
67
|
+
if has_headers:
|
|
68
|
+
columns = []
|
|
69
|
+
for i, col in enumerate(raw_data[0]):
|
|
70
|
+
if col is None:
|
|
71
|
+
col = f'_unnamed_column_{i}'
|
|
72
|
+
elif not isinstance(col, str):
|
|
73
|
+
col = str(col)
|
|
74
|
+
columns.append(col)
|
|
75
|
+
columns = ensure_unique(columns)
|
|
76
|
+
df = create_pl_df_type_save(raw_data[1:])
|
|
77
|
+
renames = {o: n for o, n in zip(df.columns, columns)}
|
|
78
|
+
df = df.rename(renames)
|
|
79
|
+
|
|
80
|
+
else:
|
|
81
|
+
df = create_pl_df_type_save(raw_data)
|
|
82
|
+
return df
|
|
83
|
+
else:
|
|
84
|
+
return pl.DataFrame()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_open_xlsx_datatypes(file_path: str,
|
|
88
|
+
sheet_name: str = None,
|
|
89
|
+
min_row: int = None,
|
|
90
|
+
max_row: int = None,
|
|
91
|
+
min_col: int = None,
|
|
92
|
+
max_col: int = None,
|
|
93
|
+
has_headers: bool = True) -> List[FlowfileColumn]:
|
|
94
|
+
data_iterator = raw_data_openpyxl(file_path=file_path,
|
|
95
|
+
sheet_name=sheet_name,
|
|
96
|
+
min_row=min_row,
|
|
97
|
+
max_row=max_row,
|
|
98
|
+
min_col=min_col,
|
|
99
|
+
max_col=max_col)
|
|
100
|
+
raw_data = data_iterator
|
|
101
|
+
if has_headers:
|
|
102
|
+
columns = (f'_unnamed_column_{i}' if col is None else col for i, col in enumerate(next(raw_data)))
|
|
103
|
+
data_types = (dtype_to_pl_str.get(get_data_type(vals), 'String') for vals in zip(*raw_data))
|
|
104
|
+
schema = [FlowfileColumn.from_input(n, d, col_index = i) for i, (n, d) in enumerate(zip(columns, data_types))]
|
|
105
|
+
else:
|
|
106
|
+
columns = (f'column_{i}' for i in range(len(next(raw_data))))
|
|
107
|
+
data_types = (dtype_to_pl_str.get(get_data_type(vals), 'String') for vals in zip(*raw_data))
|
|
108
|
+
schema = [FlowfileColumn.from_input(n, d, col_index=i) for i, (n, d) in enumerate(zip(columns, data_types))]
|
|
109
|
+
return schema
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def ensure_unique(lst: List[str]) -> List[str]:
|
|
113
|
+
"""
|
|
114
|
+
Ensures that all elements in the input list are unique by appending
|
|
115
|
+
a version number (e.g., '_v1') to duplicates. It continues adding
|
|
116
|
+
version numbers until all items in the list are unique.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
lst (List[str]): A list of strings that may contain duplicates.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
List[str]: A new list where all elements are unique.
|
|
123
|
+
"""
|
|
124
|
+
seen = {}
|
|
125
|
+
result = []
|
|
126
|
+
|
|
127
|
+
for item in lst:
|
|
128
|
+
if item in seen:
|
|
129
|
+
# Increment the version and append the version number
|
|
130
|
+
seen[item] += 1
|
|
131
|
+
new_item = f"{item}_v{seen[item]}"
|
|
132
|
+
# Ensure the new item is unique by checking for conflicts
|
|
133
|
+
while new_item in seen:
|
|
134
|
+
seen[new_item] += 1
|
|
135
|
+
new_item = f"{item}_v{seen[item]}"
|
|
136
|
+
result.append(new_item)
|
|
137
|
+
seen[new_item] = 1 # Mark the new unique item as seen
|
|
138
|
+
else:
|
|
139
|
+
result.append(item)
|
|
140
|
+
seen[item] = 1 # First occurrence of the item
|
|
141
|
+
|
|
142
|
+
return result
|
|
143
|
+
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from faker import Faker
|
|
2
|
+
from functools import partial
|
|
3
|
+
from random import randint
|
|
4
|
+
import polars as pl
|
|
5
|
+
from typing import List, Dict, Any, Generator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
|
|
9
|
+
fake = Faker()
|
|
10
|
+
selector = partial(randint,0)
|
|
11
|
+
min_range = partial(min, n_records)
|
|
12
|
+
# Pre-generation of static data
|
|
13
|
+
cities = [fake.city() for _ in range(min_range(7000))]
|
|
14
|
+
companies = [fake.company() for _ in range(min_range(100_000))]
|
|
15
|
+
zipcodes = [fake.zipcode() for _ in range(min_range(200_000))]
|
|
16
|
+
countries = [fake.country() for _ in range(min_range(50))]
|
|
17
|
+
street_names = [fake.street_name() for _ in range(min_range(100000))]
|
|
18
|
+
dob = [fake.date_of_birth() for _ in range(min_range(100_000))]
|
|
19
|
+
first_names = [fake.first_name() for _ in range(min_range(100_000))]
|
|
20
|
+
last_names = [fake.last_name() for _ in range(min_range(50_000))]
|
|
21
|
+
domain_names = [fake.domain_name() for _ in range(10)]
|
|
22
|
+
sales_data = [fake.random_int(0, 1000) for _ in range(n_records)]
|
|
23
|
+
|
|
24
|
+
def generate_name():
|
|
25
|
+
return f"{first_names[selector(min_range(100_000))-1]} {last_names[selector(min_range(50_000))-1]}"
|
|
26
|
+
|
|
27
|
+
def generate_address():
|
|
28
|
+
return f"{randint(100, 999)} {street_names[selector(min_range(100000))-1]}"
|
|
29
|
+
|
|
30
|
+
def generate_email(name):
|
|
31
|
+
return f"{name.lower().replace(' ', '_')}.{randint(1, 99)}@{domain_names[selector(10)-1]}"
|
|
32
|
+
|
|
33
|
+
def generate_phone_number():
|
|
34
|
+
return fake.phone_number()
|
|
35
|
+
|
|
36
|
+
data = []
|
|
37
|
+
for i in range(n_records):
|
|
38
|
+
name = generate_name()
|
|
39
|
+
data.append(dict(
|
|
40
|
+
ID=randint(1, 1000000),
|
|
41
|
+
Name=name,
|
|
42
|
+
Address=generate_address(),
|
|
43
|
+
City=cities[selector(min_range(7000))-1],
|
|
44
|
+
Email=generate_email(name),
|
|
45
|
+
Phone=generate_phone_number(),
|
|
46
|
+
DOB=dob[selector(min_range(100_000))-1],
|
|
47
|
+
Work=companies[selector(min_range(100_000))-1],
|
|
48
|
+
Zipcode=zipcodes[selector(min_range(200_000))-1],
|
|
49
|
+
Country=countries[selector(min_range(50))-1],
|
|
50
|
+
sales_data=sales_data[selector(n_records)-1]
|
|
51
|
+
))
|
|
52
|
+
|
|
53
|
+
return pl.DataFrame(data)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def create_fake_data_raw(n_records: int = 1000, col_selection: List[str] = None) -> Generator[Dict[str, Any], None, None]:
|
|
58
|
+
fake = Faker()
|
|
59
|
+
selector = partial(randint, 0)
|
|
60
|
+
|
|
61
|
+
# Caching static data to avoid regenerating it unnecessarily
|
|
62
|
+
cities = partial(fake.city)
|
|
63
|
+
companies = partial(fake.company)
|
|
64
|
+
zipcodes = partial(fake.zipcode)
|
|
65
|
+
countries = partial(fake.country)
|
|
66
|
+
street_names = partial(fake.street_name)
|
|
67
|
+
dob = partial(fake.date_of_birth)
|
|
68
|
+
first_names = partial(fake.first_name)
|
|
69
|
+
last_names = partial(fake.last_name)
|
|
70
|
+
domain_names = [fake.domain_name() for _ in range(10)] # Pre-generate a small list
|
|
71
|
+
sales_data = lambda: fake.random_int(0, 1000)
|
|
72
|
+
|
|
73
|
+
def generate_name():
|
|
74
|
+
return f"{first_names()} {last_names()}"
|
|
75
|
+
|
|
76
|
+
def generate_address():
|
|
77
|
+
return f"{randint(100, 999)} {street_names()}"
|
|
78
|
+
|
|
79
|
+
def generate_email(_name):
|
|
80
|
+
return f"{_name.lower().replace(' ', '_')}.{randint(1, 99)}@{domain_names[selector(10)-1]}"
|
|
81
|
+
|
|
82
|
+
def generate_phone_number():
|
|
83
|
+
return fake.phone_number()
|
|
84
|
+
|
|
85
|
+
# Default columns if no selection is provided
|
|
86
|
+
all_columns = {
|
|
87
|
+
"ID": lambda: randint(1, 1000000),
|
|
88
|
+
"Name": generate_name,
|
|
89
|
+
"Address": generate_address,
|
|
90
|
+
"City": cities,
|
|
91
|
+
"Email": lambda name: generate_email(name),
|
|
92
|
+
"Phone": generate_phone_number,
|
|
93
|
+
"DOB": dob,
|
|
94
|
+
"Work": companies,
|
|
95
|
+
"Zipcode": zipcodes,
|
|
96
|
+
"Country": countries,
|
|
97
|
+
"sales_data": sales_data
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Filter the available columns based on col_selection
|
|
101
|
+
if col_selection is not None:
|
|
102
|
+
all_columns = {col: all_columns[col] for col in col_selection if col in all_columns}
|
|
103
|
+
|
|
104
|
+
# Use a generator to yield one record at a time
|
|
105
|
+
for _ in range(n_records):
|
|
106
|
+
record = {}
|
|
107
|
+
for col, generator in all_columns.items():
|
|
108
|
+
if col == "Email": # Email depends on the Name column
|
|
109
|
+
name = record.get("Name", generate_name())
|
|
110
|
+
record["Email"] = generate_email(name)
|
|
111
|
+
else:
|
|
112
|
+
record[col] = generator()
|
|
113
|
+
yield record
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def write_fake_data():
|
|
117
|
+
df = create_fake_data()
|
|
118
|
+
df.write_parquet('backend/tests/data/fake_data.parquet')
|
|
119
|
+
df.write_csv('backend/tests/data/fake_data.csv')
|
|
120
|
+
df.write_excel('backend/tests/data/fake_data.xlsx')
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import *
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Any, Optional, Literal
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
from flowfile_core.schemas.transform_schema import FuzzyMap
|
|
4
|
+
|
|
5
|
+
OperationType = Literal['store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'store_sample']
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PolarsOperation(BaseModel):
|
|
9
|
+
operation: bytes
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PolarsScript(PolarsOperation):
|
|
13
|
+
task_id: Optional[str] = None
|
|
14
|
+
cache_dir: Optional[str] = None
|
|
15
|
+
operation_type: OperationType
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FuzzyJoinInput(BaseModel):
|
|
19
|
+
task_id: Optional[str] = None
|
|
20
|
+
cache_dir: Optional[str] = None
|
|
21
|
+
left_df_operation: PolarsOperation
|
|
22
|
+
right_df_operation: PolarsOperation
|
|
23
|
+
fuzzy_maps: list[FuzzyMap]
|
|
24
|
+
flowfile_node_id: int|str
|
|
25
|
+
flowfile_flow_id: int
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Status(BaseModel):
|
|
29
|
+
background_task_id: str
|
|
30
|
+
status: Literal['Processing', 'Completed', 'Error', 'Unknown Error', 'Starting', 'Cancelled'] # Type alias for status
|
|
31
|
+
file_ref: str
|
|
32
|
+
progress: int = 0
|
|
33
|
+
error_message: Optional[str] = None # Add error_message field
|
|
34
|
+
results: Any
|
|
35
|
+
result_type: Literal['polars', 'other'] = 'polars'
|
|
36
|
+
|