Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import pyarrow as pa
|
|
2
|
+
from typing import List, Iterator, Tuple, Callable
|
|
3
|
+
from flowfile_core.configs import logger
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def open_validated_file(file_path: str, n: int) -> pa.OSFile:
|
|
7
|
+
"""
|
|
8
|
+
Validate and open an Arrow file with input parameter checking.
|
|
9
|
+
|
|
10
|
+
This function performs validation on the input parameters and opens the Arrow file
|
|
11
|
+
in binary read mode. It includes checks for file existence and parameter types.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
file_path (str): Path to the Arrow file to be opened.
|
|
15
|
+
n (int): Number of rows to be read. Used for validation purposes.
|
|
16
|
+
Must be non-negative.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
pa.OSFile: An open PyArrow file object ready for reading.
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: If n is negative.
|
|
23
|
+
TypeError: If file_path is not a string.
|
|
24
|
+
FileNotFoundError: If the specified file does not exist.
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> file = open_validated_file("data.arrow", 1000)
|
|
28
|
+
>>> # Use the file object
|
|
29
|
+
>>> file.close()
|
|
30
|
+
"""
|
|
31
|
+
logger.debug(f"Attempting to open file: {file_path} with n={n}")
|
|
32
|
+
if n < 0:
|
|
33
|
+
logger.error(f"Invalid negative row count requested: {n}")
|
|
34
|
+
raise ValueError("Number of rows must be non-negative")
|
|
35
|
+
if not isinstance(file_path, str):
|
|
36
|
+
logger.error(f"Invalid file_path type: {type(file_path)}")
|
|
37
|
+
raise TypeError("file_path must be a string")
|
|
38
|
+
try:
|
|
39
|
+
file = pa.OSFile(file_path, 'rb')
|
|
40
|
+
logger.info(f"Successfully opened file: {file_path}")
|
|
41
|
+
return file
|
|
42
|
+
except FileNotFoundError:
|
|
43
|
+
logger.error(f"File not found: {file_path}")
|
|
44
|
+
raise FileNotFoundError(f"Could not find file: {file_path}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def create_reader(source: pa.OSFile) -> pa.ipc.RecordBatchFileReader:
|
|
48
|
+
"""
|
|
49
|
+
Create a RecordBatchFileReader from an open Arrow file.
|
|
50
|
+
|
|
51
|
+
This function initializes a reader that can process Arrow IPC file formats.
|
|
52
|
+
It handles the creation of the reader and validates the file format.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
source (pa.OSFile): An open PyArrow file object.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
pa.ipc.RecordBatchFileReader: A reader object for processing Arrow record batches.
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
ValueError: If the file is not a valid Arrow format.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
>>> with open_validated_file("data.arrow", 1000) as source:
|
|
65
|
+
... reader = create_reader(source)
|
|
66
|
+
... # Use the reader
|
|
67
|
+
"""
|
|
68
|
+
try:
|
|
69
|
+
reader = pa.ipc.open_file(source)
|
|
70
|
+
logger.debug(f"Created reader with {reader.num_record_batches} batches")
|
|
71
|
+
return reader
|
|
72
|
+
except pa.ArrowInvalid:
|
|
73
|
+
logger.error("Failed to create reader: Invalid Arrow file format")
|
|
74
|
+
raise ValueError("Invalid Arrow file format")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def iter_batches(reader: pa.ipc.RecordBatchFileReader, n: int, rows_collected: int) -> Iterator[pa.RecordBatch]:
|
|
78
|
+
"""
|
|
79
|
+
Iterator over record batches with row limit handling.
|
|
80
|
+
|
|
81
|
+
Yields record batches from the reader, handling the case where the last batch
|
|
82
|
+
needs to be sliced to meet the requested row count. This function provides
|
|
83
|
+
efficient batch-wise processing of Arrow data.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
reader (pa.ipc.RecordBatchFileReader): The Arrow file reader.
|
|
87
|
+
n (int): Maximum number of rows to read in total.
|
|
88
|
+
rows_collected (int): Number of rows already collected before this iteration.
|
|
89
|
+
|
|
90
|
+
Yields:
|
|
91
|
+
pa.RecordBatch: Record batches containing the data, with the last batch
|
|
92
|
+
potentially sliced to meet the row count requirement.
|
|
93
|
+
|
|
94
|
+
Example:
|
|
95
|
+
>>> reader = create_reader(source)
|
|
96
|
+
>>> for batch in iter_batches(reader, 1000, 0):
|
|
97
|
+
... # Process each batch
|
|
98
|
+
... process_batch(batch)
|
|
99
|
+
"""
|
|
100
|
+
logger.debug(f"Starting batch iteration: target={n}, collected={rows_collected}")
|
|
101
|
+
for i in range(reader.num_record_batches):
|
|
102
|
+
batch = reader.get_record_batch(i)
|
|
103
|
+
batch_rows = batch.num_rows
|
|
104
|
+
logger.debug(f"Processing batch {i}: {batch_rows} rows")
|
|
105
|
+
|
|
106
|
+
if rows_collected + batch_rows <= n:
|
|
107
|
+
yield batch
|
|
108
|
+
else:
|
|
109
|
+
remaining_rows = n - rows_collected
|
|
110
|
+
logger.debug(f"Slicing final batch to {remaining_rows} rows")
|
|
111
|
+
yield batch.slice(0, remaining_rows)
|
|
112
|
+
break
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def collect_batches(reader: pa.ipc.RecordBatchFileReader, n: int) -> Tuple[List[pa.RecordBatch], int]:
|
|
116
|
+
"""
|
|
117
|
+
Collect record batches from a reader up to a specified number of rows.
|
|
118
|
+
|
|
119
|
+
This function aggregates record batches while respecting the total row count
|
|
120
|
+
limit. It's useful for building a complete dataset from multiple batches.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
reader (pa.ipc.RecordBatchFileReader): The Arrow file reader.
|
|
124
|
+
n (int): Maximum number of rows to collect.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Tuple[List[pa.RecordBatch], int]: A tuple containing:
|
|
128
|
+
- List of collected record batches
|
|
129
|
+
- Total number of rows collected
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
>>> reader = create_reader(source)
|
|
133
|
+
>>> batches, row_count = collect_batches(reader, 1000)
|
|
134
|
+
>>> print(f"Collected {row_count} rows in {len(batches)} batches")
|
|
135
|
+
"""
|
|
136
|
+
logger.debug(f"Collecting batches up to {n} rows")
|
|
137
|
+
batches: List[pa.RecordBatch] = []
|
|
138
|
+
rows_collected = 0
|
|
139
|
+
|
|
140
|
+
for batch in iter_batches(reader, n, rows_collected):
|
|
141
|
+
batches.append(batch)
|
|
142
|
+
rows_collected += batch.num_rows
|
|
143
|
+
logger.debug(f"Collected batch: total rows now {rows_collected}")
|
|
144
|
+
if rows_collected >= n:
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
logger.info(f"Finished collecting {len(batches)} batches with {rows_collected} total rows")
|
|
148
|
+
return batches, rows_collected
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def read(file_path: str) -> pa.Table:
|
|
152
|
+
"""
|
|
153
|
+
Read an entire Arrow file into a Table.
|
|
154
|
+
|
|
155
|
+
This function provides a simple interface to read all data from an Arrow file.
|
|
156
|
+
It handles file opening, reading, and proper resource cleanup.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
file_path (str): Path to the Arrow file to read.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
pa.Table: PyArrow Table containing all data from the file.
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
FileNotFoundError: If the file doesn't exist.
|
|
166
|
+
ValueError: If the file is not a valid Arrow format.
|
|
167
|
+
|
|
168
|
+
Example:
|
|
169
|
+
>>> table = read("data.arrow")
|
|
170
|
+
>>> print(f"Read {len(table)} rows")
|
|
171
|
+
>>> print(f"Columns: {table.column_names}")
|
|
172
|
+
"""
|
|
173
|
+
logger.info(f"Reading entire file: {file_path}")
|
|
174
|
+
with open_validated_file(file_path, 0) as source:
|
|
175
|
+
reader = create_reader(source)
|
|
176
|
+
batches, total_rows = collect_batches(reader, float('inf'))
|
|
177
|
+
table = pa.Table.from_batches(batches) # type: ignore
|
|
178
|
+
logger.info(f"Successfully read {total_rows} rows from {file_path}")
|
|
179
|
+
return table
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> pa.Table:
|
|
183
|
+
"""
|
|
184
|
+
Read the first N rows from an Arrow file.
|
|
185
|
+
|
|
186
|
+
This function provides efficient reading of a limited number of rows from an
|
|
187
|
+
Arrow file. It's useful for data sampling and preview operations.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
file_path (str): Path to the Arrow file to read.
|
|
191
|
+
n (int, optional): Number of rows to read. Defaults to 1000.
|
|
192
|
+
strict (bool, optional): If True, raises an error when fewer than n rows
|
|
193
|
+
are available. Defaults to False.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
pa.Table: PyArrow Table containing up to n rows of data.
|
|
197
|
+
|
|
198
|
+
Raises:
|
|
199
|
+
ValueError: If strict=True and fewer than n rows are available,
|
|
200
|
+
or if n is negative.
|
|
201
|
+
FileNotFoundError: If the file doesn't exist.
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
>>> # Read first 1000 rows
|
|
205
|
+
>>> table = read_top_n("data.arrow")
|
|
206
|
+
>>> # Read exactly 500 rows with strict checking
|
|
207
|
+
>>> table = read_top_n("data.arrow", n=500, strict=True)
|
|
208
|
+
"""
|
|
209
|
+
logger.info(f"Reading top {n} rows from {file_path} (strict={strict})")
|
|
210
|
+
with open_validated_file(file_path, n) as source:
|
|
211
|
+
reader = create_reader(source)
|
|
212
|
+
batches, rows_collected = collect_batches(reader, n)
|
|
213
|
+
|
|
214
|
+
if strict and rows_collected < n:
|
|
215
|
+
logger.error(f"Strict mode: requested {n} rows but only {rows_collected} available")
|
|
216
|
+
raise ValueError(f"Requested {n} rows but only {rows_collected} available")
|
|
217
|
+
|
|
218
|
+
table = pa.Table.from_batches(batches) # type: ignore
|
|
219
|
+
logger.info(f"Successfully read {rows_collected} rows from {file_path}")
|
|
220
|
+
return table
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Callable[[], pa.Table]:
|
|
224
|
+
"""
|
|
225
|
+
Create a callable that reads the first N rows from an Arrow file.
|
|
226
|
+
|
|
227
|
+
This function returns a closure that can be called later to read data.
|
|
228
|
+
It's useful for creating reusable data reading functions with fixed parameters.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
file_path (str): Path to the Arrow file to read.
|
|
232
|
+
n (int, optional): Number of rows to read. Defaults to 1000.
|
|
233
|
+
strict (bool, optional): If True, raises an error when fewer than n rows
|
|
234
|
+
are available. Defaults to False.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Callable[[], pa.Table]: A function that when called, reads and returns
|
|
238
|
+
the data as a PyArrow Table.
|
|
239
|
+
|
|
240
|
+
Example:
|
|
241
|
+
>>> # Create a reader function for the first 500 rows
|
|
242
|
+
>>> reader_func = get_read_top_n("data.arrow", n=500)
|
|
243
|
+
>>> # Later, use the function to read the data
|
|
244
|
+
>>> table = reader_func()
|
|
245
|
+
"""
|
|
246
|
+
logger.info(f"Creating reader function for {file_path} with n={n}, strict={strict}")
|
|
247
|
+
return lambda: read_top_n(file_path, n, strict)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import fastexcel
|
|
3
|
+
from typing import List
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
|
|
6
|
+
from flowfile_core.configs import logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@lru_cache(maxsize=32)
|
|
10
|
+
def get_sheet_names(file_path: str) -> List[str] | None:
|
|
11
|
+
if not os.path.exists(file_path):
|
|
12
|
+
logger.error(f"File does not exist: {file_path}")
|
|
13
|
+
return
|
|
14
|
+
try:
|
|
15
|
+
return fastexcel.read_excel(file_path).sheet_names
|
|
16
|
+
except Exception as e:
|
|
17
|
+
logger.error(e)
|
|
18
|
+
return
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from flowfile_core.schemas.input_schema import NewDirectory, RemoveItem, RemoveItemsInput
|
|
3
|
+
from typing import Tuple, Optional
|
|
4
|
+
from flowfile_core.configs import logger
|
|
5
|
+
|
|
6
|
+
local_database_connection = None
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def create_dir(new_directory: NewDirectory) -> Tuple[bool, Optional[Exception]]:
|
|
10
|
+
full_path: str = os.path.join(new_directory.source_path,new_directory.dir_name)
|
|
11
|
+
try:
|
|
12
|
+
os.mkdir(full_path)
|
|
13
|
+
logger.info("Successfully created a new folder")
|
|
14
|
+
return True, None
|
|
15
|
+
except Exception as e:
|
|
16
|
+
return False, e
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def remove_path(path: str) -> Tuple[bool, Optional[Exception]]:
|
|
20
|
+
try:
|
|
21
|
+
os.remove(path)
|
|
22
|
+
logger.info(f"Succesfully removed {path}")
|
|
23
|
+
return True, None
|
|
24
|
+
except Exception as e:
|
|
25
|
+
return False, e
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def remove_item(item_to_remove: RemoveItem):
|
|
29
|
+
if item_to_remove.id >= 0:
|
|
30
|
+
os.remove(item_to_remove.path)
|
|
31
|
+
|
|
32
|
+
elif os.path.isfile(item_to_remove.path):
|
|
33
|
+
os.remove(item_to_remove.path)
|
|
34
|
+
elif os.path.isdir(item_to_remove.path):
|
|
35
|
+
os.rmdir(item_to_remove.path)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def remove_paths(remove_items: RemoveItemsInput) -> Tuple[bool, Optional[Exception]]:
|
|
39
|
+
try:
|
|
40
|
+
for path in remove_items.paths:
|
|
41
|
+
remove_item(path)
|
|
42
|
+
logger.info(f'Successfully removed {remove_items.paths}')
|
|
43
|
+
return True, None
|
|
44
|
+
except Exception as e:
|
|
45
|
+
return False, e
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from flowfile_core.configs import logger
|
|
2
|
+
from inspect import isfunction
|
|
3
|
+
from loky import get_reusable_executor
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
import atexit
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# process_executor: Uses loky for process-based parallelism
|
|
10
|
+
def process_executor(wait_on_completion: bool = False, max_workers: int = 12):
|
|
11
|
+
max_workers = max_workers if not wait_on_completion else 1
|
|
12
|
+
|
|
13
|
+
def executor(f):
|
|
14
|
+
@wraps(f)
|
|
15
|
+
def inner(*args, **kwargs):
|
|
16
|
+
logger.debug(f'Added task {f.__name__} to a process executor')
|
|
17
|
+
logger.debug(f'max_workers: {max_workers}')
|
|
18
|
+
|
|
19
|
+
# Create a new executor with the required number of workers
|
|
20
|
+
func_executor = get_reusable_executor(max_workers=max_workers, timeout=2, kill_workers=False, reuse=True)
|
|
21
|
+
r = func_executor.submit(f, *args, **kwargs)
|
|
22
|
+
if wait_on_completion:
|
|
23
|
+
result = r.result()
|
|
24
|
+
logger.info(f'done executing {f.__name__}')
|
|
25
|
+
return result
|
|
26
|
+
|
|
27
|
+
logger.info(f'done submitting {f.__name__} to a process executor')
|
|
28
|
+
return r
|
|
29
|
+
|
|
30
|
+
return inner
|
|
31
|
+
|
|
32
|
+
if isfunction(wait_on_completion):
|
|
33
|
+
f = wait_on_completion
|
|
34
|
+
wait_on_completion = False
|
|
35
|
+
return executor(f)
|
|
36
|
+
return executor
|
|
37
|
+
|
|
38
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def camel_case_to_snake_case(text: str) -> str:
|
|
5
|
+
# Use a regular expression to find capital letters and replace them with _ followed by the lowercase letter
|
|
6
|
+
transformed_text = re.sub(r'(?<!^)(?=[A-Z])', '_', text).lower()
|
|
7
|
+
return transformed_text
|
|
8
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# flowframe/__init__.py
|
|
2
|
+
"""A Polars-like API for building ETL graphs."""
|
|
3
|
+
|
|
4
|
+
# Core classes
|
|
5
|
+
from flowfile_frame.flow_frame import FlowFrame # noqa: F401
|
|
6
|
+
|
|
7
|
+
from flowfile_frame.utils import create_etl_graph # noqa: F401
|
|
8
|
+
|
|
9
|
+
# Commonly used functions
|
|
10
|
+
from flowfile_frame.expr import ( # noqa: F401
|
|
11
|
+
col, lit, column,
|
|
12
|
+
cum_count, len,
|
|
13
|
+
sum, min, max, mean, count, when
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# Selector utilities
|
|
17
|
+
from flowfile_frame.selectors import ( # noqa: F401
|
|
18
|
+
numeric, float_, integer, string, temporal,
|
|
19
|
+
datetime, date, time, duration, boolean,
|
|
20
|
+
categorical, object_, list_, struct, all_,
|
|
21
|
+
by_dtype, contains, starts_with, ends_with, matches
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# File I/O
|
|
25
|
+
from flowfile_frame.flow_frame import ( # noqa: F401
|
|
26
|
+
read_csv, read_parquet, from_dict, concat
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Import Polars data types for user convenience
|
|
30
|
+
from polars.datatypes import ( # noqa: F401
|
|
31
|
+
# Integer types
|
|
32
|
+
Int8, Int16, Int32, Int64, Int128,
|
|
33
|
+
UInt8, UInt16, UInt32, UInt64,
|
|
34
|
+
IntegerType,
|
|
35
|
+
|
|
36
|
+
# Float types
|
|
37
|
+
Float32, Float64,
|
|
38
|
+
|
|
39
|
+
# Other primitive types
|
|
40
|
+
Boolean, String, Utf8, Binary, Null,
|
|
41
|
+
|
|
42
|
+
# Complex types
|
|
43
|
+
List, Array, Struct, Object,
|
|
44
|
+
|
|
45
|
+
# Date/time types
|
|
46
|
+
Date, Time, Datetime, Duration,
|
|
47
|
+
TemporalType,
|
|
48
|
+
|
|
49
|
+
# Special types
|
|
50
|
+
Categorical, Decimal, Enum, Unknown,
|
|
51
|
+
|
|
52
|
+
# Type classes
|
|
53
|
+
DataType, DataTypeClass, Field
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Main entry point for the FlowFrame CLI."""
|
|
2
|
+
|
|
3
|
+
def main():
|
|
4
|
+
"""Main entry point for the FlowFrame CLI."""
|
|
5
|
+
print("FlowFrame - A Polars-like API for building ETL graphs")
|
|
6
|
+
print("Usage: import flowframe as ff")
|
|
7
|
+
print(" df = ff.from_dict({'a': [1, 2, 3]})")
|
|
8
|
+
print(" result = df.filter(ff.col('a') > 1)")
|
|
9
|
+
print(" print(result.collect())")
|
|
10
|
+
|
|
11
|
+
if __name__ == "__main__":
|
|
12
|
+
main()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# flowframe/adapters.py
|
|
2
|
+
"""Adapters to connect FlowFrame with the flowfile-core library."""
|
|
3
|
+
|
|
4
|
+
# Import from your existing project
|
|
5
|
+
from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
|
|
6
|
+
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
7
|
+
from flowfile_core.schemas import input_schema, schemas, transform_schema
|
|
8
|
+
|
|
9
|
+
# Export these for use in FlowFrame
|
|
10
|
+
__all__ = [
|
|
11
|
+
'FlowGraph',
|
|
12
|
+
'add_connection',
|
|
13
|
+
'FlowDataEngine',
|
|
14
|
+
'input_schema',
|
|
15
|
+
'schemas',
|
|
16
|
+
'transform_schema'
|
|
17
|
+
]
|