Flowfile 0.3.2__py3-none-any.whl → 0.3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. flowfile/__init__.py +3 -2
  2. flowfile/web/__init__.py +3 -0
  3. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/METADATA +4 -3
  4. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/RECORD +46 -35
  5. flowfile_core/configs/__init__.py +15 -4
  6. flowfile_core/configs/settings.py +5 -3
  7. flowfile_core/configs/utils.py +18 -0
  8. flowfile_core/flowfile/FlowfileFlow.py +13 -18
  9. flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
  10. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
  11. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
  12. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
  13. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
  14. flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
  15. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
  16. flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
  17. flowfile_core/flowfile/flow_node/flow_node.py +2 -1
  18. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
  19. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
  20. flowfile_core/flowfile/utils.py +34 -3
  21. flowfile_core/main.py +2 -3
  22. flowfile_core/routes/secrets.py +1 -1
  23. flowfile_core/schemas/input_schema.py +10 -4
  24. flowfile_core/schemas/transform_schema.py +25 -47
  25. flowfile_frame/__init__.py +11 -4
  26. flowfile_frame/adding_expr.py +280 -0
  27. flowfile_frame/config.py +9 -0
  28. flowfile_frame/expr.py +301 -83
  29. flowfile_frame/expr.pyi +2174 -0
  30. flowfile_frame/expr_name.py +258 -0
  31. flowfile_frame/flow_frame.py +584 -1002
  32. flowfile_frame/flow_frame.pyi +368 -0
  33. flowfile_frame/flow_frame_methods.py +617 -0
  34. flowfile_frame/group_frame.py +89 -42
  35. flowfile_frame/join.py +1 -2
  36. flowfile_frame/lazy.py +704 -0
  37. flowfile_frame/lazy_methods.py +201 -0
  38. flowfile_frame/list_name_space.py +324 -0
  39. flowfile_frame/selectors.py +3 -0
  40. flowfile_frame/series.py +70 -0
  41. flowfile_frame/utils.py +80 -4
  42. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/LICENSE +0 -0
  43. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/WHEEL +0 -0
  44. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/entry_points.txt +0 -0
  45. /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
  46. /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
@@ -0,0 +1,617 @@
1
+ import logging
2
+ import os
3
+ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
4
+ from pathlib import Path
5
+
6
+ import io
7
+ import polars as pl
8
+ from polars._typing import (SchemaDict, IO,PolarsDataType,
9
+ Sequence, CsvEncoding)
10
+
11
+ from flowfile_core.flowfile.FlowfileFlow import FlowGraph
12
+ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
13
+ from flowfile_core.schemas import input_schema, transform_schema
14
+
15
+ from flowfile_frame.expr import col
16
+
17
+ from flowfile_frame.utils import create_flow_graph
18
+ from flowfile_frame.flow_frame import generate_node_id, FlowFrame
19
+ from flowfile_frame.config import logger
20
+
21
+ def sum(expr):
22
+ """Sum aggregation function."""
23
+ if isinstance(expr, str):
24
+ expr = col(expr)
25
+ return expr.sum()
26
+
27
+
28
+ def mean(expr):
29
+ """Mean aggregation function."""
30
+ if isinstance(expr, str):
31
+ expr = col(expr)
32
+ return expr.mean()
33
+
34
+
35
+ def min(expr):
36
+ """Min aggregation function."""
37
+ if isinstance(expr, str):
38
+ expr = col(expr)
39
+ return expr.min()
40
+
41
+
42
+ def max(expr):
43
+ """Max aggregation function."""
44
+ if isinstance(expr, str):
45
+ expr = col(expr)
46
+ return expr.max()
47
+
48
+
49
+ def count(expr):
50
+ """Count aggregation function."""
51
+ if isinstance(expr, str):
52
+ expr = col(expr)
53
+ return expr.count()
54
+
55
+
56
+ def read_csv(
57
+ source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
58
+ *,
59
+ flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
60
+ separator: str = ',',
61
+ convert_to_absolute_path: bool = True,
62
+ description: Optional[str] = None,
63
+ has_header: bool = True,
64
+ new_columns: Optional[List[str]] = None,
65
+ comment_prefix: Optional[str] = None,
66
+ quote_char: Optional[str] = '"',
67
+ skip_rows: int = 0,
68
+ skip_lines: int = 0,
69
+ schema: Optional[SchemaDict] = None,
70
+ schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
71
+ null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
72
+ missing_utf8_is_empty_string: bool = False,
73
+ ignore_errors: bool = False,
74
+ try_parse_dates: bool = False,
75
+ infer_schema: bool = True,
76
+ infer_schema_length: Optional[int] = 100,
77
+ n_rows: Optional[int] = None,
78
+ encoding: CsvEncoding = 'utf8',
79
+ low_memory: bool = False,
80
+ rechunk: bool = False,
81
+ storage_options: Optional[Dict[str, Any]] = None,
82
+ skip_rows_after_header: int = 0,
83
+ row_index_name: Optional[str] = None,
84
+ row_index_offset: int = 0,
85
+ eol_char: str = '\n',
86
+ raise_if_empty: bool = True,
87
+ truncate_ragged_lines: bool = False,
88
+ decimal_comma: bool = False,
89
+ glob: bool = True,
90
+ cache: bool = True,
91
+ with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
92
+ **other_options: Any
93
+ ) -> FlowFrame:
94
+ """
95
+ Read a CSV file into a FlowFrame.
96
+
97
+ This function uses the native FlowGraph implementation when the parameters
98
+ fall within the supported range, and falls back to using Polars' scan_csv implementation
99
+ for more advanced features.
100
+
101
+ Args:
102
+ source: Path(s) to CSV file(s), or a file-like object.
103
+ flow_graph: if you want to add it to an existing graph
104
+ separator: Single byte character to use as separator in the file.
105
+ convert_to_absolute_path: If the path needs to be set to a fixed location
106
+ description: if you want to add a readable name in the frontend (advised)
107
+
108
+ # Polars.scan_csv aligned parameters
109
+ has_header: Indicate if the first row of the dataset is a header or not.
110
+ new_columns: Rename columns after selection.
111
+ comment_prefix: String that indicates a comment line if found at beginning of line.
112
+ quote_char: Character used for quoting. None to disable.
113
+ skip_rows: Start reading after this many rows.
114
+ skip_lines: Skip this many lines by newline char only.
115
+ schema: Schema to use when reading the CSV.
116
+ schema_overrides: Schema overrides for specific columns.
117
+ null_values: Values to interpret as null.
118
+ missing_utf8_is_empty_string: Treat missing utf8 values as empty strings.
119
+ ignore_errors: Try to keep reading lines if some parsing errors occur.
120
+ try_parse_dates: Try to automatically parse dates.
121
+ infer_schema: Boolean flag. If False, `infer_schema_length` for Polars is set to 0.
122
+ infer_schema_length: Number of rows to use for schema inference. Polars default is 100.
123
+ n_rows: Stop reading after this many rows.
124
+ encoding: Character encoding to use.
125
+ low_memory: Reduce memory usage at the cost of performance.
126
+ rechunk: Ensure data is in contiguous memory layout after parsing.
127
+ storage_options: Options for fsspec for cloud storage.
128
+ skip_rows_after_header: Skip rows after header.
129
+ row_index_name: Name of the row index column.
130
+ row_index_offset: Start value for the row index.
131
+ eol_char: End of line character.
132
+ raise_if_empty: Raise error if file is empty.
133
+ truncate_ragged_lines: Truncate lines with too many values.
134
+ decimal_comma: Parse floats with decimal comma.
135
+ glob: Use glob pattern for file path (if source is a string).
136
+ cache: Cache the result after reading (Polars default True).
137
+ with_column_names: Apply a function over the column names.
138
+ other_options: Any other options to pass to polars.scan_csv (e.g. retries, file_cache_ttl).
139
+
140
+ Returns:
141
+ A FlowFrame with the CSV data.
142
+ """
143
+ node_id = generate_node_id() # Assuming generate_node_id is defined
144
+ if flow_graph is None:
145
+ flow_graph = create_flow_graph() # Assuming create_flow_graph is defined
146
+ flow_id = flow_graph.flow_id
147
+
148
+ current_source_path_for_native = None
149
+ if isinstance(source, (str, os.PathLike)):
150
+ current_source_path_for_native = str(source)
151
+ if '~' in current_source_path_for_native:
152
+ current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
153
+ elif isinstance(source, list) and all(isinstance(s, (str, os.PathLike)) for s in source):
154
+ current_source_path_for_native = str(source[0]) if source else None
155
+ if current_source_path_for_native and '~' in current_source_path_for_native:
156
+ current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
157
+ elif isinstance(source, (io.BytesIO, io.StringIO)):
158
+ logger.warning("Read from bytes io from csv not supported, converting data to raw data")
159
+ return from_dict(pl.read_csv(source), flow_graph=flow_graph, description=description)
160
+ actual_infer_schema_length: Optional[int]
161
+ if not infer_schema:
162
+ actual_infer_schema_length = 0
163
+ else:
164
+ actual_infer_schema_length = infer_schema_length
165
+ can_use_native = (
166
+ current_source_path_for_native is not None and
167
+ comment_prefix is None and
168
+ skip_lines == 0 and
169
+ schema is None and
170
+ schema_overrides is None and
171
+ null_values is None and
172
+ not missing_utf8_is_empty_string and
173
+ not try_parse_dates and
174
+ n_rows is None and
175
+ not low_memory and
176
+ not rechunk and
177
+ storage_options is None and
178
+ skip_rows_after_header == 0 and
179
+ row_index_name is None and
180
+ row_index_offset == 0 and
181
+ eol_char == '\n' and
182
+ not decimal_comma and
183
+ new_columns is None and
184
+ glob is True
185
+ )
186
+ if can_use_native and current_source_path_for_native:
187
+ received_table = input_schema.ReceivedTable(
188
+ file_type='csv',
189
+ path=current_source_path_for_native,
190
+ name=Path(current_source_path_for_native).name,
191
+ delimiter=separator,
192
+ has_headers=has_header,
193
+ encoding=encoding,
194
+ starting_from_line=skip_rows,
195
+ quote_char=quote_char if quote_char is not None else '"',
196
+ infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
197
+ truncate_ragged_lines=truncate_ragged_lines,
198
+ ignore_errors=ignore_errors,
199
+ row_delimiter=eol_char
200
+ )
201
+ if convert_to_absolute_path:
202
+ try:
203
+ received_table.set_absolute_filepath()
204
+ received_table.path = received_table.abs_file_path
205
+ except Exception as e:
206
+ logger.warning(f"Could not determine absolute path for {current_source_path_for_native}: {e}")
207
+
208
+ read_node_description = description or f"Read CSV from {Path(current_source_path_for_native).name}"
209
+ read_node = input_schema.NodeRead(
210
+ flow_id=flow_id,
211
+ node_id=node_id,
212
+ received_file=received_table,
213
+ pos_x=100,
214
+ pos_y=100,
215
+ is_setup=True,
216
+ description=read_node_description
217
+ )
218
+ flow_graph.add_read(read_node)
219
+ result_frame = FlowFrame(
220
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
221
+ flow_graph=flow_graph,
222
+ node_id=node_id
223
+ )
224
+ return result_frame
225
+ else:
226
+ polars_source_arg = source
227
+ polars_code = _build_polars_code_args(
228
+ source=polars_source_arg,
229
+ separator=separator,
230
+ has_header=has_header,
231
+ new_columns=new_columns,
232
+ comment_prefix=comment_prefix,
233
+ quote_char=quote_char,
234
+ skip_rows=skip_rows,
235
+ skip_lines=skip_lines,
236
+ schema=schema,
237
+ schema_overrides=schema_overrides,
238
+ null_values=null_values,
239
+ missing_utf8_is_empty_string=missing_utf8_is_empty_string,
240
+ ignore_errors=ignore_errors,
241
+ try_parse_dates=try_parse_dates,
242
+ infer_schema_length=actual_infer_schema_length,
243
+ n_rows=n_rows,
244
+ encoding=encoding,
245
+ low_memory=low_memory,
246
+ rechunk=rechunk,
247
+ storage_options=storage_options,
248
+ skip_rows_after_header=skip_rows_after_header,
249
+ row_index_name=row_index_name,
250
+ row_index_offset=row_index_offset,
251
+ eol_char=eol_char,
252
+ raise_if_empty=raise_if_empty,
253
+ truncate_ragged_lines=truncate_ragged_lines,
254
+ decimal_comma=decimal_comma,
255
+ glob=glob,
256
+ cache=cache,
257
+ with_column_names=with_column_names,
258
+ **other_options
259
+ )
260
+ polars_code_node_description = description or "Read CSV with Polars scan_csv"
261
+ if isinstance(source, (str, os.PathLike)):
262
+ polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source).name}"
263
+ elif isinstance(source, list) and source and isinstance(source[0], (str, os.PathLike)):
264
+ polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source[0]).name} (and possibly others)"
265
+
266
+ # Assuming input_schema.NodePolarsCode, transform_schema.PolarsCodeInput are defined
267
+ polars_code_settings = input_schema.NodePolarsCode(
268
+ flow_id=flow_id,
269
+ node_id=node_id,
270
+ polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code),
271
+ is_setup=True,
272
+ description=polars_code_node_description
273
+ )
274
+ flow_graph.add_polars_code(polars_code_settings)
275
+ return FlowFrame(
276
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
277
+ flow_graph=flow_graph,
278
+ node_id=node_id,
279
+ )
280
+
281
+ def _build_polars_code_args(
282
+ source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
283
+ separator: str,
284
+ has_header: bool,
285
+ new_columns: Optional[List[str]],
286
+ comment_prefix: Optional[str],
287
+ quote_char: Optional[str],
288
+ skip_rows: int,
289
+ skip_lines: int,
290
+ schema: Optional[SchemaDict],
291
+ schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]],
292
+ null_values: Optional[Union[str, List[str], Dict[str, str]]],
293
+ missing_utf8_is_empty_string: bool,
294
+ ignore_errors: bool,
295
+ try_parse_dates: bool,
296
+ infer_schema_length: Optional[int],
297
+ n_rows: Optional[int],
298
+ encoding: CsvEncoding,
299
+ low_memory: bool,
300
+ rechunk: bool,
301
+ storage_options: Optional[Dict[str, Any]],
302
+ skip_rows_after_header: int,
303
+ row_index_name: Optional[str],
304
+ row_index_offset: int,
305
+ eol_char: str,
306
+ raise_if_empty: bool,
307
+ truncate_ragged_lines: bool,
308
+ decimal_comma: bool,
309
+ glob: bool,
310
+ cache: bool,
311
+ with_column_names: Optional[Callable[[List[str]], List[str]]],
312
+ **other_options: Any
313
+ ) -> str:
314
+ source_repr: str
315
+ if isinstance(source, (str, Path)):
316
+ source_repr = repr(str(source))
317
+ elif isinstance(source, list):
318
+ source_repr = repr([str(p) for p in source])
319
+ elif isinstance(source, bytes):
320
+ source_repr = "source_bytes_obj"
321
+ elif hasattr(source, 'read'):
322
+ source_repr = "source_file_like_obj"
323
+ else:
324
+ source_repr = repr(source)
325
+
326
+ param_mapping = {
327
+ 'has_header': (True, lambda x: str(x)),
328
+ 'separator': (',', lambda x: repr(str(x))),
329
+ 'comment_prefix': (None, lambda x: repr(str(x)) if x is not None else 'None'),
330
+ 'quote_char': ('"', lambda x: repr(str(x)) if x is not None else 'None'),
331
+ 'skip_rows': (0, str),
332
+ 'skip_lines': (0, str),
333
+ 'schema': (None, lambda x: repr(x) if x is not None else 'None'),
334
+ 'schema_overrides': (None, lambda x: repr(x) if x is not None else 'None'),
335
+ 'null_values': (None, lambda x: repr(x) if x is not None else 'None'),
336
+ 'missing_utf8_is_empty_string': (False, str),
337
+ 'ignore_errors': (False, str),
338
+ 'cache': (True, str),
339
+ 'with_column_names': (None, lambda x: repr(x) if x is not None else 'None'),
340
+ 'infer_schema_length': (100, lambda x: str(x) if x is not None else 'None'),
341
+ 'n_rows': (None, lambda x: str(x) if x is not None else 'None'),
342
+ 'encoding': ('utf8', lambda x: repr(str(x))),
343
+ 'low_memory': (False, str),
344
+ 'rechunk': (False, str),
345
+ 'skip_rows_after_header': (0, str),
346
+ 'row_index_name': (None, lambda x: repr(str(x)) if x is not None else 'None'),
347
+ 'row_index_offset': (0, str),
348
+ 'try_parse_dates': (False, str),
349
+ 'eol_char': ('\n', lambda x: repr(str(x))),
350
+ 'new_columns': (None, lambda x: repr(x) if x is not None else 'None'),
351
+ 'raise_if_empty': (True, str),
352
+ 'truncate_ragged_lines': (False, str),
353
+ 'decimal_comma': (False, str),
354
+ 'glob': (True, str),
355
+ 'storage_options': (None, lambda x: repr(x) if x is not None else 'None'),
356
+ }
357
+
358
+ all_vars = locals()
359
+ kwargs_list = []
360
+
361
+ for param_name_key, (default_value, format_func) in param_mapping.items():
362
+ value = all_vars.get(param_name_key)
363
+ formatted_value = format_func(value)
364
+ kwargs_list.append(f"{param_name_key}={formatted_value}")
365
+
366
+ if other_options:
367
+ for k, v in other_options.items():
368
+ kwargs_list.append(f"{k}={repr(v)}")
369
+
370
+ kwargs_str = ",\n ".join(kwargs_list)
371
+
372
+ if kwargs_str:
373
+ polars_code = f"output_df = pl.scan_csv(\n {source_repr},\n {kwargs_str}\n)"
374
+ else:
375
+ polars_code = f"output_df = pl.scan_csv({source_repr})"
376
+
377
+ return polars_code
378
+
379
+
380
+ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
381
+ convert_to_absolute_path: bool = True, **options) -> FlowFrame:
382
+ """
383
+ Read a Parquet file into a FlowFrame.
384
+
385
+ Args:
386
+ file_path: Path to Parquet file
387
+ flow_graph: if you want to add it to an existing graph
388
+ description: if you want to add a readable name in the frontend (advised)
389
+ convert_to_absolute_path: If the path needs to be set to a fixed location
390
+ **options: Options for polars.read_parquet
391
+
392
+ Returns:
393
+ A FlowFrame with the Parquet data
394
+ """
395
+ if '~' in file_path:
396
+ file_path = os.path.expanduser(file_path)
397
+ node_id = generate_node_id()
398
+
399
+ if flow_graph is None:
400
+ flow_graph = create_flow_graph()
401
+
402
+ flow_id = flow_graph.flow_id
403
+
404
+ received_table = input_schema.ReceivedTable(
405
+ file_type='parquet',
406
+ path=file_path,
407
+ name=Path(file_path).name,
408
+ )
409
+ if convert_to_absolute_path:
410
+ received_table.path = received_table.abs_file_path
411
+
412
+ read_node = input_schema.NodeRead(
413
+ flow_id=flow_id,
414
+ node_id=node_id,
415
+ received_file=received_table,
416
+ pos_x=100,
417
+ pos_y=100,
418
+ is_setup=True,
419
+ description=description
420
+ )
421
+
422
+ flow_graph.add_read(read_node)
423
+
424
+ return FlowFrame(
425
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
426
+ flow_graph=flow_graph,
427
+ node_id=node_id
428
+ )
429
+
430
+
431
+ def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) -> FlowFrame:
432
+ """
433
+ Create a FlowFrame from a dictionary or list of dictionaries.
434
+
435
+ Args:
436
+ data: Dictionary of lists or list of dictionaries
437
+ flow_graph: if you want to add it to an existing graph
438
+ description: if you want to add a readable name in the frontend (advised)
439
+ Returns:
440
+ A FlowFrame with the data
441
+ """
442
+ # Create new node ID
443
+ node_id = generate_node_id()
444
+
445
+ if not flow_graph:
446
+ flow_graph = create_flow_graph()
447
+ flow_id = flow_graph.flow_id
448
+
449
+ input_node = input_schema.NodeManualInput(
450
+ flow_id=flow_id,
451
+ node_id=node_id,
452
+ raw_data=FlowDataEngine(data).to_pylist(),
453
+ pos_x=100,
454
+ pos_y=100,
455
+ is_setup=True,
456
+ description=description
457
+ )
458
+
459
+ # Add to graph
460
+ flow_graph.add_manual_input(input_node)
461
+
462
+ # Return new frame
463
+ return FlowFrame(
464
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
465
+ flow_graph=flow_graph,
466
+ node_id=node_id
467
+ )
468
+
469
+
470
+ def concat(frames: List['FlowFrame'],
471
+ how: str = 'vertical',
472
+ rechunk: bool = False,
473
+ parallel: bool = True,
474
+ description: str = None) -> 'FlowFrame':
475
+ """
476
+ Concatenate multiple FlowFrames into one.
477
+
478
+ Parameters
479
+ ----------
480
+ frames : List[FlowFrame]
481
+ List of FlowFrames to concatenate
482
+ how : str, default 'vertical'
483
+ How to combine the FlowFrames (see concat method documentation)
484
+ rechunk : bool, default False
485
+ Whether to ensure contiguous memory in result
486
+ parallel : bool, default True
487
+ Whether to use parallel processing for the operation
488
+ description : str, optional
489
+ Description of this operation
490
+
491
+ Returns
492
+ -------
493
+ FlowFrame
494
+ A new FlowFrame with the concatenated data
495
+ """
496
+ if not frames:
497
+ raise ValueError("No frames provided to concat_frames")
498
+ if len(frames) == 1:
499
+ return frames[0]
500
+ # Use first frame's concat method with remaining frames
501
+ first_frame = frames[0]
502
+ remaining_frames = frames[1:]
503
+
504
+ return first_frame.concat(remaining_frames, how=how,
505
+ rechunk=rechunk, parallel=parallel,
506
+ description=description)
507
+
508
+
509
+ def scan_csv(
510
+ source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
511
+ *,
512
+ flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
513
+ separator: str = ',',
514
+ convert_to_absolute_path: bool = True,
515
+ description: Optional[str] = None,
516
+ has_header: bool = True,
517
+ new_columns: Optional[List[str]] = None,
518
+ comment_prefix: Optional[str] = None,
519
+ quote_char: Optional[str] = '"',
520
+ skip_rows: int = 0,
521
+ skip_lines: int = 0,
522
+ schema: Optional[SchemaDict] = None,
523
+ schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
524
+ null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
525
+ missing_utf8_is_empty_string: bool = False,
526
+ ignore_errors: bool = False,
527
+ try_parse_dates: bool = False,
528
+ infer_schema: bool = True,
529
+ infer_schema_length: Optional[int] = 100,
530
+ n_rows: Optional[int] = None,
531
+ encoding: CsvEncoding = 'utf8',
532
+ low_memory: bool = False,
533
+ rechunk: bool = False,
534
+ storage_options: Optional[Dict[str, Any]] = None,
535
+ skip_rows_after_header: int = 0,
536
+ row_index_name: Optional[str] = None,
537
+ row_index_offset: int = 0,
538
+ eol_char: str = '\n',
539
+ raise_if_empty: bool = True,
540
+ truncate_ragged_lines: bool = False,
541
+ decimal_comma: bool = False,
542
+ glob: bool = True,
543
+ cache: bool = True,
544
+ with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
545
+ **other_options: Any
546
+ ) -> FlowFrame:
547
+ """
548
+ Scan a CSV file into a FlowFrame. This function is an alias for read_csv.
549
+
550
+ This method is the same as read_csv but is provided for compatibility with
551
+ the polars API where scan_csv returns a LazyFrame.
552
+
553
+ See read_csv for full documentation.
554
+ """
555
+ return read_csv(
556
+ source=source,
557
+ flow_graph=flow_graph,
558
+ separator=separator,
559
+ convert_to_absolute_path=convert_to_absolute_path,
560
+ description=description,
561
+ has_header=has_header,
562
+ new_columns=new_columns,
563
+ comment_prefix=comment_prefix,
564
+ quote_char=quote_char,
565
+ skip_rows=skip_rows,
566
+ skip_lines=skip_lines,
567
+ schema=schema,
568
+ schema_overrides=schema_overrides,
569
+ null_values=null_values,
570
+ missing_utf8_is_empty_string=missing_utf8_is_empty_string,
571
+ ignore_errors=ignore_errors,
572
+ try_parse_dates=try_parse_dates,
573
+ infer_schema=infer_schema,
574
+ infer_schema_length=infer_schema_length,
575
+ n_rows=n_rows,
576
+ encoding=encoding,
577
+ low_memory=low_memory,
578
+ rechunk=rechunk,
579
+ storage_options=storage_options,
580
+ skip_rows_after_header=skip_rows_after_header,
581
+ row_index_name=row_index_name,
582
+ row_index_offset=row_index_offset,
583
+ eol_char=eol_char,
584
+ raise_if_empty=raise_if_empty,
585
+ truncate_ragged_lines=truncate_ragged_lines,
586
+ decimal_comma=decimal_comma,
587
+ glob=glob,
588
+ cache=cache,
589
+ with_column_names=with_column_names,
590
+ **other_options
591
+ )
592
+
593
+
594
+ def scan_parquet(
595
+ file_path,
596
+ *,
597
+ flow_graph: FlowGraph = None,
598
+ description: str = None,
599
+ convert_to_absolute_path: bool = True,
600
+ **options
601
+ ) -> FlowFrame:
602
+ """
603
+ Scan a Parquet file into a FlowFrame. This function is an alias for read_parquet.
604
+
605
+ This method is the same as read_parquet but is provided for compatibility with
606
+ the polars API where scan_parquet returns a LazyFrame.
607
+
608
+ See read_parquet for full documentation.
609
+ """
610
+ return read_parquet(
611
+ file_path=file_path,
612
+ flow_graph=flow_graph,
613
+ description=description,
614
+ convert_to_absolute_path=convert_to_absolute_path,
615
+ **options
616
+ )
617
+