Flowfile 0.3.1.2__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/api.py +5 -3
- flowfile/web/static/assets/{AirbyteReader-cb0c1d4a.js → AirbyteReader-2b1cf2d8.js} +10 -9
- flowfile/web/static/assets/{CrossJoin-a514fa59.js → CrossJoin-cc3ab73c.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-f2cecf33.js → DatabaseConnectionSettings-307c4652.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-83ee3c98.js → DatabaseManager-69faa6e1.js} +10 -6
- flowfile/web/static/assets/{DatabaseReader-dc0c6881.js → DatabaseReader-e4134cd0.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-5afe9f8d.js → DatabaseWriter-d32d75b1.js} +9 -9
- flowfile/web/static/assets/{ExploreData-c7ee19cf.js → ExploreData-5eb48389.js} +18639 -18629
- flowfile/web/static/assets/{ExternalSource-17b23a01.js → ExternalSource-29489051.js} +8 -21
- flowfile/web/static/assets/{Filter-90856b4f.js → Filter-031332bb.js} +9 -9
- flowfile/web/static/assets/{Formula-38b71e9e.js → Formula-3b900540.js} +15 -15
- flowfile/web/static/assets/{Formula-d60a74f4.css → Formula-b8cefc31.css} +4 -4
- flowfile/web/static/assets/{FuzzyMatch-d0f1fe81.js → FuzzyMatch-dee31153.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-0c86bbc6.js → GraphSolver-ca74eb47.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f2772e9f.js → GroupBy-081b6591.js} +8 -7
- flowfile/web/static/assets/{Join-bc3e1cf7.js → Join-b467376f.js} +11 -10
- flowfile/web/static/assets/{ManualInput-03aa0245.js → ManualInput-ffffb80a.js} +11 -8
- flowfile/web/static/assets/{Output-5b35eee8.js → Output-9a87d4ba.js} +4 -4
- flowfile/web/static/assets/{Pivot-7164087c.js → Pivot-ee3e6093.js} +8 -7
- flowfile/web/static/assets/{PolarsCode-3abf6507.js → PolarsCode-03921254.js} +13 -11
- flowfile/web/static/assets/{PopOver-b37ff9be.js → PopOver-3bdf8951.js} +1 -1
- flowfile/web/static/assets/{Read-65966a3e.js → Read-67fee3a0.js} +6 -6
- flowfile/web/static/assets/{RecordCount-c66c6d6d.js → RecordCount-a2acd02d.js} +7 -6
- flowfile/web/static/assets/{RecordId-826dc095.js → RecordId-0c8bcd77.js} +10 -8
- flowfile/web/static/assets/{Sample-4ed555c8.js → Sample-60594a3a.js} +7 -6
- flowfile/web/static/assets/{SecretManager-eac1e97d.js → SecretManager-bbcec2ac.js} +2 -2
- flowfile/web/static/assets/{Select-085f05cc.js → Select-9540e6ca.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-1f5e79c1.js → SettingsSection-48f28104.js} +1 -1
- flowfile/web/static/assets/{Sort-3e6cb414.js → Sort-6dbe3633.js} +6 -6
- flowfile/web/static/assets/{TextToRows-606349bc.js → TextToRows-27aab4a8.js} +18 -13
- flowfile/web/static/assets/{UnavailableFields-b41976ed.js → UnavailableFields-8143044b.js} +2 -2
- flowfile/web/static/assets/{Union-fca91665.js → Union-52460248.js} +7 -6
- flowfile/web/static/assets/{Unique-a59f830e.js → Unique-f6962644.js} +8 -8
- flowfile/web/static/assets/{Unpivot-c3815565.js → Unpivot-1ff1e938.js} +5 -5
- flowfile/web/static/assets/{api-22b338bd.js → api-3b345d92.js} +1 -1
- flowfile/web/static/assets/{designer-e5bbe26f.js → designer-4736134f.js} +72 -42
- flowfile/web/static/assets/{documentation-08045cf2.js → documentation-b9545eba.js} +1 -1
- flowfile/web/static/assets/{dropDown-5e7e9a5a.js → dropDown-d5a4014c.js} +1 -1
- flowfile/web/static/assets/{dropDownGeneric-50a91b99.js → dropDownGeneric-1f4e32ec.js} +2 -2
- flowfile/web/static/assets/{fullEditor-705c6ccb.js → fullEditor-f4791c23.js} +3 -3
- flowfile/web/static/assets/{genericNodeSettings-65587f20.js → genericNodeSettings-1d456350.js} +3 -3
- flowfile/web/static/assets/{index-552863fd.js → index-f25c9283.js} +2608 -1570
- flowfile/web/static/assets/{nodeTitle-cf9bae3c.js → nodeTitle-cad6fd9d.js} +3 -3
- flowfile/web/static/assets/{secretApi-3ad510e1.js → secretApi-01f07e2c.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-bd644891.js → selectDynamic-f46a4e3f.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-dd17b478.js → vue-codemirror.esm-eb98fc8b.js} +15 -14
- flowfile/web/static/assets/{vue-content-loader.es-6b36f05e.js → vue-content-loader.es-860c0380.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/METADATA +1 -3
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/RECORD +62 -64
- flowfile_core/configs/node_store/nodes.py +2 -4
- flowfile_core/flowfile/FlowfileFlow.py +72 -12
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1 -1
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +32 -1
- flowfile_core/flowfile/flow_graph_utils.py +320 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +0 -1
- flowfile_core/schemas/input_schema.py +2 -10
- flowfile_frame/__init__.py +1 -1
- flowfile_frame/flow_frame.py +455 -51
- flowfile/web/static/assets/GoogleSheet-854294a4.js +0 -2616
- flowfile/web/static/assets/GoogleSheet-92084da7.css +0 -233
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +0 -74
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/LICENSE +0 -0
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/WHEEL +0 -0
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/entry_points.txt +0 -0
flowfile_frame/flow_frame.py
CHANGED
|
@@ -1,14 +1,17 @@
|
|
|
1
|
-
import
|
|
1
|
+
import logging
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
|
|
3
|
+
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
+
import io
|
|
6
7
|
import re
|
|
7
8
|
import polars as pl
|
|
8
|
-
from polars._typing import FrameInitTypes, SchemaDefinition, SchemaDict, Orientation
|
|
9
|
+
from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation, IO, Mapping, PolarsDataType,
|
|
10
|
+
Sequence, CsvEncoding)
|
|
9
11
|
|
|
10
12
|
# Assume these imports are correct from your original context
|
|
11
13
|
from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
|
|
14
|
+
from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
|
|
12
15
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
13
16
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
14
17
|
from flowfile_core.schemas import input_schema, transform_schema
|
|
@@ -22,6 +25,14 @@ from flowfile_frame.join import _normalize_columns_to_list, _create_join_mapping
|
|
|
22
25
|
node_id_counter = 0
|
|
23
26
|
|
|
24
27
|
|
|
28
|
+
logging.basicConfig(
|
|
29
|
+
level=logging.INFO,
|
|
30
|
+
format='[%(levelname)s] %(message)s'
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Create and export the logger
|
|
34
|
+
logger = logging.getLogger('flow_frame')
|
|
35
|
+
|
|
25
36
|
def _to_string_val(v) -> str:
|
|
26
37
|
if isinstance(v, str):
|
|
27
38
|
return f"'{v}'"
|
|
@@ -478,14 +489,25 @@ class FlowFrame:
|
|
|
478
489
|
FlowFrame
|
|
479
490
|
New FlowFrame with join operation applied.
|
|
480
491
|
"""
|
|
481
|
-
new_node_id = generate_node_id()
|
|
482
|
-
print('new node id', new_node_id)
|
|
483
492
|
use_polars_code = not(maintain_order is None and
|
|
484
493
|
coalesce is None and
|
|
485
494
|
nulls_equal is False and
|
|
486
495
|
validate is None and
|
|
487
496
|
suffix == '_right')
|
|
488
497
|
join_mappings = None
|
|
498
|
+
if self.flow_graph.flow_id != other.flow_graph.flow_id:
|
|
499
|
+
combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
|
|
500
|
+
new_self_node_id = node_mappings.get((self.flow_graph.flow_id, self.node_id), None)
|
|
501
|
+
new_other_node_id = node_mappings.get((other.flow_graph.flow_id, other.node_id), None)
|
|
502
|
+
if new_other_node_id is None or new_self_node_id is None:
|
|
503
|
+
raise ValueError("Cannot remap the nodes")
|
|
504
|
+
self.node_id = new_self_node_id
|
|
505
|
+
other.node_id = new_other_node_id
|
|
506
|
+
self.flow_graph = combined_graph
|
|
507
|
+
other.flow_graph = combined_graph
|
|
508
|
+
global node_id_counter
|
|
509
|
+
node_id_counter += len(combined_graph.nodes)
|
|
510
|
+
new_node_id = generate_node_id()
|
|
489
511
|
if on is not None:
|
|
490
512
|
left_columns = right_columns = _normalize_columns_to_list(on)
|
|
491
513
|
elif left_on is not None and right_on is not None:
|
|
@@ -597,10 +619,8 @@ class FlowFrame:
|
|
|
597
619
|
if (len(columns) == 1 and isinstance(columns[0], Expr)
|
|
598
620
|
and str(columns[0]) == "pl.Expr(len()).alias('number_of_records')"):
|
|
599
621
|
return self._add_number_of_records(new_node_id, description)
|
|
600
|
-
|
|
601
|
-
# Handle simple column names
|
|
602
622
|
if all(isinstance(col_, (str, Column)) for col_ in columns):
|
|
603
|
-
|
|
623
|
+
|
|
604
624
|
select_inputs = [
|
|
605
625
|
transform_schema.SelectInput(old_name=col_) if isinstance(col_, str) else col_.to_select_input()
|
|
606
626
|
for col_ in columns
|
|
@@ -946,7 +966,7 @@ class FlowFrame:
|
|
|
946
966
|
input_schema.NodeFormula(flow_id=self.flow_graph.flow_id, node_id=new_node_id, depending_on_id=self.node_id,
|
|
947
967
|
function=transform_schema.FunctionInput(
|
|
948
968
|
function=flowfile_formula,
|
|
949
|
-
field=transform_schema.FieldInput(name=output_column_name)),
|
|
969
|
+
field=transform_schema.FieldInput(name=output_column_name, data_type='Auto')),
|
|
950
970
|
description=description))
|
|
951
971
|
self.flow_graph.add_formula(function_settings)
|
|
952
972
|
return self._create_child_frame(new_node_id)
|
|
@@ -1241,14 +1261,24 @@ class FlowFrame:
|
|
|
1241
1261
|
FlowFrame
|
|
1242
1262
|
A new FlowFrame with the concatenated data
|
|
1243
1263
|
"""
|
|
1244
|
-
new_node_id = generate_node_id()
|
|
1245
|
-
|
|
1246
1264
|
# Convert single FlowFrame to list
|
|
1247
1265
|
if isinstance(other, FlowFrame):
|
|
1248
1266
|
others = [other]
|
|
1249
1267
|
else:
|
|
1250
1268
|
others = other
|
|
1251
|
-
|
|
1269
|
+
all_graphs = []
|
|
1270
|
+
all_graph_ids = []
|
|
1271
|
+
for g in [self.flow_graph] + [f.flow_graph for f in others]:
|
|
1272
|
+
if g.flow_id not in all_graph_ids:
|
|
1273
|
+
all_graph_ids.append(g.flow_id)
|
|
1274
|
+
all_graphs.append(g)
|
|
1275
|
+
if len(all_graphs) > 1:
|
|
1276
|
+
combined_graph, node_mappings = combine_flow_graphs_with_mapping(*all_graphs)
|
|
1277
|
+
for f in [self] + other:
|
|
1278
|
+
f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
|
|
1279
|
+
global node_id_counter
|
|
1280
|
+
node_id_counter += len(combined_graph.nodes)
|
|
1281
|
+
new_node_id = generate_node_id()
|
|
1252
1282
|
use_native = how == "diagonal_relaxed" and parallel and not rechunk
|
|
1253
1283
|
|
|
1254
1284
|
if use_native:
|
|
@@ -1902,64 +1932,328 @@ def count(expr):
|
|
|
1902
1932
|
return expr.count()
|
|
1903
1933
|
|
|
1904
1934
|
|
|
1905
|
-
def read_csv(
|
|
1906
|
-
|
|
1907
|
-
|
|
1935
|
+
def read_csv(
|
|
1936
|
+
source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
|
|
1937
|
+
*,
|
|
1938
|
+
flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
|
|
1939
|
+
separator: str = ',',
|
|
1940
|
+
convert_to_absolute_path: bool = True,
|
|
1941
|
+
description: Optional[str] = None,
|
|
1942
|
+
has_header: bool = True,
|
|
1943
|
+
new_columns: Optional[List[str]] = None,
|
|
1944
|
+
comment_prefix: Optional[str] = None,
|
|
1945
|
+
quote_char: Optional[str] = '"',
|
|
1946
|
+
skip_rows: int = 0,
|
|
1947
|
+
skip_lines: int = 0,
|
|
1948
|
+
schema: Optional[SchemaDict] = None,
|
|
1949
|
+
schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
|
|
1950
|
+
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
|
|
1951
|
+
missing_utf8_is_empty_string: bool = False,
|
|
1952
|
+
ignore_errors: bool = False,
|
|
1953
|
+
try_parse_dates: bool = False,
|
|
1954
|
+
infer_schema: bool = True,
|
|
1955
|
+
infer_schema_length: Optional[int] = 100,
|
|
1956
|
+
n_rows: Optional[int] = None,
|
|
1957
|
+
encoding: CsvEncoding = 'utf8',
|
|
1958
|
+
low_memory: bool = False,
|
|
1959
|
+
rechunk: bool = False,
|
|
1960
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
1961
|
+
skip_rows_after_header: int = 0,
|
|
1962
|
+
row_index_name: Optional[str] = None,
|
|
1963
|
+
row_index_offset: int = 0,
|
|
1964
|
+
eol_char: str = '\n',
|
|
1965
|
+
raise_if_empty: bool = True,
|
|
1966
|
+
truncate_ragged_lines: bool = False,
|
|
1967
|
+
decimal_comma: bool = False,
|
|
1968
|
+
glob: bool = True,
|
|
1969
|
+
cache: bool = True,
|
|
1970
|
+
with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
|
|
1971
|
+
**other_options: Any
|
|
1972
|
+
) -> FlowFrame:
|
|
1908
1973
|
"""
|
|
1909
1974
|
Read a CSV file into a FlowFrame.
|
|
1910
1975
|
|
|
1976
|
+
This function uses the native FlowGraph implementation when the parameters
|
|
1977
|
+
fall within the supported range, and falls back to using Polars' scan_csv implementation
|
|
1978
|
+
for more advanced features.
|
|
1979
|
+
|
|
1911
1980
|
Args:
|
|
1912
|
-
|
|
1981
|
+
source: Path(s) to CSV file(s), or a file-like object.
|
|
1913
1982
|
flow_graph: if you want to add it to an existing graph
|
|
1914
1983
|
separator: Single byte character to use as separator in the file.
|
|
1915
1984
|
convert_to_absolute_path: If the path needs to be set to a fixed location
|
|
1916
1985
|
description: if you want to add a readable name in the frontend (advised)
|
|
1917
|
-
|
|
1986
|
+
|
|
1987
|
+
# Polars.scan_csv aligned parameters
|
|
1988
|
+
has_header: Indicate if the first row of the dataset is a header or not.
|
|
1989
|
+
new_columns: Rename columns after selection.
|
|
1990
|
+
comment_prefix: String that indicates a comment line if found at beginning of line.
|
|
1991
|
+
quote_char: Character used for quoting. None to disable.
|
|
1992
|
+
skip_rows: Start reading after this many rows.
|
|
1993
|
+
skip_lines: Skip this many lines by newline char only.
|
|
1994
|
+
schema: Schema to use when reading the CSV.
|
|
1995
|
+
schema_overrides: Schema overrides for specific columns.
|
|
1996
|
+
null_values: Values to interpret as null.
|
|
1997
|
+
missing_utf8_is_empty_string: Treat missing utf8 values as empty strings.
|
|
1998
|
+
ignore_errors: Try to keep reading lines if some parsing errors occur.
|
|
1999
|
+
try_parse_dates: Try to automatically parse dates.
|
|
2000
|
+
infer_schema: Boolean flag. If False, `infer_schema_length` for Polars is set to 0.
|
|
2001
|
+
infer_schema_length: Number of rows to use for schema inference. Polars default is 100.
|
|
2002
|
+
n_rows: Stop reading after this many rows.
|
|
2003
|
+
encoding: Character encoding to use.
|
|
2004
|
+
low_memory: Reduce memory usage at the cost of performance.
|
|
2005
|
+
rechunk: Ensure data is in contiguous memory layout after parsing.
|
|
2006
|
+
storage_options: Options for fsspec for cloud storage.
|
|
2007
|
+
skip_rows_after_header: Skip rows after header.
|
|
2008
|
+
row_index_name: Name of the row index column.
|
|
2009
|
+
row_index_offset: Start value for the row index.
|
|
2010
|
+
eol_char: End of line character.
|
|
2011
|
+
raise_if_empty: Raise error if file is empty.
|
|
2012
|
+
truncate_ragged_lines: Truncate lines with too many values.
|
|
2013
|
+
decimal_comma: Parse floats with decimal comma.
|
|
2014
|
+
glob: Use glob pattern for file path (if source is a string).
|
|
2015
|
+
cache: Cache the result after reading (Polars default True).
|
|
2016
|
+
with_column_names: Apply a function over the column names.
|
|
2017
|
+
other_options: Any other options to pass to polars.scan_csv (e.g. retries, file_cache_ttl).
|
|
1918
2018
|
|
|
1919
2019
|
Returns:
|
|
1920
|
-
A FlowFrame with the CSV data
|
|
2020
|
+
A FlowFrame with the CSV data.
|
|
1921
2021
|
"""
|
|
1922
|
-
#
|
|
1923
|
-
node_id = generate_node_id()
|
|
2022
|
+
node_id = generate_node_id() # Assuming generate_node_id is defined
|
|
1924
2023
|
if flow_graph is None:
|
|
1925
|
-
flow_graph = create_flow_graph()
|
|
1926
|
-
|
|
2024
|
+
flow_graph = create_flow_graph() # Assuming create_flow_graph is defined
|
|
1927
2025
|
flow_id = flow_graph.flow_id
|
|
1928
2026
|
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
2027
|
+
current_source_path_for_native = None
|
|
2028
|
+
if isinstance(source, (str, os.PathLike)):
|
|
2029
|
+
current_source_path_for_native = str(source)
|
|
2030
|
+
if '~' in current_source_path_for_native:
|
|
2031
|
+
current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
|
|
2032
|
+
elif isinstance(source, list) and all(isinstance(s, (str, os.PathLike)) for s in source):
|
|
2033
|
+
current_source_path_for_native = str(source[0]) if source else None
|
|
2034
|
+
if current_source_path_for_native and '~' in current_source_path_for_native:
|
|
2035
|
+
current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
|
|
2036
|
+
elif isinstance(source, (io.BytesIO, io.StringIO)):
|
|
2037
|
+
logger.warning("Read from bytes io from csv not supported, converting data to raw data")
|
|
2038
|
+
return from_dict(pl.read_csv(source), flow_graph=flow_graph, description=description)
|
|
2039
|
+
actual_infer_schema_length: Optional[int]
|
|
2040
|
+
if not infer_schema:
|
|
2041
|
+
actual_infer_schema_length = 0
|
|
2042
|
+
else:
|
|
2043
|
+
actual_infer_schema_length = infer_schema_length
|
|
2044
|
+
can_use_native = (
|
|
2045
|
+
current_source_path_for_native is not None and
|
|
2046
|
+
comment_prefix is None and
|
|
2047
|
+
skip_lines == 0 and
|
|
2048
|
+
schema is None and
|
|
2049
|
+
schema_overrides is None and
|
|
2050
|
+
null_values is None and
|
|
2051
|
+
not missing_utf8_is_empty_string and
|
|
2052
|
+
not try_parse_dates and
|
|
2053
|
+
n_rows is None and
|
|
2054
|
+
not low_memory and
|
|
2055
|
+
not rechunk and
|
|
2056
|
+
storage_options is None and
|
|
2057
|
+
skip_rows_after_header == 0 and
|
|
2058
|
+
row_index_name is None and
|
|
2059
|
+
row_index_offset == 0 and
|
|
2060
|
+
eol_char == '\n' and
|
|
2061
|
+
not decimal_comma and
|
|
2062
|
+
new_columns is None and
|
|
2063
|
+
glob is True
|
|
1942
2064
|
)
|
|
2065
|
+
if can_use_native and current_source_path_for_native:
|
|
2066
|
+
received_table = input_schema.ReceivedTable(
|
|
2067
|
+
file_type='csv',
|
|
2068
|
+
path=current_source_path_for_native,
|
|
2069
|
+
name=Path(current_source_path_for_native).name,
|
|
2070
|
+
delimiter=separator,
|
|
2071
|
+
has_headers=has_header,
|
|
2072
|
+
encoding=encoding,
|
|
2073
|
+
starting_from_line=skip_rows,
|
|
2074
|
+
quote_char=quote_char if quote_char is not None else '"',
|
|
2075
|
+
infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
|
|
2076
|
+
truncate_ragged_lines=truncate_ragged_lines,
|
|
2077
|
+
ignore_errors=ignore_errors,
|
|
2078
|
+
row_delimiter=eol_char
|
|
2079
|
+
)
|
|
2080
|
+
if convert_to_absolute_path:
|
|
2081
|
+
try:
|
|
2082
|
+
received_table.set_absolute_filepath()
|
|
2083
|
+
received_table.path = received_table.abs_file_path
|
|
2084
|
+
except Exception as e:
|
|
2085
|
+
print(f"Warning: Could not determine absolute path for {current_source_path_for_native}: {e}")
|
|
1943
2086
|
|
|
1944
|
-
|
|
1945
|
-
|
|
2087
|
+
read_node_description = description or f"Read CSV from {Path(current_source_path_for_native).name}"
|
|
2088
|
+
read_node = input_schema.NodeRead(
|
|
2089
|
+
flow_id=flow_id,
|
|
2090
|
+
node_id=node_id,
|
|
2091
|
+
received_file=received_table,
|
|
2092
|
+
pos_x=100,
|
|
2093
|
+
pos_y=100,
|
|
2094
|
+
is_setup=True,
|
|
2095
|
+
description=read_node_description
|
|
2096
|
+
)
|
|
2097
|
+
flow_graph.add_read(read_node)
|
|
2098
|
+
result_frame = FlowFrame(
|
|
2099
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2100
|
+
flow_graph=flow_graph,
|
|
2101
|
+
node_id=node_id
|
|
2102
|
+
)
|
|
2103
|
+
return result_frame
|
|
2104
|
+
else:
|
|
2105
|
+
polars_source_arg = source
|
|
2106
|
+
polars_code = _build_polars_code_args(
|
|
2107
|
+
source=polars_source_arg,
|
|
2108
|
+
separator=separator,
|
|
2109
|
+
has_header=has_header,
|
|
2110
|
+
new_columns=new_columns,
|
|
2111
|
+
comment_prefix=comment_prefix,
|
|
2112
|
+
quote_char=quote_char,
|
|
2113
|
+
skip_rows=skip_rows,
|
|
2114
|
+
skip_lines=skip_lines,
|
|
2115
|
+
schema=schema,
|
|
2116
|
+
schema_overrides=schema_overrides,
|
|
2117
|
+
null_values=null_values,
|
|
2118
|
+
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
2119
|
+
ignore_errors=ignore_errors,
|
|
2120
|
+
try_parse_dates=try_parse_dates,
|
|
2121
|
+
infer_schema_length=actual_infer_schema_length,
|
|
2122
|
+
n_rows=n_rows,
|
|
2123
|
+
encoding=encoding,
|
|
2124
|
+
low_memory=low_memory,
|
|
2125
|
+
rechunk=rechunk,
|
|
2126
|
+
storage_options=storage_options,
|
|
2127
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
2128
|
+
row_index_name=row_index_name,
|
|
2129
|
+
row_index_offset=row_index_offset,
|
|
2130
|
+
eol_char=eol_char,
|
|
2131
|
+
raise_if_empty=raise_if_empty,
|
|
2132
|
+
truncate_ragged_lines=truncate_ragged_lines,
|
|
2133
|
+
decimal_comma=decimal_comma,
|
|
2134
|
+
glob=glob,
|
|
2135
|
+
cache=cache,
|
|
2136
|
+
with_column_names=with_column_names,
|
|
2137
|
+
**other_options
|
|
2138
|
+
)
|
|
2139
|
+
polars_code_node_description = description or "Read CSV with Polars scan_csv"
|
|
2140
|
+
if isinstance(source, (str, os.PathLike)):
|
|
2141
|
+
polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source).name}"
|
|
2142
|
+
elif isinstance(source, list) and source and isinstance(source[0], (str, os.PathLike)):
|
|
2143
|
+
polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source[0]).name} (and possibly others)"
|
|
1946
2144
|
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
2145
|
+
# Assuming input_schema.NodePolarsCode, transform_schema.PolarsCodeInput are defined
|
|
2146
|
+
polars_code_settings = input_schema.NodePolarsCode(
|
|
2147
|
+
flow_id=flow_id,
|
|
2148
|
+
node_id=node_id,
|
|
2149
|
+
polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code),
|
|
2150
|
+
is_setup=True,
|
|
2151
|
+
description=polars_code_node_description
|
|
2152
|
+
)
|
|
2153
|
+
flow_graph.add_polars_code(polars_code_settings)
|
|
2154
|
+
return FlowFrame(
|
|
2155
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2156
|
+
flow_graph=flow_graph,
|
|
2157
|
+
node_id=node_id,
|
|
2158
|
+
)
|
|
1955
2159
|
|
|
1956
|
-
|
|
2160
|
+
def _build_polars_code_args(
|
|
2161
|
+
source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
|
|
2162
|
+
separator: str,
|
|
2163
|
+
has_header: bool,
|
|
2164
|
+
new_columns: Optional[List[str]],
|
|
2165
|
+
comment_prefix: Optional[str],
|
|
2166
|
+
quote_char: Optional[str],
|
|
2167
|
+
skip_rows: int,
|
|
2168
|
+
skip_lines: int,
|
|
2169
|
+
schema: Optional[SchemaDict],
|
|
2170
|
+
schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]],
|
|
2171
|
+
null_values: Optional[Union[str, List[str], Dict[str, str]]],
|
|
2172
|
+
missing_utf8_is_empty_string: bool,
|
|
2173
|
+
ignore_errors: bool,
|
|
2174
|
+
try_parse_dates: bool,
|
|
2175
|
+
infer_schema_length: Optional[int],
|
|
2176
|
+
n_rows: Optional[int],
|
|
2177
|
+
encoding: CsvEncoding,
|
|
2178
|
+
low_memory: bool,
|
|
2179
|
+
rechunk: bool,
|
|
2180
|
+
storage_options: Optional[Dict[str, Any]],
|
|
2181
|
+
skip_rows_after_header: int,
|
|
2182
|
+
row_index_name: Optional[str],
|
|
2183
|
+
row_index_offset: int,
|
|
2184
|
+
eol_char: str,
|
|
2185
|
+
raise_if_empty: bool,
|
|
2186
|
+
truncate_ragged_lines: bool,
|
|
2187
|
+
decimal_comma: bool,
|
|
2188
|
+
glob: bool,
|
|
2189
|
+
cache: bool,
|
|
2190
|
+
with_column_names: Optional[Callable[[List[str]], List[str]]],
|
|
2191
|
+
**other_options: Any
|
|
2192
|
+
) -> str:
|
|
2193
|
+
source_repr: str
|
|
2194
|
+
if isinstance(source, (str, Path)):
|
|
2195
|
+
source_repr = repr(str(source))
|
|
2196
|
+
elif isinstance(source, list):
|
|
2197
|
+
source_repr = repr([str(p) for p in source])
|
|
2198
|
+
elif isinstance(source, bytes):
|
|
2199
|
+
source_repr = "source_bytes_obj"
|
|
2200
|
+
elif hasattr(source, 'read'):
|
|
2201
|
+
source_repr = "source_file_like_obj"
|
|
2202
|
+
else:
|
|
2203
|
+
source_repr = repr(source)
|
|
2204
|
+
|
|
2205
|
+
param_mapping = {
|
|
2206
|
+
'has_header': (True, lambda x: str(x)),
|
|
2207
|
+
'separator': (',', lambda x: repr(str(x))),
|
|
2208
|
+
'comment_prefix': (None, lambda x: repr(str(x)) if x is not None else 'None'),
|
|
2209
|
+
'quote_char': ('"', lambda x: repr(str(x)) if x is not None else 'None'),
|
|
2210
|
+
'skip_rows': (0, str),
|
|
2211
|
+
'skip_lines': (0, str),
|
|
2212
|
+
'schema': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2213
|
+
'schema_overrides': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2214
|
+
'null_values': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2215
|
+
'missing_utf8_is_empty_string': (False, str),
|
|
2216
|
+
'ignore_errors': (False, str),
|
|
2217
|
+
'cache': (True, str),
|
|
2218
|
+
'with_column_names': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2219
|
+
'infer_schema_length': (100, lambda x: str(x) if x is not None else 'None'),
|
|
2220
|
+
'n_rows': (None, lambda x: str(x) if x is not None else 'None'),
|
|
2221
|
+
'encoding': ('utf8', lambda x: repr(str(x))),
|
|
2222
|
+
'low_memory': (False, str),
|
|
2223
|
+
'rechunk': (False, str),
|
|
2224
|
+
'skip_rows_after_header': (0, str),
|
|
2225
|
+
'row_index_name': (None, lambda x: repr(str(x)) if x is not None else 'None'),
|
|
2226
|
+
'row_index_offset': (0, str),
|
|
2227
|
+
'try_parse_dates': (False, str),
|
|
2228
|
+
'eol_char': ('\n', lambda x: repr(str(x))),
|
|
2229
|
+
'new_columns': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2230
|
+
'raise_if_empty': (True, str),
|
|
2231
|
+
'truncate_ragged_lines': (False, str),
|
|
2232
|
+
'decimal_comma': (False, str),
|
|
2233
|
+
'glob': (True, str),
|
|
2234
|
+
'storage_options': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2235
|
+
}
|
|
2236
|
+
|
|
2237
|
+
all_vars = locals()
|
|
2238
|
+
kwargs_list = []
|
|
2239
|
+
|
|
2240
|
+
for param_name_key, (default_value, format_func) in param_mapping.items():
|
|
2241
|
+
value = all_vars.get(param_name_key)
|
|
2242
|
+
formatted_value = format_func(value)
|
|
2243
|
+
kwargs_list.append(f"{param_name_key}={formatted_value}")
|
|
2244
|
+
|
|
2245
|
+
if other_options:
|
|
2246
|
+
for k, v in other_options.items():
|
|
2247
|
+
kwargs_list.append(f"{k}={repr(v)}")
|
|
2248
|
+
|
|
2249
|
+
kwargs_str = ",\n ".join(kwargs_list)
|
|
2250
|
+
|
|
2251
|
+
if kwargs_str:
|
|
2252
|
+
polars_code = f"output_df = pl.scan_csv(\n {source_repr},\n {kwargs_str}\n)"
|
|
2253
|
+
else:
|
|
2254
|
+
polars_code = f"output_df = pl.scan_csv({source_repr})"
|
|
1957
2255
|
|
|
1958
|
-
return
|
|
1959
|
-
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
1960
|
-
flow_graph=flow_graph,
|
|
1961
|
-
node_id=node_id
|
|
1962
|
-
)
|
|
2256
|
+
return polars_code
|
|
1963
2257
|
|
|
1964
2258
|
|
|
1965
2259
|
def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
|
|
@@ -2091,3 +2385,113 @@ def concat(frames: List['FlowFrame'],
|
|
|
2091
2385
|
return first_frame.concat(remaining_frames, how=how,
|
|
2092
2386
|
rechunk=rechunk, parallel=parallel,
|
|
2093
2387
|
description=description)
|
|
2388
|
+
|
|
2389
|
+
|
|
2390
|
+
def scan_csv(
|
|
2391
|
+
source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
|
|
2392
|
+
*,
|
|
2393
|
+
flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
|
|
2394
|
+
separator: str = ',',
|
|
2395
|
+
convert_to_absolute_path: bool = True,
|
|
2396
|
+
description: Optional[str] = None,
|
|
2397
|
+
has_header: bool = True,
|
|
2398
|
+
new_columns: Optional[List[str]] = None,
|
|
2399
|
+
comment_prefix: Optional[str] = None,
|
|
2400
|
+
quote_char: Optional[str] = '"',
|
|
2401
|
+
skip_rows: int = 0,
|
|
2402
|
+
skip_lines: int = 0,
|
|
2403
|
+
schema: Optional[SchemaDict] = None,
|
|
2404
|
+
schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
|
|
2405
|
+
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
|
|
2406
|
+
missing_utf8_is_empty_string: bool = False,
|
|
2407
|
+
ignore_errors: bool = False,
|
|
2408
|
+
try_parse_dates: bool = False,
|
|
2409
|
+
infer_schema: bool = True,
|
|
2410
|
+
infer_schema_length: Optional[int] = 100,
|
|
2411
|
+
n_rows: Optional[int] = None,
|
|
2412
|
+
encoding: CsvEncoding = 'utf8',
|
|
2413
|
+
low_memory: bool = False,
|
|
2414
|
+
rechunk: bool = False,
|
|
2415
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
2416
|
+
skip_rows_after_header: int = 0,
|
|
2417
|
+
row_index_name: Optional[str] = None,
|
|
2418
|
+
row_index_offset: int = 0,
|
|
2419
|
+
eol_char: str = '\n',
|
|
2420
|
+
raise_if_empty: bool = True,
|
|
2421
|
+
truncate_ragged_lines: bool = False,
|
|
2422
|
+
decimal_comma: bool = False,
|
|
2423
|
+
glob: bool = True,
|
|
2424
|
+
cache: bool = True,
|
|
2425
|
+
with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
|
|
2426
|
+
**other_options: Any
|
|
2427
|
+
) -> FlowFrame:
|
|
2428
|
+
"""
|
|
2429
|
+
Scan a CSV file into a FlowFrame. This function is an alias for read_csv.
|
|
2430
|
+
|
|
2431
|
+
This method is the same as read_csv but is provided for compatibility with
|
|
2432
|
+
the polars API where scan_csv returns a LazyFrame.
|
|
2433
|
+
|
|
2434
|
+
See read_csv for full documentation.
|
|
2435
|
+
"""
|
|
2436
|
+
return read_csv(
|
|
2437
|
+
source=source,
|
|
2438
|
+
flow_graph=flow_graph,
|
|
2439
|
+
separator=separator,
|
|
2440
|
+
convert_to_absolute_path=convert_to_absolute_path,
|
|
2441
|
+
description=description,
|
|
2442
|
+
has_header=has_header,
|
|
2443
|
+
new_columns=new_columns,
|
|
2444
|
+
comment_prefix=comment_prefix,
|
|
2445
|
+
quote_char=quote_char,
|
|
2446
|
+
skip_rows=skip_rows,
|
|
2447
|
+
skip_lines=skip_lines,
|
|
2448
|
+
schema=schema,
|
|
2449
|
+
schema_overrides=schema_overrides,
|
|
2450
|
+
null_values=null_values,
|
|
2451
|
+
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
2452
|
+
ignore_errors=ignore_errors,
|
|
2453
|
+
try_parse_dates=try_parse_dates,
|
|
2454
|
+
infer_schema=infer_schema,
|
|
2455
|
+
infer_schema_length=infer_schema_length,
|
|
2456
|
+
n_rows=n_rows,
|
|
2457
|
+
encoding=encoding,
|
|
2458
|
+
low_memory=low_memory,
|
|
2459
|
+
rechunk=rechunk,
|
|
2460
|
+
storage_options=storage_options,
|
|
2461
|
+
skip_rows_after_header=skip_rows_after_header,
|
|
2462
|
+
row_index_name=row_index_name,
|
|
2463
|
+
row_index_offset=row_index_offset,
|
|
2464
|
+
eol_char=eol_char,
|
|
2465
|
+
raise_if_empty=raise_if_empty,
|
|
2466
|
+
truncate_ragged_lines=truncate_ragged_lines,
|
|
2467
|
+
decimal_comma=decimal_comma,
|
|
2468
|
+
glob=glob,
|
|
2469
|
+
cache=cache,
|
|
2470
|
+
with_column_names=with_column_names,
|
|
2471
|
+
**other_options
|
|
2472
|
+
)
|
|
2473
|
+
|
|
2474
|
+
|
|
2475
|
+
def scan_parquet(
|
|
2476
|
+
file_path,
|
|
2477
|
+
*,
|
|
2478
|
+
flow_graph: FlowGraph = None,
|
|
2479
|
+
description: str = None,
|
|
2480
|
+
convert_to_absolute_path: bool = True,
|
|
2481
|
+
**options
|
|
2482
|
+
) -> FlowFrame:
|
|
2483
|
+
"""
|
|
2484
|
+
Scan a Parquet file into a FlowFrame. This function is an alias for read_parquet.
|
|
2485
|
+
|
|
2486
|
+
This method is the same as read_parquet but is provided for compatibility with
|
|
2487
|
+
the polars API where scan_parquet returns a LazyFrame.
|
|
2488
|
+
|
|
2489
|
+
See read_parquet for full documentation.
|
|
2490
|
+
"""
|
|
2491
|
+
return read_parquet(
|
|
2492
|
+
file_path=file_path,
|
|
2493
|
+
flow_graph=flow_graph,
|
|
2494
|
+
description=description,
|
|
2495
|
+
convert_to_absolute_path=convert_to_absolute_path,
|
|
2496
|
+
**options
|
|
2497
|
+
)
|