Flowfile 0.3.1.2__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. flowfile/api.py +5 -3
  2. flowfile/web/static/assets/{AirbyteReader-cb0c1d4a.js → AirbyteReader-2b1cf2d8.js} +10 -9
  3. flowfile/web/static/assets/{CrossJoin-a514fa59.js → CrossJoin-cc3ab73c.js} +8 -8
  4. flowfile/web/static/assets/{DatabaseConnectionSettings-f2cecf33.js → DatabaseConnectionSettings-307c4652.js} +2 -2
  5. flowfile/web/static/assets/{DatabaseManager-83ee3c98.js → DatabaseManager-69faa6e1.js} +10 -6
  6. flowfile/web/static/assets/{DatabaseReader-dc0c6881.js → DatabaseReader-e4134cd0.js} +9 -9
  7. flowfile/web/static/assets/{DatabaseWriter-5afe9f8d.js → DatabaseWriter-d32d75b1.js} +9 -9
  8. flowfile/web/static/assets/{ExploreData-c7ee19cf.js → ExploreData-5eb48389.js} +18639 -18629
  9. flowfile/web/static/assets/{ExternalSource-17b23a01.js → ExternalSource-29489051.js} +8 -21
  10. flowfile/web/static/assets/{Filter-90856b4f.js → Filter-031332bb.js} +9 -9
  11. flowfile/web/static/assets/{Formula-38b71e9e.js → Formula-3b900540.js} +15 -15
  12. flowfile/web/static/assets/{Formula-d60a74f4.css → Formula-b8cefc31.css} +4 -4
  13. flowfile/web/static/assets/{FuzzyMatch-d0f1fe81.js → FuzzyMatch-dee31153.js} +9 -9
  14. flowfile/web/static/assets/{GraphSolver-0c86bbc6.js → GraphSolver-ca74eb47.js} +5 -5
  15. flowfile/web/static/assets/{GroupBy-f2772e9f.js → GroupBy-081b6591.js} +8 -7
  16. flowfile/web/static/assets/{Join-bc3e1cf7.js → Join-b467376f.js} +11 -10
  17. flowfile/web/static/assets/{ManualInput-03aa0245.js → ManualInput-ffffb80a.js} +11 -8
  18. flowfile/web/static/assets/{Output-5b35eee8.js → Output-9a87d4ba.js} +4 -4
  19. flowfile/web/static/assets/{Pivot-7164087c.js → Pivot-ee3e6093.js} +8 -7
  20. flowfile/web/static/assets/{PolarsCode-3abf6507.js → PolarsCode-03921254.js} +13 -11
  21. flowfile/web/static/assets/{PopOver-b37ff9be.js → PopOver-3bdf8951.js} +1 -1
  22. flowfile/web/static/assets/{Read-65966a3e.js → Read-67fee3a0.js} +6 -6
  23. flowfile/web/static/assets/{RecordCount-c66c6d6d.js → RecordCount-a2acd02d.js} +7 -6
  24. flowfile/web/static/assets/{RecordId-826dc095.js → RecordId-0c8bcd77.js} +10 -8
  25. flowfile/web/static/assets/{Sample-4ed555c8.js → Sample-60594a3a.js} +7 -6
  26. flowfile/web/static/assets/{SecretManager-eac1e97d.js → SecretManager-bbcec2ac.js} +2 -2
  27. flowfile/web/static/assets/{Select-085f05cc.js → Select-9540e6ca.js} +8 -8
  28. flowfile/web/static/assets/{SettingsSection-1f5e79c1.js → SettingsSection-48f28104.js} +1 -1
  29. flowfile/web/static/assets/{Sort-3e6cb414.js → Sort-6dbe3633.js} +6 -6
  30. flowfile/web/static/assets/{TextToRows-606349bc.js → TextToRows-27aab4a8.js} +18 -13
  31. flowfile/web/static/assets/{UnavailableFields-b41976ed.js → UnavailableFields-8143044b.js} +2 -2
  32. flowfile/web/static/assets/{Union-fca91665.js → Union-52460248.js} +7 -6
  33. flowfile/web/static/assets/{Unique-a59f830e.js → Unique-f6962644.js} +8 -8
  34. flowfile/web/static/assets/{Unpivot-c3815565.js → Unpivot-1ff1e938.js} +5 -5
  35. flowfile/web/static/assets/{api-22b338bd.js → api-3b345d92.js} +1 -1
  36. flowfile/web/static/assets/{designer-e5bbe26f.js → designer-4736134f.js} +72 -42
  37. flowfile/web/static/assets/{documentation-08045cf2.js → documentation-b9545eba.js} +1 -1
  38. flowfile/web/static/assets/{dropDown-5e7e9a5a.js → dropDown-d5a4014c.js} +1 -1
  39. flowfile/web/static/assets/{dropDownGeneric-50a91b99.js → dropDownGeneric-1f4e32ec.js} +2 -2
  40. flowfile/web/static/assets/{fullEditor-705c6ccb.js → fullEditor-f4791c23.js} +3 -3
  41. flowfile/web/static/assets/{genericNodeSettings-65587f20.js → genericNodeSettings-1d456350.js} +3 -3
  42. flowfile/web/static/assets/{index-552863fd.js → index-f25c9283.js} +2608 -1570
  43. flowfile/web/static/assets/{nodeTitle-cf9bae3c.js → nodeTitle-cad6fd9d.js} +3 -3
  44. flowfile/web/static/assets/{secretApi-3ad510e1.js → secretApi-01f07e2c.js} +1 -1
  45. flowfile/web/static/assets/{selectDynamic-bd644891.js → selectDynamic-f46a4e3f.js} +3 -3
  46. flowfile/web/static/assets/{vue-codemirror.esm-dd17b478.js → vue-codemirror.esm-eb98fc8b.js} +15 -14
  47. flowfile/web/static/assets/{vue-content-loader.es-6b36f05e.js → vue-content-loader.es-860c0380.js} +1 -1
  48. flowfile/web/static/index.html +1 -1
  49. {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/METADATA +1 -3
  50. {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/RECORD +62 -64
  51. flowfile_core/configs/node_store/nodes.py +2 -4
  52. flowfile_core/flowfile/FlowfileFlow.py +72 -12
  53. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1 -1
  54. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +32 -1
  55. flowfile_core/flowfile/flow_graph_utils.py +320 -0
  56. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +0 -1
  57. flowfile_core/schemas/input_schema.py +2 -10
  58. flowfile_frame/__init__.py +1 -1
  59. flowfile_frame/flow_frame.py +455 -51
  60. flowfile/web/static/assets/GoogleSheet-854294a4.js +0 -2616
  61. flowfile/web/static/assets/GoogleSheet-92084da7.css +0 -233
  62. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +0 -74
  63. {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/LICENSE +0 -0
  64. {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/WHEEL +0 -0
  65. {flowfile-0.3.1.2.dist-info → flowfile-0.3.2.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,17 @@
1
- import uuid
1
+ import logging
2
2
  import os
3
- from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
3
+ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
4
4
  from pathlib import Path
5
5
 
6
+ import io
6
7
  import re
7
8
  import polars as pl
8
- from polars._typing import FrameInitTypes, SchemaDefinition, SchemaDict, Orientation
9
+ from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation, IO, Mapping, PolarsDataType,
10
+ Sequence, CsvEncoding)
9
11
 
10
12
  # Assume these imports are correct from your original context
11
13
  from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
14
+ from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
12
15
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
13
16
  from flowfile_core.flowfile.flow_node.flow_node import FlowNode
14
17
  from flowfile_core.schemas import input_schema, transform_schema
@@ -22,6 +25,14 @@ from flowfile_frame.join import _normalize_columns_to_list, _create_join_mapping
22
25
  node_id_counter = 0
23
26
 
24
27
 
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='[%(levelname)s] %(message)s'
31
+ )
32
+
33
+ # Create and export the logger
34
+ logger = logging.getLogger('flow_frame')
35
+
25
36
  def _to_string_val(v) -> str:
26
37
  if isinstance(v, str):
27
38
  return f"'{v}'"
@@ -478,14 +489,25 @@ class FlowFrame:
478
489
  FlowFrame
479
490
  New FlowFrame with join operation applied.
480
491
  """
481
- new_node_id = generate_node_id()
482
- print('new node id', new_node_id)
483
492
  use_polars_code = not(maintain_order is None and
484
493
  coalesce is None and
485
494
  nulls_equal is False and
486
495
  validate is None and
487
496
  suffix == '_right')
488
497
  join_mappings = None
498
+ if self.flow_graph.flow_id != other.flow_graph.flow_id:
499
+ combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
500
+ new_self_node_id = node_mappings.get((self.flow_graph.flow_id, self.node_id), None)
501
+ new_other_node_id = node_mappings.get((other.flow_graph.flow_id, other.node_id), None)
502
+ if new_other_node_id is None or new_self_node_id is None:
503
+ raise ValueError("Cannot remap the nodes")
504
+ self.node_id = new_self_node_id
505
+ other.node_id = new_other_node_id
506
+ self.flow_graph = combined_graph
507
+ other.flow_graph = combined_graph
508
+ global node_id_counter
509
+ node_id_counter += len(combined_graph.nodes)
510
+ new_node_id = generate_node_id()
489
511
  if on is not None:
490
512
  left_columns = right_columns = _normalize_columns_to_list(on)
491
513
  elif left_on is not None and right_on is not None:
@@ -597,10 +619,8 @@ class FlowFrame:
597
619
  if (len(columns) == 1 and isinstance(columns[0], Expr)
598
620
  and str(columns[0]) == "pl.Expr(len()).alias('number_of_records')"):
599
621
  return self._add_number_of_records(new_node_id, description)
600
-
601
- # Handle simple column names
602
622
  if all(isinstance(col_, (str, Column)) for col_ in columns):
603
- # Create select inputs
623
+
604
624
  select_inputs = [
605
625
  transform_schema.SelectInput(old_name=col_) if isinstance(col_, str) else col_.to_select_input()
606
626
  for col_ in columns
@@ -946,7 +966,7 @@ class FlowFrame:
946
966
  input_schema.NodeFormula(flow_id=self.flow_graph.flow_id, node_id=new_node_id, depending_on_id=self.node_id,
947
967
  function=transform_schema.FunctionInput(
948
968
  function=flowfile_formula,
949
- field=transform_schema.FieldInput(name=output_column_name)),
969
+ field=transform_schema.FieldInput(name=output_column_name, data_type='Auto')),
950
970
  description=description))
951
971
  self.flow_graph.add_formula(function_settings)
952
972
  return self._create_child_frame(new_node_id)
@@ -1241,14 +1261,24 @@ class FlowFrame:
1241
1261
  FlowFrame
1242
1262
  A new FlowFrame with the concatenated data
1243
1263
  """
1244
- new_node_id = generate_node_id()
1245
-
1246
1264
  # Convert single FlowFrame to list
1247
1265
  if isinstance(other, FlowFrame):
1248
1266
  others = [other]
1249
1267
  else:
1250
1268
  others = other
1251
-
1269
+ all_graphs = []
1270
+ all_graph_ids = []
1271
+ for g in [self.flow_graph] + [f.flow_graph for f in others]:
1272
+ if g.flow_id not in all_graph_ids:
1273
+ all_graph_ids.append(g.flow_id)
1274
+ all_graphs.append(g)
1275
+ if len(all_graphs) > 1:
1276
+ combined_graph, node_mappings = combine_flow_graphs_with_mapping(*all_graphs)
1277
+ for f in [self] + other:
1278
+ f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
1279
+ global node_id_counter
1280
+ node_id_counter += len(combined_graph.nodes)
1281
+ new_node_id = generate_node_id()
1252
1282
  use_native = how == "diagonal_relaxed" and parallel and not rechunk
1253
1283
 
1254
1284
  if use_native:
@@ -1902,64 +1932,328 @@ def count(expr):
1902
1932
  return expr.count()
1903
1933
 
1904
1934
 
1905
- def read_csv(file_path, *, flow_graph: FlowGraph = None, separator: str = ';',
1906
- convert_to_absolute_path: bool = True,
1907
- description: str = None, **options):
1935
+ def read_csv(
1936
+ source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
1937
+ *,
1938
+ flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
1939
+ separator: str = ',',
1940
+ convert_to_absolute_path: bool = True,
1941
+ description: Optional[str] = None,
1942
+ has_header: bool = True,
1943
+ new_columns: Optional[List[str]] = None,
1944
+ comment_prefix: Optional[str] = None,
1945
+ quote_char: Optional[str] = '"',
1946
+ skip_rows: int = 0,
1947
+ skip_lines: int = 0,
1948
+ schema: Optional[SchemaDict] = None,
1949
+ schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
1950
+ null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
1951
+ missing_utf8_is_empty_string: bool = False,
1952
+ ignore_errors: bool = False,
1953
+ try_parse_dates: bool = False,
1954
+ infer_schema: bool = True,
1955
+ infer_schema_length: Optional[int] = 100,
1956
+ n_rows: Optional[int] = None,
1957
+ encoding: CsvEncoding = 'utf8',
1958
+ low_memory: bool = False,
1959
+ rechunk: bool = False,
1960
+ storage_options: Optional[Dict[str, Any]] = None,
1961
+ skip_rows_after_header: int = 0,
1962
+ row_index_name: Optional[str] = None,
1963
+ row_index_offset: int = 0,
1964
+ eol_char: str = '\n',
1965
+ raise_if_empty: bool = True,
1966
+ truncate_ragged_lines: bool = False,
1967
+ decimal_comma: bool = False,
1968
+ glob: bool = True,
1969
+ cache: bool = True,
1970
+ with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
1971
+ **other_options: Any
1972
+ ) -> FlowFrame:
1908
1973
  """
1909
1974
  Read a CSV file into a FlowFrame.
1910
1975
 
1976
+ This function uses the native FlowGraph implementation when the parameters
1977
+ fall within the supported range, and falls back to using Polars' scan_csv implementation
1978
+ for more advanced features.
1979
+
1911
1980
  Args:
1912
- file_path: Path to CSV file
1981
+ source: Path(s) to CSV file(s), or a file-like object.
1913
1982
  flow_graph: if you want to add it to an existing graph
1914
1983
  separator: Single byte character to use as separator in the file.
1915
1984
  convert_to_absolute_path: If the path needs to be set to a fixed location
1916
1985
  description: if you want to add a readable name in the frontend (advised)
1917
- **options: Options for polars.read_csv
1986
+
1987
+ # Polars.scan_csv aligned parameters
1988
+ has_header: Indicate if the first row of the dataset is a header or not.
1989
+ new_columns: Rename columns after selection.
1990
+ comment_prefix: String that indicates a comment line if found at beginning of line.
1991
+ quote_char: Character used for quoting. None to disable.
1992
+ skip_rows: Start reading after this many rows.
1993
+ skip_lines: Skip this many lines by newline char only.
1994
+ schema: Schema to use when reading the CSV.
1995
+ schema_overrides: Schema overrides for specific columns.
1996
+ null_values: Values to interpret as null.
1997
+ missing_utf8_is_empty_string: Treat missing utf8 values as empty strings.
1998
+ ignore_errors: Try to keep reading lines if some parsing errors occur.
1999
+ try_parse_dates: Try to automatically parse dates.
2000
+ infer_schema: Boolean flag. If False, `infer_schema_length` for Polars is set to 0.
2001
+ infer_schema_length: Number of rows to use for schema inference. Polars default is 100.
2002
+ n_rows: Stop reading after this many rows.
2003
+ encoding: Character encoding to use.
2004
+ low_memory: Reduce memory usage at the cost of performance.
2005
+ rechunk: Ensure data is in contiguous memory layout after parsing.
2006
+ storage_options: Options for fsspec for cloud storage.
2007
+ skip_rows_after_header: Skip rows after header.
2008
+ row_index_name: Name of the row index column.
2009
+ row_index_offset: Start value for the row index.
2010
+ eol_char: End of line character.
2011
+ raise_if_empty: Raise error if file is empty.
2012
+ truncate_ragged_lines: Truncate lines with too many values.
2013
+ decimal_comma: Parse floats with decimal comma.
2014
+ glob: Use glob pattern for file path (if source is a string).
2015
+ cache: Cache the result after reading (Polars default True).
2016
+ with_column_names: Apply a function over the column names.
2017
+ other_options: Any other options to pass to polars.scan_csv (e.g. retries, file_cache_ttl).
1918
2018
 
1919
2019
  Returns:
1920
- A FlowFrame with the CSV data
2020
+ A FlowFrame with the CSV data.
1921
2021
  """
1922
- # Create new node ID
1923
- node_id = generate_node_id()
2022
+ node_id = generate_node_id() # Assuming generate_node_id is defined
1924
2023
  if flow_graph is None:
1925
- flow_graph = create_flow_graph()
1926
-
2024
+ flow_graph = create_flow_graph() # Assuming create_flow_graph is defined
1927
2025
  flow_id = flow_graph.flow_id
1928
2026
 
1929
- has_headers = options.get('has_header', True)
1930
- encoding = options.get('encoding', 'utf-8')
1931
-
1932
- if '~' in file_path:
1933
- file_path = os.path.expanduser(file_path)
1934
-
1935
- received_table = input_schema.ReceivedTable(
1936
- file_type='csv',
1937
- path=file_path,
1938
- name=Path(file_path).name,
1939
- delimiter=separator,
1940
- has_headers=has_headers,
1941
- encoding=encoding
2027
+ current_source_path_for_native = None
2028
+ if isinstance(source, (str, os.PathLike)):
2029
+ current_source_path_for_native = str(source)
2030
+ if '~' in current_source_path_for_native:
2031
+ current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
2032
+ elif isinstance(source, list) and all(isinstance(s, (str, os.PathLike)) for s in source):
2033
+ current_source_path_for_native = str(source[0]) if source else None
2034
+ if current_source_path_for_native and '~' in current_source_path_for_native:
2035
+ current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
2036
+ elif isinstance(source, (io.BytesIO, io.StringIO)):
2037
+ logger.warning("Read from bytes io from csv not supported, converting data to raw data")
2038
+ return from_dict(pl.read_csv(source), flow_graph=flow_graph, description=description)
2039
+ actual_infer_schema_length: Optional[int]
2040
+ if not infer_schema:
2041
+ actual_infer_schema_length = 0
2042
+ else:
2043
+ actual_infer_schema_length = infer_schema_length
2044
+ can_use_native = (
2045
+ current_source_path_for_native is not None and
2046
+ comment_prefix is None and
2047
+ skip_lines == 0 and
2048
+ schema is None and
2049
+ schema_overrides is None and
2050
+ null_values is None and
2051
+ not missing_utf8_is_empty_string and
2052
+ not try_parse_dates and
2053
+ n_rows is None and
2054
+ not low_memory and
2055
+ not rechunk and
2056
+ storage_options is None and
2057
+ skip_rows_after_header == 0 and
2058
+ row_index_name is None and
2059
+ row_index_offset == 0 and
2060
+ eol_char == '\n' and
2061
+ not decimal_comma and
2062
+ new_columns is None and
2063
+ glob is True
1942
2064
  )
2065
+ if can_use_native and current_source_path_for_native:
2066
+ received_table = input_schema.ReceivedTable(
2067
+ file_type='csv',
2068
+ path=current_source_path_for_native,
2069
+ name=Path(current_source_path_for_native).name,
2070
+ delimiter=separator,
2071
+ has_headers=has_header,
2072
+ encoding=encoding,
2073
+ starting_from_line=skip_rows,
2074
+ quote_char=quote_char if quote_char is not None else '"',
2075
+ infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
2076
+ truncate_ragged_lines=truncate_ragged_lines,
2077
+ ignore_errors=ignore_errors,
2078
+ row_delimiter=eol_char
2079
+ )
2080
+ if convert_to_absolute_path:
2081
+ try:
2082
+ received_table.set_absolute_filepath()
2083
+ received_table.path = received_table.abs_file_path
2084
+ except Exception as e:
2085
+ print(f"Warning: Could not determine absolute path for {current_source_path_for_native}: {e}")
1943
2086
 
1944
- if convert_to_absolute_path:
1945
- received_table.path = received_table.abs_file_path
2087
+ read_node_description = description or f"Read CSV from {Path(current_source_path_for_native).name}"
2088
+ read_node = input_schema.NodeRead(
2089
+ flow_id=flow_id,
2090
+ node_id=node_id,
2091
+ received_file=received_table,
2092
+ pos_x=100,
2093
+ pos_y=100,
2094
+ is_setup=True,
2095
+ description=read_node_description
2096
+ )
2097
+ flow_graph.add_read(read_node)
2098
+ result_frame = FlowFrame(
2099
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2100
+ flow_graph=flow_graph,
2101
+ node_id=node_id
2102
+ )
2103
+ return result_frame
2104
+ else:
2105
+ polars_source_arg = source
2106
+ polars_code = _build_polars_code_args(
2107
+ source=polars_source_arg,
2108
+ separator=separator,
2109
+ has_header=has_header,
2110
+ new_columns=new_columns,
2111
+ comment_prefix=comment_prefix,
2112
+ quote_char=quote_char,
2113
+ skip_rows=skip_rows,
2114
+ skip_lines=skip_lines,
2115
+ schema=schema,
2116
+ schema_overrides=schema_overrides,
2117
+ null_values=null_values,
2118
+ missing_utf8_is_empty_string=missing_utf8_is_empty_string,
2119
+ ignore_errors=ignore_errors,
2120
+ try_parse_dates=try_parse_dates,
2121
+ infer_schema_length=actual_infer_schema_length,
2122
+ n_rows=n_rows,
2123
+ encoding=encoding,
2124
+ low_memory=low_memory,
2125
+ rechunk=rechunk,
2126
+ storage_options=storage_options,
2127
+ skip_rows_after_header=skip_rows_after_header,
2128
+ row_index_name=row_index_name,
2129
+ row_index_offset=row_index_offset,
2130
+ eol_char=eol_char,
2131
+ raise_if_empty=raise_if_empty,
2132
+ truncate_ragged_lines=truncate_ragged_lines,
2133
+ decimal_comma=decimal_comma,
2134
+ glob=glob,
2135
+ cache=cache,
2136
+ with_column_names=with_column_names,
2137
+ **other_options
2138
+ )
2139
+ polars_code_node_description = description or "Read CSV with Polars scan_csv"
2140
+ if isinstance(source, (str, os.PathLike)):
2141
+ polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source).name}"
2142
+ elif isinstance(source, list) and source and isinstance(source[0], (str, os.PathLike)):
2143
+ polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source[0]).name} (and possibly others)"
1946
2144
 
1947
- read_node = input_schema.NodeRead(
1948
- flow_id=flow_id,
1949
- node_id=node_id,
1950
- received_file=received_table,
1951
- pos_x=100,
1952
- pos_y=100,
1953
- is_setup=True
1954
- )
2145
+ # Assuming input_schema.NodePolarsCode, transform_schema.PolarsCodeInput are defined
2146
+ polars_code_settings = input_schema.NodePolarsCode(
2147
+ flow_id=flow_id,
2148
+ node_id=node_id,
2149
+ polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code),
2150
+ is_setup=True,
2151
+ description=polars_code_node_description
2152
+ )
2153
+ flow_graph.add_polars_code(polars_code_settings)
2154
+ return FlowFrame(
2155
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2156
+ flow_graph=flow_graph,
2157
+ node_id=node_id,
2158
+ )
1955
2159
 
1956
- flow_graph.add_read(read_node)
2160
+ def _build_polars_code_args(
2161
+ source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
2162
+ separator: str,
2163
+ has_header: bool,
2164
+ new_columns: Optional[List[str]],
2165
+ comment_prefix: Optional[str],
2166
+ quote_char: Optional[str],
2167
+ skip_rows: int,
2168
+ skip_lines: int,
2169
+ schema: Optional[SchemaDict],
2170
+ schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]],
2171
+ null_values: Optional[Union[str, List[str], Dict[str, str]]],
2172
+ missing_utf8_is_empty_string: bool,
2173
+ ignore_errors: bool,
2174
+ try_parse_dates: bool,
2175
+ infer_schema_length: Optional[int],
2176
+ n_rows: Optional[int],
2177
+ encoding: CsvEncoding,
2178
+ low_memory: bool,
2179
+ rechunk: bool,
2180
+ storage_options: Optional[Dict[str, Any]],
2181
+ skip_rows_after_header: int,
2182
+ row_index_name: Optional[str],
2183
+ row_index_offset: int,
2184
+ eol_char: str,
2185
+ raise_if_empty: bool,
2186
+ truncate_ragged_lines: bool,
2187
+ decimal_comma: bool,
2188
+ glob: bool,
2189
+ cache: bool,
2190
+ with_column_names: Optional[Callable[[List[str]], List[str]]],
2191
+ **other_options: Any
2192
+ ) -> str:
2193
+ source_repr: str
2194
+ if isinstance(source, (str, Path)):
2195
+ source_repr = repr(str(source))
2196
+ elif isinstance(source, list):
2197
+ source_repr = repr([str(p) for p in source])
2198
+ elif isinstance(source, bytes):
2199
+ source_repr = "source_bytes_obj"
2200
+ elif hasattr(source, 'read'):
2201
+ source_repr = "source_file_like_obj"
2202
+ else:
2203
+ source_repr = repr(source)
2204
+
2205
+ param_mapping = {
2206
+ 'has_header': (True, lambda x: str(x)),
2207
+ 'separator': (',', lambda x: repr(str(x))),
2208
+ 'comment_prefix': (None, lambda x: repr(str(x)) if x is not None else 'None'),
2209
+ 'quote_char': ('"', lambda x: repr(str(x)) if x is not None else 'None'),
2210
+ 'skip_rows': (0, str),
2211
+ 'skip_lines': (0, str),
2212
+ 'schema': (None, lambda x: repr(x) if x is not None else 'None'),
2213
+ 'schema_overrides': (None, lambda x: repr(x) if x is not None else 'None'),
2214
+ 'null_values': (None, lambda x: repr(x) if x is not None else 'None'),
2215
+ 'missing_utf8_is_empty_string': (False, str),
2216
+ 'ignore_errors': (False, str),
2217
+ 'cache': (True, str),
2218
+ 'with_column_names': (None, lambda x: repr(x) if x is not None else 'None'),
2219
+ 'infer_schema_length': (100, lambda x: str(x) if x is not None else 'None'),
2220
+ 'n_rows': (None, lambda x: str(x) if x is not None else 'None'),
2221
+ 'encoding': ('utf8', lambda x: repr(str(x))),
2222
+ 'low_memory': (False, str),
2223
+ 'rechunk': (False, str),
2224
+ 'skip_rows_after_header': (0, str),
2225
+ 'row_index_name': (None, lambda x: repr(str(x)) if x is not None else 'None'),
2226
+ 'row_index_offset': (0, str),
2227
+ 'try_parse_dates': (False, str),
2228
+ 'eol_char': ('\n', lambda x: repr(str(x))),
2229
+ 'new_columns': (None, lambda x: repr(x) if x is not None else 'None'),
2230
+ 'raise_if_empty': (True, str),
2231
+ 'truncate_ragged_lines': (False, str),
2232
+ 'decimal_comma': (False, str),
2233
+ 'glob': (True, str),
2234
+ 'storage_options': (None, lambda x: repr(x) if x is not None else 'None'),
2235
+ }
2236
+
2237
+ all_vars = locals()
2238
+ kwargs_list = []
2239
+
2240
+ for param_name_key, (default_value, format_func) in param_mapping.items():
2241
+ value = all_vars.get(param_name_key)
2242
+ formatted_value = format_func(value)
2243
+ kwargs_list.append(f"{param_name_key}={formatted_value}")
2244
+
2245
+ if other_options:
2246
+ for k, v in other_options.items():
2247
+ kwargs_list.append(f"{k}={repr(v)}")
2248
+
2249
+ kwargs_str = ",\n ".join(kwargs_list)
2250
+
2251
+ if kwargs_str:
2252
+ polars_code = f"output_df = pl.scan_csv(\n {source_repr},\n {kwargs_str}\n)"
2253
+ else:
2254
+ polars_code = f"output_df = pl.scan_csv({source_repr})"
1957
2255
 
1958
- return FlowFrame(
1959
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
1960
- flow_graph=flow_graph,
1961
- node_id=node_id
1962
- )
2256
+ return polars_code
1963
2257
 
1964
2258
 
1965
2259
  def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
@@ -2091,3 +2385,113 @@ def concat(frames: List['FlowFrame'],
2091
2385
  return first_frame.concat(remaining_frames, how=how,
2092
2386
  rechunk=rechunk, parallel=parallel,
2093
2387
  description=description)
2388
+
2389
+
2390
+ def scan_csv(
2391
+ source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
2392
+ *,
2393
+ flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
2394
+ separator: str = ',',
2395
+ convert_to_absolute_path: bool = True,
2396
+ description: Optional[str] = None,
2397
+ has_header: bool = True,
2398
+ new_columns: Optional[List[str]] = None,
2399
+ comment_prefix: Optional[str] = None,
2400
+ quote_char: Optional[str] = '"',
2401
+ skip_rows: int = 0,
2402
+ skip_lines: int = 0,
2403
+ schema: Optional[SchemaDict] = None,
2404
+ schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
2405
+ null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
2406
+ missing_utf8_is_empty_string: bool = False,
2407
+ ignore_errors: bool = False,
2408
+ try_parse_dates: bool = False,
2409
+ infer_schema: bool = True,
2410
+ infer_schema_length: Optional[int] = 100,
2411
+ n_rows: Optional[int] = None,
2412
+ encoding: CsvEncoding = 'utf8',
2413
+ low_memory: bool = False,
2414
+ rechunk: bool = False,
2415
+ storage_options: Optional[Dict[str, Any]] = None,
2416
+ skip_rows_after_header: int = 0,
2417
+ row_index_name: Optional[str] = None,
2418
+ row_index_offset: int = 0,
2419
+ eol_char: str = '\n',
2420
+ raise_if_empty: bool = True,
2421
+ truncate_ragged_lines: bool = False,
2422
+ decimal_comma: bool = False,
2423
+ glob: bool = True,
2424
+ cache: bool = True,
2425
+ with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
2426
+ **other_options: Any
2427
+ ) -> FlowFrame:
2428
+ """
2429
+ Scan a CSV file into a FlowFrame. This function is an alias for read_csv.
2430
+
2431
+ This method is the same as read_csv but is provided for compatibility with
2432
+ the polars API where scan_csv returns a LazyFrame.
2433
+
2434
+ See read_csv for full documentation.
2435
+ """
2436
+ return read_csv(
2437
+ source=source,
2438
+ flow_graph=flow_graph,
2439
+ separator=separator,
2440
+ convert_to_absolute_path=convert_to_absolute_path,
2441
+ description=description,
2442
+ has_header=has_header,
2443
+ new_columns=new_columns,
2444
+ comment_prefix=comment_prefix,
2445
+ quote_char=quote_char,
2446
+ skip_rows=skip_rows,
2447
+ skip_lines=skip_lines,
2448
+ schema=schema,
2449
+ schema_overrides=schema_overrides,
2450
+ null_values=null_values,
2451
+ missing_utf8_is_empty_string=missing_utf8_is_empty_string,
2452
+ ignore_errors=ignore_errors,
2453
+ try_parse_dates=try_parse_dates,
2454
+ infer_schema=infer_schema,
2455
+ infer_schema_length=infer_schema_length,
2456
+ n_rows=n_rows,
2457
+ encoding=encoding,
2458
+ low_memory=low_memory,
2459
+ rechunk=rechunk,
2460
+ storage_options=storage_options,
2461
+ skip_rows_after_header=skip_rows_after_header,
2462
+ row_index_name=row_index_name,
2463
+ row_index_offset=row_index_offset,
2464
+ eol_char=eol_char,
2465
+ raise_if_empty=raise_if_empty,
2466
+ truncate_ragged_lines=truncate_ragged_lines,
2467
+ decimal_comma=decimal_comma,
2468
+ glob=glob,
2469
+ cache=cache,
2470
+ with_column_names=with_column_names,
2471
+ **other_options
2472
+ )
2473
+
2474
+
2475
+ def scan_parquet(
2476
+ file_path,
2477
+ *,
2478
+ flow_graph: FlowGraph = None,
2479
+ description: str = None,
2480
+ convert_to_absolute_path: bool = True,
2481
+ **options
2482
+ ) -> FlowFrame:
2483
+ """
2484
+ Scan a Parquet file into a FlowFrame. This function is an alias for read_parquet.
2485
+
2486
+ This method is the same as read_parquet but is provided for compatibility with
2487
+ the polars API where scan_parquet returns a LazyFrame.
2488
+
2489
+ See read_parquet for full documentation.
2490
+ """
2491
+ return read_parquet(
2492
+ file_path=file_path,
2493
+ flow_graph=flow_graph,
2494
+ description=description,
2495
+ convert_to_absolute_path=convert_to_absolute_path,
2496
+ **options
2497
+ )