PyPI - sqlframe - Versions diffs - 3.21.1__tar.gz → 3.22.0__tar.gz - Mend

sqlframe 3.21.1tar.gz → 3.22.0tar.gz

Files changed (388) hide show

{sqlframe-3.21.1 → sqlframe-3.22.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sqlframe
-Version: 3.21.1
+Version: 3.22.0
 Summary: Turning PySpark Into a Universal DataFrame API
 Home-page: https://github.com/eakmanrq/sqlframe
 Author: Ryan Eakman

{sqlframe-3.21.1 → sqlframe-3.22.0}/docs/duckdb.md RENAMED Viewed

@@ -406,6 +406,7 @@ See something that you would like to see supported? [Open an issue](https://gith
 * [min](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.min.html)
 * [min_by](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.min_by.html)
 * [minute](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.minute.html)
+* [mode](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.mode.html)
 * [month](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.month.html)
 * [months_between](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.months_between.html)
     * Rounded whole number is returned

{sqlframe-3.21.1 → sqlframe-3.22.0}/docs/snowflake.md RENAMED Viewed

@@ -439,6 +439,7 @@ See something that you would like to see supported? [Open an issue](https://gith
 * [min](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.min.html)
 * [min_by](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.min_by.html)
 * [minute](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.minute.html)
+* [mode](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.mode.html)
 * [module](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.module.html)
 * [month](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.month.html)
 * [months_between](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.months_between.html)

{sqlframe-3.21.1 → sqlframe-3.22.0}/sqlframe/_version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '3.21.1'
-__version_tuple__ = version_tuple = (3, 21, 1)
+__version__ = version = '3.22.0'
+__version_tuple__ = version_tuple = (3, 22, 0)

{sqlframe-3.21.1 → sqlframe-3.22.0}/sqlframe/base/dataframe.py RENAMED Viewed

@@ -342,7 +342,7 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
             return types.StructType(
                 [
                     types.StructField(
-                        c.name,
+                        self.display_name_mapping.get(c.name, c.name),
                         sqlglot_to_spark(
                             exp.DataType.build(c.dataType, dialect=self.session.output_dialect)
                         ),
@@ -1898,7 +1898,7 @@ class BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
         print("root")
         for column in self._typed_columns:
             print_schema(
-                column.name,
+                self.display_name_mapping.get(column.name, column.name),
                 exp.DataType.build(column.dataType, dialect=self.session.output_dialect),
                 column.nullable,
                 0,

{sqlframe-3.21.1 → sqlframe-3.22.0}/sqlframe/base/functions.py RENAMED Viewed

@@ -4504,7 +4504,7 @@ def median(col: ColumnOrName) -> Column:
     return Column.invoke_expression_over_column(col, expression.Median)
-@meta(unsupported_engines="*")
+@meta(unsupported_engines=["bigquery", "postgres"])
 def mode(col: ColumnOrName) -> Column:
     """
     Returns the most frequent value in a group.
@@ -4540,6 +4540,7 @@ def mode(col: ColumnOrName) -> Column:
     |dotNET|      2012|
     +------+----------+
     """
     return Column.invoke_anonymous_function(col, "mode")

{sqlframe-3.21.1 → sqlframe-3.22.0}/sqlframe/base/mixins/readwriter_mixins.py RENAMED Viewed

@@ -82,6 +82,10 @@ class PandasLoaderMixin(_BaseDataFrameReader, t.Generic[SESSION, DF]):
         elif format == "parquet":
             df = pd.read_parquet(path, **kwargs)  # type: ignore
         elif format == "csv":
+            kwargs.pop("inferSchema", None)
+            if "header" in kwargs:
+                if isinstance(kwargs["header"], bool) and kwargs["header"]:
+                    kwargs["header"] = "infer"
             df = pd.read_csv(path, **kwargs)  # type: ignore
         else:
             raise UnsupportedOperationError(f"Unsupported format: {format}")

{sqlframe-3.21.1 → sqlframe-3.22.0}/sqlframe/base/readerwriter.py RENAMED Viewed

@@ -393,10 +393,12 @@ class _BaseDataFrameWriter(t.Generic[SESSION, DF]):
         df: DF,
         mode: t.Optional[str] = None,
         by_name: bool = False,
+        state_format_to_write: t.Optional[str] = None,
     ):
         self._df = df
         self._mode = mode
         self._by_name = by_name
+        self._state_format_to_write = state_format_to_write
     @property
     def _session(self) -> SESSION:
@@ -484,6 +486,44 @@ class _BaseDataFrameWriter(t.Generic[SESSION, DF]):
     def _write(self, path: str, mode: t.Optional[str], format: str, **options) -> None:
         raise NotImplementedError
+    def format(self, source: str) -> "Self":
+        """Specifies the input data source format.
+        .. versionadded:: 1.4.0
+        .. versionchanged:: 3.4.0
+            Supports Spark Connect.
+        Parameters
+        ----------
+        source : str
+            string, name of the data source, e.g. 'json', 'parquet'.
+        Examples
+        --------
+        >>> spark.read.format('json')
+        <...readwriter.DataFrameReader object ...>
+        Write a DataFrame into a JSON file and read it back.
+        >>> import tempfile
+        >>> with tempfile.TemporaryDirectory() as d:
+        ...     # Write a DataFrame into a JSON file
+        ...     spark.createDataFrame(
+        ...         [{"age": 100, "name": "Hyukjin Kwon"}]
+        ...     ).write.mode("overwrite").format("json").save(d)
+        ...
+        ...     # Read the JSON file as a DataFrame.
+        ...     spark.read.format('json').load(d).show()
+        +---+------------+
+        |age|        name|
+        +---+------------+
+        |100|Hyukjin Kwon|
+        +---+------------+
+        """
+        self._state_format_to_write = source
+        return self
     def json(
         self,
         path: str,

{sqlframe-3.21.1 → sqlframe-3.22.0}/sqlframe/base/util.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 import importlib
+import random
+import string
 import typing as t
 import unicodedata
@@ -427,3 +429,20 @@ def normalize_string(
         for pos in star_positions:
             normalized_value = normalized_value[:pos] + "*" + normalized_value[pos:]
     return normalized_value
+def generate_random_identifier(size=6, chars=string.ascii_uppercase + string.digits):
+    return "_" + "".join(random.choice(chars) for _ in range(size))
+def split_filepath(filepath: str) -> tuple[str, str]:
+    if filepath.startswith("dbfs:") or filepath.startswith("/dbfs"):
+        prefix = "dbfs:"
+        return prefix, filepath[len(prefix) :]
+    if filepath.startswith("file://"):
+        prefix = "file://"
+        return "", filepath[len(prefix) :]
+    split_ = str(filepath).split("://", 1)
+    if len(split_) == 2:  # noqa: PLR2004
+        return split_[0] + "://", split_[1]
+    return "", split_[0]

{sqlframe-3.21.1 → sqlframe-3.22.0}/sqlframe/duckdb/readwriter.py RENAMED Viewed

@@ -92,6 +92,7 @@ class DuckDBDataFrameReader(
         if format == "delta":
             from_clause = f"delta_scan('{path}')"
         elif format:
+            options.pop("inferSchema", None)
             paths = ",".join([f"'{path}'" for path in ensure_list(path)])
             from_clause = f"read_{format}([{paths}], {to_csv(options)})"
         else:

sqlframe-3.22.0/sqlframe/spark/readwriter.py ADDED Viewed

@@ -0,0 +1,163 @@
+# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
+from __future__ import annotations
+import typing as t
+from sqlglot import exp
+from sqlglot.helper import ensure_list
+from sqlframe.base.readerwriter import (
+    _BaseDataFrameReader,
+    _BaseDataFrameWriter,
+    _infer_format,
+)
+from sqlframe.base.util import ensure_column_mapping, generate_random_identifier, to_csv
+if t.TYPE_CHECKING:
+    from sqlframe.base._typing import OptionalPrimitiveType, PathOrPaths
+    from sqlframe.base.types import StructType
+    from sqlframe.spark.dataframe import SparkDataFrame
+    from sqlframe.spark.session import SparkSession
+    from sqlframe.spark.table import SparkTable
+class SparkDataFrameReader(
+    _BaseDataFrameReader["SparkSession", "SparkDataFrame", "SparkTable"],
+):
+    def load(
+        self,
+        path: t.Optional[PathOrPaths] = None,
+        format: t.Optional[str] = None,
+        schema: t.Optional[t.Union[StructType, str]] = None,
+        **options: OptionalPrimitiveType,
+    ) -> SparkDataFrame:
+        """Loads data from a data source and returns it as a :class:`DataFrame`.
+        .. versionadded:: 1.4.0
+        .. versionchanged:: 3.4.0
+            Supports Spark Connect.
+        Parameters
+        ----------
+        path : str or list, t.Optional
+            t.Optional string or a list of string for file-system backed data sources.
+        format : str, t.Optional
+            t.Optional string for format of the data source. Default to 'parquet'.
+        schema : :class:`pyspark.sql.types.StructType` or str, t.Optional
+            t.Optional :class:`pyspark.sql.types.StructType` for the input schema
+            or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
+        **options : dict
+            all other string options
+        Examples
+        --------
+        Load a CSV file with format, schema and options specified.
+        >>> import tempfile
+        >>> with tempfile.TemporaryDirectory() as d:
+        ...     # Write a DataFrame into a CSV file with a header
+        ...     df = spark.createDataFrame([{"age": 100, "name": "Hyukjin Kwon"}])
+        ...     df.write.option("header", True).mode("overwrite").format("csv").save(d)
+        ...
+        ...     # Read the CSV file as a DataFrame with 'nullValue' option set to 'Hyukjin Kwon',
+        ...     # and 'header' option set to `True`.
+        ...     df = spark.read.load(
+        ...         d, schema=df.schema, format="csv", nullValue="Hyukjin Kwon", header=True)
+        ...     df.printSchema()
+        ...     df.show()
+        root
+         |-- age: long (nullable = true)
+         |-- name: string (nullable = true)
+        +---+----+
+        |age|name|
+        +---+----+
+        |100|NULL|
+        +---+----+
+        """
+        assert path is not None, "path is required"
+        assert isinstance(path, str), "path must be a string"
+        format = format or self.state_format_to_read or _infer_format(path)
+        if schema:
+            column_mapping = ensure_column_mapping(schema)
+            select_column_mapping = column_mapping.copy()
+            select_columns = [x.expression for x in self._to_casted_columns(select_column_mapping)]
+            if hasattr(schema, "simpleString"):
+                schema = schema.simpleString()
+        else:
+            select_columns = [exp.Star()]
+        if format == "delta":
+            from_clause = f"delta.`{path}`"
+        elif format:
+            paths = ",".join([f"{path}" for path in ensure_list(path)])
+            tmp_view_key = options.get("_tmp_view_key_", f"{generate_random_identifier()}_vw")
+            options["_tmp_view_key_"] = tmp_view_key
+            format_options: dict[str, OptionalPrimitiveType] = {
+                k: v for k, v in options.items() if v is not None
+            }
+            format_options.pop("_tmp_view_key_")
+            format_options["path"] = paths
+            if schema:
+                format_options["schema"] = f"{schema}"
+                format_options.pop("inferSchema", None)
+            format_options = {key: f"'{val}'" for key, val in format_options.items()}
+            format_options_str = to_csv(format_options, " ")
+            tmp_view = f"CREATE OR REPLACE TEMPORARY VIEW {tmp_view_key} USING {format}" + (
+                f" OPTIONS ({format_options_str})" if format_options_str else ""
+            )
+            self.session.spark_session.sql(tmp_view).collect()
+            from_clause = f"{tmp_view_key}"
+        else:
+            from_clause = f"'{path}'"
+        df = self.session.sql(
+            exp.select(*select_columns).from_(from_clause, dialect=self.session.input_dialect),
+            qualify=False,
+        )
+        if select_columns == [exp.Star()] and df.schema:
+            return self.load(path=path, format=format, schema=df.schema, **options)
+        self.session._last_loaded_file = path  # type: ignore
+        return df
+class SparkDataFrameWriter(
+    _BaseDataFrameWriter["SparkSession", "SparkDataFrame"],
+):
+    def save(
+        self,
+        path: str,
+        mode: t.Optional[str] = None,
+        format: t.Optional[str] = None,
+        partitionBy: t.Optional[t.Union[str, t.List[str]]] = None,
+        **options,
+    ):
+        format = str(format or self._state_format_to_write)
+        self._write(path, mode, format, partitionBy=partitionBy, **options)
+    def _write(self, path: str, mode: t.Optional[str], format: str, **options):
+        spark_df = None
+        expressions = self._df._get_expressions()
+        for i, expression in enumerate(expressions):
+            if i < len(expressions) - 1:
+                self._df.session._collect(expressions)
+            else:
+                sql = self._df.session._to_sql(expression)
+                spark_df = self._session.spark_session.sql(sql)
+        if spark_df is not None:
+            options = {k: v for k, v in options.items() if v is not None}
+            mode = str(mode or self._mode or "default")
+            spark_writer = spark_df.write.format(format).mode(mode)
+            partition_columns = options.pop("partitionBy", None)
+            compression = options.pop("compression", None)
+            if partition_columns:
+                partition_columns = options.pop("partitionBy")
+                spark_writer = spark_writer.partitionBy(*partition_columns)
+            if compression:
+                spark_writer = spark_writer.option("compression", compression)
+            spark_writer.save(path=path, **options)

{sqlframe-3.21.1 → sqlframe-3.22.0}/sqlframe.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sqlframe
-Version: 3.21.1
+Version: 3.22.0
 Summary: Turning PySpark Into a Universal DataFrame API
 Home-page: https://github.com/eakmanrq/sqlframe
 Author: Ryan Eakman

{sqlframe-3.21.1 → sqlframe-3.22.0}/tests/integration/engines/databricks/test_databricks_dataframe.py RENAMED Viewed

@@ -71,16 +71,16 @@ root
  |-- bigint_col: bigint (nullable = true)
  |-- double_col: double (nullable = true)
  |-- string_col: string (nullable = true)
- |-- `map<string,bigint>_col`: map<string, bigint> (nullable = true)
+ |-- map<string,bigint>_col: map<string, bigint> (nullable = true)
  |    |-- key: string (nullable = true)
  |    |-- value: bigint (nullable = true)
- |-- `array<struct<a:bigint,b:bigint>>`: array<struct<a: bigint, b: bigint>> (nullable = true)
+ |-- array<struct<a:bigint,b:bigint>>: array<struct<a: bigint, b: bigint>> (nullable = true)
  |    |-- element: struct<a: bigint, b: bigint> (nullable = true)
  |    |    |-- a: bigint (nullable = true)
  |    |    |-- b: bigint (nullable = true)
- |-- `array<bigint>_col`: array<bigint> (nullable = true)
+ |-- array<bigint>_col: array<bigint> (nullable = true)
  |    |-- element: bigint (nullable = true)
- |-- `struct<a:bigint>_col`: struct<a: bigint> (nullable = true)
+ |-- struct<a:bigint>_col: struct<a: bigint> (nullable = true)
  |    |-- a: bigint (nullable = true)
  |-- date_col: date (nullable = true)
  |-- timestamp_col: timestamp (nullable = true)
@@ -126,12 +126,12 @@ def test_schema_nested(databricks_datatypes: DatabricksDataFrame):
     assert struct_fields[1].dataType == types.DoubleType()
     assert struct_fields[2].name == "string_col"
     assert struct_fields[2].dataType == types.StringType()
-    assert struct_fields[3].name == "`map<string,bigint>_col`"
+    assert struct_fields[3].name == "map<string,bigint>_col"
     assert struct_fields[3].dataType == types.MapType(
         types.StringType(),
         types.LongType(),
     )
-    assert struct_fields[4].name == "`array<struct<a:bigint,b:bigint>>`"
+    assert struct_fields[4].name == "array<struct<a:bigint,b:bigint>>"
     assert struct_fields[4].dataType == types.ArrayType(
         types.StructType(
             [
@@ -146,11 +146,11 @@ def test_schema_nested(databricks_datatypes: DatabricksDataFrame):
             ]
         ),
     )
-    assert struct_fields[5].name == "`array<bigint>_col`"
+    assert struct_fields[5].name == "array<bigint>_col"
     assert struct_fields[5].dataType == types.ArrayType(
         types.LongType(),
     )
-    assert struct_fields[6].name == "`struct<a:bigint>_col`"
+    assert struct_fields[6].name == "struct<a:bigint>_col"
     assert struct_fields[6].dataType == types.StructType(
         [
             types.StructField(

{sqlframe-3.21.1 → sqlframe-3.22.0}/tests/integration/engines/duck/test_duckdb_dataframe.py RENAMED Viewed

@@ -67,16 +67,16 @@ root
  |-- bigint_col: bigint (nullable = true)
  |-- double_col: double (nullable = true)
  |-- string_col: string (nullable = true)
- |-- `map<string,bigint>_col`: map<string, bigint> (nullable = true)
+ |-- map<string,bigint>_col: map<string, bigint> (nullable = true)
  |    |-- key: string (nullable = true)
  |    |-- value: bigint (nullable = true)
- |-- `array<struct<a:bigint,b:bigint>>`: array<struct<a: bigint, b: bigint>> (nullable = true)
+ |-- array<struct<a:bigint,b:bigint>>: array<struct<a: bigint, b: bigint>> (nullable = true)
  |    |-- element: struct<a: bigint, b: bigint> (nullable = true)
  |    |    |-- a: bigint (nullable = true)
  |    |    |-- b: bigint (nullable = true)
- |-- `array<bigint>_col`: array<bigint> (nullable = true)
+ |-- array<bigint>_col: array<bigint> (nullable = true)
  |    |-- element: bigint (nullable = true)
- |-- `struct<a:bigint>_col`: struct<a: bigint> (nullable = true)
+ |-- struct<a:bigint>_col: struct<a: bigint> (nullable = true)
  |    |-- a: bigint (nullable = true)
  |-- date_col: date (nullable = true)
  |-- timestamp_col: timestamp (nullable = true)
@@ -122,12 +122,12 @@ def test_schema_nested(duckdb_datatypes: DuckDBDataFrame):
     assert struct_fields[1].dataType == types.DoubleType()
     assert struct_fields[2].name == "string_col"
     assert struct_fields[2].dataType == types.StringType()
-    assert struct_fields[3].name == "`map<string,bigint>_col`"
+    assert struct_fields[3].name == "map<string,bigint>_col"
     assert struct_fields[3].dataType == types.MapType(
         types.StringType(),
         types.LongType(),
     )
-    assert struct_fields[4].name == "`array<struct<a:bigint,b:bigint>>`"
+    assert struct_fields[4].name == "array<struct<a:bigint,b:bigint>>"
     assert struct_fields[4].dataType == types.ArrayType(
         types.StructType(
             [
@@ -142,11 +142,11 @@ def test_schema_nested(duckdb_datatypes: DuckDBDataFrame):
             ]
         ),
     )
-    assert struct_fields[5].name == "`array<bigint>_col`"
+    assert struct_fields[5].name == "array<bigint>_col"
     assert struct_fields[5].dataType == types.ArrayType(
         types.LongType(),
     )
-    assert struct_fields[6].name == "`struct<a:bigint>_col`"
+    assert struct_fields[6].name == "struct<a:bigint>_col"
     assert struct_fields[6].dataType == types.StructType(
         [
             types.StructField(

{sqlframe-3.21.1 → sqlframe-3.22.0}/tests/integration/engines/postgres/test_postgres_dataframe.py RENAMED Viewed

@@ -62,7 +62,7 @@ root
  |-- bigint_col: bigint (nullable = true)
  |-- double_col: double (nullable = true)
  |-- string_col: string (nullable = true)
- |-- `array<bigint>_col`: array<bigint> (nullable = true)
+ |-- array<bigint>_col: array<bigint> (nullable = true)
  |    |-- element: bigint (nullable = true)
  |-- date_col: date (nullable = true)
  |-- timestamp_col: timestamp (nullable = true)
@@ -108,7 +108,7 @@ def test_schema_nested(postgres_datatypes: PostgresDataFrame):
     assert struct_fields[1].dataType == types.DoubleType()
     assert struct_fields[2].name == "string_col"
     assert struct_fields[2].dataType == types.StringType()
-    assert struct_fields[3].name == "`array<bigint>_col`"
+    assert struct_fields[3].name == "array<bigint>_col"
     assert struct_fields[3].dataType == types.ArrayType(
         types.LongType(),
     )

{sqlframe-3.21.1 → sqlframe-3.22.0}/tests/integration/engines/test_engine_dataframe.py RENAMED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import typing as t
 from sqlframe.base.session import _BaseSession
-from sqlframe.base.types import Row
+from sqlframe.base.types import DoubleType, LongType, Row, StructField, StructType
 from sqlframe.snowflake import SnowflakeSession
 from sqlframe.spark import SparkSession
@@ -185,3 +185,7 @@ def test_show_from_create_with_space_with_schema(get_session: t.Callable[[], _Ba
         Row(**{"an tan": 3, "b": 4, "z": 8.0}),
         Row(**{"an tan": 2, "b": 6, "z": 9.0}),
     ]
+    assert df.schema.fields[0].name == "an tan"
+    df.printSchema()
+    captured = capsys.readouterr()
+    assert "|-- an tan:" in captured.out.strip()

{sqlframe-3.21.1 → sqlframe-3.22.0}/tests/integration/engines/test_engine_reader.py RENAMED Viewed

@@ -13,7 +13,7 @@ pytest_plugins = ["tests.integration.fixtures"]
 def test_load_no_format(get_session: t.Callable[[], _BaseSession]):
     session = get_session()
     df = session.read.load("tests/fixtures/employee.json")
-    assert df.collect() == [
+    expected = [
         Row(**{"employee_id": 1, "fname": "Jack", "lname": "Shephard", "age": 37, "store_id": 1}),
         Row(**{"employee_id": 2, "fname": "John", "lname": "Locke", "age": 65, "store_id": 1}),
         Row(**{"employee_id": 3, "fname": "Kate", "lname": "Austen", "age": 37, "store_id": 2}),
@@ -22,6 +22,9 @@ def test_load_no_format(get_session: t.Callable[[], _BaseSession]):
         ),
         Row(**{"employee_id": 5, "fname": "Hugo", "lname": "Reyes", "age": 29, "store_id": 100}),
     ]
+    assert sorted([sorted(row.asDict().items()) for row in df.collect()]) == sorted(
+        [sorted(row.asDict().items()) for row in expected]
+    )
 def test_load_no_format_schema(get_session: t.Callable[[], _BaseSession]):
@@ -50,7 +53,7 @@ def test_load_no_format_schema(get_session: t.Callable[[], _BaseSession]):
 def test_load_json(get_session: t.Callable[[], _BaseSession]):
     session = get_session()
     df = session.read.load("tests/fixtures/employee.json", format="json")
-    assert df.collect() == [
+    expected = [
         Row(**{"employee_id": 1, "fname": "Jack", "lname": "Shephard", "age": 37, "store_id": 1}),
         Row(**{"employee_id": 2, "fname": "John", "lname": "Locke", "age": 65, "store_id": 1}),
         Row(**{"employee_id": 3, "fname": "Kate", "lname": "Austen", "age": 37, "store_id": 2}),
@@ -59,12 +62,15 @@ def test_load_json(get_session: t.Callable[[], _BaseSession]):
         ),
         Row(**{"employee_id": 5, "fname": "Hugo", "lname": "Reyes", "age": 29, "store_id": 100}),
     ]
+    assert sorted([sorted(row.asDict().items()) for row in df.collect()]) == sorted(
+        [sorted(row.asDict().items()) for row in expected]
+    )
 def test_json(get_session: t.Callable[[], _BaseSession]):
     session = get_session()
     df = session.read.json("tests/fixtures/employee.json")
-    assert df.collect() == [
+    expected = [
         Row(**{"employee_id": 1, "fname": "Jack", "lname": "Shephard", "age": 37, "store_id": 1}),
         Row(**{"employee_id": 2, "fname": "John", "lname": "Locke", "age": 65, "store_id": 1}),
         Row(**{"employee_id": 3, "fname": "Kate", "lname": "Austen", "age": 37, "store_id": 2}),
@@ -73,6 +79,9 @@ def test_json(get_session: t.Callable[[], _BaseSession]):
         ),
         Row(**{"employee_id": 5, "fname": "Hugo", "lname": "Reyes", "age": 29, "store_id": 100}),
     ]
+    assert sorted([sorted(row.asDict().items()) for row in df.collect()]) == sorted(
+        [sorted(row.asDict().items()) for row in expected]
+    )
 def test_load_parquet(get_session: t.Callable[[], _BaseSession]):
@@ -105,8 +114,10 @@ def test_parquet(get_session: t.Callable[[], _BaseSession]):
 def test_load_csv(get_session: t.Callable[[], _BaseSession]):
     session = get_session()
-    df = session.read.load("tests/fixtures/employee.csv", format="csv")
-    assert df.collect() == [
+    df = session.read.load(
+        "tests/fixtures/employee.csv", format="csv", header=True, inferSchema=True
+    )
+    expected = [
         Row(**{"employee_id": 1, "fname": "Jack", "lname": "Shephard", "age": 37, "store_id": 1}),
         Row(**{"employee_id": 2, "fname": "John", "lname": "Locke", "age": 65, "store_id": 1}),
         Row(**{"employee_id": 3, "fname": "Kate", "lname": "Austen", "age": 37, "store_id": 2}),
@@ -115,12 +126,15 @@ def test_load_csv(get_session: t.Callable[[], _BaseSession]):
         ),
         Row(**{"employee_id": 5, "fname": "Hugo", "lname": "Reyes", "age": 29, "store_id": 100}),
     ]
+    assert sorted([sorted(row.asDict().items()) for row in df.collect()]) == sorted(
+        [sorted(row.asDict().items()) for row in expected]
+    )
 def test_csv(get_session: t.Callable[[], _BaseSession]):
     session = get_session()
-    df = session.read.csv("tests/fixtures/employee.csv")
-    assert df.collect() == [
+    df = session.read.csv("tests/fixtures/employee.csv", header=True, inferSchema=True)
+    expected = [
         Row(**{"employee_id": 1, "fname": "Jack", "lname": "Shephard", "age": 37, "store_id": 1}),
         Row(**{"employee_id": 2, "fname": "John", "lname": "Locke", "age": 65, "store_id": 1}),
         Row(**{"employee_id": 3, "fname": "Kate", "lname": "Austen", "age": 37, "store_id": 2}),
@@ -129,3 +143,6 @@ def test_csv(get_session: t.Callable[[], _BaseSession]):
         ),
         Row(**{"employee_id": 5, "fname": "Hugo", "lname": "Reyes", "age": 29, "store_id": 100}),
     ]
+    assert sorted([sorted(row.asDict().items()) for row in df.collect()]) == sorted(
+        [sorted(row.asDict().items()) for row in expected]
+    )

sqlframe 3.21.1__tar.gz → 3.22.0__tar.gz

sqlframe 3.21.1tar.gz → 3.22.0tar.gz