PyPI - spark-utils - Versions diffs - 2.0.0__tar.gz → 2.0.2__tar.gz - Mend

spark-utils 2.0.0tar.gz → 2.0.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{spark_utils-2.0.0 → spark_utils-2.0.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: spark-utils
-Version: 2.0.0
+Version: 2.0.2
 Summary: Utility classes for comfy Spark job authoriing.
 License: MIT
 License-File: LICENSE

{spark_utils-2.0.0 → spark_utils-2.0.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "spark-utils"
-version = "2.0.0"
+version = "2.0.2"
 description = "Utility classes for comfy Spark job authoriing."
 authors = ["ECCO Sneaks & Data <esdsupport@ecco.com>"]
 maintainers = ['GZU <gzu@ecco.com>', 'JRB <ext-jrb@ecco.com>']

spark_utils-2.0.2/spark_utils/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = '2.0.2'

{spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/functions.py RENAMED Viewed

@@ -76,7 +76,7 @@ def read_from_socket(
     :return: Spark dataframe
     """
     read_options = read_options or {}
-    if socket.data_format.startswith("hive"):
+    if socket.data_format.startswith("hive") or socket.data_format.startswith("iceberg"):
         return spark_session.table(socket.data_path)
     return spark_session.read.options(**read_options).format(socket.data_format).load(socket.data_path)
@@ -88,6 +88,7 @@ def write_to_socket(
     partition_by: Optional[List[str]] = None,
     partition_count: Optional[int] = None,
     write_options: Optional[Dict[str, str]] = None,
+    mode: str = "overwrite",
 ) -> None:
     """Writes data to socket
@@ -95,7 +96,8 @@ def write_to_socket(
     :param socket: Socket to write to
     :param partition_by: List of column names to partition by
     :param partition_count: Number of partitions to split result into.
-    :param write_options: Write options passed to spark (e.g. Parquet options
+    :param write_options: Write options passed to spark (e.g. Parquet options)
+    :param mode: Write mode
     found here: https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option)
     """
     write_options = write_options or {}
@@ -103,7 +105,15 @@ def write_to_socket(
     if partition_count:
         data = data.repartition(partition_count, *partition_by)
-    writer = data.write.mode("overwrite").options(**write_options)
+    # ignore all external write options as Iceberg writer will take care of those
+    if socket.data_format.startswith("iceberg"):
+        if mode == "overwrite":
+            data.writeTo(socket.data_path).createOrReplace()
+        if mode == "append":
+            data.writeTo(socket.data_path).append()
+        return
+    writer = data.write.mode(mode).options(**write_options)
     if partition_by:
         writer = writer.partitionBy(*partition_by)