spark-utils 2.0.0__tar.gz → 2.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {spark_utils-2.0.0 → spark_utils-2.0.2}/PKG-INFO +1 -1
  2. {spark_utils-2.0.0 → spark_utils-2.0.2}/pyproject.toml +1 -1
  3. spark_utils-2.0.2/spark_utils/_version.py +1 -0
  4. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/functions.py +13 -3
  5. spark_utils-2.0.0/spark_utils/_version.py +0 -1
  6. {spark_utils-2.0.0 → spark_utils-2.0.2}/LICENSE +0 -0
  7. {spark_utils-2.0.0 → spark_utils-2.0.2}/README.md +0 -0
  8. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/__init__.py +0 -0
  9. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/__init__.py +0 -0
  10. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/spark_job_args.py +0 -0
  11. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/spark_session_provider.py +0 -0
  12. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/spark_sql_utils.py +0 -0
  13. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/spark_udf.py +0 -0
  14. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/__init__.py +0 -0
  15. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/functions.py +0 -0
  16. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/models.py +0 -0
  17. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/sets/__init__.py +0 -0
  18. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/sets/functions.py +0 -0
  19. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/delta_lake/__init__.py +0 -0
  20. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/delta_lake/delta_log.py +0 -0
  21. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/delta_lake/functions.py +0 -0
  22. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/__init__.py +0 -0
  23. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/delta_lake_config.py +0 -0
  24. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/hive_metastore_config.py +0 -0
  25. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/hive_table.py +0 -0
  26. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/iceberg_rest_config.py +0 -0
  27. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/job_socket.py +0 -0
  28. {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/k8s_config.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spark-utils
3
- Version: 2.0.0
3
+ Version: 2.0.2
4
4
  Summary: Utility classes for comfy Spark job authoriing.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "spark-utils"
3
- version = "2.0.0"
3
+ version = "2.0.2"
4
4
  description = "Utility classes for comfy Spark job authoriing."
5
5
  authors = ["ECCO Sneaks & Data <esdsupport@ecco.com>"]
6
6
  maintainers = ['GZU <gzu@ecco.com>', 'JRB <ext-jrb@ecco.com>']
@@ -0,0 +1 @@
1
+ __version__ = '2.0.2'
@@ -76,7 +76,7 @@ def read_from_socket(
76
76
  :return: Spark dataframe
77
77
  """
78
78
  read_options = read_options or {}
79
- if socket.data_format.startswith("hive"):
79
+ if socket.data_format.startswith("hive") or socket.data_format.startswith("iceberg"):
80
80
  return spark_session.table(socket.data_path)
81
81
 
82
82
  return spark_session.read.options(**read_options).format(socket.data_format).load(socket.data_path)
@@ -88,6 +88,7 @@ def write_to_socket(
88
88
  partition_by: Optional[List[str]] = None,
89
89
  partition_count: Optional[int] = None,
90
90
  write_options: Optional[Dict[str, str]] = None,
91
+ mode: str = "overwrite",
91
92
  ) -> None:
92
93
  """Writes data to socket
93
94
 
@@ -95,7 +96,8 @@ def write_to_socket(
95
96
  :param socket: Socket to write to
96
97
  :param partition_by: List of column names to partition by
97
98
  :param partition_count: Number of partitions to split result into.
98
- :param write_options: Write options passed to spark (e.g. Parquet options
99
+ :param write_options: Write options passed to spark (e.g. Parquet options)
100
+ :param mode: Write mode
99
101
  found here: https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option)
100
102
  """
101
103
  write_options = write_options or {}
@@ -103,7 +105,15 @@ def write_to_socket(
103
105
  if partition_count:
104
106
  data = data.repartition(partition_count, *partition_by)
105
107
 
106
- writer = data.write.mode("overwrite").options(**write_options)
108
+ # ignore all external write options as Iceberg writer will take care of those
109
+ if socket.data_format.startswith("iceberg"):
110
+ if mode == "overwrite":
111
+ data.writeTo(socket.data_path).createOrReplace()
112
+ if mode == "append":
113
+ data.writeTo(socket.data_path).append()
114
+ return
115
+
116
+ writer = data.write.mode(mode).options(**write_options)
107
117
 
108
118
  if partition_by:
109
119
  writer = writer.partitionBy(*partition_by)
@@ -1 +0,0 @@
1
- __version__ = '2.0.0'
File without changes
File without changes