spark-utils 2.0.0__tar.gz → 2.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {spark_utils-2.0.0 → spark_utils-2.0.2}/PKG-INFO +1 -1
- {spark_utils-2.0.0 → spark_utils-2.0.2}/pyproject.toml +1 -1
- spark_utils-2.0.2/spark_utils/_version.py +1 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/functions.py +13 -3
- spark_utils-2.0.0/spark_utils/_version.py +0 -1
- {spark_utils-2.0.0 → spark_utils-2.0.2}/LICENSE +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/README.md +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/__init__.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/__init__.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/spark_job_args.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/spark_session_provider.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/spark_sql_utils.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/common/spark_udf.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/__init__.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/functions.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/models.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/sets/__init__.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/dataframes/sets/functions.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/delta_lake/__init__.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/delta_lake/delta_log.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/delta_lake/functions.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/__init__.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/delta_lake_config.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/hive_metastore_config.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/hive_table.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/iceberg_rest_config.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/job_socket.py +0 -0
- {spark_utils-2.0.0 → spark_utils-2.0.2}/spark_utils/models/k8s_config.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '2.0.2'
|
|
@@ -76,7 +76,7 @@ def read_from_socket(
|
|
|
76
76
|
:return: Spark dataframe
|
|
77
77
|
"""
|
|
78
78
|
read_options = read_options or {}
|
|
79
|
-
if socket.data_format.startswith("hive"):
|
|
79
|
+
if socket.data_format.startswith("hive") or socket.data_format.startswith("iceberg"):
|
|
80
80
|
return spark_session.table(socket.data_path)
|
|
81
81
|
|
|
82
82
|
return spark_session.read.options(**read_options).format(socket.data_format).load(socket.data_path)
|
|
@@ -88,6 +88,7 @@ def write_to_socket(
|
|
|
88
88
|
partition_by: Optional[List[str]] = None,
|
|
89
89
|
partition_count: Optional[int] = None,
|
|
90
90
|
write_options: Optional[Dict[str, str]] = None,
|
|
91
|
+
mode: str = "overwrite",
|
|
91
92
|
) -> None:
|
|
92
93
|
"""Writes data to socket
|
|
93
94
|
|
|
@@ -95,7 +96,8 @@ def write_to_socket(
|
|
|
95
96
|
:param socket: Socket to write to
|
|
96
97
|
:param partition_by: List of column names to partition by
|
|
97
98
|
:param partition_count: Number of partitions to split result into.
|
|
98
|
-
:param write_options: Write options passed to spark (e.g. Parquet options
|
|
99
|
+
:param write_options: Write options passed to spark (e.g. Parquet options)
|
|
100
|
+
:param mode: Write mode
|
|
99
101
|
found here: https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#data-source-option)
|
|
100
102
|
"""
|
|
101
103
|
write_options = write_options or {}
|
|
@@ -103,7 +105,15 @@ def write_to_socket(
|
|
|
103
105
|
if partition_count:
|
|
104
106
|
data = data.repartition(partition_count, *partition_by)
|
|
105
107
|
|
|
106
|
-
|
|
108
|
+
# ignore all external write options as Iceberg writer will take care of those
|
|
109
|
+
if socket.data_format.startswith("iceberg"):
|
|
110
|
+
if mode == "overwrite":
|
|
111
|
+
data.writeTo(socket.data_path).createOrReplace()
|
|
112
|
+
if mode == "append":
|
|
113
|
+
data.writeTo(socket.data_path).append()
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
writer = data.write.mode(mode).options(**write_options)
|
|
107
117
|
|
|
108
118
|
if partition_by:
|
|
109
119
|
writer = writer.partitionBy(*partition_by)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = '2.0.0'
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|