sws-spark-dissemination-helper 0.0.105__tar.gz → 0.0.107__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/PKG-INFO +1 -1
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/pyproject.toml +1 -1
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +1 -1
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +94 -4
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/constants.py +3 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/.gitignore +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/LICENSE +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/README.md +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/old_requirements.txt +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/requirements.txt +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/utils.py +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/tests/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/tests/test.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.107
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -89,6 +89,11 @@ class SWSGoldIcebergSparkHelper:
|
|
|
89
89
|
def keep_dim_val_attr_columns(self, df: DataFrame):
|
|
90
90
|
return df.select(*self.cols_to_keep_sws)
|
|
91
91
|
|
|
92
|
+
def read_bronze_data(self) -> DataFrame:
|
|
93
|
+
return self.spark.read.option("tag", self.tag_name).table(
|
|
94
|
+
self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
|
|
95
|
+
)
|
|
96
|
+
|
|
92
97
|
def read_silver_data(self) -> DataFrame:
|
|
93
98
|
return self.spark.read.option("tag", self.tag_name).table(
|
|
94
99
|
self.iceberg_tables.SILVER.iceberg_id
|
|
@@ -101,6 +106,9 @@ class SWSGoldIcebergSparkHelper:
|
|
|
101
106
|
.transform(self.keep_dim_val_attr_columns)
|
|
102
107
|
)
|
|
103
108
|
|
|
109
|
+
def gen_gold_sws_data(self) -> DataFrame:
|
|
110
|
+
return self.read_bronze_data().transform(self.keep_dim_val_attr_columns)
|
|
111
|
+
|
|
104
112
|
def gen_gold_sws_validated_data(self) -> DataFrame:
|
|
105
113
|
return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
|
|
106
114
|
|
|
@@ -130,6 +138,39 @@ class SWSGoldIcebergSparkHelper:
|
|
|
130
138
|
|
|
131
139
|
return df
|
|
132
140
|
|
|
141
|
+
def write_gold_sws_data_to_iceberg_and_csv(
|
|
142
|
+
self, df: DataFrame
|
|
143
|
+
) -> DataFrame:
|
|
144
|
+
df.writeTo(self.iceberg_tables.GOLD_SWS.iceberg_id).createOrReplace()
|
|
145
|
+
|
|
146
|
+
logging.info(
|
|
147
|
+
f"Gold SWS table written to {self.iceberg_tables.GOLD_SWS.iceberg_id}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
self.spark.sql(
|
|
151
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_SWS.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
logging.info(f"gold SWS tag '{self.tag_name}' created")
|
|
155
|
+
|
|
156
|
+
df_1 = df.coalesce(1)
|
|
157
|
+
|
|
158
|
+
save_cache_csv(
|
|
159
|
+
df=df_1,
|
|
160
|
+
bucket=self.bucket,
|
|
161
|
+
prefix=self.iceberg_tables.GOLD_SWS.csv_prefix,
|
|
162
|
+
tag_name=self.tag_name,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return df
|
|
166
|
+
|
|
167
|
+
def gen_and_write_gold_sws_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
168
|
+
self.df_gold_sws = self.gen_gold_sws_data()
|
|
169
|
+
|
|
170
|
+
self.write_gold_sws_data_to_iceberg_and_csv(self.df_gold_sws)
|
|
171
|
+
|
|
172
|
+
return self.df_gold_sws
|
|
173
|
+
|
|
133
174
|
def gen_and_write_gold_sws_validated_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
134
175
|
self.df_gold_sws_validated = self.gen_gold_sws_validated_data()
|
|
135
176
|
|
|
@@ -242,7 +283,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
242
283
|
new_iceberg_table = BaseDisseminatedTagTable(
|
|
243
284
|
id=f"{self.domain_code.lower()}_gold_sws_validated_iceberg",
|
|
244
285
|
name=f"{self.domain_code} gold SWS validated Iceberg",
|
|
245
|
-
description="Gold table containing all the data
|
|
286
|
+
description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format",
|
|
246
287
|
layer=TableLayer.GOLD,
|
|
247
288
|
private=True,
|
|
248
289
|
type=TableType.ICEBERG,
|
|
@@ -263,7 +304,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
263
304
|
new_diss_table = BaseDisseminatedTagTable(
|
|
264
305
|
id=f"{self.domain_code.lower()}_gold_sws_validated_csv",
|
|
265
306
|
name=f"{self.domain_code} gold SWS validated csv",
|
|
266
|
-
description="Gold table containing all the data
|
|
307
|
+
description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format, cached in csv",
|
|
267
308
|
layer=TableLayer.GOLD,
|
|
268
309
|
private=True,
|
|
269
310
|
type=TableType.CSV,
|
|
@@ -291,7 +332,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
291
332
|
new_iceberg_table = BaseDisseminatedTagTable(
|
|
292
333
|
id=f"{self.domain_code.lower()}_gold_sws_disseminated_iceberg",
|
|
293
334
|
name=f"{self.domain_code} gold SWS disseminated Iceberg",
|
|
294
|
-
description="Gold table containing
|
|
335
|
+
description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format",
|
|
295
336
|
layer=TableLayer.GOLD,
|
|
296
337
|
private=True,
|
|
297
338
|
type=TableType.ICEBERG,
|
|
@@ -312,7 +353,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
312
353
|
new_diss_table = BaseDisseminatedTagTable(
|
|
313
354
|
id=f"{self.domain_code.lower()}_gold_sws_disseminated_csv",
|
|
314
355
|
name=f"{self.domain_code} gold SWS disseminated csv",
|
|
315
|
-
description="Gold table containing
|
|
356
|
+
description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format, cached in csv",
|
|
316
357
|
layer=TableLayer.GOLD,
|
|
317
358
|
private=True,
|
|
318
359
|
type=TableType.CSV,
|
|
@@ -427,3 +468,52 @@ class SWSGoldIcebergSparkHelper:
|
|
|
427
468
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
428
469
|
|
|
429
470
|
return df
|
|
471
|
+
|
|
472
|
+
def write_gold_sws_dissemination_tag(
|
|
473
|
+
self, df: DataFrame, tags: Tags
|
|
474
|
+
) -> DataFrame:
|
|
475
|
+
# Get or create a new tag
|
|
476
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
477
|
+
logging.debug(f"Tag: {tag}")
|
|
478
|
+
|
|
479
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
480
|
+
id=f"{self.domain_code.lower()}_gold_sws_iceberg",
|
|
481
|
+
name=f"{self.domain_code} gold SWS Iceberg",
|
|
482
|
+
description="Gold table containing the tag data without any processing",
|
|
483
|
+
layer=TableLayer.GOLD,
|
|
484
|
+
private=True,
|
|
485
|
+
type=TableType.ICEBERG,
|
|
486
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
487
|
+
table=self.iceberg_tables.GOLD_SWS.table,
|
|
488
|
+
path=self.iceberg_tables.GOLD_SWS.path,
|
|
489
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
490
|
+
)
|
|
491
|
+
tag = upsert_disseminated_table(
|
|
492
|
+
sws_tags=tags,
|
|
493
|
+
tag=tag,
|
|
494
|
+
dataset_id=self.dataset_id,
|
|
495
|
+
tag_name=self.tag_name,
|
|
496
|
+
table=new_iceberg_table,
|
|
497
|
+
)
|
|
498
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
499
|
+
|
|
500
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
501
|
+
id=f"{self.domain_code.lower()}_gold_pre_sdmx_csv",
|
|
502
|
+
name=f"{self.domain_code} gold pre-SDMX csv",
|
|
503
|
+
description="Gold table containing the tag data without any processing cached in csv",
|
|
504
|
+
layer=TableLayer.GOLD,
|
|
505
|
+
private=True,
|
|
506
|
+
type=TableType.CSV,
|
|
507
|
+
path=self.iceberg_tables.GOLD_SWS.csv_path,
|
|
508
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
509
|
+
)
|
|
510
|
+
tag = upsert_disseminated_table(
|
|
511
|
+
sws_tags=tags,
|
|
512
|
+
tag=tag,
|
|
513
|
+
dataset_id=self.dataset_id,
|
|
514
|
+
tag_name=self.tag_name,
|
|
515
|
+
table=new_diss_table,
|
|
516
|
+
)
|
|
517
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
518
|
+
|
|
519
|
+
return df
|
|
@@ -223,6 +223,9 @@ class IcebergTables:
|
|
|
223
223
|
self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)
|
|
224
224
|
|
|
225
225
|
# GOLD tables with specific suffixes
|
|
226
|
+
self.GOLD_SWS = self._create_iceberg_table(
|
|
227
|
+
"GOLD", prefix=domain, suffix="sws"
|
|
228
|
+
)
|
|
226
229
|
self.GOLD_SDMX = self._create_iceberg_table(
|
|
227
230
|
"GOLD", prefix=domain, suffix="sdmx_disseminated"
|
|
228
231
|
)
|
{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/.gitignore
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/tests/__init__.py
RENAMED
|
File without changes
|
{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/tests/test.py
RENAMED
|
File without changes
|