sws-spark-dissemination-helper 0.0.105__tar.gz → 0.0.107__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/PKG-INFO +1 -1
  2. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/pyproject.toml +1 -1
  3. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +1 -1
  4. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +94 -4
  5. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/constants.py +3 -0
  6. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/.gitignore +0 -0
  7. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/LICENSE +0 -0
  8. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/README.md +0 -0
  9. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/old_requirements.txt +0 -0
  10. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/requirements.txt +0 -0
  11. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/utils.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/tests/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.105
3
+ Version: 0.0.107
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.105"
7
+ version = "0.0.107"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3==1.36.18",
@@ -459,7 +459,7 @@ class SWSBronzeIcebergSparkHelper:
459
459
  col(dimension_name).isin(codes)
460
460
  )
461
461
 
462
- self.df_bronze.writeTo(
462
+ self.disseminated_tag_df.writeTo(
463
463
  self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
464
464
  ).createOrReplace()
465
465
 
@@ -89,6 +89,11 @@ class SWSGoldIcebergSparkHelper:
89
89
  def keep_dim_val_attr_columns(self, df: DataFrame):
90
90
  return df.select(*self.cols_to_keep_sws)
91
91
 
92
+ def read_bronze_data(self) -> DataFrame:
93
+ return self.spark.read.option("tag", self.tag_name).table(
94
+ self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
95
+ )
96
+
92
97
  def read_silver_data(self) -> DataFrame:
93
98
  return self.spark.read.option("tag", self.tag_name).table(
94
99
  self.iceberg_tables.SILVER.iceberg_id
@@ -101,6 +106,9 @@ class SWSGoldIcebergSparkHelper:
101
106
  .transform(self.keep_dim_val_attr_columns)
102
107
  )
103
108
 
109
+ def gen_gold_sws_data(self) -> DataFrame:
110
+ return self.read_bronze_data().transform(self.keep_dim_val_attr_columns)
111
+
104
112
  def gen_gold_sws_validated_data(self) -> DataFrame:
105
113
  return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
106
114
 
@@ -130,6 +138,39 @@ class SWSGoldIcebergSparkHelper:
130
138
 
131
139
  return df
132
140
 
141
+ def write_gold_sws_data_to_iceberg_and_csv(
142
+ self, df: DataFrame
143
+ ) -> DataFrame:
144
+ df.writeTo(self.iceberg_tables.GOLD_SWS.iceberg_id).createOrReplace()
145
+
146
+ logging.info(
147
+ f"Gold SWS table written to {self.iceberg_tables.GOLD_SWS.iceberg_id}"
148
+ )
149
+
150
+ self.spark.sql(
151
+ f"ALTER TABLE {self.iceberg_tables.GOLD_SWS.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
152
+ )
153
+
154
+ logging.info(f"gold SWS tag '{self.tag_name}' created")
155
+
156
+ df_1 = df.coalesce(1)
157
+
158
+ save_cache_csv(
159
+ df=df_1,
160
+ bucket=self.bucket,
161
+ prefix=self.iceberg_tables.GOLD_SWS.csv_prefix,
162
+ tag_name=self.tag_name,
163
+ )
164
+
165
+ return df
166
+
167
+ def gen_and_write_gold_sws_data_to_iceberg_and_csv(self) -> DataFrame:
168
+ self.df_gold_sws = self.gen_gold_sws_data()
169
+
170
+ self.write_gold_sws_data_to_iceberg_and_csv(self.df_gold_sws)
171
+
172
+ return self.df_gold_sws
173
+
133
174
  def gen_and_write_gold_sws_validated_data_to_iceberg_and_csv(self) -> DataFrame:
134
175
  self.df_gold_sws_validated = self.gen_gold_sws_validated_data()
135
176
 
@@ -242,7 +283,7 @@ class SWSGoldIcebergSparkHelper:
242
283
  new_iceberg_table = BaseDisseminatedTagTable(
243
284
  id=f"{self.domain_code.lower()}_gold_sws_validated_iceberg",
244
285
  name=f"{self.domain_code} gold SWS validated Iceberg",
245
- description="Gold table containing all the data unmapped and unfiltered in SWS compatible format",
286
+ description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format",
246
287
  layer=TableLayer.GOLD,
247
288
  private=True,
248
289
  type=TableType.ICEBERG,
@@ -263,7 +304,7 @@ class SWSGoldIcebergSparkHelper:
263
304
  new_diss_table = BaseDisseminatedTagTable(
264
305
  id=f"{self.domain_code.lower()}_gold_sws_validated_csv",
265
306
  name=f"{self.domain_code} gold SWS validated csv",
266
- description="Gold table containing all the data unmapped and unfiltered in SWS compatible format cached in csv",
307
+ description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format, cached in csv",
267
308
  layer=TableLayer.GOLD,
268
309
  private=True,
269
310
  type=TableType.CSV,
@@ -291,7 +332,7 @@ class SWSGoldIcebergSparkHelper:
291
332
  new_iceberg_table = BaseDisseminatedTagTable(
292
333
  id=f"{self.domain_code.lower()}_gold_sws_disseminated_iceberg",
293
334
  name=f"{self.domain_code} gold SWS disseminated Iceberg",
294
- description="Gold table containing all the data mapped and filtered in SWS compatible format",
335
+ description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format",
295
336
  layer=TableLayer.GOLD,
296
337
  private=True,
297
338
  type=TableType.ICEBERG,
@@ -312,7 +353,7 @@ class SWSGoldIcebergSparkHelper:
312
353
  new_diss_table = BaseDisseminatedTagTable(
313
354
  id=f"{self.domain_code.lower()}_gold_sws_disseminated_csv",
314
355
  name=f"{self.domain_code} gold SWS disseminated csv",
315
- description="Gold table containing all the data mapped and filtered in SWS compatible format format cached in csv",
356
+ description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format, cached in csv",
316
357
  layer=TableLayer.GOLD,
317
358
  private=True,
318
359
  type=TableType.CSV,
@@ -427,3 +468,52 @@ class SWSGoldIcebergSparkHelper:
427
468
  logging.debug(f"Tag with Added csv Table: {tag}")
428
469
 
429
470
  return df
471
+
472
+ def write_gold_sws_dissemination_tag(
473
+ self, df: DataFrame, tags: Tags
474
+ ) -> DataFrame:
475
+ # Get or create a new tag
476
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
477
+ logging.debug(f"Tag: {tag}")
478
+
479
+ new_iceberg_table = BaseDisseminatedTagTable(
480
+ id=f"{self.domain_code.lower()}_gold_sws_iceberg",
481
+ name=f"{self.domain_code} gold SWS Iceberg",
482
+ description="Gold table containing the tag data without any processing",
483
+ layer=TableLayer.GOLD,
484
+ private=True,
485
+ type=TableType.ICEBERG,
486
+ database=IcebergDatabases.GOLD_DATABASE,
487
+ table=self.iceberg_tables.GOLD_SWS.table,
488
+ path=self.iceberg_tables.GOLD_SWS.path,
489
+ structure={"columns": df.schema.jsonValue()["fields"]},
490
+ )
491
+ tag = upsert_disseminated_table(
492
+ sws_tags=tags,
493
+ tag=tag,
494
+ dataset_id=self.dataset_id,
495
+ tag_name=self.tag_name,
496
+ table=new_iceberg_table,
497
+ )
498
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
499
+
500
+ new_diss_table = BaseDisseminatedTagTable(
501
+ id=f"{self.domain_code.lower()}_gold_pre_sdmx_csv",
502
+ name=f"{self.domain_code} gold pre-SDMX csv",
503
+ description="Gold table containing the tag data without any processing cached in csv",
504
+ layer=TableLayer.GOLD,
505
+ private=True,
506
+ type=TableType.CSV,
507
+ path=self.iceberg_tables.GOLD_SWS.csv_path,
508
+ structure={"columns": df.schema.jsonValue()["fields"]},
509
+ )
510
+ tag = upsert_disseminated_table(
511
+ sws_tags=tags,
512
+ tag=tag,
513
+ dataset_id=self.dataset_id,
514
+ tag_name=self.tag_name,
515
+ table=new_diss_table,
516
+ )
517
+ logging.debug(f"Tag with Added csv Table: {tag}")
518
+
519
+ return df
@@ -223,6 +223,9 @@ class IcebergTables:
223
223
  self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)
224
224
 
225
225
  # GOLD tables with specific suffixes
226
+ self.GOLD_SWS = self._create_iceberg_table(
227
+ "GOLD", prefix=domain, suffix="sws"
228
+ )
226
229
  self.GOLD_SDMX = self._create_iceberg_table(
227
230
  "GOLD", prefix=domain, suffix="sdmx_disseminated"
228
231
  )