PyPI - sws-spark-dissemination-helper - Versions diffs - 0.0.105__tar.gz → 0.0.107__tar.gz - Mend

sws-spark-dissemination-helper 0.0.105tar.gz → 0.0.107tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sws-spark-dissemination-helper
-Version: 0.0.105
+Version: 0.0.107
 Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
 Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
 Author-email: Daniele Mansillo <danielemansillo@gmail.com>

{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "sws-spark-dissemination-helper"
-version = "0.0.105"
+version = "0.0.107"
 dependencies = [
     "annotated-types==0.7.0",
     "boto3==1.36.18",

{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py RENAMED Viewed

@@ -459,7 +459,7 @@ class SWSBronzeIcebergSparkHelper:
                     col(dimension_name).isin(codes)
                 )
-        self.df_bronze.writeTo(
+        self.disseminated_tag_df.writeTo(
             self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
         ).createOrReplace()

{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py RENAMED Viewed

@@ -89,6 +89,11 @@ class SWSGoldIcebergSparkHelper:
     def keep_dim_val_attr_columns(self, df: DataFrame):
         return df.select(*self.cols_to_keep_sws)
+    def read_bronze_data(self) -> DataFrame:
+        return self.spark.read.option("tag", self.tag_name).table(
+            self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
+        )
     def read_silver_data(self) -> DataFrame:
         return self.spark.read.option("tag", self.tag_name).table(
             self.iceberg_tables.SILVER.iceberg_id
@@ -101,6 +106,9 @@ class SWSGoldIcebergSparkHelper:
             .transform(self.keep_dim_val_attr_columns)
         )
+    def gen_gold_sws_data(self) -> DataFrame:
+        return self.read_bronze_data().transform(self.keep_dim_val_attr_columns)
     def gen_gold_sws_validated_data(self) -> DataFrame:
         return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
@@ -130,6 +138,39 @@ class SWSGoldIcebergSparkHelper:
         return df
+    def write_gold_sws_data_to_iceberg_and_csv(
+        self, df: DataFrame
+    ) -> DataFrame:
+        df.writeTo(self.iceberg_tables.GOLD_SWS.iceberg_id).createOrReplace()
+        logging.info(
+            f"Gold SWS table written to {self.iceberg_tables.GOLD_SWS.iceberg_id}"
+        )
+        self.spark.sql(
+            f"ALTER TABLE {self.iceberg_tables.GOLD_SWS.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
+        )
+        logging.info(f"gold SWS tag '{self.tag_name}' created")
+        df_1 = df.coalesce(1)
+        save_cache_csv(
+            df=df_1,
+            bucket=self.bucket,
+            prefix=self.iceberg_tables.GOLD_SWS.csv_prefix,
+            tag_name=self.tag_name,
+        )
+        return df
+    def gen_and_write_gold_sws_data_to_iceberg_and_csv(self) -> DataFrame:
+        self.df_gold_sws = self.gen_gold_sws_data()
+        self.write_gold_sws_data_to_iceberg_and_csv(self.df_gold_sws)
+        return self.df_gold_sws
     def gen_and_write_gold_sws_validated_data_to_iceberg_and_csv(self) -> DataFrame:
         self.df_gold_sws_validated = self.gen_gold_sws_validated_data()
@@ -242,7 +283,7 @@ class SWSGoldIcebergSparkHelper:
         new_iceberg_table = BaseDisseminatedTagTable(
             id=f"{self.domain_code.lower()}_gold_sws_validated_iceberg",
             name=f"{self.domain_code} gold SWS validated Iceberg",
-            description="Gold table containing all the data unmapped and unfiltered in SWS compatible format",
+            description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format",
             layer=TableLayer.GOLD,
             private=True,
             type=TableType.ICEBERG,
@@ -263,7 +304,7 @@ class SWSGoldIcebergSparkHelper:
         new_diss_table = BaseDisseminatedTagTable(
             id=f"{self.domain_code.lower()}_gold_sws_validated_csv",
             name=f"{self.domain_code} gold SWS validated csv",
-            description="Gold table containing all the data unmapped and unfiltered in SWS compatible format cached in csv",
+            description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format, cached in csv",
             layer=TableLayer.GOLD,
             private=True,
             type=TableType.CSV,
@@ -291,7 +332,7 @@ class SWSGoldIcebergSparkHelper:
         new_iceberg_table = BaseDisseminatedTagTable(
             id=f"{self.domain_code.lower()}_gold_sws_disseminated_iceberg",
             name=f"{self.domain_code} gold SWS disseminated Iceberg",
-            description="Gold table containing all the data mapped and filtered in SWS compatible format",
+            description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format",
             layer=TableLayer.GOLD,
             private=True,
             type=TableType.ICEBERG,
@@ -312,7 +353,7 @@ class SWSGoldIcebergSparkHelper:
         new_diss_table = BaseDisseminatedTagTable(
             id=f"{self.domain_code.lower()}_gold_sws_disseminated_csv",
             name=f"{self.domain_code} gold SWS disseminated csv",
-            description="Gold table containing all the data mapped and filtered in SWS compatible format format cached in csv",
+            description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format, cached in csv",
             layer=TableLayer.GOLD,
             private=True,
             type=TableType.CSV,
@@ -427,3 +468,52 @@ class SWSGoldIcebergSparkHelper:
         logging.debug(f"Tag with Added csv Table: {tag}")
         return df
+    def write_gold_sws_dissemination_tag(
+        self, df: DataFrame, tags: Tags
+    ) -> DataFrame:
+        # Get or create a new tag
+        tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
+        logging.debug(f"Tag: {tag}")
+        new_iceberg_table = BaseDisseminatedTagTable(
+            id=f"{self.domain_code.lower()}_gold_sws_iceberg",
+            name=f"{self.domain_code} gold SWS Iceberg",
+            description="Gold table containing the tag data without any processing",
+            layer=TableLayer.GOLD,
+            private=True,
+            type=TableType.ICEBERG,
+            database=IcebergDatabases.GOLD_DATABASE,
+            table=self.iceberg_tables.GOLD_SWS.table,
+            path=self.iceberg_tables.GOLD_SWS.path,
+            structure={"columns": df.schema.jsonValue()["fields"]},
+        )
+        tag = upsert_disseminated_table(
+            sws_tags=tags,
+            tag=tag,
+            dataset_id=self.dataset_id,
+            tag_name=self.tag_name,
+            table=new_iceberg_table,
+        )
+        logging.debug(f"Tag with Added Iceberg Table: {tag}")
+        new_diss_table = BaseDisseminatedTagTable(
+            id=f"{self.domain_code.lower()}_gold_pre_sdmx_csv",
+            name=f"{self.domain_code} gold pre-SDMX csv",
+            description="Gold table containing the tag data without any processing cached in csv",
+            layer=TableLayer.GOLD,
+            private=True,
+            type=TableType.CSV,
+            path=self.iceberg_tables.GOLD_SWS.csv_path,
+            structure={"columns": df.schema.jsonValue()["fields"]},
+        )
+        tag = upsert_disseminated_table(
+            sws_tags=tags,
+            tag=tag,
+            dataset_id=self.dataset_id,
+            tag_name=self.tag_name,
+            table=new_diss_table,
+        )
+        logging.debug(f"Tag with Added csv Table: {tag}")
+        return df

{sws_spark_dissemination_helper-0.0.105 → sws_spark_dissemination_helper-0.0.107}/src/sws_spark_dissemination_helper/constants.py RENAMED Viewed

@@ -223,6 +223,9 @@ class IcebergTables:
         self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)
         # GOLD tables with specific suffixes
+        self.GOLD_SWS = self._create_iceberg_table(
+            "GOLD", prefix=domain, suffix="sws"
+        )
         self.GOLD_SDMX = self._create_iceberg_table(
             "GOLD", prefix=domain, suffix="sdmx_disseminated"
         )