PyPI - sws-spark-dissemination-helper - Versions diffs - 0.0.187__py3-none-any.whl → 0.0.191__py3-none-any.whl - Mend

sws-spark-dissemination-helper 0.0.187py3-none-any.whl → 0.0.191py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py CHANGED Viewed

@@ -5,13 +5,19 @@ from typing import List, Tuple
 import pyspark.sql.functions as F
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, lit
+from pyspark.sql.types import DecimalType, FloatType
 from sws_api_client import Tags
 from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
-from .constants import IcebergDatabases, IcebergTables, DatasetDatatables
+from .constants import DatasetDatatables, IcebergDatabases, IcebergTables
 from .SWSPostgresSparkReader import SWSPostgresSparkReader
 from .utils import get_or_create_tag, save_cache_csv, upsert_disseminated_table
+SIMPLE_NUMERIC_REGEX = r"^[+-]?\d*(\.\d+)?$"
+NUMERIC_REGEX = r"^[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$"
+# Regex to extract decimal places: matches the decimal part and counts its length
+DECIMAL_PLACES_REGEX = r"\.(\d+)$"
 class SWSGoldIcebergSparkHelper:
     def __init__(
@@ -62,10 +68,9 @@ class SWSGoldIcebergSparkHelper:
                 if col_name in self.dim_columns
             }
-            self.display_decimals = (
-                self.sws_postgres_spark_reader.get_display_decimals_datatable(
-                    domain_code=domain_code
-                )
+            self.display_decimals_df = self.sws_postgres_spark_reader.read_pg_table(
+                pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
+                custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
             )
     def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
@@ -99,53 +104,153 @@ class SWSGoldIcebergSparkHelper:
             cols_to_keep_sws = cols_to_keep_sws + ["unit_of_measure_symbol"]
         return df.select(*cols_to_keep_sws)
-    def round_to_display_decimals(self, df: DataFrame):
-        col1_name, col2_name = (
-            self.display_decimals.select("column_1_name", "column_2_name")
-            .distinct()
-            .collect()[0]
-        )
-        if col1_name.lower() not in [column.lower() for column in df.columns]:
-            raise ValueError(
-                f"{col1_name} is not part of the columns available for this dataset ({df.columns})"
+    def round_to_display_decimals(
+        self,
+        df: DataFrame,
+        value_column: str = "value",
+    ) -> DataFrame:
+        df = df.withColumn("unrounded_value", col(value_column).cast("string"))
+        general_default_decimals = (
+            self.display_decimals_df.filter(col("domain") == lit("DEFAULT"))
+            .select("display_decimals")
+            .collect()[0][0]
+        )
+        domain_default_decimals = self.display_decimals_df.filter(
+            (col("domain") == lit(self.domain_code))
+            & col("column_1_name").isNull()
+            & col("column_2_name").isNull()
+        ).select("display_decimals")
+        default_decimals = int(
+            general_default_decimals
+            if domain_default_decimals.count() == 0
+            else domain_default_decimals.collect()[0][0]
+        )
+        domain_specific_rules = self.display_decimals_df.filter(
+            (col("domain") == lit(self.domain_code))
+            & (col("column_1_name").isNotNull() & col("column_1_value").isNotNull())
+            | (col("column_2_name").isNotNull() & col("column_2_value").isNotNull())
+        )
+        when_decimals = None
+        when_rounded = None
+        for rule in domain_specific_rules.collect():
+            condition = lit(True)
+            if rule["column_1_name"] != "" and rule["column_1_value"] != "":
+                column_1_name = rule["column_1_name"]
+                column_1_value_str = rule["column_1_value"]
+                column_1_value_list = [
+                    v.strip() for v in str(column_1_value_str).split(",")
+                ]
+                condition &= col(column_1_name).isin(column_1_value_list)
+            if (
+                rule["column_2_name"] is not None
+                and rule["column_2_name"] != ""
+                and rule["column_2_value"] is not None
+                and rule["column_2_value"] != ""
+            ):
+                column_2_name = rule["column_2_name"]
+                column_2_value_str = rule["column_2_value"]
+                column_2_value_list = [
+                    v.strip() for v in str(column_2_value_str).split(",")
+                ]
+                condition &= col(column_2_name).isin(column_2_value_list)
+            display_decimals = int(rule["display_decimals"])
+            # Count actual decimal places in the current value
+            # If the value already has fewer decimals than target, skip rounding
+            actual_decimals = F.length(
+                F.regexp_extract(
+                    F.col(value_column).cast("string"), DECIMAL_PLACES_REGEX, 1
+                )
             )
-        if col2_name.lower() not in [column.lower() for column in df.columns]:
-            raise ValueError(
-                f"{col2_name} is not part of the columns available for this dataset ({df.columns})"
+            # Add decimals condition
+            when_decimals = (
+                F.when(condition, lit(display_decimals))
+                if when_decimals is None
+                else when_decimals.when(condition, lit(display_decimals))
             )
-        df = (
-            df.alias("d")
-            .join(
-                self.display_decimals.alias("dd"),
-                on=(col(f"d.{col1_name}") == col("dd.column_1_value"))
-                & (col(f"d.{col2_name}") == col("dd.column_2_value")),
-                how="left",
+            # Add rounding condition based on display_decimals
+            # Only apply rounding if current decimals >= target decimals
+            if display_decimals > 6:
+                # Cast to float and round
+                rounded_value = F.round(
+                    col(value_column).cast(FloatType()), display_decimals
+                )
+            else:
+                # Cast to DECIMAL with precision 38 and decimals as display_decimals + 2
+                precision = 38
+                decimals = display_decimals + 2
+                decimal_value = col(value_column).cast(DecimalType(precision, decimals))
+                scale = pow(lit(10), lit(display_decimals)).cast(
+                    DecimalType(precision, decimals)
+                )
+                rounded_value = F.round(decimal_value * scale) / scale
+            # Only round if actual decimals >= target decimals, otherwise keep original
+            rounded_value = F.when(
+                actual_decimals >= lit(display_decimals), rounded_value
+            ).otherwise(col(value_column))
+            when_rounded = (
+                F.when(condition, rounded_value)
+                if when_rounded is None
+                else when_rounded.when(condition, rounded_value)
             )
-            .select("d.*", "dd.display_decimals")
-        )
-        df.filter(col("display_decimals").isNull()).select(
-            col1_name, col2_name
-        ).distinct()
-        logging.warning(
-            f"The following combinations of {col1_name} and {col2_name} are not available in the table {DatasetDatatables.DISPLAY_DECIMALS.name} and will be assigned to 0"
+        # Add otherwise with default value for decimals
+        when_decimals = (
+            lit(default_decimals)
+            if when_decimals is None
+            else when_decimals.otherwise(lit(default_decimals))
         )
-        df = df.withColumn(
-            "display_decimals",
-            F.coalesce(col("display_decimals"), lit("0")).cast("INT"),
-        ).withColumn(
-            "value",
-            F.round(
-                F.col("value").cast("FLOAT") * F.pow(10, F.col("display_decimals")), 0
+        # Add otherwise with default rounding for value
+        if default_decimals > 6:
+            default_rounded = F.round(
+                col(value_column).cast(FloatType()), default_decimals
+            )
+        else:
+            precision = 38
+            decimals = default_decimals + 2
+            default_decimal_value = col(value_column).cast(
+                DecimalType(precision, decimals)
             )
-            / F.pow(10, F.col("display_decimals")).cast("STRING"),
+            default_scale = pow(lit(10), lit(default_decimals)).cast(
+                DecimalType(precision, decimals)
+            )
+            default_rounded = (
+                F.round(default_decimal_value * default_scale) / default_scale
+            )
+        # Only round if actual decimals >= target decimals, otherwise keep original
+        actual_decimals_default = F.length(
+            F.regexp_extract(
+                F.col(value_column).cast("string"), DECIMAL_PLACES_REGEX, 1
+            )
+        )
+        default_rounded = F.when(
+            actual_decimals_default >= lit(default_decimals), default_rounded
+        ).otherwise(col(value_column))
+        when_rounded = (
+            default_rounded
+            if when_rounded is None
+            else when_rounded.otherwise(default_rounded)
         )
-        # F.round(
-        #     col("value").cast("FLOAT"), col("display_decimals").cast("INT")
-        # ).cast("STRING"),
+        df = df.withColumn("display_decimals", when_decimals)
+        df = df.withColumn(value_column, when_rounded)
         return df
@@ -735,3 +840,29 @@ class SWSGoldIcebergSparkHelper:
         logging.debug(f"Tag with Added csv Table: {tag}")
         return df
+1
+frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
+1
+1
+2
+frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
+2
+1
+1
+frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
+1
+1
+2
+frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
+2
+1
+1
+frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
+1
+1
+1
+frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
+1
+1

sws_spark_dissemination_helper/SWSPostgresSparkReader.py CHANGED Viewed

@@ -497,45 +497,3 @@ class SWSPostgresSparkReader:
                 "aggregation",
             ],
         )
-    def get_display_decimals_datatable(
-        self,
-        domain_code: str,
-    ) -> DataFrame:
-        df = self.read_pg_table(
-            pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
-            custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
-        ).filter(col("domain") == lit(domain_code))
-        pairs = df.select("column_1_name", "column_2_name").distinct().collect()
-        # If no config exists for this domain, fail early
-        if not pairs:
-            msg = (
-                f'No display-decimals configuration found for domain "{domain_code}". '
-                f'Please add an entry in table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
-            )
-            logging.error(msg)
-            # raise ValueError(msg)
-        # If more than one mapping exists, it's invalid
-        if len(pairs) > 1:
-            formatted_pairs = [(p["column_1_name"], p["column_2_name"]) for p in pairs]
-            msg = (
-                f'Invalid configuration for domain "{domain_code}". '
-                f"Expected exactly one (column_1_name, column_2_name) pair, but found {len(pairs)}: "
-                f"{formatted_pairs}. "
-                f'Please correct the table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
-            )
-            logging.error(
-                "Multiple display-decimals column pairs detected",
-                extra={
-                    "domain": domain_code,
-                    "pairs_found": formatted_pairs,
-                },
-            )
-            raise ValueError(msg)
-        return df

sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py CHANGED Viewed

@@ -444,7 +444,7 @@ class SWSSilverIcebergSparkHelper:
         logging.info("Checking the dissemination flag for each dimension (except year)")
         for col_name, col_type in self.mapping_dim_col_name_type.items():
-            if col_type != "other":
+            if col_type not in ("other", "year"):
                 df = self._check_diss_dim_list(
                     df,
                     self.dfs_diss_flags[col_type],

{sws_spark_dissemination_helper-0.0.187.dist-info → sws_spark_dissemination_helper-0.0.191.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sws-spark-dissemination-helper
-Version: 0.0.187
+Version: 0.0.191
 Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
 Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
 Author-email: Daniele Mansillo <danielemansillo@gmail.com>

{sws_spark_dissemination_helper-0.0.187.dist-info → sws_spark_dissemination_helper-0.0.191.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
 sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
-sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=atQFiY5Mmo-rzHY7WVWg-Guvg8i1ZcaaoKE4ymTaKdE,27750
-sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=qoO___xL1g1iH_KkJ0opLvtNJGU2Dm6bUn-jWem5v2U,20030
-sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=3l5zkEWksnEC-R4mJi8JEHL3ylCMbkMD9a0qbdZQU5E,26345
+sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=3JkM3u7fLG-ZTtuReR4y3q5JVbhv_XG7-faRYQaktc0,32378
+sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=5do-Cz2_GAEwNxPWRnnITjADMX8Wgi3aj_ynpQCUNmI,18467
+sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=PGZPq_oNKRGOseYGuNujbcS8y-WuLmoDMN95faq0Css,26359
 sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
 sws_spark_dissemination_helper/constants.py,sha256=MzuC7pqsXF89r-FK7hhmWaZSk5x3GB_YPVSfuK3NYVY,14056
 sws_spark_dissemination_helper/utils.py,sha256=Ge8zXsUIcvFihALDNLF5kCu_tAdRQUE04xE6Yn9xQF4,22008
-sws_spark_dissemination_helper-0.0.187.dist-info/METADATA,sha256=PPrDi-8X1HkcAjYs92VJRaAvcf27I3Aw0wljIs1UMO8,2822
-sws_spark_dissemination_helper-0.0.187.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-sws_spark_dissemination_helper-0.0.187.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
-sws_spark_dissemination_helper-0.0.187.dist-info/RECORD,,
+sws_spark_dissemination_helper-0.0.191.dist-info/METADATA,sha256=GCVYkvlKzxgFc22jEYBEc2_Hj7PN1RJAoamlnXdM4nA,2822
+sws_spark_dissemination_helper-0.0.191.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+sws_spark_dissemination_helper-0.0.191.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
+sws_spark_dissemination_helper-0.0.191.dist-info/RECORD,,

{sws_spark_dissemination_helper-0.0.187.dist-info → sws_spark_dissemination_helper-0.0.191.dist-info}/WHEEL RENAMED Viewed

File without changes

{sws_spark_dissemination_helper-0.0.187.dist-info → sws_spark_dissemination_helper-0.0.191.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sws-spark-dissemination-helper 0.0.187__py3-none-any.whl → 0.0.191__py3-none-any.whl

sws-spark-dissemination-helper 0.0.187py3-none-any.whl → 0.0.191py3-none-any.whl