PyPI - sws-spark-dissemination-helper - Versions diffs - 0.0.185__tar.gz → 0.0.194__tar.gz - Mend

sws-spark-dissemination-helper 0.0.185tar.gz → 0.0.194tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{sws_spark_dissemination_helper-0.0.185 → sws_spark_dissemination_helper-0.0.194}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sws-spark-dissemination-helper
-Version: 0.0.185
+Version: 0.0.194
 Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
 Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
 Author-email: Daniele Mansillo <danielemansillo@gmail.com>

{sws_spark_dissemination_helper-0.0.185 → sws_spark_dissemination_helper-0.0.194}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "sws-spark-dissemination-helper"
-version = "0.0.185"
+version = "0.0.194"
 dependencies = [
     "annotated-types==0.7.0",
     "boto3>=1.40.0",

{sws_spark_dissemination_helper-0.0.185 → sws_spark_dissemination_helper-0.0.194}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py RENAMED Viewed

@@ -5,13 +5,19 @@ from typing import List, Tuple
 import pyspark.sql.functions as F
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, lit
+from pyspark.sql.types import DecimalType, FloatType
 from sws_api_client import Tags
 from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
-from .constants import IcebergDatabases, IcebergTables, DatasetDatatables
+from .constants import DatasetDatatables, IcebergDatabases, IcebergTables
 from .SWSPostgresSparkReader import SWSPostgresSparkReader
 from .utils import get_or_create_tag, save_cache_csv, upsert_disseminated_table
+SIMPLE_NUMERIC_REGEX = r"^[+-]?\d*(\.\d+)?$"
+NUMERIC_REGEX = r"^[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$"
+# Regex to extract decimal places: matches the decimal part and counts its length
+DECIMAL_PLACES_REGEX = r"\.(\d+)$"
 class SWSGoldIcebergSparkHelper:
     def __init__(
@@ -62,10 +68,12 @@ class SWSGoldIcebergSparkHelper:
                 if col_name in self.dim_columns
             }
-            self.display_decimals = (
-                self.sws_postgres_spark_reader.get_display_decimals_datatable(
-                    domain_code=domain_code
-                )
+            self.display_decimals_df = self.sws_postgres_spark_reader.read_pg_table(
+                pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
+                custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
+            ).filter(
+                (col("domain") == lit(self.domain_code))
+                | ((col("domain") == lit("DEFAULT")))
             )
     def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
@@ -99,53 +107,162 @@ class SWSGoldIcebergSparkHelper:
             cols_to_keep_sws = cols_to_keep_sws + ["unit_of_measure_symbol"]
         return df.select(*cols_to_keep_sws)
-    def round_to_display_decimals(self, df: DataFrame):
-        col1_name, col2_name = (
-            self.display_decimals.select("column_1_name", "column_2_name")
-            .distinct()
-            .collect()[0]
-        )
-        if col1_name.lower() not in [column.lower() for column in df.columns]:
-            raise ValueError(
-                f"{col1_name} is not part of the columns available for this dataset ({df.columns})"
+    def round_to_display_decimals(
+        self,
+        df: DataFrame,
+        value_column: str = "value",
+    ) -> DataFrame:
+        df = df.withColumn("unrounded_value", col(value_column).cast("string"))
+        general_default_decimals = (
+            self.display_decimals_df.filter(col("domain") == lit("DEFAULT"))
+            .select("display_decimals")
+            .collect()[0][0]
+        )
+        domain_default_decimals = self.display_decimals_df.filter(
+            (col("domain") == lit(self.domain_code))
+            & col("column_1_name").isNull()
+            & col("column_2_name").isNull()
+        ).select("display_decimals")
+        default_decimals = int(
+            general_default_decimals
+            if domain_default_decimals.count() == 0
+            else domain_default_decimals.collect()[0][0]
+        )
+        domain_specific_rules = self.display_decimals_df.filter(
+            (col("domain") == lit(self.domain_code))
+            & (col("column_1_name").isNotNull() & col("column_1_value").isNotNull())
+            | (col("column_2_name").isNotNull() & col("column_2_value").isNotNull())
+        )
+        when_decimals = None
+        when_rounded = None
+        for rule in domain_specific_rules.collect():
+            condition = lit(True)
+            if rule["column_1_name"] != "" and rule["column_1_value"] != "":
+                column_1_name = rule["column_1_name"]
+                column_1_value_str = rule["column_1_value"]
+                column_1_value_list = [
+                    v.strip() for v in str(column_1_value_str).split(",")
+                ]
+                condition &= col(column_1_name).isin(column_1_value_list)
+            if (
+                rule["column_2_name"] is not None
+                and rule["column_2_name"] != ""
+                and rule["column_2_value"] is not None
+                and rule["column_2_value"] != ""
+            ):
+                column_2_name = rule["column_2_name"]
+                column_2_value_str = rule["column_2_value"]
+                column_2_value_list = [
+                    v.strip() for v in str(column_2_value_str).split(",")
+                ]
+                condition &= col(column_2_name).isin(column_2_value_list)
+            display_decimals = int(rule["display_decimals"])
+            # Count actual decimal places in the current value
+            # Handle both regular decimals and scientific notation
+            # Convert scientific notation to decimal format first
+            value_str_normalized = F.when(
+                F.col(value_column).cast("string").rlike("[eE]"),
+                F.format_number(F.col(value_column).cast("double"), 20),
+            ).otherwise(F.col(value_column).cast("string"))
+            actual_decimals = F.length(
+                F.regexp_extract(value_str_normalized, DECIMAL_PLACES_REGEX, 1)
             )
-        if col2_name.lower() not in [column.lower() for column in df.columns]:
-            raise ValueError(
-                f"{col2_name} is not part of the columns available for this dataset ({df.columns})"
+            # Add decimals condition
+            when_decimals = (
+                F.when(condition, lit(display_decimals))
+                if when_decimals is None
+                else when_decimals.when(condition, lit(display_decimals))
             )
-        df = (
-            df.alias("d")
-            .join(
-                self.display_decimals.alias("dd"),
-                on=(col(f"d.{col1_name}") == col("dd.column_1_value"))
-                & (col(f"d.{col2_name}") == col("dd.column_2_value")),
-                how="left",
+            # Add rounding condition based on display_decimals
+            # Only apply rounding if current decimals >= target decimals
+            if display_decimals > 6:
+                # Cast to float and round
+                # Cast to DECIMAL with precision 38 and decimals as display_decimals + 2
+                precision = 38
+                decimals = display_decimals
+                rounded_value = col(value_column).cast(DecimalType(precision, decimals))
+            else:
+                # Cast to DECIMAL with precision 38 and decimals as display_decimals + 2
+                precision = 38
+                decimals = display_decimals + 2
+                decimal_value = col(value_column).cast(DecimalType(precision, decimals))
+                scale = pow(lit(10), lit(display_decimals)).cast(
+                    DecimalType(precision, decimals)
+                )
+                rounded_value = F.round(decimal_value * scale) / scale
+            # Only round if actual decimals >= target decimals, otherwise keep original
+            rounded_value = F.when(
+                actual_decimals >= lit(display_decimals), rounded_value
+            ).otherwise(col(value_column))
+            when_rounded = (
+                F.when(condition, rounded_value)
+                if when_rounded is None
+                else when_rounded.when(condition, rounded_value)
             )
-            .select("d.*", "dd.display_decimals")
-        )
-        df.filter(col("display_decimals").isNull()).select(
-            col1_name, col2_name
-        ).distinct()
-        logging.warning(
-            f"The following combinations of {col1_name} and {col2_name} are not available in the table {DatasetDatatables.DISPLAY_DECIMALS.name} and will be assigned to 0"
+        # Add otherwise with default value for decimals
+        when_decimals = (
+            lit(default_decimals)
+            if when_decimals is None
+            else when_decimals.otherwise(lit(default_decimals))
         )
-        df = df.withColumn(
-            "display_decimals",
-            F.coalesce(col("display_decimals"), lit("0")).cast("INT"),
-        ).withColumn(
-            "value",
-            F.round(
-                F.col("value").cast("FLOAT") * F.pow(10, F.col("display_decimals")), 0
+        # Add otherwise with default rounding for value
+        if default_decimals > 6:
+            default_rounded = F.round(
+                col(value_column).cast(FloatType()), default_decimals
+            )
+        else:
+            precision = 38
+            decimals = default_decimals + 2
+            default_decimal_value = col(value_column).cast(
+                DecimalType(precision, decimals)
+            )
+            default_scale = pow(lit(10), lit(default_decimals)).cast(
+                DecimalType(precision, decimals)
+            )
+            default_rounded = (
+                F.round(default_decimal_value * default_scale) / default_scale
             )
-            / F.pow(10, F.col("display_decimals")).cast("STRING"),
+        # Only round if actual decimals >= target decimals, otherwise keep original
+        # Handle both regular decimals and scientific notation for default case
+        value_str_normalized_default = F.when(
+            F.col(value_column).cast("string").rlike("[eE]"),
+            F.format_number(F.col(value_column).cast("double"), 20),
+        ).otherwise(F.col(value_column).cast("string"))
+        actual_decimals_default = F.length(
+            F.regexp_extract(value_str_normalized_default, DECIMAL_PLACES_REGEX, 1)
+        )
+        default_rounded = F.when(
+            actual_decimals_default >= lit(default_decimals), default_rounded
+        ).otherwise(col(value_column))
+        when_rounded = (
+            default_rounded
+            if when_rounded is None
+            else when_rounded.otherwise(default_rounded)
         )
-        # F.round(
-        #     col("value").cast("FLOAT"), col("display_decimals").cast("INT")
-        # ).cast("STRING"),
+        df = df.withColumn("display_decimals", when_decimals)
+        df = df.withColumn(value_column, when_rounded)
         return df

{sws_spark_dissemination_helper-0.0.185 → sws_spark_dissemination_helper-0.0.194}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py RENAMED Viewed

@@ -468,7 +468,7 @@ class SWSPostgresSparkReader:
                 correct_domain_filter, domain=domain_code, unique_columns=["code"]
             )
             for col_type in mapping_dim_col_name_type.values()
-            if col_type != "other"
+            if col_type not in ("year", "other")
         }
     def import_diss_exceptions_datatable(
@@ -497,45 +497,3 @@ class SWSPostgresSparkReader:
                 "aggregation",
             ],
         )
-    def get_display_decimals_datatable(
-        self,
-        domain_code: str,
-    ) -> DataFrame:
-        df = self.read_pg_table(
-            pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
-            custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
-        ).filter(col("domain") == lit(domain_code))
-        pairs = df.select("column_1_name", "column_2_name").distinct().collect()
-        # If no config exists for this domain, fail early
-        if not pairs:
-            msg = (
-                f'No display-decimals configuration found for domain "{domain_code}". '
-                f'Please add an entry in table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
-            )
-            logging.error(msg)
-            # raise ValueError(msg)
-        # If more than one mapping exists, it's invalid
-        if len(pairs) > 1:
-            formatted_pairs = [(p["column_1_name"], p["column_2_name"]) for p in pairs]
-            msg = (
-                f'Invalid configuration for domain "{domain_code}". '
-                f"Expected exactly one (column_1_name, column_2_name) pair, but found {len(pairs)}: "
-                f"{formatted_pairs}. "
-                f'Please correct the table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
-            )
-            logging.error(
-                "Multiple display-decimals column pairs detected",
-                extra={
-                    "domain": domain_code,
-                    "pairs_found": formatted_pairs,
-                },
-            )
-            raise ValueError(msg)
-        return df

{sws_spark_dissemination_helper-0.0.185 → sws_spark_dissemination_helper-0.0.194}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py RENAMED Viewed

@@ -444,7 +444,7 @@ class SWSSilverIcebergSparkHelper:
         logging.info("Checking the dissemination flag for each dimension (except year)")
         for col_name, col_type in self.mapping_dim_col_name_type.items():
-            if col_type != "other":
+            if col_type not in ("other", "year"):
                 df = self._check_diss_dim_list(
                     df,
                     self.dfs_diss_flags[col_type],

{sws_spark_dissemination_helper-0.0.185 → sws_spark_dissemination_helper-0.0.194}/src/sws_spark_dissemination_helper/constants.py RENAMED Viewed

@@ -168,7 +168,7 @@ class DatasetTables:
         self.OBSERVATION = self.__SWSTable(
             postgres_id=f"{self.__dataset_id}.observation",
             iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_observation",
-            schema="id BIGINT, observation_coordinates BIGINT, version INT, value FLOAT, flag_obs_status STRING, flag_method STRING, created_on TIMESTAMP, created_by INT, replaced_on TIMESTAMP",
+            schema="id BIGINT, observation_coordinates BIGINT, version INT, value STRING, flag_obs_status STRING, flag_method STRING, created_on TIMESTAMP, created_by INT, replaced_on TIMESTAMP",
         )
         self.OBSERVATION_COORDINATE = self.__SWSTable(
             postgres_id=f"{self.__dataset_id}.observation_coordinate",