sws-spark-dissemination-helper 0.0.187__py3-none-any.whl → 0.0.191__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,13 +5,19 @@ from typing import List, Tuple
5
5
  import pyspark.sql.functions as F
6
6
  from pyspark.sql import DataFrame, SparkSession
7
7
  from pyspark.sql.functions import col, lit
8
+ from pyspark.sql.types import DecimalType, FloatType
8
9
  from sws_api_client import Tags
9
10
  from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
10
11
 
11
- from .constants import IcebergDatabases, IcebergTables, DatasetDatatables
12
+ from .constants import DatasetDatatables, IcebergDatabases, IcebergTables
12
13
  from .SWSPostgresSparkReader import SWSPostgresSparkReader
13
14
  from .utils import get_or_create_tag, save_cache_csv, upsert_disseminated_table
14
15
 
16
+ SIMPLE_NUMERIC_REGEX = r"^[+-]?\d*(\.\d+)?$"
17
+ NUMERIC_REGEX = r"^[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$"
18
+ # Regex to extract decimal places: matches the decimal part and counts its length
19
+ DECIMAL_PLACES_REGEX = r"\.(\d+)$"
20
+
15
21
 
16
22
  class SWSGoldIcebergSparkHelper:
17
23
  def __init__(
@@ -62,10 +68,9 @@ class SWSGoldIcebergSparkHelper:
62
68
  if col_name in self.dim_columns
63
69
  }
64
70
 
65
- self.display_decimals = (
66
- self.sws_postgres_spark_reader.get_display_decimals_datatable(
67
- domain_code=domain_code
68
- )
71
+ self.display_decimals_df = self.sws_postgres_spark_reader.read_pg_table(
72
+ pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
73
+ custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
69
74
  )
70
75
 
71
76
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
@@ -99,53 +104,153 @@ class SWSGoldIcebergSparkHelper:
99
104
  cols_to_keep_sws = cols_to_keep_sws + ["unit_of_measure_symbol"]
100
105
  return df.select(*cols_to_keep_sws)
101
106
 
102
- def round_to_display_decimals(self, df: DataFrame):
103
- col1_name, col2_name = (
104
- self.display_decimals.select("column_1_name", "column_2_name")
105
- .distinct()
106
- .collect()[0]
107
- )
108
- if col1_name.lower() not in [column.lower() for column in df.columns]:
109
- raise ValueError(
110
- f"{col1_name} is not part of the columns available for this dataset ({df.columns})"
107
+ def round_to_display_decimals(
108
+ self,
109
+ df: DataFrame,
110
+ value_column: str = "value",
111
+ ) -> DataFrame:
112
+
113
+ df = df.withColumn("unrounded_value", col(value_column).cast("string"))
114
+
115
+ general_default_decimals = (
116
+ self.display_decimals_df.filter(col("domain") == lit("DEFAULT"))
117
+ .select("display_decimals")
118
+ .collect()[0][0]
119
+ )
120
+ domain_default_decimals = self.display_decimals_df.filter(
121
+ (col("domain") == lit(self.domain_code))
122
+ & col("column_1_name").isNull()
123
+ & col("column_2_name").isNull()
124
+ ).select("display_decimals")
125
+
126
+ default_decimals = int(
127
+ general_default_decimals
128
+ if domain_default_decimals.count() == 0
129
+ else domain_default_decimals.collect()[0][0]
130
+ )
131
+
132
+ domain_specific_rules = self.display_decimals_df.filter(
133
+ (col("domain") == lit(self.domain_code))
134
+ & (col("column_1_name").isNotNull() & col("column_1_value").isNotNull())
135
+ | (col("column_2_name").isNotNull() & col("column_2_value").isNotNull())
136
+ )
137
+
138
+ when_decimals = None
139
+ when_rounded = None
140
+
141
+ for rule in domain_specific_rules.collect():
142
+ condition = lit(True)
143
+ if rule["column_1_name"] != "" and rule["column_1_value"] != "":
144
+ column_1_name = rule["column_1_name"]
145
+ column_1_value_str = rule["column_1_value"]
146
+
147
+ column_1_value_list = [
148
+ v.strip() for v in str(column_1_value_str).split(",")
149
+ ]
150
+ condition &= col(column_1_name).isin(column_1_value_list)
151
+
152
+ if (
153
+ rule["column_2_name"] is not None
154
+ and rule["column_2_name"] != ""
155
+ and rule["column_2_value"] is not None
156
+ and rule["column_2_value"] != ""
157
+ ):
158
+ column_2_name = rule["column_2_name"]
159
+ column_2_value_str = rule["column_2_value"]
160
+
161
+ column_2_value_list = [
162
+ v.strip() for v in str(column_2_value_str).split(",")
163
+ ]
164
+ condition &= col(column_2_name).isin(column_2_value_list)
165
+
166
+ display_decimals = int(rule["display_decimals"])
167
+
168
+ # Count actual decimal places in the current value
169
+ # If the value already has fewer decimals than target, skip rounding
170
+ actual_decimals = F.length(
171
+ F.regexp_extract(
172
+ F.col(value_column).cast("string"), DECIMAL_PLACES_REGEX, 1
173
+ )
111
174
  )
112
- if col2_name.lower() not in [column.lower() for column in df.columns]:
113
- raise ValueError(
114
- f"{col2_name} is not part of the columns available for this dataset ({df.columns})"
175
+
176
+ # Add decimals condition
177
+ when_decimals = (
178
+ F.when(condition, lit(display_decimals))
179
+ if when_decimals is None
180
+ else when_decimals.when(condition, lit(display_decimals))
115
181
  )
116
182
 
117
- df = (
118
- df.alias("d")
119
- .join(
120
- self.display_decimals.alias("dd"),
121
- on=(col(f"d.{col1_name}") == col("dd.column_1_value"))
122
- & (col(f"d.{col2_name}") == col("dd.column_2_value")),
123
- how="left",
183
+ # Add rounding condition based on display_decimals
184
+ # Only apply rounding if current decimals >= target decimals
185
+ if display_decimals > 6:
186
+ # Cast to float and round
187
+ rounded_value = F.round(
188
+ col(value_column).cast(FloatType()), display_decimals
189
+ )
190
+ else:
191
+ # Cast to DECIMAL with precision 38 and decimals as display_decimals + 2
192
+ precision = 38
193
+ decimals = display_decimals + 2
194
+ decimal_value = col(value_column).cast(DecimalType(precision, decimals))
195
+ scale = pow(lit(10), lit(display_decimals)).cast(
196
+ DecimalType(precision, decimals)
197
+ )
198
+ rounded_value = F.round(decimal_value * scale) / scale
199
+
200
+ # Only round if actual decimals >= target decimals, otherwise keep original
201
+ rounded_value = F.when(
202
+ actual_decimals >= lit(display_decimals), rounded_value
203
+ ).otherwise(col(value_column))
204
+
205
+ when_rounded = (
206
+ F.when(condition, rounded_value)
207
+ if when_rounded is None
208
+ else when_rounded.when(condition, rounded_value)
124
209
  )
125
- .select("d.*", "dd.display_decimals")
126
- )
127
210
 
128
- df.filter(col("display_decimals").isNull()).select(
129
- col1_name, col2_name
130
- ).distinct()
131
- logging.warning(
132
- f"The following combinations of {col1_name} and {col2_name} are not available in the table {DatasetDatatables.DISPLAY_DECIMALS.name} and will be assigned to 0"
211
+ # Add otherwise with default value for decimals
212
+ when_decimals = (
213
+ lit(default_decimals)
214
+ if when_decimals is None
215
+ else when_decimals.otherwise(lit(default_decimals))
133
216
  )
134
217
 
135
- df = df.withColumn(
136
- "display_decimals",
137
- F.coalesce(col("display_decimals"), lit("0")).cast("INT"),
138
- ).withColumn(
139
- "value",
140
- F.round(
141
- F.col("value").cast("FLOAT") * F.pow(10, F.col("display_decimals")), 0
218
+ # Add otherwise with default rounding for value
219
+ if default_decimals > 6:
220
+ default_rounded = F.round(
221
+ col(value_column).cast(FloatType()), default_decimals
222
+ )
223
+ else:
224
+ precision = 38
225
+ decimals = default_decimals + 2
226
+ default_decimal_value = col(value_column).cast(
227
+ DecimalType(precision, decimals)
142
228
  )
143
- / F.pow(10, F.col("display_decimals")).cast("STRING"),
229
+ default_scale = pow(lit(10), lit(default_decimals)).cast(
230
+ DecimalType(precision, decimals)
231
+ )
232
+ default_rounded = (
233
+ F.round(default_decimal_value * default_scale) / default_scale
234
+ )
235
+
236
+ # Only round if actual decimals >= target decimals, otherwise keep original
237
+ actual_decimals_default = F.length(
238
+ F.regexp_extract(
239
+ F.col(value_column).cast("string"), DECIMAL_PLACES_REGEX, 1
240
+ )
241
+ )
242
+ default_rounded = F.when(
243
+ actual_decimals_default >= lit(default_decimals), default_rounded
244
+ ).otherwise(col(value_column))
245
+
246
+ when_rounded = (
247
+ default_rounded
248
+ if when_rounded is None
249
+ else when_rounded.otherwise(default_rounded)
144
250
  )
145
251
 
146
- # F.round(
147
- # col("value").cast("FLOAT"), col("display_decimals").cast("INT")
148
- # ).cast("STRING"),
252
+ df = df.withColumn("display_decimals", when_decimals)
253
+ df = df.withColumn(value_column, when_rounded)
149
254
 
150
255
  return df
151
256
 
@@ -735,3 +840,29 @@ class SWSGoldIcebergSparkHelper:
735
840
  logging.debug(f"Tag with Added csv Table: {tag}")
736
841
 
737
842
  return df
843
+
844
+
845
+ 1
846
+ frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
847
+ 1
848
+ 1
849
+ 2
850
+ frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
851
+ 2
852
+ 1
853
+ 1
854
+ frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
855
+ 1
856
+ 1
857
+ 2
858
+ frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
859
+ 2
860
+ 1
861
+ 1
862
+ frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
863
+ 1
864
+ 1
865
+ 1
866
+ frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
867
+ 1
868
+ 1
@@ -497,45 +497,3 @@ class SWSPostgresSparkReader:
497
497
  "aggregation",
498
498
  ],
499
499
  )
500
-
501
- def get_display_decimals_datatable(
502
- self,
503
- domain_code: str,
504
- ) -> DataFrame:
505
- df = self.read_pg_table(
506
- pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
507
- custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
508
- ).filter(col("domain") == lit(domain_code))
509
-
510
- pairs = df.select("column_1_name", "column_2_name").distinct().collect()
511
-
512
- # If no config exists for this domain, fail early
513
- if not pairs:
514
- msg = (
515
- f'No display-decimals configuration found for domain "{domain_code}". '
516
- f'Please add an entry in table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
517
- )
518
- logging.error(msg)
519
- # raise ValueError(msg)
520
-
521
- # If more than one mapping exists, it's invalid
522
- if len(pairs) > 1:
523
- formatted_pairs = [(p["column_1_name"], p["column_2_name"]) for p in pairs]
524
-
525
- msg = (
526
- f'Invalid configuration for domain "{domain_code}". '
527
- f"Expected exactly one (column_1_name, column_2_name) pair, but found {len(pairs)}: "
528
- f"{formatted_pairs}. "
529
- f'Please correct the table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
530
- )
531
-
532
- logging.error(
533
- "Multiple display-decimals column pairs detected",
534
- extra={
535
- "domain": domain_code,
536
- "pairs_found": formatted_pairs,
537
- },
538
- )
539
- raise ValueError(msg)
540
-
541
- return df
@@ -444,7 +444,7 @@ class SWSSilverIcebergSparkHelper:
444
444
  logging.info("Checking the dissemination flag for each dimension (except year)")
445
445
 
446
446
  for col_name, col_type in self.mapping_dim_col_name_type.items():
447
- if col_type != "other":
447
+ if col_type not in ("other", "year"):
448
448
  df = self._check_diss_dim_list(
449
449
  df,
450
450
  self.dfs_diss_flags[col_type],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.187
3
+ Version: 0.0.191
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -1,13 +1,13 @@
1
1
  sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
2
2
  sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
4
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=atQFiY5Mmo-rzHY7WVWg-Guvg8i1ZcaaoKE4ymTaKdE,27750
5
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=qoO___xL1g1iH_KkJ0opLvtNJGU2Dm6bUn-jWem5v2U,20030
6
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=3l5zkEWksnEC-R4mJi8JEHL3ylCMbkMD9a0qbdZQU5E,26345
4
+ sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=3JkM3u7fLG-ZTtuReR4y3q5JVbhv_XG7-faRYQaktc0,32378
5
+ sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=5do-Cz2_GAEwNxPWRnnITjADMX8Wgi3aj_ynpQCUNmI,18467
6
+ sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=PGZPq_oNKRGOseYGuNujbcS8y-WuLmoDMN95faq0Css,26359
7
7
  sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
8
8
  sws_spark_dissemination_helper/constants.py,sha256=MzuC7pqsXF89r-FK7hhmWaZSk5x3GB_YPVSfuK3NYVY,14056
9
9
  sws_spark_dissemination_helper/utils.py,sha256=Ge8zXsUIcvFihALDNLF5kCu_tAdRQUE04xE6Yn9xQF4,22008
10
- sws_spark_dissemination_helper-0.0.187.dist-info/METADATA,sha256=PPrDi-8X1HkcAjYs92VJRaAvcf27I3Aw0wljIs1UMO8,2822
11
- sws_spark_dissemination_helper-0.0.187.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
- sws_spark_dissemination_helper-0.0.187.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
- sws_spark_dissemination_helper-0.0.187.dist-info/RECORD,,
10
+ sws_spark_dissemination_helper-0.0.191.dist-info/METADATA,sha256=GCVYkvlKzxgFc22jEYBEc2_Hj7PN1RJAoamlnXdM4nA,2822
11
+ sws_spark_dissemination_helper-0.0.191.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
+ sws_spark_dissemination_helper-0.0.191.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
+ sws_spark_dissemination_helper-0.0.191.dist-info/RECORD,,