sws-spark-dissemination-helper 0.0.183__py3-none-any.whl → 0.0.191__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +173 -42
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py +1 -43
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +1 -1
- sws_spark_dissemination_helper/constants.py +1 -1
- {sws_spark_dissemination_helper-0.0.183.dist-info → sws_spark_dissemination_helper-0.0.191.dist-info}/METADATA +2 -2
- {sws_spark_dissemination_helper-0.0.183.dist-info → sws_spark_dissemination_helper-0.0.191.dist-info}/RECORD +8 -8
- {sws_spark_dissemination_helper-0.0.183.dist-info → sws_spark_dissemination_helper-0.0.191.dist-info}/WHEEL +0 -0
- {sws_spark_dissemination_helper-0.0.183.dist-info → sws_spark_dissemination_helper-0.0.191.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,13 +5,19 @@ from typing import List, Tuple
|
|
|
5
5
|
import pyspark.sql.functions as F
|
|
6
6
|
from pyspark.sql import DataFrame, SparkSession
|
|
7
7
|
from pyspark.sql.functions import col, lit
|
|
8
|
+
from pyspark.sql.types import DecimalType, FloatType
|
|
8
9
|
from sws_api_client import Tags
|
|
9
10
|
from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
|
|
10
11
|
|
|
11
|
-
from .constants import IcebergDatabases, IcebergTables
|
|
12
|
+
from .constants import DatasetDatatables, IcebergDatabases, IcebergTables
|
|
12
13
|
from .SWSPostgresSparkReader import SWSPostgresSparkReader
|
|
13
14
|
from .utils import get_or_create_tag, save_cache_csv, upsert_disseminated_table
|
|
14
15
|
|
|
16
|
+
SIMPLE_NUMERIC_REGEX = r"^[+-]?\d*(\.\d+)?$"
|
|
17
|
+
NUMERIC_REGEX = r"^[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$"
|
|
18
|
+
# Regex to extract decimal places: matches the decimal part and counts its length
|
|
19
|
+
DECIMAL_PLACES_REGEX = r"\.(\d+)$"
|
|
20
|
+
|
|
15
21
|
|
|
16
22
|
class SWSGoldIcebergSparkHelper:
|
|
17
23
|
def __init__(
|
|
@@ -62,10 +68,9 @@ class SWSGoldIcebergSparkHelper:
|
|
|
62
68
|
if col_name in self.dim_columns
|
|
63
69
|
}
|
|
64
70
|
|
|
65
|
-
self.
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
)
|
|
71
|
+
self.display_decimals_df = self.sws_postgres_spark_reader.read_pg_table(
|
|
72
|
+
pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
|
|
73
|
+
custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
|
|
69
74
|
)
|
|
70
75
|
|
|
71
76
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
@@ -99,53 +104,153 @@ class SWSGoldIcebergSparkHelper:
|
|
|
99
104
|
cols_to_keep_sws = cols_to_keep_sws + ["unit_of_measure_symbol"]
|
|
100
105
|
return df.select(*cols_to_keep_sws)
|
|
101
106
|
|
|
102
|
-
def round_to_display_decimals(
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
107
|
+
def round_to_display_decimals(
|
|
108
|
+
self,
|
|
109
|
+
df: DataFrame,
|
|
110
|
+
value_column: str = "value",
|
|
111
|
+
) -> DataFrame:
|
|
112
|
+
|
|
113
|
+
df = df.withColumn("unrounded_value", col(value_column).cast("string"))
|
|
114
|
+
|
|
115
|
+
general_default_decimals = (
|
|
116
|
+
self.display_decimals_df.filter(col("domain") == lit("DEFAULT"))
|
|
117
|
+
.select("display_decimals")
|
|
118
|
+
.collect()[0][0]
|
|
119
|
+
)
|
|
120
|
+
domain_default_decimals = self.display_decimals_df.filter(
|
|
121
|
+
(col("domain") == lit(self.domain_code))
|
|
122
|
+
& col("column_1_name").isNull()
|
|
123
|
+
& col("column_2_name").isNull()
|
|
124
|
+
).select("display_decimals")
|
|
125
|
+
|
|
126
|
+
default_decimals = int(
|
|
127
|
+
general_default_decimals
|
|
128
|
+
if domain_default_decimals.count() == 0
|
|
129
|
+
else domain_default_decimals.collect()[0][0]
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
domain_specific_rules = self.display_decimals_df.filter(
|
|
133
|
+
(col("domain") == lit(self.domain_code))
|
|
134
|
+
& (col("column_1_name").isNotNull() & col("column_1_value").isNotNull())
|
|
135
|
+
| (col("column_2_name").isNotNull() & col("column_2_value").isNotNull())
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
when_decimals = None
|
|
139
|
+
when_rounded = None
|
|
140
|
+
|
|
141
|
+
for rule in domain_specific_rules.collect():
|
|
142
|
+
condition = lit(True)
|
|
143
|
+
if rule["column_1_name"] != "" and rule["column_1_value"] != "":
|
|
144
|
+
column_1_name = rule["column_1_name"]
|
|
145
|
+
column_1_value_str = rule["column_1_value"]
|
|
146
|
+
|
|
147
|
+
column_1_value_list = [
|
|
148
|
+
v.strip() for v in str(column_1_value_str).split(",")
|
|
149
|
+
]
|
|
150
|
+
condition &= col(column_1_name).isin(column_1_value_list)
|
|
151
|
+
|
|
152
|
+
if (
|
|
153
|
+
rule["column_2_name"] is not None
|
|
154
|
+
and rule["column_2_name"] != ""
|
|
155
|
+
and rule["column_2_value"] is not None
|
|
156
|
+
and rule["column_2_value"] != ""
|
|
157
|
+
):
|
|
158
|
+
column_2_name = rule["column_2_name"]
|
|
159
|
+
column_2_value_str = rule["column_2_value"]
|
|
160
|
+
|
|
161
|
+
column_2_value_list = [
|
|
162
|
+
v.strip() for v in str(column_2_value_str).split(",")
|
|
163
|
+
]
|
|
164
|
+
condition &= col(column_2_name).isin(column_2_value_list)
|
|
165
|
+
|
|
166
|
+
display_decimals = int(rule["display_decimals"])
|
|
167
|
+
|
|
168
|
+
# Count actual decimal places in the current value
|
|
169
|
+
# If the value already has fewer decimals than target, skip rounding
|
|
170
|
+
actual_decimals = F.length(
|
|
171
|
+
F.regexp_extract(
|
|
172
|
+
F.col(value_column).cast("string"), DECIMAL_PLACES_REGEX, 1
|
|
173
|
+
)
|
|
111
174
|
)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
175
|
+
|
|
176
|
+
# Add decimals condition
|
|
177
|
+
when_decimals = (
|
|
178
|
+
F.when(condition, lit(display_decimals))
|
|
179
|
+
if when_decimals is None
|
|
180
|
+
else when_decimals.when(condition, lit(display_decimals))
|
|
115
181
|
)
|
|
116
182
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
183
|
+
# Add rounding condition based on display_decimals
|
|
184
|
+
# Only apply rounding if current decimals >= target decimals
|
|
185
|
+
if display_decimals > 6:
|
|
186
|
+
# Cast to float and round
|
|
187
|
+
rounded_value = F.round(
|
|
188
|
+
col(value_column).cast(FloatType()), display_decimals
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
# Cast to DECIMAL with precision 38 and decimals as display_decimals + 2
|
|
192
|
+
precision = 38
|
|
193
|
+
decimals = display_decimals + 2
|
|
194
|
+
decimal_value = col(value_column).cast(DecimalType(precision, decimals))
|
|
195
|
+
scale = pow(lit(10), lit(display_decimals)).cast(
|
|
196
|
+
DecimalType(precision, decimals)
|
|
197
|
+
)
|
|
198
|
+
rounded_value = F.round(decimal_value * scale) / scale
|
|
199
|
+
|
|
200
|
+
# Only round if actual decimals >= target decimals, otherwise keep original
|
|
201
|
+
rounded_value = F.when(
|
|
202
|
+
actual_decimals >= lit(display_decimals), rounded_value
|
|
203
|
+
).otherwise(col(value_column))
|
|
204
|
+
|
|
205
|
+
when_rounded = (
|
|
206
|
+
F.when(condition, rounded_value)
|
|
207
|
+
if when_rounded is None
|
|
208
|
+
else when_rounded.when(condition, rounded_value)
|
|
124
209
|
)
|
|
125
|
-
.select("d.*", "dd.display_decimals")
|
|
126
|
-
)
|
|
127
210
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
211
|
+
# Add otherwise with default value for decimals
|
|
212
|
+
when_decimals = (
|
|
213
|
+
lit(default_decimals)
|
|
214
|
+
if when_decimals is None
|
|
215
|
+
else when_decimals.otherwise(lit(default_decimals))
|
|
133
216
|
)
|
|
134
217
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
F.
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
218
|
+
# Add otherwise with default rounding for value
|
|
219
|
+
if default_decimals > 6:
|
|
220
|
+
default_rounded = F.round(
|
|
221
|
+
col(value_column).cast(FloatType()), default_decimals
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
precision = 38
|
|
225
|
+
decimals = default_decimals + 2
|
|
226
|
+
default_decimal_value = col(value_column).cast(
|
|
227
|
+
DecimalType(precision, decimals)
|
|
142
228
|
)
|
|
143
|
-
|
|
229
|
+
default_scale = pow(lit(10), lit(default_decimals)).cast(
|
|
230
|
+
DecimalType(precision, decimals)
|
|
231
|
+
)
|
|
232
|
+
default_rounded = (
|
|
233
|
+
F.round(default_decimal_value * default_scale) / default_scale
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Only round if actual decimals >= target decimals, otherwise keep original
|
|
237
|
+
actual_decimals_default = F.length(
|
|
238
|
+
F.regexp_extract(
|
|
239
|
+
F.col(value_column).cast("string"), DECIMAL_PLACES_REGEX, 1
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
default_rounded = F.when(
|
|
243
|
+
actual_decimals_default >= lit(default_decimals), default_rounded
|
|
244
|
+
).otherwise(col(value_column))
|
|
245
|
+
|
|
246
|
+
when_rounded = (
|
|
247
|
+
default_rounded
|
|
248
|
+
if when_rounded is None
|
|
249
|
+
else when_rounded.otherwise(default_rounded)
|
|
144
250
|
)
|
|
145
251
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# ).cast("STRING"),
|
|
252
|
+
df = df.withColumn("display_decimals", when_decimals)
|
|
253
|
+
df = df.withColumn(value_column, when_rounded)
|
|
149
254
|
|
|
150
255
|
return df
|
|
151
256
|
|
|
@@ -735,3 +840,29 @@ class SWSGoldIcebergSparkHelper:
|
|
|
735
840
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
736
841
|
|
|
737
842
|
return df
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
1
|
|
846
|
+
frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
|
|
847
|
+
1
|
|
848
|
+
1
|
|
849
|
+
2
|
|
850
|
+
frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
|
|
851
|
+
2
|
|
852
|
+
1
|
|
853
|
+
1
|
|
854
|
+
frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
|
|
855
|
+
1
|
|
856
|
+
1
|
|
857
|
+
2
|
|
858
|
+
frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
|
|
859
|
+
2
|
|
860
|
+
1
|
|
861
|
+
1
|
|
862
|
+
frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
|
|
863
|
+
1
|
|
864
|
+
1
|
|
865
|
+
1
|
|
866
|
+
frozenset({"1", "2", "6", "7", "5", "8", "0", "4", "3", "9"})
|
|
867
|
+
1
|
|
868
|
+
1
|
|
@@ -468,7 +468,7 @@ class SWSPostgresSparkReader:
|
|
|
468
468
|
correct_domain_filter, domain=domain_code, unique_columns=["code"]
|
|
469
469
|
)
|
|
470
470
|
for col_type in mapping_dim_col_name_type.values()
|
|
471
|
-
if col_type
|
|
471
|
+
if col_type not in ("year", "other")
|
|
472
472
|
}
|
|
473
473
|
|
|
474
474
|
def import_diss_exceptions_datatable(
|
|
@@ -497,45 +497,3 @@ class SWSPostgresSparkReader:
|
|
|
497
497
|
"aggregation",
|
|
498
498
|
],
|
|
499
499
|
)
|
|
500
|
-
|
|
501
|
-
def get_display_decimals_datatable(
|
|
502
|
-
self,
|
|
503
|
-
domain_code: str,
|
|
504
|
-
) -> DataFrame:
|
|
505
|
-
df = self.read_pg_table(
|
|
506
|
-
pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
|
|
507
|
-
custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
|
|
508
|
-
).filter(col("domain") == lit(domain_code))
|
|
509
|
-
|
|
510
|
-
pairs = df.select("column_1_name", "column_2_name").distinct().collect()
|
|
511
|
-
|
|
512
|
-
# If no config exists for this domain, fail early
|
|
513
|
-
if not pairs:
|
|
514
|
-
msg = (
|
|
515
|
-
f'No display-decimals configuration found for domain "{domain_code}". '
|
|
516
|
-
f'Please add an entry in table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
|
|
517
|
-
)
|
|
518
|
-
logging.error(msg)
|
|
519
|
-
# raise ValueError(msg)
|
|
520
|
-
|
|
521
|
-
# If more than one mapping exists, it's invalid
|
|
522
|
-
if len(pairs) > 1:
|
|
523
|
-
formatted_pairs = [(p["column_1_name"], p["column_2_name"]) for p in pairs]
|
|
524
|
-
|
|
525
|
-
msg = (
|
|
526
|
-
f'Invalid configuration for domain "{domain_code}". '
|
|
527
|
-
f"Expected exactly one (column_1_name, column_2_name) pair, but found {len(pairs)}: "
|
|
528
|
-
f"{formatted_pairs}. "
|
|
529
|
-
f'Please correct the table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
|
|
530
|
-
)
|
|
531
|
-
|
|
532
|
-
logging.error(
|
|
533
|
-
"Multiple display-decimals column pairs detected",
|
|
534
|
-
extra={
|
|
535
|
-
"domain": domain_code,
|
|
536
|
-
"pairs_found": formatted_pairs,
|
|
537
|
-
},
|
|
538
|
-
)
|
|
539
|
-
raise ValueError(msg)
|
|
540
|
-
|
|
541
|
-
return df
|
|
@@ -444,7 +444,7 @@ class SWSSilverIcebergSparkHelper:
|
|
|
444
444
|
logging.info("Checking the dissemination flag for each dimension (except year)")
|
|
445
445
|
|
|
446
446
|
for col_name, col_type in self.mapping_dim_col_name_type.items():
|
|
447
|
-
if col_type
|
|
447
|
+
if col_type not in ("other", "year"):
|
|
448
448
|
df = self._check_diss_dim_list(
|
|
449
449
|
df,
|
|
450
450
|
self.dfs_diss_flags[col_type],
|
|
@@ -168,7 +168,7 @@ class DatasetTables:
|
|
|
168
168
|
self.OBSERVATION = self.__SWSTable(
|
|
169
169
|
postgres_id=f"{self.__dataset_id}.observation",
|
|
170
170
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_observation",
|
|
171
|
-
schema="id BIGINT, observation_coordinates BIGINT, version INT, value
|
|
171
|
+
schema="id BIGINT, observation_coordinates BIGINT, version INT, value STRING, flag_obs_status STRING, flag_method STRING, created_on TIMESTAMP, created_by INT, replaced_on TIMESTAMP",
|
|
172
172
|
)
|
|
173
173
|
self.OBSERVATION_COORDINATE = self.__SWSTable(
|
|
174
174
|
postgres_id=f"{self.__dataset_id}.observation_coordinate",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.191
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -49,7 +49,7 @@ Requires-Dist: pytz==2025.2
|
|
|
49
49
|
Requires-Dist: requests==2.32.3
|
|
50
50
|
Requires-Dist: s3transfer>=0.11.2
|
|
51
51
|
Requires-Dist: six==1.17.0
|
|
52
|
-
Requires-Dist: sws-api-client==2.3
|
|
52
|
+
Requires-Dist: sws-api-client==2.7.3
|
|
53
53
|
Requires-Dist: typing-extensions>=4.12.2
|
|
54
54
|
Requires-Dist: tzdata==2025.2
|
|
55
55
|
Requires-Dist: urllib3==1.26.20
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
|
|
2
2
|
sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
|
|
4
|
-
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=
|
|
5
|
-
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=
|
|
6
|
-
sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=
|
|
4
|
+
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=3JkM3u7fLG-ZTtuReR4y3q5JVbhv_XG7-faRYQaktc0,32378
|
|
5
|
+
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=5do-Cz2_GAEwNxPWRnnITjADMX8Wgi3aj_ynpQCUNmI,18467
|
|
6
|
+
sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=PGZPq_oNKRGOseYGuNujbcS8y-WuLmoDMN95faq0Css,26359
|
|
7
7
|
sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
|
|
8
|
-
sws_spark_dissemination_helper/constants.py,sha256=
|
|
8
|
+
sws_spark_dissemination_helper/constants.py,sha256=MzuC7pqsXF89r-FK7hhmWaZSk5x3GB_YPVSfuK3NYVY,14056
|
|
9
9
|
sws_spark_dissemination_helper/utils.py,sha256=Ge8zXsUIcvFihALDNLF5kCu_tAdRQUE04xE6Yn9xQF4,22008
|
|
10
|
-
sws_spark_dissemination_helper-0.0.
|
|
11
|
-
sws_spark_dissemination_helper-0.0.
|
|
12
|
-
sws_spark_dissemination_helper-0.0.
|
|
13
|
-
sws_spark_dissemination_helper-0.0.
|
|
10
|
+
sws_spark_dissemination_helper-0.0.191.dist-info/METADATA,sha256=GCVYkvlKzxgFc22jEYBEc2_Hj7PN1RJAoamlnXdM4nA,2822
|
|
11
|
+
sws_spark_dissemination_helper-0.0.191.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
12
|
+
sws_spark_dissemination_helper-0.0.191.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
|
|
13
|
+
sws_spark_dissemination_helper-0.0.191.dist-info/RECORD,,
|
|
File without changes
|