sws-spark-dissemination-helper 0.0.93__py3-none-any.whl → 0.0.183__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,13 +8,9 @@ from pyspark.sql.functions import col, lit
8
8
  from sws_api_client import Tags
9
9
  from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
10
10
 
11
- from .constants import IcebergDatabases, IcebergTables
11
+ from .constants import IcebergDatabases, IcebergTables, DatasetDatatables
12
12
  from .SWSPostgresSparkReader import SWSPostgresSparkReader
13
- from .utils import (
14
- get_or_create_tag,
15
- save_cache_csv,
16
- upsert_disseminated_table,
17
- )
13
+ from .utils import get_or_create_tag, save_cache_csv, upsert_disseminated_table
18
14
 
19
15
 
20
16
  class SWSGoldIcebergSparkHelper:
@@ -66,6 +62,12 @@ class SWSGoldIcebergSparkHelper:
66
62
  if col_name in self.dim_columns
67
63
  }
68
64
 
65
+ self.display_decimals = (
66
+ self.sws_postgres_spark_reader.get_display_decimals_datatable(
67
+ domain_code=domain_code
68
+ )
69
+ )
70
+
69
71
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
70
72
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
71
73
  dim_columns_w_time = [
@@ -86,23 +88,97 @@ class SWSGoldIcebergSparkHelper:
86
88
  def apply_diss_flag_filter(self, df: DataFrame) -> DataFrame:
87
89
  return df.filter(col("diss_flag"))
88
90
 
89
- def keep_dim_val_attr_columns(self, df: DataFrame):
90
- return df.select(*self.cols_to_keep_sws)
91
+ def keep_dim_val_attr_columns(
92
+ self, df: DataFrame, additional_columns: List[str] = []
93
+ ):
94
+ cols_to_keep_sws = self.cols_to_keep_sws
95
+ for additional_column in additional_columns:
96
+ if additional_column in df.columns:
97
+ cols_to_keep_sws = cols_to_keep_sws + [additional_column]
98
+ if "unit_of_measure_symbol" in df.columns:
99
+ cols_to_keep_sws = cols_to_keep_sws + ["unit_of_measure_symbol"]
100
+ return df.select(*cols_to_keep_sws)
101
+
102
+ def round_to_display_decimals(self, df: DataFrame):
103
+ col1_name, col2_name = (
104
+ self.display_decimals.select("column_1_name", "column_2_name")
105
+ .distinct()
106
+ .collect()[0]
107
+ )
108
+ if col1_name.lower() not in [column.lower() for column in df.columns]:
109
+ raise ValueError(
110
+ f"{col1_name} is not part of the columns available for this dataset ({df.columns})"
111
+ )
112
+ if col2_name.lower() not in [column.lower() for column in df.columns]:
113
+ raise ValueError(
114
+ f"{col2_name} is not part of the columns available for this dataset ({df.columns})"
115
+ )
116
+
117
+ df = (
118
+ df.alias("d")
119
+ .join(
120
+ self.display_decimals.alias("dd"),
121
+ on=(col(f"d.{col1_name}") == col("dd.column_1_value"))
122
+ & (col(f"d.{col2_name}") == col("dd.column_2_value")),
123
+ how="left",
124
+ )
125
+ .select("d.*", "dd.display_decimals")
126
+ )
127
+
128
+ df.filter(col("display_decimals").isNull()).select(
129
+ col1_name, col2_name
130
+ ).distinct()
131
+ logging.warning(
132
+ f"The following combinations of {col1_name} and {col2_name} are not available in the table {DatasetDatatables.DISPLAY_DECIMALS.name} and will be assigned to 0"
133
+ )
134
+
135
+ df = df.withColumn(
136
+ "display_decimals",
137
+ F.coalesce(col("display_decimals"), lit("0")).cast("INT"),
138
+ ).withColumn(
139
+ "value",
140
+ F.round(
141
+ F.col("value").cast("FLOAT") * F.pow(10, F.col("display_decimals")), 0
142
+ )
143
+ / F.pow(10, F.col("display_decimals")).cast("STRING"),
144
+ )
145
+
146
+ # F.round(
147
+ # col("value").cast("FLOAT"), col("display_decimals").cast("INT")
148
+ # ).cast("STRING"),
149
+
150
+ return df
151
+
152
+ def read_bronze_data(self) -> DataFrame:
153
+ return self.spark.read.option("tag", self.tag_name).table(
154
+ self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
155
+ )
91
156
 
92
157
  def read_silver_data(self) -> DataFrame:
93
158
  return self.spark.read.option("tag", self.tag_name).table(
94
159
  self.iceberg_tables.SILVER.iceberg_id
95
160
  )
96
161
 
97
- def gen_gold_sws_disseminated_data(self) -> DataFrame:
162
+ def gen_gold_sws_disseminated_data(
163
+ self, additional_columns: List[str] = []
164
+ ) -> DataFrame:
98
165
  return (
99
166
  self.read_silver_data()
100
167
  .transform(self.apply_diss_flag_filter)
101
- .transform(self.keep_dim_val_attr_columns)
168
+ .transform(self.keep_dim_val_attr_columns, additional_columns)
102
169
  )
103
170
 
104
- def gen_gold_sws_validated_data(self) -> DataFrame:
105
- return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
171
+ def gen_gold_sws_data(self, additional_columns: List[str] = []) -> DataFrame:
172
+ return self.read_bronze_data().transform(
173
+ self.keep_dim_val_attr_columns, additional_columns
174
+ )
175
+
176
+ def gen_gold_sws_validated_data(
177
+ self, additional_columns: List[str] = []
178
+ ) -> DataFrame:
179
+ return self.read_silver_data().transform(
180
+ self.keep_dim_val_attr_columns, additional_columns
181
+ )
106
182
 
107
183
  def write_gold_sws_validated_data_to_iceberg_and_csv(
108
184
  self, df: DataFrame
@@ -130,6 +206,37 @@ class SWSGoldIcebergSparkHelper:
130
206
 
131
207
  return df
132
208
 
209
+ def write_gold_sws_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
210
+ df.writeTo(self.iceberg_tables.GOLD_SWS.iceberg_id).createOrReplace()
211
+
212
+ logging.info(
213
+ f"Gold SWS table written to {self.iceberg_tables.GOLD_SWS.iceberg_id}"
214
+ )
215
+
216
+ self.spark.sql(
217
+ f"ALTER TABLE {self.iceberg_tables.GOLD_SWS.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
218
+ )
219
+
220
+ logging.info(f"gold SWS tag '{self.tag_name}' created")
221
+
222
+ df_1 = df.coalesce(1)
223
+
224
+ save_cache_csv(
225
+ df=df_1,
226
+ bucket=self.bucket,
227
+ prefix=self.iceberg_tables.GOLD_SWS.csv_prefix,
228
+ tag_name=self.tag_name,
229
+ )
230
+
231
+ return df
232
+
233
+ def gen_and_write_gold_sws_data_to_iceberg_and_csv(self) -> DataFrame:
234
+ self.df_gold_sws = self.gen_gold_sws_data()
235
+
236
+ self.write_gold_sws_data_to_iceberg_and_csv(self.df_gold_sws)
237
+
238
+ return self.df_gold_sws
239
+
133
240
  def gen_and_write_gold_sws_validated_data_to_iceberg_and_csv(self) -> DataFrame:
134
241
  self.df_gold_sws_validated = self.gen_gold_sws_validated_data()
135
242
 
@@ -232,6 +339,60 @@ class SWSGoldIcebergSparkHelper:
232
339
 
233
340
  return df
234
341
 
342
+ def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
343
+ """The expected input to this function is the output of the sws disseminated function"""
344
+ df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
345
+
346
+ logging.info(
347
+ f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
348
+ )
349
+
350
+ self.spark.sql(
351
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
352
+ )
353
+
354
+ logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
355
+
356
+ df_1 = df.coalesce(1)
357
+
358
+ save_cache_csv(
359
+ df=df_1,
360
+ bucket=self.bucket,
361
+ prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
362
+ tag_name=self.tag_name,
363
+ )
364
+
365
+ return df
366
+
367
+ def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
368
+ self, df: DataFrame
369
+ ) -> DataFrame:
370
+ """The expected input to this function is the output of the sws disseminated function"""
371
+ df.writeTo(
372
+ self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
373
+ ).createOrReplace()
374
+
375
+ logging.info(
376
+ f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
377
+ )
378
+
379
+ self.spark.sql(
380
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
381
+ )
382
+
383
+ logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
384
+
385
+ df_1 = df.coalesce(1)
386
+
387
+ save_cache_csv(
388
+ df=df_1,
389
+ bucket=self.bucket,
390
+ prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
391
+ tag_name=self.tag_name,
392
+ )
393
+
394
+ return df
395
+
235
396
  def write_gold_sws_validated_sws_dissemination_tag(
236
397
  self, df: DataFrame, tags: Tags
237
398
  ) -> DataFrame:
@@ -242,7 +403,7 @@ class SWSGoldIcebergSparkHelper:
242
403
  new_iceberg_table = BaseDisseminatedTagTable(
243
404
  id=f"{self.domain_code.lower()}_gold_sws_validated_iceberg",
244
405
  name=f"{self.domain_code} gold SWS validated Iceberg",
245
- description="Gold table containing all the data unmapped and unfiltered in SWS compatible format",
406
+ description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format",
246
407
  layer=TableLayer.GOLD,
247
408
  private=True,
248
409
  type=TableType.ICEBERG,
@@ -263,7 +424,7 @@ class SWSGoldIcebergSparkHelper:
263
424
  new_diss_table = BaseDisseminatedTagTable(
264
425
  id=f"{self.domain_code.lower()}_gold_sws_validated_csv",
265
426
  name=f"{self.domain_code} gold SWS validated csv",
266
- description="Gold table containing all the data unmapped and unfiltered in SWS compatible format cached in csv",
427
+ description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format, cached in csv",
267
428
  layer=TableLayer.GOLD,
268
429
  private=True,
269
430
  type=TableType.CSV,
@@ -291,7 +452,7 @@ class SWSGoldIcebergSparkHelper:
291
452
  new_iceberg_table = BaseDisseminatedTagTable(
292
453
  id=f"{self.domain_code.lower()}_gold_sws_disseminated_iceberg",
293
454
  name=f"{self.domain_code} gold SWS disseminated Iceberg",
294
- description="Gold table containing all the data mapped and filtered in SWS compatible format",
455
+ description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format",
295
456
  layer=TableLayer.GOLD,
296
457
  private=True,
297
458
  type=TableType.ICEBERG,
@@ -312,7 +473,7 @@ class SWSGoldIcebergSparkHelper:
312
473
  new_diss_table = BaseDisseminatedTagTable(
313
474
  id=f"{self.domain_code.lower()}_gold_sws_disseminated_csv",
314
475
  name=f"{self.domain_code} gold SWS disseminated csv",
315
- description="Gold table containing all the data mapped and filtered in SWS compatible format format cached in csv",
476
+ description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format, cached in csv",
316
477
  layer=TableLayer.GOLD,
317
478
  private=True,
318
479
  type=TableType.CSV,
@@ -392,6 +553,7 @@ class SWSGoldIcebergSparkHelper:
392
553
  description="Gold table containing all the cleaned data in SDMX compatible format, ready to be mapped using FMR",
393
554
  layer=TableLayer.GOLD,
394
555
  private=True,
556
+ debug=True,
395
557
  type=TableType.ICEBERG,
396
558
  database=IcebergDatabases.GOLD_DATABASE,
397
559
  table=self.iceberg_tables.GOLD_PRE_SDMX.table,
@@ -413,6 +575,7 @@ class SWSGoldIcebergSparkHelper:
413
575
  description="Gold table containing all the cleaned data in SDMX compatible format, ready to be mapped using FMR and cached in csv",
414
576
  layer=TableLayer.GOLD,
415
577
  private=True,
578
+ debug=True,
416
579
  type=TableType.CSV,
417
580
  path=self.iceberg_tables.GOLD_PRE_SDMX.csv_path,
418
581
  structure={"columns": df.schema.jsonValue()["fields"]},
@@ -427,3 +590,148 @@ class SWSGoldIcebergSparkHelper:
427
590
  logging.debug(f"Tag with Added csv Table: {tag}")
428
591
 
429
592
  return df
593
+
594
+ def write_gold_sws_dissemination_tag(self, df: DataFrame, tags: Tags) -> DataFrame:
595
+ # Get or create a new tag
596
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
597
+ logging.debug(f"Tag: {tag}")
598
+
599
+ new_iceberg_table = BaseDisseminatedTagTable(
600
+ id=f"{self.domain_code.lower()}_gold_sws_iceberg",
601
+ name=f"{self.domain_code} gold SWS Iceberg",
602
+ description="Gold table containing the tag data without any processing",
603
+ layer=TableLayer.GOLD,
604
+ private=True,
605
+ type=TableType.ICEBERG,
606
+ database=IcebergDatabases.GOLD_DATABASE,
607
+ table=self.iceberg_tables.GOLD_SWS.table,
608
+ path=self.iceberg_tables.GOLD_SWS.path,
609
+ structure={"columns": df.schema.jsonValue()["fields"]},
610
+ )
611
+ tag = upsert_disseminated_table(
612
+ sws_tags=tags,
613
+ tag=tag,
614
+ dataset_id=self.dataset_id,
615
+ tag_name=self.tag_name,
616
+ table=new_iceberg_table,
617
+ )
618
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
619
+
620
+ new_diss_table = BaseDisseminatedTagTable(
621
+ id=f"{self.domain_code.lower()}_gold_sws_csv",
622
+ name=f"{self.domain_code} gold SWS csv",
623
+ description="Gold table containing the tag data without any processing cached in csv",
624
+ layer=TableLayer.GOLD,
625
+ private=True,
626
+ type=TableType.CSV,
627
+ path=self.iceberg_tables.GOLD_SWS.csv_path,
628
+ structure={"columns": df.schema.jsonValue()["fields"]},
629
+ )
630
+ tag = upsert_disseminated_table(
631
+ sws_tags=tags,
632
+ tag=tag,
633
+ dataset_id=self.dataset_id,
634
+ tag_name=self.tag_name,
635
+ table=new_diss_table,
636
+ )
637
+ logging.debug(f"Tag with Added csv Table: {tag}")
638
+
639
+ return df
640
+
641
+ def write_gold_faostat_dissemination_tag(
642
+ self, df: DataFrame, tags: Tags
643
+ ) -> DataFrame:
644
+ # Get or create a new tag
645
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
646
+ logging.debug(f"Tag: {tag}")
647
+
648
+ new_iceberg_table = BaseDisseminatedTagTable(
649
+ id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
650
+ name=f"{self.domain_code} gold FAOSTAT Iceberg",
651
+ description="Gold table containing the tag data in FAOSTAT format",
652
+ layer=TableLayer.GOLD,
653
+ private=True,
654
+ type=TableType.ICEBERG,
655
+ database=IcebergDatabases.GOLD_DATABASE,
656
+ table=self.iceberg_tables.GOLD_FAOSTAT.table,
657
+ path=self.iceberg_tables.GOLD_FAOSTAT.path,
658
+ structure={"columns": df.schema.jsonValue()["fields"]},
659
+ )
660
+ tag = upsert_disseminated_table(
661
+ sws_tags=tags,
662
+ tag=tag,
663
+ dataset_id=self.dataset_id,
664
+ tag_name=self.tag_name,
665
+ table=new_iceberg_table,
666
+ )
667
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
668
+
669
+ new_diss_table = BaseDisseminatedTagTable(
670
+ id=f"{self.domain_code.lower()}_gold_faostat_csv",
671
+ name=f"{self.domain_code} gold FAOSTAT csv",
672
+ description="Gold table containing the tag data in FAOSTAT format in csv",
673
+ layer=TableLayer.GOLD,
674
+ private=True,
675
+ type=TableType.CSV,
676
+ path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
677
+ structure={"columns": df.schema.jsonValue()["fields"]},
678
+ )
679
+ tag = upsert_disseminated_table(
680
+ sws_tags=tags,
681
+ tag=tag,
682
+ dataset_id=self.dataset_id,
683
+ tag_name=self.tag_name,
684
+ table=new_diss_table,
685
+ )
686
+ logging.debug(f"Tag with Added csv Table: {tag}")
687
+
688
+ return df
689
+
690
+ def write_gold_faostat_unfiltered_dissemination_tag(
691
+ self, df: DataFrame, tags: Tags
692
+ ) -> DataFrame:
693
+ # Get or create a new tag
694
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
695
+ logging.debug(f"Tag: {tag}")
696
+
697
+ new_iceberg_table = BaseDisseminatedTagTable(
698
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
699
+ name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
700
+ description="Gold table containing all the tag data in FAOSTAT format",
701
+ layer=TableLayer.GOLD,
702
+ private=True,
703
+ type=TableType.ICEBERG,
704
+ database=IcebergDatabases.GOLD_DATABASE,
705
+ table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
706
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
707
+ structure={"columns": df.schema.jsonValue()["fields"]},
708
+ )
709
+ tag = upsert_disseminated_table(
710
+ sws_tags=tags,
711
+ tag=tag,
712
+ dataset_id=self.dataset_id,
713
+ tag_name=self.tag_name,
714
+ table=new_iceberg_table,
715
+ )
716
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
717
+
718
+ new_diss_table = BaseDisseminatedTagTable(
719
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
720
+ name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
721
+ description="Gold table containing the tag data in FAOSTAT format in csv",
722
+ layer=TableLayer.GOLD,
723
+ private=True,
724
+ type=TableType.CSV,
725
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
726
+ structure={"columns": df.schema.jsonValue()["fields"]},
727
+ )
728
+ tag = upsert_disseminated_table(
729
+ sws_tags=tags,
730
+ tag=tag,
731
+ dataset_id=self.dataset_id,
732
+ tag_name=self.tag_name,
733
+ table=new_diss_table,
734
+ )
735
+ logging.debug(f"Tag with Added csv Table: {tag}")
736
+
737
+ return df
@@ -94,25 +94,37 @@ class SWSPostgresSparkReader:
94
94
 
95
95
  logging.info(f"{pg_table} read start")
96
96
 
97
- # Read observations from the PostgreSQL table into a DataFrame
98
- df = (
99
- self.spark.read.format("jdbc")
100
- .option("customSchema", custom_schema)
101
- .option("dbtable", pg_table)
102
- .option("partitionColumn", partition_column)
103
- .option("lowerBound", min_id)
104
- .option("upperBound", max_id)
105
- .option("numPartitions", num_partitions)
106
- .option("fetchsize", "1000")
107
- .option("url", self.jdbc_url)
108
- .option("user", self.jdbc_conn_properties["user"])
109
- .option("password", self.jdbc_conn_properties["password"])
110
- .option("driver", SPARK_POSTGRES_DRIVER)
111
- .load()
112
- # .repartition(1024, partition_column)
113
- # .sortWithinPartitions(partition_column)
114
- # .cache()
115
- )
97
+ if min_id is None or max_id is None:
98
+ df = (
99
+ self.spark.read.format("jdbc")
100
+ .option("customSchema", custom_schema)
101
+ .option("dbtable", pg_table)
102
+ .option("fetchsize", "1000")
103
+ .option("url", self.jdbc_url)
104
+ .option("user", self.jdbc_conn_properties["user"])
105
+ .option("password", self.jdbc_conn_properties["password"])
106
+ .option("driver", SPARK_POSTGRES_DRIVER)
107
+ .load()
108
+ )
109
+ else:
110
+ df = (
111
+ self.spark.read.format("jdbc")
112
+ .option("customSchema", custom_schema)
113
+ .option("dbtable", pg_table)
114
+ .option("partitionColumn", partition_column)
115
+ .option("lowerBound", min_id)
116
+ .option("upperBound", max_id)
117
+ .option("numPartitions", num_partitions)
118
+ .option("fetchsize", "1000")
119
+ .option("url", self.jdbc_url)
120
+ .option("user", self.jdbc_conn_properties["user"])
121
+ .option("password", self.jdbc_conn_properties["password"])
122
+ .option("driver", SPARK_POSTGRES_DRIVER)
123
+ .load()
124
+ # .repartition(1024, partition_column)
125
+ # .sortWithinPartitions(partition_column)
126
+ # .cache()
127
+ )
116
128
  else:
117
129
  df = (
118
130
  self.spark.read.format("jdbc")
@@ -195,6 +207,7 @@ class SWSPostgresSparkReader:
195
207
  (dataset_tables.OBSERVATION_COORDINATE, "id", 10),
196
208
  (dataset_tables.METADATA, "id", 10),
197
209
  (dataset_tables.METADATA_ELEMENT, "metadata", 10),
210
+ (dataset_tables.TAG_OBSERVATION, "tag", 10),
198
211
  ]
199
212
  return self._import_tables(data_tables)
200
213
 
@@ -209,25 +222,30 @@ class SWSPostgresSparkReader:
209
222
  dataset_tables.METADATA_ELEMENT_TYPE,
210
223
  dataset_tables.LANGUAGE,
211
224
  dataset_tables.UNIT_OF_MEASURE,
225
+ dataset_tables.DATASET,
212
226
  *dataset_tables.CODELISTS,
213
227
  ]
228
+ logging.info(
229
+ f"Importing reference data tables: {[(table.postgres_id, table.iceberg_id) for table in reference_data_tables]}"
230
+ )
214
231
  return self._import_tables(
215
232
  [(table, None, 1) for table in reference_data_tables]
216
233
  )
217
234
 
218
235
  def import_operational_data_tables(
219
236
  self, dataset_tables: DatasetTables
220
- ) -> DataFrame:
237
+ ) -> List[DataFrame]:
221
238
  # Define and import operational data table without partitioning
222
239
  operational_data_tables = [
223
240
  (dataset_tables.USER, None, 1),
241
+ (dataset_tables.TAG, None, 1),
224
242
  ]
225
- return self._import_tables(operational_data_tables)[0]
243
+ return self._import_tables(operational_data_tables)
226
244
 
227
245
  def import_data_reference_data_operational_data(
228
246
  self, dataset_tables: DatasetTables
229
247
  ) -> Tuple[
230
- Tuple[DataFrame, DataFrame, DataFrame, DataFrame],
248
+ Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame],
231
249
  Tuple[
232
250
  DataFrame,
233
251
  DataFrame,
@@ -235,22 +253,23 @@ class SWSPostgresSparkReader:
235
253
  DataFrame,
236
254
  DataFrame,
237
255
  DataFrame,
256
+ DataFrame,
238
257
  List[DataFrame],
239
258
  ],
240
- DataFrame,
259
+ Tuple[DataFrame, DataFrame],
241
260
  ]:
242
261
  # Import and organize DataFrames into the desired output structure
243
262
  data_dfs = self.import_data_tables(dataset_tables)
244
263
  reference_data_dfs = self.import_reference_data_tables(dataset_tables)
245
- operational_data_df = self.import_operational_data_tables(dataset_tables)
264
+ operational_data_dfs = self.import_operational_data_tables(dataset_tables)
246
265
 
247
266
  return (
248
267
  tuple(data_dfs),
249
268
  (
250
- *reference_data_dfs[:6],
251
- reference_data_dfs[6:],
269
+ *reference_data_dfs[:7],
270
+ reference_data_dfs[7:],
252
271
  ),
253
- operational_data_df,
272
+ tuple(operational_data_dfs),
254
273
  )
255
274
 
256
275
  def get_codelist_type_mapping(
@@ -291,13 +310,17 @@ class SWSPostgresSparkReader:
291
310
  self,
292
311
  domain_code: str,
293
312
  ) -> DataFrame:
294
- return self.read_pg_table_and_check_duplicates_for_domain(
313
+ df = self.read_pg_table(
295
314
  pg_table=DatasetDatatables.MAPPING_CODE_CORRECTION.id,
296
- table_name=DatasetDatatables.MAPPING_CODE_CORRECTION.name,
297
315
  custom_schema=DatasetDatatables.MAPPING_CODE_CORRECTION.schema,
298
- domain_code=domain_code,
299
- unique_columns=["old_code"],
300
316
  )
317
+ df.filter(
318
+ col("mapping_type").isNull() | (col("mapping_type") == lit(""))
319
+ ).transform(
320
+ correct_domain_filter, domain=domain_code, unique_columns=["old_code"]
321
+ )
322
+
323
+ return df
301
324
 
302
325
  def get_domain_code_source_datasets_ids_dest_dataset_id(
303
326
  self, dataset_id: str, domain_code: str = None
@@ -474,3 +497,45 @@ class SWSPostgresSparkReader:
474
497
  "aggregation",
475
498
  ],
476
499
  )
500
+
501
+ def get_display_decimals_datatable(
502
+ self,
503
+ domain_code: str,
504
+ ) -> DataFrame:
505
+ df = self.read_pg_table(
506
+ pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
507
+ custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
508
+ ).filter(col("domain") == lit(domain_code))
509
+
510
+ pairs = df.select("column_1_name", "column_2_name").distinct().collect()
511
+
512
+ # If no config exists for this domain, fail early
513
+ if not pairs:
514
+ msg = (
515
+ f'No display-decimals configuration found for domain "{domain_code}". '
516
+ f'Please add an entry in table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
517
+ )
518
+ logging.error(msg)
519
+ # raise ValueError(msg)
520
+
521
+ # If more than one mapping exists, it's invalid
522
+ if len(pairs) > 1:
523
+ formatted_pairs = [(p["column_1_name"], p["column_2_name"]) for p in pairs]
524
+
525
+ msg = (
526
+ f'Invalid configuration for domain "{domain_code}". '
527
+ f"Expected exactly one (column_1_name, column_2_name) pair, but found {len(pairs)}: "
528
+ f"{formatted_pairs}. "
529
+ f'Please correct the table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
530
+ )
531
+
532
+ logging.error(
533
+ "Multiple display-decimals column pairs detected",
534
+ extra={
535
+ "domain": domain_code,
536
+ "pairs_found": formatted_pairs,
537
+ },
538
+ )
539
+ raise ValueError(msg)
540
+
541
+ return df