sws-spark-dissemination-helper 0.0.60__py3-none-any.whl → 0.0.171__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,6 @@ from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
11
11
  from .constants import IcebergDatabases, IcebergTables
12
12
  from .SWSPostgresSparkReader import SWSPostgresSparkReader
13
13
  from .utils import (
14
- col_is_null_or_empty,
15
14
  get_or_create_tag,
16
15
  save_cache_csv,
17
16
  upsert_disseminated_table,
@@ -47,11 +46,6 @@ class SWSGoldIcebergSparkHelper:
47
46
  self.flag_columns,
48
47
  ) = self._get_dim_time_flag_columns()
49
48
 
50
- self.cols_to_keep_sdmx = (
51
- self.dim_columns_w_time
52
- + ["unit_of_measure", "unit_of_measure_multiplier", "value"]
53
- + self.flag_columns
54
- )
55
49
  self.cols_to_keep_sws = (
56
50
  self.dim_columns_w_time + ["value"] + self.flag_columns
57
51
  )
@@ -72,40 +66,6 @@ class SWSGoldIcebergSparkHelper:
72
66
  if col_name in self.dim_columns
73
67
  }
74
68
 
75
- (
76
- self.df_mapping_sdmx_codes,
77
- self.df_mapping_sdmx_uom,
78
- self.df_mapping_sdmx_col_names,
79
- ) = sws_postgres_spark_reader.import_sdmx_mapping_datatables(self.domain_code)
80
-
81
- self._check_column_mappings(self.df_mapping_sdmx_col_names)
82
-
83
- def _check_column_mappings(
84
- self,
85
- df_mapping_sdmx_col_names: DataFrame,
86
- ) -> DataFrame:
87
- cols_to_keep_set = set(self.cols_to_keep_sdmx)
88
- mapping_sdmx_col_names_internal_set = {
89
- row[0]
90
- for row in df_mapping_sdmx_col_names.filter(
91
- col("internal_name").isNotNull() & (col("internal_name") != lit(""))
92
- )
93
- .select("internal_name")
94
- .collect()
95
- }
96
-
97
- if not (cols_to_keep_set <= mapping_sdmx_col_names_internal_set):
98
- missing_mappings = cols_to_keep_set - mapping_sdmx_col_names_internal_set
99
-
100
- message = 'The mappings in the table "Mapping - SDMX columns names" are not correct'
101
-
102
- if len(missing_mappings) > 0:
103
- message += (
104
- f"\nThe following column mappings are missing: {missing_mappings}"
105
- )
106
-
107
- raise ValueError(message)
108
-
109
69
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
110
70
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
111
71
  dim_columns_w_time = [
@@ -126,319 +86,31 @@ class SWSGoldIcebergSparkHelper:
126
86
  def apply_diss_flag_filter(self, df: DataFrame) -> DataFrame:
127
87
  return df.filter(col("diss_flag"))
128
88
 
129
- # TODO implement the delete flag
130
- def apply_uom_mapping(
131
- self,
132
- df: DataFrame,
133
- ) -> DataFrame:
134
- logging.info("mapping unit of measure for dissemination")
135
-
136
- df = df.withColumn(
137
- "official_sws_uom",
138
- F.when(
139
- col_is_null_or_empty("unit_of_measure_base_unit"),
140
- col("unit_of_measure"),
141
- ).otherwise(col("unit_of_measure_base_unit")),
142
- ).withColumn(
143
- "official_sws_multiplier",
144
- F.coalesce(F.log10(col("unit_of_measure_multiplier")), lit(0)).cast("int"),
145
- )
146
-
147
- delete_df_uom_mapping = self.df_mapping_sdmx_uom.filter(
148
- col("delete")
149
- & col_is_null_or_empty("sdmx_code")
150
- & col("sdmx_multiplier").isNull()
151
- & col("value_multiplier").isNull()
152
- )
153
-
154
- generic_df_uom_mapping = self.df_mapping_sdmx_uom.filter(
155
- ~col("delete")
156
- & col("sws_multiplier").isNull()
157
- & col("sdmx_multiplier").isNull()
158
- & (col("value_multiplier") == lit(0))
159
- )
160
-
161
- specific_df_uom_mapping = self.df_mapping_sdmx_uom.filter(
162
- ~col("delete")
163
- & col("sws_multiplier").isNotNull()
164
- & col("sdmx_multiplier").isNotNull()
165
- )
166
-
167
- # Apply generic uom mapping
168
- df = (
169
- df.alias("d")
170
- .join(
171
- generic_df_uom_mapping.alias("m"),
172
- col("d.official_sws_uom") == col("m.sws_code"),
173
- "left",
174
- )
175
- .select("d.*", col("sdmx_code").alias("generic_sdmx_uom"))
176
- )
177
-
178
- # Apply specific uom mapping
179
- df = (
180
- df.alias("d")
181
- .join(
182
- specific_df_uom_mapping.alias("m"),
183
- (col("d.official_sws_uom") == col("m.sws_code"))
184
- & (col("d.official_sws_multiplier") == col("m.sws_multiplier")),
185
- "left",
186
- )
187
- .select(
188
- "d.*",
189
- col("sdmx_code").alias("specific_sdmx_uom"),
190
- col("sdmx_multiplier").alias("specific_sdmx_multiplier"),
191
- (col("value") * F.pow(lit(10), col("value_multiplier"))).alias(
192
- "specific_sdmx_value"
193
- ),
194
- )
195
- )
196
-
197
- # Select the official values according to descending specificity
198
- df = (
199
- df.withColumn(
200
- "unit_of_measure",
201
- F.coalesce(
202
- col("specific_sdmx_uom"),
203
- col("generic_sdmx_uom"),
204
- col("official_sws_uom"),
205
- ),
206
- )
207
- .withColumn(
208
- "unit_of_measure_multiplier",
209
- F.coalesce(
210
- col("specific_sdmx_multiplier"), col("official_sws_multiplier")
211
- ),
212
- )
213
- .withColumn(
214
- "value",
215
- F.coalesce(col("specific_sdmx_value"), col("value")),
216
- )
217
- # Remove the columns that were not in the original dataset
218
- .drop(
219
- col("specific_sdmx_uom"),
220
- col("specific_sdmx_multiplier"),
221
- col("specific_sdmx_value"),
222
- col("generic_sdmx_uom"),
223
- col("official_sws_uom"),
224
- col("official_sws_multiplier"),
225
- )
226
- )
227
-
228
- return df
229
-
230
- def keep_dim_uom_val_attr_columns(self, df: DataFrame):
231
- return df.select(*self.cols_to_keep_sdmx)
232
-
233
89
  def keep_dim_val_attr_columns(self, df: DataFrame):
234
90
  return df.select(*self.cols_to_keep_sws)
235
91
 
236
- def _apply_sdmx_dimension_codes_mapping_single(
237
- self,
238
- df: DataFrame,
239
- dimension_name: str,
240
- dimension_type: str,
241
- ) -> DataFrame:
242
- logging.info(
243
- f"mapping column {dimension_name} of type {dimension_type} for dissemination"
244
- )
245
- return (
246
- df.alias("d")
247
- # Join the data with the standard mapping for the specific dimension
248
- .join(
249
- F.broadcast(
250
- self.df_mapping_sdmx_codes.filter(
251
- (col("domain").isNull() | (col("domain") == lit("")))
252
- & (col("var_type") == lit(dimension_type))
253
- & (
254
- col("mapping_type").isNull()
255
- | (col("mapping_type") == lit(""))
256
- )
257
- )
258
- ).alias("m_standard"),
259
- col(f"d.{dimension_name}") == col("m_standard.internal_code"),
260
- "left",
261
- )
262
- # Join the data with the domain specific mapping for the specific dimension
263
- .join(
264
- F.broadcast(
265
- self.df_mapping_sdmx_codes.filter(
266
- (col("domain") == lit(self.domain_code))
267
- & (col("var_type") == lit(dimension_type))
268
- & (
269
- col("mapping_type").isNull()
270
- | (col("mapping_type") == lit(""))
271
- )
272
- )
273
- ).alias("m_domain"),
274
- col(f"d.{dimension_name}") == col("m_domain.internal_code"),
275
- "left",
276
- )
277
- # Select only the columns we are interested in (this step is optional but recommended for debugging)
278
- .select(
279
- "d.*",
280
- col("m_standard.external_code").alias("standard_external_code"),
281
- col("m_standard.delete").alias("standard_delete"),
282
- col("m_standard.multiplier").alias("standard_multiplier"),
283
- col("m_domain.external_code").alias("domain_specific_external_code"),
284
- col("m_domain.delete").alias("domain_specific_delete"),
285
- col("m_domain.multiplier").alias("domain_specific_multiplier"),
286
- )
287
- # Filter out records to delete
288
- .filter(
289
- # Evaluate first the domain specific flag
290
- F.when(
291
- col("domain_specific_delete").isNotNull(),
292
- ~col("domain_specific_delete"),
293
- )
294
- # Then evaluate the general flag
295
- .when(
296
- col("standard_delete").isNotNull(), ~col("standard_delete")
297
- ).otherwise(lit(True))
298
- )
299
- .withColumn(
300
- dimension_name,
301
- # Evaluate first the domain specific mapping
302
- F.when(
303
- col("domain_specific_external_code").isNotNull(),
304
- col("domain_specific_external_code"),
305
- )
306
- # Then evaluate the general mapping
307
- .when(
308
- col("standard_external_code").isNotNull(),
309
- col("standard_external_code"),
310
- ).otherwise(col(dimension_name)),
311
- )
312
- .withColumn(
313
- "value",
314
- # Multiply first by the domain specific multiplier
315
- F.when(
316
- col("domain_specific_multiplier").isNotNull(),
317
- col("value") * col("domain_specific_multiplier"),
318
- )
319
- # Then multiply by the general multiplier
320
- .when(
321
- col("standard_external_code").isNotNull(),
322
- col("value") * col("standard_multiplier"),
323
- ).otherwise(col("value")),
324
- )
325
- # Remove the columns that were not in the original dataset
326
- .drop(
327
- "standard_external_code",
328
- "standard_delete",
329
- "standard_multiplier",
330
- "domain_specific_external_code",
331
- "domain_specific_delete",
332
- "domain_specific_multiplier",
333
- )
92
+ def read_bronze_data(self) -> DataFrame:
93
+ return self.spark.read.option("tag", self.tag_name).table(
94
+ self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
334
95
  )
335
96
 
336
- def apply_sdmx_dimension_codes_mapping(self, df: DataFrame) -> DataFrame:
337
- logging.info("Mapping codes to comply with SDMX standard")
338
- for dimension_name, dimension_type in self.codelist_type_mapping.items():
339
- df = df.transform(
340
- self._apply_sdmx_dimension_codes_mapping_single,
341
- dimension_name=dimension_name,
342
- dimension_type=dimension_type,
343
- )
344
-
345
- return df
346
-
347
- def drop_non_sdmx_columns(self, df: DataFrame) -> DataFrame:
348
- cols_to_drop = [
349
- row["internal_name"]
350
- for row in self.df_mapping_sdmx_col_names.collect()
351
- if row["delete"] is True
352
- ]
353
- logging.info(f"Dropping non-SDMX columns: {cols_to_drop}")
354
- return df.drop(*cols_to_drop)
355
-
356
- def apply_sdmx_column_names_mapping(self, df: DataFrame) -> DataFrame:
357
- logging.info("Renaming columns to comply with SDMX standard")
358
-
359
- mapping_sws_col_sdmx_col = {
360
- row["internal_name"]: row["external_name"]
361
- for row in self.df_mapping_sdmx_col_names.filter(
362
- col("internal_name").isNotNull()
363
- & (col("internal_name") != lit(""))
364
- & ~col("delete")
365
- ).collect()
366
- }
367
-
368
- logging.info(f"Column names mappings: {mapping_sws_col_sdmx_col}")
369
-
370
- return df.withColumnsRenamed(mapping_sws_col_sdmx_col)
371
-
372
- def add_sdmx_default_columns(self, df: DataFrame) -> DataFrame:
373
- col_w_default_value = {
374
- row["external_name"]: row["default_value"]
375
- for row in self.df_mapping_sdmx_col_names.collect()
376
- if row["add"] is True
377
- }
378
-
379
- logging.info("Adding SDMX columns with default values")
380
-
381
- for name, default_value in col_w_default_value.items():
382
- logging.info(
383
- f"Adding SDMX column {name} with default value {default_value}"
384
- )
385
- df = df.withColumn(name, lit(default_value))
386
-
387
- return df
388
-
389
- def rearrange_sdmx_columns(self, df: DataFrame) -> DataFrame:
390
- logging.info(
391
- "Rearranging the columns to have the following order: Dimensions, TimeDimension, PrimaryMeasure, Attributes"
97
+ def read_silver_data(self) -> DataFrame:
98
+ return self.spark.read.option("tag", self.tag_name).table(
99
+ self.iceberg_tables.SILVER.iceberg_id
392
100
  )
393
101
 
394
- get_columns_for_type = lambda df, type: [
395
- row[0]
396
- for row in df.filter(col("type") == lit(type))
397
- .select("external_name")
398
- .collect()
399
- ]
400
-
401
- df_mapping_sdmx_no_del = self.df_mapping_sdmx_col_names.filter(~col("delete"))
402
-
403
- dimensions = get_columns_for_type(df_mapping_sdmx_no_del, "Dimension")
404
- time_dimensions = get_columns_for_type(df_mapping_sdmx_no_del, "TimeDimension")
405
- primary_measure = get_columns_for_type(df_mapping_sdmx_no_del, "PrimaryMeasure")
406
- attributes = get_columns_for_type(df_mapping_sdmx_no_del, "Attribute")
407
-
408
- logging.info(f"Dimensions: {dimensions}")
409
- logging.info(f"Time Dimensions: {time_dimensions}")
410
- logging.info(f"Primary Measure: {primary_measure}")
411
- logging.info(f"Attributes: {attributes}")
412
-
413
- return df.select(*dimensions, *time_dimensions, *primary_measure, *attributes)
414
-
415
102
  def gen_gold_sws_disseminated_data(self) -> DataFrame:
416
103
  return (
417
- self.spark.read.option("tag", self.tag_name)
418
- .table(self.iceberg_tables.SILVER.iceberg_id)
104
+ self.read_silver_data()
419
105
  .transform(self.apply_diss_flag_filter)
420
106
  .transform(self.keep_dim_val_attr_columns)
421
107
  )
422
108
 
423
- def gen_gold_sws_validated_data(self) -> DataFrame:
424
- return (
425
- self.spark.read.option("tag", self.tag_name)
426
- .table(self.iceberg_tables.BRONZE.iceberg_id)
427
- .transform(self.keep_dim_val_attr_columns)
428
- )
109
+ def gen_gold_sws_data(self) -> DataFrame:
110
+ return self.read_bronze_data().transform(self.keep_dim_val_attr_columns)
429
111
 
430
- def gen_gold_sdmx_data(self) -> DataFrame:
431
- return (
432
- self.spark.read.option("tag", self.tag_name)
433
- .table(self.iceberg_tables.SILVER.iceberg_id)
434
- .transform(self.apply_diss_flag_filter)
435
- .transform(self.apply_uom_mapping)
436
- .transform(self.keep_dim_uom_val_attr_columns)
437
- .transform(self.apply_sdmx_dimension_codes_mapping)
438
- .transform(self.apply_sdmx_column_names_mapping)
439
- .transform(self.add_sdmx_default_columns)
440
- .transform(self.rearrange_sdmx_columns)
441
- )
112
+ def gen_gold_sws_validated_data(self) -> DataFrame:
113
+ return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
442
114
 
443
115
  def write_gold_sws_validated_data_to_iceberg_and_csv(
444
116
  self, df: DataFrame
@@ -466,6 +138,37 @@ class SWSGoldIcebergSparkHelper:
466
138
 
467
139
  return df
468
140
 
141
+ def write_gold_sws_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
142
+ df.writeTo(self.iceberg_tables.GOLD_SWS.iceberg_id).createOrReplace()
143
+
144
+ logging.info(
145
+ f"Gold SWS table written to {self.iceberg_tables.GOLD_SWS.iceberg_id}"
146
+ )
147
+
148
+ self.spark.sql(
149
+ f"ALTER TABLE {self.iceberg_tables.GOLD_SWS.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
150
+ )
151
+
152
+ logging.info(f"gold SWS tag '{self.tag_name}' created")
153
+
154
+ df_1 = df.coalesce(1)
155
+
156
+ save_cache_csv(
157
+ df=df_1,
158
+ bucket=self.bucket,
159
+ prefix=self.iceberg_tables.GOLD_SWS.csv_prefix,
160
+ tag_name=self.tag_name,
161
+ )
162
+
163
+ return df
164
+
165
+ def gen_and_write_gold_sws_data_to_iceberg_and_csv(self) -> DataFrame:
166
+ self.df_gold_sws = self.gen_gold_sws_data()
167
+
168
+ self.write_gold_sws_data_to_iceberg_and_csv(self.df_gold_sws)
169
+
170
+ return self.df_gold_sws
171
+
469
172
  def gen_and_write_gold_sws_validated_data_to_iceberg_and_csv(self) -> DataFrame:
470
173
  self.df_gold_sws_validated = self.gen_gold_sws_validated_data()
471
174
 
@@ -536,12 +239,91 @@ class SWSGoldIcebergSparkHelper:
536
239
 
537
240
  return df
538
241
 
539
- def gen_and_write_gold_sdmx_data_to_iceberg_and_csv(self) -> DataFrame:
540
- self.df_gold_sdmx = self.gen_gold_sdmx_data()
242
+ def write_gold_pre_sdmx_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
243
+ """The expected input to this function is the output of the sws disseminated function"""
244
+ for column in self.dim_columns:
245
+ df = df.withColumn(
246
+ column, F.regexp_replace(col(column), lit("\."), lit("_"))
247
+ )
248
+ df = df.withColumnRenamed("value", "OBS_VALUE").withColumnsRenamed(
249
+ {column: column.upper() for column in df.columns}
250
+ )
251
+ df.writeTo(self.iceberg_tables.GOLD_PRE_SDMX.iceberg_id).createOrReplace()
252
+
253
+ logging.info(
254
+ f"Gold pre-SDMX table written to {self.iceberg_tables.GOLD_PRE_SDMX.iceberg_id}"
255
+ )
541
256
 
542
- self.write_gold_sdmx_data_to_iceberg_and_csv(self.df_gold_sdmx)
257
+ self.spark.sql(
258
+ f"ALTER TABLE {self.iceberg_tables.GOLD_PRE_SDMX.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
259
+ )
543
260
 
544
- return self.df_gold_sdmx
261
+ logging.info(f"gold pre-SDMX tag '{self.tag_name}' created")
262
+
263
+ df_1 = df.coalesce(1)
264
+
265
+ save_cache_csv(
266
+ df=df_1,
267
+ bucket=self.bucket,
268
+ prefix=self.iceberg_tables.GOLD_PRE_SDMX.csv_prefix,
269
+ tag_name=self.tag_name,
270
+ )
271
+
272
+ return df
273
+
274
+ def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
275
+ """The expected input to this function is the output of the sws disseminated function"""
276
+ df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
277
+
278
+ logging.info(
279
+ f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
280
+ )
281
+
282
+ self.spark.sql(
283
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
284
+ )
285
+
286
+ logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
287
+
288
+ df_1 = df.coalesce(1)
289
+
290
+ save_cache_csv(
291
+ df=df_1,
292
+ bucket=self.bucket,
293
+ prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
294
+ tag_name=self.tag_name,
295
+ )
296
+
297
+ return df
298
+
299
+ def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
300
+ self, df: DataFrame
301
+ ) -> DataFrame:
302
+ """The expected input to this function is the output of the sws disseminated function"""
303
+ df.writeTo(
304
+ self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
305
+ ).createOrReplace()
306
+
307
+ logging.info(
308
+ f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
309
+ )
310
+
311
+ self.spark.sql(
312
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
313
+ )
314
+
315
+ logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
316
+
317
+ df_1 = df.coalesce(1)
318
+
319
+ save_cache_csv(
320
+ df=df_1,
321
+ bucket=self.bucket,
322
+ prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
323
+ tag_name=self.tag_name,
324
+ )
325
+
326
+ return df
545
327
 
546
328
  def write_gold_sws_validated_sws_dissemination_tag(
547
329
  self, df: DataFrame, tags: Tags
@@ -553,7 +335,7 @@ class SWSGoldIcebergSparkHelper:
553
335
  new_iceberg_table = BaseDisseminatedTagTable(
554
336
  id=f"{self.domain_code.lower()}_gold_sws_validated_iceberg",
555
337
  name=f"{self.domain_code} gold SWS validated Iceberg",
556
- description="Gold table containing all the data unmapped and unfiltered in SWS compatible format",
338
+ description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format",
557
339
  layer=TableLayer.GOLD,
558
340
  private=True,
559
341
  type=TableType.ICEBERG,
@@ -571,10 +353,10 @@ class SWSGoldIcebergSparkHelper:
571
353
  )
572
354
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
573
355
 
574
- new_sdmx_table = BaseDisseminatedTagTable(
356
+ new_diss_table = BaseDisseminatedTagTable(
575
357
  id=f"{self.domain_code.lower()}_gold_sws_validated_csv",
576
358
  name=f"{self.domain_code} gold SWS validated csv",
577
- description="Gold table containing all the data unmapped and unfiltered in SWS compatible format cached in csv",
359
+ description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format, cached in csv",
578
360
  layer=TableLayer.GOLD,
579
361
  private=True,
580
362
  type=TableType.CSV,
@@ -586,7 +368,7 @@ class SWSGoldIcebergSparkHelper:
586
368
  tag=tag,
587
369
  dataset_id=self.dataset_id,
588
370
  tag_name=self.tag_name,
589
- table=new_sdmx_table,
371
+ table=new_diss_table,
590
372
  )
591
373
  logging.debug(f"Tag with Added csv Table: {tag}")
592
374
 
@@ -602,7 +384,7 @@ class SWSGoldIcebergSparkHelper:
602
384
  new_iceberg_table = BaseDisseminatedTagTable(
603
385
  id=f"{self.domain_code.lower()}_gold_sws_disseminated_iceberg",
604
386
  name=f"{self.domain_code} gold SWS disseminated Iceberg",
605
- description="Gold table containing all the data mapped and filtered in SWS compatible format",
387
+ description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format",
606
388
  layer=TableLayer.GOLD,
607
389
  private=True,
608
390
  type=TableType.ICEBERG,
@@ -620,10 +402,10 @@ class SWSGoldIcebergSparkHelper:
620
402
  )
621
403
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
622
404
 
623
- new_sdmx_table = BaseDisseminatedTagTable(
405
+ new_diss_table = BaseDisseminatedTagTable(
624
406
  id=f"{self.domain_code.lower()}_gold_sws_disseminated_csv",
625
407
  name=f"{self.domain_code} gold SWS disseminated csv",
626
- description="Gold table containing all the data mapped and filtered in SWS compatible format format cached in csv",
408
+ description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format, cached in csv",
627
409
  layer=TableLayer.GOLD,
628
410
  private=True,
629
411
  type=TableType.CSV,
@@ -635,7 +417,7 @@ class SWSGoldIcebergSparkHelper:
635
417
  tag=tag,
636
418
  dataset_id=self.dataset_id,
637
419
  tag_name=self.tag_name,
638
- table=new_sdmx_table,
420
+ table=new_diss_table,
639
421
  )
640
422
  logging.debug(f"Tag with Added csv Table: {tag}")
641
423
 
@@ -669,7 +451,7 @@ class SWSGoldIcebergSparkHelper:
669
451
  )
670
452
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
671
453
 
672
- new_sdmx_table = BaseDisseminatedTagTable(
454
+ new_diss_table = BaseDisseminatedTagTable(
673
455
  id=f"{self.domain_code.lower()}_gold_sdmx_csv",
674
456
  name=f"{self.domain_code} gold SDMX csv",
675
457
  description="Gold table containing all the cleaned data in SDMX compatible format cached in csv",
@@ -684,7 +466,203 @@ class SWSGoldIcebergSparkHelper:
684
466
  tag=tag,
685
467
  dataset_id=self.dataset_id,
686
468
  tag_name=self.tag_name,
687
- table=new_sdmx_table,
469
+ table=new_diss_table,
470
+ )
471
+ logging.debug(f"Tag with Added csv Table: {tag}")
472
+
473
+ return df
474
+
475
+ def write_gold_pre_sdmx_sws_dissemination_tag(
476
+ self, df: DataFrame, tags: Tags
477
+ ) -> DataFrame:
478
+ # Get or create a new tag
479
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
480
+ logging.debug(f"Tag: {tag}")
481
+
482
+ new_iceberg_table = BaseDisseminatedTagTable(
483
+ id=f"{self.domain_code.lower()}_gold_pre_sdmx_iceberg",
484
+ name=f"{self.domain_code} gold pre-SDMX Iceberg",
485
+ description="Gold table containing all the cleaned data in SDMX compatible format, ready to be mapped using FMR",
486
+ layer=TableLayer.GOLD,
487
+ private=True,
488
+ debug=True,
489
+ type=TableType.ICEBERG,
490
+ database=IcebergDatabases.GOLD_DATABASE,
491
+ table=self.iceberg_tables.GOLD_PRE_SDMX.table,
492
+ path=self.iceberg_tables.GOLD_PRE_SDMX.path,
493
+ structure={"columns": df.schema.jsonValue()["fields"]},
494
+ )
495
+ tag = upsert_disseminated_table(
496
+ sws_tags=tags,
497
+ tag=tag,
498
+ dataset_id=self.dataset_id,
499
+ tag_name=self.tag_name,
500
+ table=new_iceberg_table,
501
+ )
502
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
503
+
504
+ new_diss_table = BaseDisseminatedTagTable(
505
+ id=f"{self.domain_code.lower()}_gold_pre_sdmx_csv",
506
+ name=f"{self.domain_code} gold pre-SDMX csv",
507
+ description="Gold table containing all the cleaned data in SDMX compatible format, ready to be mapped using FMR and cached in csv",
508
+ layer=TableLayer.GOLD,
509
+ private=True,
510
+ debug=True,
511
+ type=TableType.CSV,
512
+ path=self.iceberg_tables.GOLD_PRE_SDMX.csv_path,
513
+ structure={"columns": df.schema.jsonValue()["fields"]},
514
+ )
515
+ tag = upsert_disseminated_table(
516
+ sws_tags=tags,
517
+ tag=tag,
518
+ dataset_id=self.dataset_id,
519
+ tag_name=self.tag_name,
520
+ table=new_diss_table,
521
+ )
522
+ logging.debug(f"Tag with Added csv Table: {tag}")
523
+
524
+ return df
525
+
526
+ def write_gold_sws_dissemination_tag(self, df: DataFrame, tags: Tags) -> DataFrame:
527
+ # Get or create a new tag
528
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
529
+ logging.debug(f"Tag: {tag}")
530
+
531
+ new_iceberg_table = BaseDisseminatedTagTable(
532
+ id=f"{self.domain_code.lower()}_gold_sws_iceberg",
533
+ name=f"{self.domain_code} gold SWS Iceberg",
534
+ description="Gold table containing the tag data without any processing",
535
+ layer=TableLayer.GOLD,
536
+ private=True,
537
+ type=TableType.ICEBERG,
538
+ database=IcebergDatabases.GOLD_DATABASE,
539
+ table=self.iceberg_tables.GOLD_SWS.table,
540
+ path=self.iceberg_tables.GOLD_SWS.path,
541
+ structure={"columns": df.schema.jsonValue()["fields"]},
542
+ )
543
+ tag = upsert_disseminated_table(
544
+ sws_tags=tags,
545
+ tag=tag,
546
+ dataset_id=self.dataset_id,
547
+ tag_name=self.tag_name,
548
+ table=new_iceberg_table,
549
+ )
550
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
551
+
552
+ new_diss_table = BaseDisseminatedTagTable(
553
+ id=f"{self.domain_code.lower()}_gold_sws_csv",
554
+ name=f"{self.domain_code} gold SWS csv",
555
+ description="Gold table containing the tag data without any processing cached in csv",
556
+ layer=TableLayer.GOLD,
557
+ private=True,
558
+ type=TableType.CSV,
559
+ path=self.iceberg_tables.GOLD_SWS.csv_path,
560
+ structure={"columns": df.schema.jsonValue()["fields"]},
561
+ )
562
+ tag = upsert_disseminated_table(
563
+ sws_tags=tags,
564
+ tag=tag,
565
+ dataset_id=self.dataset_id,
566
+ tag_name=self.tag_name,
567
+ table=new_diss_table,
568
+ )
569
+ logging.debug(f"Tag with Added csv Table: {tag}")
570
+
571
+ return df
572
+
573
+ def write_gold_faostat_dissemination_tag(
574
+ self, df: DataFrame, tags: Tags
575
+ ) -> DataFrame:
576
+ # Get or create a new tag
577
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
578
+ logging.debug(f"Tag: {tag}")
579
+
580
+ new_iceberg_table = BaseDisseminatedTagTable(
581
+ id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
582
+ name=f"{self.domain_code} gold FAOSTAT Iceberg",
583
+ description="Gold table containing the tag data in FAOSTAT format",
584
+ layer=TableLayer.GOLD,
585
+ private=True,
586
+ type=TableType.ICEBERG,
587
+ database=IcebergDatabases.GOLD_DATABASE,
588
+ table=self.iceberg_tables.GOLD_FAOSTAT.table,
589
+ path=self.iceberg_tables.GOLD_FAOSTAT.path,
590
+ structure={"columns": df.schema.jsonValue()["fields"]},
591
+ )
592
+ tag = upsert_disseminated_table(
593
+ sws_tags=tags,
594
+ tag=tag,
595
+ dataset_id=self.dataset_id,
596
+ tag_name=self.tag_name,
597
+ table=new_iceberg_table,
598
+ )
599
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
600
+
601
+ new_diss_table = BaseDisseminatedTagTable(
602
+ id=f"{self.domain_code.lower()}_gold_faostat_csv",
603
+ name=f"{self.domain_code} gold FAOSTAT csv",
604
+ description="Gold table containing the tag data in FAOSTAT format in csv",
605
+ layer=TableLayer.GOLD,
606
+ private=True,
607
+ type=TableType.CSV,
608
+ path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
609
+ structure={"columns": df.schema.jsonValue()["fields"]},
610
+ )
611
+ tag = upsert_disseminated_table(
612
+ sws_tags=tags,
613
+ tag=tag,
614
+ dataset_id=self.dataset_id,
615
+ tag_name=self.tag_name,
616
+ table=new_diss_table,
617
+ )
618
+ logging.debug(f"Tag with Added csv Table: {tag}")
619
+
620
+ return df
621
+
622
+ def write_gold_faostat_unfiltered_dissemination_tag(
623
+ self, df: DataFrame, tags: Tags
624
+ ) -> DataFrame:
625
+ # Get or create a new tag
626
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
627
+ logging.debug(f"Tag: {tag}")
628
+
629
+ new_iceberg_table = BaseDisseminatedTagTable(
630
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
631
+ name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
632
+ description="Gold table containing all the tag data in FAOSTAT format",
633
+ layer=TableLayer.GOLD,
634
+ private=True,
635
+ type=TableType.ICEBERG,
636
+ database=IcebergDatabases.GOLD_DATABASE,
637
+ table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
638
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
639
+ structure={"columns": df.schema.jsonValue()["fields"]},
640
+ )
641
+ tag = upsert_disseminated_table(
642
+ sws_tags=tags,
643
+ tag=tag,
644
+ dataset_id=self.dataset_id,
645
+ tag_name=self.tag_name,
646
+ table=new_iceberg_table,
647
+ )
648
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
649
+
650
+ new_diss_table = BaseDisseminatedTagTable(
651
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
652
+ name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
653
+ description="Gold table containing the tag data in FAOSTAT format in csv",
654
+ layer=TableLayer.GOLD,
655
+ private=True,
656
+ type=TableType.CSV,
657
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
658
+ structure={"columns": df.schema.jsonValue()["fields"]},
659
+ )
660
+ tag = upsert_disseminated_table(
661
+ sws_tags=tags,
662
+ tag=tag,
663
+ dataset_id=self.dataset_id,
664
+ tag_name=self.tag_name,
665
+ table=new_diss_table,
688
666
  )
689
667
  logging.debug(f"Tag with Added csv Table: {tag}")
690
668