sws-spark-dissemination-helper 0.0.141__py3-none-any.whl → 0.0.171__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import time
3
3
  from copy import copy
4
- from typing import Dict, List, Tuple
4
+ from typing import Dict, List, Tuple, Union
5
5
 
6
6
  import pyspark.sql.functions as F
7
7
  from pyspark.sql import DataFrame, SparkSession
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
26
26
  domain_code: str,
27
27
  dataset_details: dict = None,
28
28
  dataset_tables: DatasetTables = None,
29
+ keep_history: bool = False,
30
+ write_csv: bool = True,
31
+ source_tag: Union[str, None] = None,
29
32
  ) -> None:
30
33
  self.spark: SparkSession = spark
31
34
  self.dataset_details: dict = dataset_details
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
36
39
  self.dataset_tables: DatasetTables = dataset_tables
37
40
  self.iceberg_tables: IcebergTables = iceberg_tables
38
41
  self.domain_code = domain_code
42
+ self.keep_history: bool = keep_history
43
+ self.write_csv: bool = write_csv
44
+ self.source_tag: Union[str, None] = source_tag
39
45
 
40
46
  if dataset_details is not None:
41
47
  (
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
83
89
  self.df_obs_coord,
84
90
  self.df_metadata,
85
91
  self.df_meta_elem,
92
+ self.df_tag_observation,
86
93
  ) = self.raw_data
87
94
 
88
95
  (
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
92
99
  self.df_meta_elem_type,
93
100
  self.df_language,
94
101
  self.df_unit_of_measure,
102
+ self.df_dataset,
95
103
  self.dfs_dimension,
96
104
  ) = self.raw_reference_data
97
105
 
98
- self.df_user = self.raw_operational_data
106
+ (self.df_user, self.df_tag) = self.raw_operational_data
99
107
 
100
108
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
101
109
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -150,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
150
158
 
151
159
  return dfs_dimension
152
160
 
153
- def _prepare_element_uom(self) -> DataFrame:
161
+ def _prepare_element_uom(self) -> Union[DataFrame, None]:
154
162
  """Prepare the element and unit of measure join."""
155
163
 
156
164
  # Get the element DataFrame
@@ -162,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
162
170
  if dimension_column == self.element_column
163
171
  )
164
172
 
165
- # Join the element and the unit_of_measure
166
- df_element_uom = (
167
- df_element.alias("e")
168
- .join(
169
- self.df_unit_of_measure.alias("u"),
170
- col("e.unit_of_measure") == col("u.id"),
171
- )
172
- .select(
173
- col("e.code").alias("element_code"),
174
- col("u.code").alias("unit_of_measure"),
175
- col("u.symbol").alias("unit_of_measure_symbol"),
176
- col("u.base_unit").alias("unit_of_measure_base_unit"),
177
- col("u.multiplier").alias("unit_of_measure_multiplier"),
173
+ if any("unit_of_measure" == column.lower() for column in df_element.columns):
174
+ # Join the element and the unit_of_measure
175
+ df_element_uom = (
176
+ df_element.alias("e")
177
+ .join(
178
+ self.df_unit_of_measure.alias("u"),
179
+ col("e.unit_of_measure") == col("u.id"),
180
+ )
181
+ .select(
182
+ col("e.code").alias("element_code"),
183
+ col("u.code").alias("unit_of_measure"),
184
+ col("u.symbol").alias("unit_of_measure_symbol"),
185
+ col("u.base_unit").alias("unit_of_measure_base_unit"),
186
+ col("u.multiplier").alias("unit_of_measure_multiplier"),
187
+ )
178
188
  )
179
- )
180
189
 
181
- return df_element_uom
190
+ return df_element_uom
182
191
 
183
192
  def _gen_denormalized_observation(self) -> DataFrame:
184
193
  """Original query upon which the below computation is based
@@ -270,20 +279,170 @@ class SWSBronzeIcebergSparkHelper:
270
279
  .withColumnRenamed("code", dimension_column)
271
280
  )
272
281
 
273
- df_intermediate = (
274
- df_intermediate.alias("d")
275
- .join(
276
- F.broadcast(df_element_uom).alias("e"),
277
- col(f"d.{self.element_column}") == col("e.element_code"),
278
- "left",
282
+ if df_element_uom is not None:
283
+ df_intermediate = (
284
+ df_intermediate.alias("d")
285
+ .join(
286
+ F.broadcast(df_element_uom).alias("e"),
287
+ col(f"d.{self.element_column}") == col("e.element_code"),
288
+ "left",
289
+ )
290
+ .drop("element_code")
279
291
  )
280
- .drop("element_code")
281
- )
282
292
 
283
293
  df_obs_denorm = df_intermediate
284
294
 
285
295
  return df_obs_denorm
286
296
 
297
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
298
+ # ----------------
299
+ # Prepare dataframes for the joins
300
+ # ----------------
301
+
302
+ select_statement = """
303
+ o.id,
304
+ o.value,
305
+ u.email,
306
+ o.created_on,
307
+ o.replaced_on,
308
+ o.version"""
309
+
310
+ from_statement = f"""
311
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
312
+ JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
313
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
314
+
315
+ hint_statement = ""
316
+
317
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
318
+ for flag_col in self.flag_columns:
319
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
320
+
321
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
322
+ for i, (dim_col, cl) in enumerate(
323
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
324
+ ):
325
+ select_statement += f",\nd{i}.code AS {dim_col}"
326
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
327
+ hint_statement = (
328
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
329
+ if hint_statement
330
+ else f"BROADCAST({cl.iceberg_id})"
331
+ )
332
+
333
+ hint_statement = "/*+ " + hint_statement + " */"
334
+
335
+ final_query = "SELECT " + hint_statement + select_statement + from_statement
336
+ if not self.keep_history:
337
+ final_query += "\nWHERE o.replaced_on IS NULL"
338
+
339
+ logging.info("Final query for merging observation and observation_coordinares")
340
+ logging.info(final_query)
341
+
342
+ df_obs_denorm = self.spark.sql(final_query)
343
+
344
+ df_element_uom = self._prepare_element_uom()
345
+
346
+ dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
347
+
348
+ # Join all the dimension codelists
349
+ for dimension_column, df_dimension in zip(
350
+ self.dim_columns_w_time, dfs_dimension_w_validity
351
+ ):
352
+ logging.debug(f"Joining dimension column: {dimension_column}")
353
+ logging.debug(f"df_obs_denorm columns: {df_obs_denorm.columns}")
354
+ logging.debug(
355
+ f"Is dimension {dimension_column} in the dataframe? {dimension_column in df_obs_denorm.columns}"
356
+ )
357
+ df_obs_denorm = (
358
+ df_obs_denorm.alias("o")
359
+ .join(
360
+ F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
361
+ "d"
362
+ ),
363
+ col(f"{dimension_column}") == col("d.code"),
364
+ )
365
+ .drop("code", "join_id")
366
+ )
367
+ logging.debug(f"After join count: {df_obs_denorm.count()}")
368
+
369
+ if df_element_uom is not None:
370
+ df_obs_denorm = (
371
+ df_obs_denorm.alias("d")
372
+ .join(
373
+ F.broadcast(df_element_uom).alias("e"),
374
+ col(f"d.{self.element_column}") == col("e.element_code"),
375
+ "left",
376
+ )
377
+ .drop("element_code")
378
+ )
379
+ logging.debug(f"After uom count: {df_obs_denorm.count()}")
380
+
381
+ return df_obs_denorm
382
+
383
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
384
+ # ----------------
385
+ # Prepare dataframes for the joins
386
+ # ----------------
387
+
388
+ select_statement = """
389
+ o.id,
390
+ o.value,
391
+ u.email,
392
+ o.created_on,
393
+ o.replaced_on,
394
+ o.version"""
395
+
396
+ from_statement = f"""
397
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
398
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
399
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
400
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
401
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
402
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
403
+
404
+ hint_statement = ""
405
+
406
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
407
+ for flag_col in self.flag_columns:
408
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
409
+
410
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
411
+ for i, (dim_col, cl) in enumerate(
412
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
413
+ ):
414
+ select_statement += f",\nd{i}.code AS {dim_col}"
415
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
416
+ hint_statement = (
417
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
418
+ if hint_statement
419
+ else f"BROADCAST({cl.iceberg_id})"
420
+ )
421
+
422
+ hint_statement = "/*+ " + hint_statement + " */"
423
+
424
+ # TODO Add tag name as a parameter
425
+ where_statement = (
426
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
427
+ )
428
+
429
+ final_query = (
430
+ "SELECT "
431
+ + hint_statement
432
+ + select_statement
433
+ + from_statement
434
+ + where_statement
435
+ )
436
+ if not self.keep_history:
437
+ final_query += "\n AND o.replaced_on IS NULL"
438
+
439
+ logging.info("Final query for merging observation and observation_coordinares")
440
+ logging.info(final_query)
441
+
442
+ df_obs_denorm = self.spark.sql(final_query)
443
+
444
+ return df_obs_denorm
445
+
287
446
  def _gen_denormalized_metadata(self) -> DataFrame:
288
447
  """Original query upon which the below computation is based
289
448
 
@@ -347,6 +506,32 @@ class SWSBronzeIcebergSparkHelper:
347
506
 
348
507
  return df_meta_denorm
349
508
 
509
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
510
+ # ----------------
511
+ # Generate denormalized observation table
512
+ # ----------------
513
+
514
+ logging.info("meta_denorm start")
515
+
516
+ df_meta_denorm = self.spark.sql(
517
+ f"""
518
+ select m.observation as observation_id,
519
+ mt.code as type,
520
+ met.code as element_type,
521
+ l.country_code as language,
522
+ me.value
523
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
524
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
525
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
526
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
527
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
528
+ """
529
+ )
530
+
531
+ logging.info("meta_denorm write")
532
+
533
+ return df_meta_denorm
534
+
350
535
  def _gen_grouped_metadata(self) -> DataFrame:
351
536
  return (
352
537
  self._gen_denormalized_metadata()
@@ -367,6 +552,26 @@ class SWSBronzeIcebergSparkHelper:
367
552
  .agg(F.collect_list("metadata").alias("metadata"))
368
553
  )
369
554
 
555
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
556
+ return (
557
+ self._gen_denormalized_metadata_sql()
558
+ .select(
559
+ col("observation_id"),
560
+ F.create_map(
561
+ lit("type"),
562
+ col("type"),
563
+ lit("element_type"),
564
+ col("element_type"),
565
+ lit("language"),
566
+ col("language"),
567
+ lit("value"),
568
+ col("value"),
569
+ ).alias("metadata"),
570
+ )
571
+ .groupby("observation_id")
572
+ .agg(F.collect_list("metadata").alias("metadata"))
573
+ )
574
+
370
575
  def _gen_bronze_data(self) -> DataFrame:
371
576
  return (
372
577
  self._gen_denormalized_observation()
@@ -379,9 +584,37 @@ class SWSBronzeIcebergSparkHelper:
379
584
  .drop("m.observation_id")
380
585
  )
381
586
 
587
+ def _gen_bronze_data_sql(self) -> DataFrame:
588
+ return (
589
+ self._gen_denormalized_observation_sql()
590
+ .alias("o")
591
+ .join(
592
+ self._gen_grouped_metadata_sql().alias("m"),
593
+ col("o.id") == col("m.observation_id"),
594
+ "left",
595
+ )
596
+ .drop("m.observation_id")
597
+ )
598
+
599
+ def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
600
+ return (
601
+ self._gen_denormalized_observation_sql_from_tag()
602
+ .alias("o")
603
+ .join(
604
+ self._gen_grouped_metadata_sql().alias("m"),
605
+ col("o.id") == col("m.observation_id"),
606
+ "left",
607
+ )
608
+ .drop("m.observation_id")
609
+ )
610
+
382
611
  # TODO decouple data generation and data writing
383
- def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
384
- self.df_bronze = self._gen_bronze_data()
612
+ def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
613
+
614
+ if sql:
615
+ self.df_bronze = self._gen_bronze_data_sql()
616
+ else:
617
+ self.df_bronze = self._gen_bronze_data()
385
618
 
386
619
  self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
387
620
 
@@ -449,12 +682,15 @@ class SWSBronzeIcebergSparkHelper:
449
682
  logging.info("Bronze Dissemination tags successfully written")
450
683
 
451
684
  def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
452
- self, dimensions: Dict[str, List[str]]
685
+ self, dimensions: Dict[str, List[str]] = {}, from_tag=False
453
686
  ) -> DataFrame:
454
687
 
455
- self.disseminated_tag_df = self.df_bronze
688
+ if from_tag:
689
+ self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
690
+ else:
691
+ self.disseminated_tag_df = self.df_bronze
456
692
 
457
- if isinstance(dimensions, dict):
693
+ if not from_tag and dimensions is not None and len(dimensions) != 0:
458
694
  for dimension_name, codes in dimensions.items():
459
695
  logging.info(f"dimension_name: {dimension_name}")
460
696
  logging.info(f"codes: {codes}")
@@ -533,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
533
769
  logging.debug(f"Tag with Added csv Table: {tag}")
534
770
 
535
771
  logging.info("Bronze Disseminated tag with selection successfully written")
772
+
773
+
774
+ 1
775
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
776
+ 1
777
+ 1
778
+ 2
779
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
780
+ 2
781
+ 1
782
+ 1
783
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
784
+ 1
785
+ 1
786
+ 2
787
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
788
+ 2
789
+ 1
790
+ 1
791
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
792
+ 1
793
+ 1
794
+ 1
795
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
796
+ 1
797
+ 1
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from copy import copy
3
- from typing import Dict, List, Tuple
3
+ from typing import Dict, List, Tuple, Union
4
4
 
5
5
  import pyspark.sql.functions as F
6
6
  from pyspark.sql import DataFrame, SparkSession
@@ -26,6 +26,7 @@ class SWSEasyIcebergSparkHelper:
26
26
  dataset_tables: DatasetTables = None,
27
27
  keep_history: bool = False,
28
28
  write_csv: bool = True,
29
+ source_tag: Union[str, None] = None,
29
30
  ) -> None:
30
31
  self.spark: SparkSession = spark
31
32
  self.dataset_details: dict = dataset_details
@@ -37,6 +38,7 @@ class SWSEasyIcebergSparkHelper:
37
38
  self.iceberg_tables: IcebergTables = iceberg_tables
38
39
  self.keep_history: bool = keep_history
39
40
  self.write_csv: bool = write_csv
41
+ self.source_tag: Union[str, None] = source_tag
40
42
 
41
43
  if dataset_details is not None:
42
44
  (
@@ -69,8 +71,10 @@ class SWSEasyIcebergSparkHelper:
69
71
  self.df_obs_coord,
70
72
  self.df_metadata,
71
73
  self.df_meta_elem,
74
+ self.df_tag_observation,
72
75
  ) = self.raw_data
73
76
 
77
+ logging.info(self.raw_reference_data)
74
78
  (
75
79
  self.df_flag_method,
76
80
  self.df_flag_obs_status,
@@ -78,10 +82,11 @@ class SWSEasyIcebergSparkHelper:
78
82
  self.df_meta_elem_type,
79
83
  self.df_language,
80
84
  self.df_unit_of_measure,
85
+ self.df_dataset,
81
86
  self.dfs_dimension,
82
87
  ) = self.raw_reference_data
83
88
 
84
- self.df_user = self.raw_operational_data
89
+ (self.df_user, self.df_tag) = self.raw_operational_data
85
90
 
86
91
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
87
92
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -271,6 +276,75 @@ class SWSEasyIcebergSparkHelper:
271
276
  if not self.keep_history:
272
277
  final_query += "\nWHERE o.replaced_on IS NULL"
273
278
 
279
+ logging.info("Final query for merging observation and observation_coordinates")
280
+ logging.info(final_query)
281
+
282
+ df_obs_denorm = self.spark.sql(final_query)
283
+
284
+ df_obs_denorm.writeTo(
285
+ self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id
286
+ ).createOrReplace()
287
+
288
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_OBSERVATION.table} write")
289
+
290
+ return df_obs_denorm
291
+
292
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
293
+ # ----------------
294
+ # Prepare dataframes for the joins
295
+ # ----------------
296
+
297
+ select_statement = """
298
+ o.id,
299
+ o.value,
300
+ u.email,
301
+ o.created_on,
302
+ o.replaced_on,
303
+ o.version"""
304
+
305
+ from_statement = f"""
306
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
307
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
308
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
309
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
310
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
311
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
312
+
313
+ hint_statement = ""
314
+
315
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
316
+ for flag_col in self.flag_columns:
317
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
318
+
319
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
320
+ for i, (dim_col, cl) in enumerate(
321
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
322
+ ):
323
+ select_statement += f",\nd{i}.code AS {dim_col}"
324
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
325
+ hint_statement = (
326
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
327
+ if hint_statement
328
+ else f"BROADCAST({cl.iceberg_id})"
329
+ )
330
+
331
+ hint_statement = "/*+ " + hint_statement + " */"
332
+
333
+ # TODO Add tag name as a parameter
334
+ where_statement = (
335
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
336
+ )
337
+
338
+ final_query = (
339
+ "SELECT "
340
+ + hint_statement
341
+ + select_statement
342
+ + from_statement
343
+ + where_statement
344
+ )
345
+ if not self.keep_history:
346
+ final_query += "\n AND o.replaced_on IS NULL"
347
+
274
348
  logging.info("Final query for merging observation and observation_coordinares")
275
349
  logging.info(final_query)
276
350
 
@@ -350,7 +424,13 @@ class SWSEasyIcebergSparkHelper:
350
424
 
351
425
  df_meta_denorm = self.spark.sql(
352
426
  f"""
353
- select m.observation as observation_id,
427
+ select
428
+ /*+
429
+ BROADCAST({self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id}),
430
+ BROADCAST({self.dataset_tables.METADATA_TYPE.iceberg_id}),
431
+ BROADCAST({self.dataset_tables.LANGUAGE.iceberg_id})
432
+ */
433
+ m.observation as observation_id,
354
434
  mt.code as type,
355
435
  met.code as element_type,
356
436
  l.country_code as language,
@@ -363,7 +443,11 @@ class SWSEasyIcebergSparkHelper:
363
443
  """
364
444
  )
365
445
 
366
- logging.info("meta_denorm write")
446
+ df_meta_denorm.writeTo(
447
+ self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id
448
+ ).createOrReplace()
449
+
450
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_METADATA.table} write")
367
451
 
368
452
  return df_meta_denorm
369
453
 
@@ -388,25 +472,31 @@ class SWSEasyIcebergSparkHelper:
388
472
  )
389
473
 
390
474
  def _gen_grouped_metadata_sql(self) -> DataFrame:
391
- return (
392
- self._gen_denormalized_metadata_sql()
393
- .select(
394
- col("observation_id"),
395
- F.create_map(
396
- lit("type"),
397
- col("type"),
398
- lit("element_type"),
399
- col("element_type"),
400
- lit("language"),
401
- col("language"),
402
- lit("value"),
403
- col("value"),
404
- ).alias("metadata"),
405
- )
406
- .groupby("observation_id")
407
- .agg(F.collect_list("metadata").alias("metadata"))
475
+ df_meta_grouped = self.spark.sql(
476
+ f"""
477
+ SELECT
478
+ observation_id,
479
+ collect_list(
480
+ map(
481
+ 'type', type,
482
+ 'element_type', element_type,
483
+ 'language', language,
484
+ 'value', value
485
+ )
486
+ ) AS metadata
487
+ FROM {self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id}
488
+ GROUP BY observation_id
489
+ """
408
490
  )
409
491
 
492
+ df_meta_grouped.writeTo(
493
+ self.iceberg_tables.GROUPED_METADATA.iceberg_id
494
+ ).createOrReplace()
495
+
496
+ logging.info(f"{self.iceberg_tables.GROUPED_METADATA.table} write")
497
+
498
+ return df_meta_grouped
499
+
410
500
  def _gen_denormalied_data(self) -> DataFrame:
411
501
  return (
412
502
  self._gen_denormalized_observation()
@@ -420,8 +510,24 @@ class SWSEasyIcebergSparkHelper:
420
510
  )
421
511
 
422
512
  def _gen_denormalied_data_sql(self) -> DataFrame:
513
+ self._gen_denormalized_observation_sql()
514
+ self._gen_denormalized_metadata_sql()
515
+ self._gen_grouped_metadata_sql()
516
+
517
+ return self.spark.sql(
518
+ f"""
519
+ SELECT
520
+ o.*,
521
+ m.metadata
522
+ FROM {self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id} AS o
523
+ LEFT JOIN {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS m
524
+ ON o.id = m.observation_id
525
+ """
526
+ )
527
+
528
+ def _gen_denormalied_data_sql_from_tag(self) -> DataFrame:
423
529
  return (
424
- self._gen_denormalized_observation_sql()
530
+ self._gen_denormalized_observation_sql_from_tag()
425
531
  .alias("o")
426
532
  .join(
427
533
  self._gen_grouped_metadata_sql().alias("m"),
@@ -431,7 +537,7 @@ class SWSEasyIcebergSparkHelper:
431
537
  .drop("m.observation_id")
432
538
  )
433
539
 
434
- def write_data_to_iceberg_and_csv(self, sql=False) -> DataFrame:
540
+ def write_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
435
541
  if sql:
436
542
  self.df_denorm = self._gen_denormalied_data_sql()
437
543
  else:
@@ -504,18 +610,21 @@ class SWSEasyIcebergSparkHelper:
504
610
  logging.info("Unfiltered data tags successfully written")
505
611
 
506
612
  def write_filtered_data_to_iceberg_and_csv(
507
- self, dimensions: Dict[str, List[str]]
613
+ self, dimensions: Dict[str, List[str]] = None, from_tag=False
508
614
  ) -> DataFrame:
509
615
 
510
- self.filtered_df = self.df_denorm
616
+ if from_tag:
617
+ self.filtered_df = self._gen_denormalied_data_sql_from_tag()
618
+ else:
619
+ self.filtered_df = self.df_denorm
511
620
 
512
- for dimension_name, codes in dimensions.items():
513
- logging.info(f"dimension_name: {dimension_name}")
514
- logging.info(f"codes: {codes}")
515
- if len(codes) != 0:
516
- self.filtered_df = self.filtered_df.filter(
517
- col(dimension_name).isin(codes)
518
- )
621
+ for dimension_name, codes in dimensions.items():
622
+ logging.info(f"dimension_name: {dimension_name}")
623
+ logging.info(f"codes: {codes}")
624
+ if len(codes) != 0:
625
+ self.filtered_df = self.filtered_df.filter(
626
+ col(dimension_name).isin(codes)
627
+ )
519
628
 
520
629
  self.filtered_df.writeTo(
521
630
  self.iceberg_tables.TABLE_FILTERED.iceberg_id
@@ -586,3 +695,29 @@ class SWSEasyIcebergSparkHelper:
586
695
  logging.debug(f"Tag with Added csv Table: {tag}")
587
696
 
588
697
  logging.info("Filtered data tags successfully written")
698
+
699
+
700
+ 1
701
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
702
+ 1
703
+ 1
704
+ 2
705
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
706
+ 2
707
+ 1
708
+ 1
709
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
710
+ 1
711
+ 1
712
+ 2
713
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
714
+ 2
715
+ 1
716
+ 1
717
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
718
+ 1
719
+ 1
720
+ 1
721
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
722
+ 1
723
+ 1
@@ -271,6 +271,60 @@ class SWSGoldIcebergSparkHelper:
271
271
 
272
272
  return df
273
273
 
274
+ def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
275
+ """The expected input to this function is the output of the sws disseminated function"""
276
+ df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
277
+
278
+ logging.info(
279
+ f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
280
+ )
281
+
282
+ self.spark.sql(
283
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
284
+ )
285
+
286
+ logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
287
+
288
+ df_1 = df.coalesce(1)
289
+
290
+ save_cache_csv(
291
+ df=df_1,
292
+ bucket=self.bucket,
293
+ prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
294
+ tag_name=self.tag_name,
295
+ )
296
+
297
+ return df
298
+
299
+ def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
300
+ self, df: DataFrame
301
+ ) -> DataFrame:
302
+ """The expected input to this function is the output of the sws disseminated function"""
303
+ df.writeTo(
304
+ self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
305
+ ).createOrReplace()
306
+
307
+ logging.info(
308
+ f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
309
+ )
310
+
311
+ self.spark.sql(
312
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
313
+ )
314
+
315
+ logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
316
+
317
+ df_1 = df.coalesce(1)
318
+
319
+ save_cache_csv(
320
+ df=df_1,
321
+ bucket=self.bucket,
322
+ prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
323
+ tag_name=self.tag_name,
324
+ )
325
+
326
+ return df
327
+
274
328
  def write_gold_sws_validated_sws_dissemination_tag(
275
329
  self, df: DataFrame, tags: Tags
276
330
  ) -> DataFrame:
@@ -496,8 +550,8 @@ class SWSGoldIcebergSparkHelper:
496
550
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
497
551
 
498
552
  new_diss_table = BaseDisseminatedTagTable(
499
- id=f"{self.domain_code.lower()}_gold_pre_sdmx_csv",
500
- name=f"{self.domain_code} gold pre-SDMX csv",
553
+ id=f"{self.domain_code.lower()}_gold_sws_csv",
554
+ name=f"{self.domain_code} gold SWS csv",
501
555
  description="Gold table containing the tag data without any processing cached in csv",
502
556
  layer=TableLayer.GOLD,
503
557
  private=True,
@@ -515,3 +569,101 @@ class SWSGoldIcebergSparkHelper:
515
569
  logging.debug(f"Tag with Added csv Table: {tag}")
516
570
 
517
571
  return df
572
+
573
+ def write_gold_faostat_dissemination_tag(
574
+ self, df: DataFrame, tags: Tags
575
+ ) -> DataFrame:
576
+ # Get or create a new tag
577
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
578
+ logging.debug(f"Tag: {tag}")
579
+
580
+ new_iceberg_table = BaseDisseminatedTagTable(
581
+ id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
582
+ name=f"{self.domain_code} gold FAOSTAT Iceberg",
583
+ description="Gold table containing the tag data in FAOSTAT format",
584
+ layer=TableLayer.GOLD,
585
+ private=True,
586
+ type=TableType.ICEBERG,
587
+ database=IcebergDatabases.GOLD_DATABASE,
588
+ table=self.iceberg_tables.GOLD_FAOSTAT.table,
589
+ path=self.iceberg_tables.GOLD_FAOSTAT.path,
590
+ structure={"columns": df.schema.jsonValue()["fields"]},
591
+ )
592
+ tag = upsert_disseminated_table(
593
+ sws_tags=tags,
594
+ tag=tag,
595
+ dataset_id=self.dataset_id,
596
+ tag_name=self.tag_name,
597
+ table=new_iceberg_table,
598
+ )
599
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
600
+
601
+ new_diss_table = BaseDisseminatedTagTable(
602
+ id=f"{self.domain_code.lower()}_gold_faostat_csv",
603
+ name=f"{self.domain_code} gold FAOSTAT csv",
604
+ description="Gold table containing the tag data in FAOSTAT format in csv",
605
+ layer=TableLayer.GOLD,
606
+ private=True,
607
+ type=TableType.CSV,
608
+ path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
609
+ structure={"columns": df.schema.jsonValue()["fields"]},
610
+ )
611
+ tag = upsert_disseminated_table(
612
+ sws_tags=tags,
613
+ tag=tag,
614
+ dataset_id=self.dataset_id,
615
+ tag_name=self.tag_name,
616
+ table=new_diss_table,
617
+ )
618
+ logging.debug(f"Tag with Added csv Table: {tag}")
619
+
620
+ return df
621
+
622
+ def write_gold_faostat_unfiltered_dissemination_tag(
623
+ self, df: DataFrame, tags: Tags
624
+ ) -> DataFrame:
625
+ # Get or create a new tag
626
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
627
+ logging.debug(f"Tag: {tag}")
628
+
629
+ new_iceberg_table = BaseDisseminatedTagTable(
630
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
631
+ name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
632
+ description="Gold table containing all the tag data in FAOSTAT format",
633
+ layer=TableLayer.GOLD,
634
+ private=True,
635
+ type=TableType.ICEBERG,
636
+ database=IcebergDatabases.GOLD_DATABASE,
637
+ table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
638
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
639
+ structure={"columns": df.schema.jsonValue()["fields"]},
640
+ )
641
+ tag = upsert_disseminated_table(
642
+ sws_tags=tags,
643
+ tag=tag,
644
+ dataset_id=self.dataset_id,
645
+ tag_name=self.tag_name,
646
+ table=new_iceberg_table,
647
+ )
648
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
649
+
650
+ new_diss_table = BaseDisseminatedTagTable(
651
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
652
+ name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
653
+ description="Gold table containing the tag data in FAOSTAT format in csv",
654
+ layer=TableLayer.GOLD,
655
+ private=True,
656
+ type=TableType.CSV,
657
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
658
+ structure={"columns": df.schema.jsonValue()["fields"]},
659
+ )
660
+ tag = upsert_disseminated_table(
661
+ sws_tags=tags,
662
+ tag=tag,
663
+ dataset_id=self.dataset_id,
664
+ tag_name=self.tag_name,
665
+ table=new_diss_table,
666
+ )
667
+ logging.debug(f"Tag with Added csv Table: {tag}")
668
+
669
+ return df
@@ -94,25 +94,37 @@ class SWSPostgresSparkReader:
94
94
 
95
95
  logging.info(f"{pg_table} read start")
96
96
 
97
- # Read observations from the PostgreSQL table into a DataFrame
98
- df = (
99
- self.spark.read.format("jdbc")
100
- .option("customSchema", custom_schema)
101
- .option("dbtable", pg_table)
102
- .option("partitionColumn", partition_column)
103
- .option("lowerBound", min_id)
104
- .option("upperBound", max_id)
105
- .option("numPartitions", num_partitions)
106
- .option("fetchsize", "1000")
107
- .option("url", self.jdbc_url)
108
- .option("user", self.jdbc_conn_properties["user"])
109
- .option("password", self.jdbc_conn_properties["password"])
110
- .option("driver", SPARK_POSTGRES_DRIVER)
111
- .load()
112
- # .repartition(1024, partition_column)
113
- # .sortWithinPartitions(partition_column)
114
- # .cache()
115
- )
97
+ if min_id is None or max_id is None:
98
+ df = (
99
+ self.spark.read.format("jdbc")
100
+ .option("customSchema", custom_schema)
101
+ .option("dbtable", pg_table)
102
+ .option("fetchsize", "1000")
103
+ .option("url", self.jdbc_url)
104
+ .option("user", self.jdbc_conn_properties["user"])
105
+ .option("password", self.jdbc_conn_properties["password"])
106
+ .option("driver", SPARK_POSTGRES_DRIVER)
107
+ .load()
108
+ )
109
+ else:
110
+ df = (
111
+ self.spark.read.format("jdbc")
112
+ .option("customSchema", custom_schema)
113
+ .option("dbtable", pg_table)
114
+ .option("partitionColumn", partition_column)
115
+ .option("lowerBound", min_id)
116
+ .option("upperBound", max_id)
117
+ .option("numPartitions", num_partitions)
118
+ .option("fetchsize", "1000")
119
+ .option("url", self.jdbc_url)
120
+ .option("user", self.jdbc_conn_properties["user"])
121
+ .option("password", self.jdbc_conn_properties["password"])
122
+ .option("driver", SPARK_POSTGRES_DRIVER)
123
+ .load()
124
+ # .repartition(1024, partition_column)
125
+ # .sortWithinPartitions(partition_column)
126
+ # .cache()
127
+ )
116
128
  else:
117
129
  df = (
118
130
  self.spark.read.format("jdbc")
@@ -195,6 +207,7 @@ class SWSPostgresSparkReader:
195
207
  (dataset_tables.OBSERVATION_COORDINATE, "id", 10),
196
208
  (dataset_tables.METADATA, "id", 10),
197
209
  (dataset_tables.METADATA_ELEMENT, "metadata", 10),
210
+ (dataset_tables.TAG_OBSERVATION, "tag", 10),
198
211
  ]
199
212
  return self._import_tables(data_tables)
200
213
 
@@ -209,25 +222,30 @@ class SWSPostgresSparkReader:
209
222
  dataset_tables.METADATA_ELEMENT_TYPE,
210
223
  dataset_tables.LANGUAGE,
211
224
  dataset_tables.UNIT_OF_MEASURE,
225
+ dataset_tables.DATASET,
212
226
  *dataset_tables.CODELISTS,
213
227
  ]
228
+ logging.info(
229
+ f"Importing reference data tables: {[(table.postgres_id, table.iceberg_id) for table in reference_data_tables]}"
230
+ )
214
231
  return self._import_tables(
215
232
  [(table, None, 1) for table in reference_data_tables]
216
233
  )
217
234
 
218
235
  def import_operational_data_tables(
219
236
  self, dataset_tables: DatasetTables
220
- ) -> DataFrame:
237
+ ) -> List[DataFrame]:
221
238
  # Define and import operational data table without partitioning
222
239
  operational_data_tables = [
223
240
  (dataset_tables.USER, None, 1),
241
+ (dataset_tables.TAG, None, 1),
224
242
  ]
225
- return self._import_tables(operational_data_tables)[0]
243
+ return self._import_tables(operational_data_tables)
226
244
 
227
245
  def import_data_reference_data_operational_data(
228
246
  self, dataset_tables: DatasetTables
229
247
  ) -> Tuple[
230
- Tuple[DataFrame, DataFrame, DataFrame, DataFrame],
248
+ Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame],
231
249
  Tuple[
232
250
  DataFrame,
233
251
  DataFrame,
@@ -235,22 +253,23 @@ class SWSPostgresSparkReader:
235
253
  DataFrame,
236
254
  DataFrame,
237
255
  DataFrame,
256
+ DataFrame,
238
257
  List[DataFrame],
239
258
  ],
240
- DataFrame,
259
+ Tuple[DataFrame, DataFrame],
241
260
  ]:
242
261
  # Import and organize DataFrames into the desired output structure
243
262
  data_dfs = self.import_data_tables(dataset_tables)
244
263
  reference_data_dfs = self.import_reference_data_tables(dataset_tables)
245
- operational_data_df = self.import_operational_data_tables(dataset_tables)
264
+ operational_data_dfs = self.import_operational_data_tables(dataset_tables)
246
265
 
247
266
  return (
248
267
  tuple(data_dfs),
249
268
  (
250
- *reference_data_dfs[:6],
251
- reference_data_dfs[6:],
269
+ *reference_data_dfs[:7],
270
+ reference_data_dfs[7:],
252
271
  ),
253
- operational_data_df,
272
+ tuple(operational_data_dfs),
254
273
  )
255
274
 
256
275
  def get_codelist_type_mapping(
@@ -1,3 +1,5 @@
1
+ from typing import List
2
+
1
3
  from pyspark.sql.functions import col, lit
2
4
 
3
5
  SPARK_POSTGRES_DRIVER = "org.postgresql.Driver"
@@ -34,10 +36,14 @@ class DomainFilters:
34
36
  class DatasetDatatables:
35
37
 
36
38
  class __SWSDatatable:
37
- def __init__(self, id: str, name: str, schema: str):
39
+ def __init__(
40
+ self, id: str, name: str, schema: str, join_columns: List[str] = []
41
+ ):
38
42
  self.id = id
43
+ self.iceberg_id = f"{IcebergDatabases.BRONZE_DATABASE}.{id.split('.')[1]}"
39
44
  self.name = name
40
45
  self.schema = schema
46
+ self.join_columns = join_columns
41
47
 
42
48
  # Aggregation Tables
43
49
  AGGREGATES_COMPOSITION = __SWSDatatable(
@@ -50,22 +56,37 @@ class DatasetDatatables:
50
56
  name="Aggregation - Aggregates per elements",
51
57
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, element STRING, aggregation_type STRING, code STRING",
52
58
  )
53
-
59
+
54
60
  # Dissemination Tables
55
61
  DISSEMINATION_TYPE_LIST = __SWSDatatable(
56
62
  id="datatables.dissemination_{type}_list",
57
63
  name="Dissemination - {type} list",
58
64
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
65
+ join_columns=["domain", "code"],
59
66
  )
60
67
  DISSEMINATION_EXCEPTIONS = __SWSDatatable(
61
68
  id="datatables.dissemination_exception",
62
69
  name="Dissemination - Exceptions",
63
70
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, dim1_code STRING, dim2_code STRING, dim3_code STRING, dim4_code STRING, dim5_code STRING, dim6_code STRING, dim7_code STRING, status_flag STRING, method_flag STRING, dissemination BOOLEAN, aggregation BOOLEAN, note STRING",
71
+ join_columns=[
72
+ "domain",
73
+ " dim1_code",
74
+ " dim2_code",
75
+ " dim3_code",
76
+ " dim4_code",
77
+ " dim5_code",
78
+ " dim6_code",
79
+ " dim7_code",
80
+ " status_flag",
81
+ " method_flag",
82
+ ],
64
83
  )
84
+ # TODO Deprecate
65
85
  DISSEMINATION_ITEM_LIST_FAOSTAT = __SWSDatatable(
66
86
  id="datatables.dissemination_item_list_faostat",
67
87
  name="Dissemination - Item list - FAOSTAT",
68
88
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
89
+ join_columns=["domain", "code"],
69
90
  )
70
91
 
71
92
  # Mapping Tables
@@ -73,19 +94,23 @@ class DatasetDatatables:
73
94
  id="datatables.aggregates_mapping_domains_id",
74
95
  name="Mapping - Domains ID",
75
96
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, domain_name STRING, sws_source_id STRING, sws_destination_id STRING",
97
+ join_columns=["domain", "sws_source_id"],
76
98
  )
77
99
  MAPPING_CODELIST_TYPE = __SWSDatatable(
78
100
  id="datatables.mapping_codelist_type",
79
101
  name="Mapping Codelist type",
80
102
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, col_name STRING, col_type STRING",
103
+ join_columns=["domain", "col_name"],
81
104
  )
82
105
  MAPPING_CODE_CORRECTION = __SWSDatatable(
83
106
  id="datatables.aggregates_mapping_code_correction",
84
107
  name="Mapping - Code correction",
85
108
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, old_code STRING, new_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
109
+ join_columns=["domain", "old_code", "var_type", "mapping_type"],
86
110
  )
87
111
 
88
112
  # Non-SWS Sources Tables
113
+ # TODO To deprecate
89
114
  FAOSTAT_CODE_MAPPING = __SWSDatatable(
90
115
  id="datatables.faostat_code_mapping",
91
116
  name="FAOSTAT Code Mapping",
@@ -147,6 +172,11 @@ class DatasetTables:
147
172
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata_element",
148
173
  schema="id BIGINT, metadata INT, metadata_element_type INT, value STRING",
149
174
  )
175
+ self.TAG_OBSERVATION = self.__SWSTable(
176
+ postgres_id=f"{self.__dataset_id}.tag_observation",
177
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_tag_observation",
178
+ schema="tag BIGINT, observation INT",
179
+ )
150
180
 
151
181
  # Reference data
152
182
  self.CODELISTS = [
@@ -178,18 +208,21 @@ class DatasetTables:
178
208
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_element_type",
179
209
  schema="id INT, metadata_type INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN, private BOOLEAN",
180
210
  )
181
-
182
211
  LANGUAGE = __SWSTable(
183
212
  postgres_id="reference_data.language",
184
213
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.language",
185
214
  schema="id INT, country_code STRING, description STRING",
186
215
  )
187
-
188
216
  UNIT_OF_MEASURE = __SWSTable(
189
217
  postgres_id="reference_data.unit_of_measure",
190
218
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.unit_of_measure",
191
219
  schema="id INT, code STRING, sdmx_code STRING, metric BOOLEAN, description STRING, symbol STRING, base_unit STRING, multiplier DECIMAL",
192
220
  )
221
+ DATASET = __SWSTable(
222
+ postgres_id="reference_data.dataset",
223
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.dataset",
224
+ schema="id INT, xml_name STRING",
225
+ )
193
226
 
194
227
  # Operational data
195
228
  USER = __SWSTable(
@@ -197,6 +230,11 @@ class DatasetTables:
197
230
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.user",
198
231
  schema="id INT, username STRING, preferences INT, email STRING, active BOOLEAN, settings STRING",
199
232
  )
233
+ TAG = __SWSTable(
234
+ postgres_id="operational_data.tag",
235
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.tag",
236
+ schema="id INT, name STRING, reference_date DATE, dataset INT, type STRING, released_ON DATE, released_by INT, properties STRING",
237
+ )
200
238
 
201
239
 
202
240
  class IcebergTable:
@@ -216,30 +254,37 @@ class IcebergTables:
216
254
  self.__tag_name = tag_name
217
255
 
218
256
  # TODO Fix later with a more appropriate DATABASE
219
- self.TABLE = self._create_iceberg_table("BRONZE")
220
- self.TABLE_FILTERED = self._create_iceberg_table("BRONZE", suffix="filtered")
221
- self.BRONZE = self._create_iceberg_table("BRONZE")
222
- self.BRONZE_DISS_TAG = self._create_iceberg_table("BRONZE", suffix="diss_tag")
223
- self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)
257
+ self.DENORMALIZED_OBSERVATION = self.create_iceberg_table("BRONZE", suffix="denormalized_observation")
258
+ self.DENORMALIZED_METADATA = self.create_iceberg_table("BRONZE", suffix="denormalized_metadata")
259
+ self.GROUPED_METADATA = self.create_iceberg_table("BRONZE", suffix="grouped_metadata")
260
+ self.TABLE = self.create_iceberg_table("BRONZE")
261
+ self.TABLE_FILTERED = self.create_iceberg_table("BRONZE", suffix="filtered")
262
+ self.BRONZE = self.create_iceberg_table("BRONZE")
263
+ self.BRONZE_DISS_TAG = self.create_iceberg_table("BRONZE", suffix="diss_tag")
264
+ self.SILVER = self.create_iceberg_table("SILVER", prefix=domain)
224
265
 
225
266
  # GOLD tables with specific suffixes
226
- self.GOLD_SWS = self._create_iceberg_table(
227
- "GOLD", prefix=domain, suffix="sws"
228
- )
229
- self.GOLD_SDMX = self._create_iceberg_table(
267
+ self.GOLD_SWS = self.create_iceberg_table("GOLD", prefix=domain, suffix="sws")
268
+ self.GOLD_SDMX = self.create_iceberg_table(
230
269
  "GOLD", prefix=domain, suffix="sdmx_disseminated"
231
270
  )
232
- self.GOLD_SWS_VALIDATED = self._create_iceberg_table(
271
+ self.GOLD_SWS_VALIDATED = self.create_iceberg_table(
233
272
  "GOLD", prefix=domain, suffix="sws_validated"
234
273
  )
235
- self.GOLD_SWS_DISSEMINATED = self._create_iceberg_table(
274
+ self.GOLD_SWS_DISSEMINATED = self.create_iceberg_table(
236
275
  "GOLD", prefix=domain, suffix="sws_disseminated"
237
276
  )
238
- self.GOLD_PRE_SDMX = self._create_iceberg_table(
277
+ self.GOLD_PRE_SDMX = self.create_iceberg_table(
239
278
  "GOLD", prefix=domain, suffix="pre_sdmx"
240
279
  )
280
+ self.GOLD_FAOSTAT = self.create_iceberg_table(
281
+ "GOLD", prefix=domain, suffix="faostat"
282
+ )
283
+ self.GOLD_FAOSTAT_UNFILTERED = self.create_iceberg_table(
284
+ "GOLD", prefix=domain, suffix="faostat_unfiltered"
285
+ )
241
286
 
242
- def _create_iceberg_table(
287
+ def create_iceberg_table(
243
288
  self, level: str, prefix: str = "", suffix: str = ""
244
289
  ) -> IcebergTable:
245
290
  database = getattr(IcebergDatabases, f"{level}_DATABASE")
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.141
3
+ Version: 0.0.171
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
- Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
5
+ Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
7
7
  License: MIT License
8
8
 
@@ -31,27 +31,27 @@ Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python :: 3
32
32
  Requires-Python: >=3.9
33
33
  Requires-Dist: annotated-types==0.7.0
34
- Requires-Dist: boto3==1.36.18
35
- Requires-Dist: botocore==1.36.18
34
+ Requires-Dist: boto3>=1.40.75
35
+ Requires-Dist: botocore>=1.40.75
36
36
  Requires-Dist: certifi==2025.1.31
37
37
  Requires-Dist: charset-normalizer==3.4.1
38
- Requires-Dist: idna==3.10
38
+ Requires-Dist: idna>=3.10
39
39
  Requires-Dist: jmespath==1.0.1
40
40
  Requires-Dist: numpy==2.0.2
41
- Requires-Dist: pandas==2.2.3
41
+ Requires-Dist: pandas==2.3.3
42
42
  Requires-Dist: py4j==0.10.9.7
43
43
  Requires-Dist: pydantic-core==2.27.2
44
44
  Requires-Dist: pydantic==2.10.6
45
45
  Requires-Dist: pyspark==3.5.4
46
46
  Requires-Dist: python-dateutil==2.9.0.post0
47
47
  Requires-Dist: python-dotenv==0.19.2
48
- Requires-Dist: pytz==2025.1
48
+ Requires-Dist: pytz==2025.2
49
49
  Requires-Dist: requests==2.32.3
50
- Requires-Dist: s3transfer==0.11.2
50
+ Requires-Dist: s3transfer>=0.11.2
51
51
  Requires-Dist: six==1.17.0
52
- Requires-Dist: sws-api-client>=1.5.1
53
- Requires-Dist: typing-extensions==4.12.2
54
- Requires-Dist: tzdata==2025.1
52
+ Requires-Dist: sws-api-client==2.3.0
53
+ Requires-Dist: typing-extensions>=4.12.2
54
+ Requires-Dist: tzdata==2025.2
55
55
  Requires-Dist: urllib3==1.26.20
56
56
  Description-Content-Type: text/markdown
57
57
 
@@ -1,13 +1,13 @@
1
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=rHTPkMU8_DqP-MrHz7CzeVmrfyU2s7eJfk9gIDyTwio,20076
1
+ sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
2
2
  sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=6Sgn2anJedjrzwUA6kfzO37QyrV2Q0ufcvgFE--DtVo,21823
4
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=KZy6v4V3ugzKq_0L8JLmTPClN0hx-9uWpAwNFcs37Og,19339
5
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=cRLa6KYv7tbGB2_Rpcyvb3ksGu0lX9gpLhnF9nxlxEs,17494
3
+ sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
4
+ sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=0dxbVkrhdaASapEffF5PFcgKwAMyJoWBxzgymjZ4JyY,25049
5
+ sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=KpG8gp8Ai9pHDiKhUOTcXWxxmFGeKEE3XKlI_Y-SveU,18453
6
6
  sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=qioLv3SlJEfk0LzTiwfXRtZXVImPOJUeh9k1XwHC-pA,26225
7
7
  sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
8
- sws_spark_dissemination_helper/constants.py,sha256=T-yTDS9spd0D9zv8Y2zIAU7Ie5MEt-IW0A1oeLm30mA,11413
8
+ sws_spark_dissemination_helper/constants.py,sha256=vQmalAqInwPAybgJOfYx99jn47KsKp8jeD8eqmjw-Rs,13471
9
9
  sws_spark_dissemination_helper/utils.py,sha256=G7lQqNRrvqZpgm9WmddD7fWsI8IVn09x1p3cV3458EA,21963
10
- sws_spark_dissemination_helper-0.0.141.dist-info/METADATA,sha256=44O84wqYsAZ-pj5HdXkyuRVUEuBfRPZtzYsCAFwQgL0,2824
11
- sws_spark_dissemination_helper-0.0.141.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
- sws_spark_dissemination_helper-0.0.141.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
- sws_spark_dissemination_helper-0.0.141.dist-info/RECORD,,
10
+ sws_spark_dissemination_helper-0.0.171.dist-info/METADATA,sha256=W4qkQISSzekzXhpmNhlNMfJEmaQlscu3hQTs4Vavawg,2824
11
+ sws_spark_dissemination_helper-0.0.171.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
+ sws_spark_dissemination_helper-0.0.171.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
+ sws_spark_dissemination_helper-0.0.171.dist-info/RECORD,,