sws-spark-dissemination-helper 0.0.152__tar.gz → 0.0.153__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/PKG-INFO +1 -1
  2. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/pyproject.toml +1 -1
  3. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +204 -7
  4. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/.gitignore +0 -0
  5. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/LICENSE +0 -0
  6. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/README.md +0 -0
  7. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  8. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +0 -0
  9. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
  10. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  11. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/constants.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/utils.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/tests/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.153}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.152
3
+ Version: 0.0.153
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.152"
7
+ version = "0.0.153"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3>=1.36.18",
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import time
3
3
  from copy import copy
4
- from typing import Dict, List, Tuple
4
+ from typing import Dict, List, Tuple, Union
5
5
 
6
6
  import pyspark.sql.functions as F
7
7
  from pyspark.sql import DataFrame, SparkSession
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
26
26
  domain_code: str,
27
27
  dataset_details: dict = None,
28
28
  dataset_tables: DatasetTables = None,
29
+ keep_history: bool = False,
30
+ write_csv: bool = True,
31
+ source_tag: Union[str, None] = None,
29
32
  ) -> None:
30
33
  self.spark: SparkSession = spark
31
34
  self.dataset_details: dict = dataset_details
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
36
39
  self.dataset_tables: DatasetTables = dataset_tables
37
40
  self.iceberg_tables: IcebergTables = iceberg_tables
38
41
  self.domain_code = domain_code
42
+ self.keep_history: bool = keep_history
43
+ self.write_csv: bool = write_csv
44
+ self.source_tag: Union[str, None] = source_tag
39
45
 
40
46
  if dataset_details is not None:
41
47
  (
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
83
89
  self.df_obs_coord,
84
90
  self.df_metadata,
85
91
  self.df_meta_elem,
92
+ self.df_tag_observation,
86
93
  ) = self.raw_data
87
94
 
88
95
  (
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
92
99
  self.df_meta_elem_type,
93
100
  self.df_language,
94
101
  self.df_unit_of_measure,
102
+ self.df_dataset,
95
103
  self.dfs_dimension,
96
104
  ) = self.raw_reference_data
97
105
 
98
- self.df_user = self.raw_operational_data
106
+ (self.df_user, self.df_tag) = self.raw_operational_data
99
107
 
100
108
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
101
109
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -284,6 +292,118 @@ class SWSBronzeIcebergSparkHelper:
284
292
 
285
293
  return df_obs_denorm
286
294
 
295
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
296
+ # ----------------
297
+ # Prepare dataframes for the joins
298
+ # ----------------
299
+
300
+ select_statement = """
301
+ o.id,
302
+ o.value,
303
+ u.email,
304
+ o.created_on,
305
+ o.replaced_on,
306
+ o.version"""
307
+
308
+ from_statement = f"""
309
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
310
+ JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
311
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
312
+
313
+ hint_statement = ""
314
+
315
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
316
+ for flag_col in self.flag_columns:
317
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
318
+
319
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
320
+ for i, (dim_col, cl) in enumerate(
321
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
322
+ ):
323
+ select_statement += f",\nd{i}.code AS {dim_col}"
324
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
325
+ hint_statement = (
326
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
327
+ if hint_statement
328
+ else f"BROADCAST({cl.iceberg_id})"
329
+ )
330
+
331
+ hint_statement = "/*+ " + hint_statement + " */"
332
+
333
+ final_query = "SELECT " + hint_statement + select_statement + from_statement
334
+ if not self.keep_history:
335
+ final_query += "\nWHERE o.replaced_on IS NULL"
336
+
337
+ logging.info("Final query for merging observation and observation_coordinares")
338
+ logging.info(final_query)
339
+
340
+ df_obs_denorm = self.spark.sql(final_query)
341
+
342
+ return df_obs_denorm
343
+
344
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
345
+ # ----------------
346
+ # Prepare dataframes for the joins
347
+ # ----------------
348
+
349
+ select_statement = """
350
+ o.id,
351
+ o.value,
352
+ u.email,
353
+ o.created_on,
354
+ o.replaced_on,
355
+ o.version"""
356
+
357
+ from_statement = f"""
358
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
359
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
360
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
361
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
362
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
363
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
364
+
365
+ hint_statement = ""
366
+
367
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
368
+ for flag_col in self.flag_columns:
369
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
370
+
371
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
372
+ for i, (dim_col, cl) in enumerate(
373
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
374
+ ):
375
+ select_statement += f",\nd{i}.code AS {dim_col}"
376
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
377
+ hint_statement = (
378
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
379
+ if hint_statement
380
+ else f"BROADCAST({cl.iceberg_id})"
381
+ )
382
+
383
+ hint_statement = "/*+ " + hint_statement + " */"
384
+
385
+ # TODO Add tag name as a parameter
386
+ where_statement = (
387
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
388
+ )
389
+
390
+ final_query = (
391
+ "SELECT "
392
+ + hint_statement
393
+ + select_statement
394
+ + from_statement
395
+ + where_statement
396
+ )
397
+ if not self.keep_history:
398
+ final_query += "\n AND o.replaced_on IS NULL"
399
+
400
+ logging.info("Final query for merging observation and observation_coordinares")
401
+ logging.info(final_query)
402
+
403
+ df_obs_denorm = self.spark.sql(final_query)
404
+
405
+ return df_obs_denorm
406
+
287
407
  def _gen_denormalized_metadata(self) -> DataFrame:
288
408
  """Original query upon which the below computation is based
289
409
 
@@ -347,6 +467,32 @@ class SWSBronzeIcebergSparkHelper:
347
467
 
348
468
  return df_meta_denorm
349
469
 
470
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
471
+ # ----------------
472
+ # Generate denormalized observation table
473
+ # ----------------
474
+
475
+ logging.info("meta_denorm start")
476
+
477
+ df_meta_denorm = self.spark.sql(
478
+ f"""
479
+ select m.observation as observation_id,
480
+ mt.code as type,
481
+ met.code as element_type,
482
+ l.country_code as language,
483
+ me.value
484
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
485
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
486
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
487
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
488
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
489
+ """
490
+ )
491
+
492
+ logging.info("meta_denorm write")
493
+
494
+ return df_meta_denorm
495
+
350
496
  def _gen_grouped_metadata(self) -> DataFrame:
351
497
  return (
352
498
  self._gen_denormalized_metadata()
@@ -367,6 +513,26 @@ class SWSBronzeIcebergSparkHelper:
367
513
  .agg(F.collect_list("metadata").alias("metadata"))
368
514
  )
369
515
 
516
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
517
+ return (
518
+ self._gen_denormalized_metadata_sql()
519
+ .select(
520
+ col("observation_id"),
521
+ F.create_map(
522
+ lit("type"),
523
+ col("type"),
524
+ lit("element_type"),
525
+ col("element_type"),
526
+ lit("language"),
527
+ col("language"),
528
+ lit("value"),
529
+ col("value"),
530
+ ).alias("metadata"),
531
+ )
532
+ .groupby("observation_id")
533
+ .agg(F.collect_list("metadata").alias("metadata"))
534
+ )
535
+
370
536
  def _gen_bronze_data(self) -> DataFrame:
371
537
  return (
372
538
  self._gen_denormalized_observation()
@@ -379,9 +545,37 @@ class SWSBronzeIcebergSparkHelper:
379
545
  .drop("m.observation_id")
380
546
  )
381
547
 
548
+ def _gen_bronze_data_sql(self) -> DataFrame:
549
+ return (
550
+ self._gen_denormalized_observation_sql()
551
+ .alias("o")
552
+ .join(
553
+ self._gen_grouped_metadata_sql().alias("m"),
554
+ col("o.id") == col("m.observation_id"),
555
+ "left",
556
+ )
557
+ .drop("m.observation_id")
558
+ )
559
+
560
+ def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
561
+ return (
562
+ self._gen_denormalized_observation_sql_from_tag()
563
+ .alias("o")
564
+ .join(
565
+ self._gen_grouped_metadata_sql().alias("m"),
566
+ col("o.id") == col("m.observation_id"),
567
+ "left",
568
+ )
569
+ .drop("m.observation_id")
570
+ )
571
+
382
572
  # TODO decouple data generation and data writing
383
- def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
384
- self.df_bronze = self._gen_bronze_data()
573
+ def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
574
+
575
+ if sql:
576
+ self.df_bronze = self._gen_bronze_data_sql()
577
+ else:
578
+ self.df_bronze = self._gen_bronze_data()
385
579
 
386
580
  self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
387
581
 
@@ -449,12 +643,15 @@ class SWSBronzeIcebergSparkHelper:
449
643
  logging.info("Bronze Dissemination tags successfully written")
450
644
 
451
645
  def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
452
- self, dimensions: Dict[str, List[str]]
646
+ self, dimensions: Dict[str, List[str]] = {}, from_tag=False
453
647
  ) -> DataFrame:
454
648
 
455
- self.disseminated_tag_df = self.df_bronze
649
+ if from_tag:
650
+ self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
651
+ else:
652
+ self.disseminated_tag_df = self.df_bronze
456
653
 
457
- if isinstance(dimensions, dict):
654
+ if not from_tag and len(dimensions) != 0:
458
655
  for dimension_name, codes in dimensions.items():
459
656
  logging.info(f"dimension_name: {dimension_name}")
460
657
  logging.info(f"codes: {codes}")