sws-spark-dissemination-helper 0.0.152__tar.gz → 0.0.154__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/PKG-INFO +1 -1
  2. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/pyproject.toml +1 -1
  3. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +234 -7
  4. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/.gitignore +0 -0
  5. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/LICENSE +0 -0
  6. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/README.md +0 -0
  7. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  8. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +0 -0
  9. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
  10. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  11. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/constants.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/utils.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/tests/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.152
3
+ Version: 0.0.154
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.152"
7
+ version = "0.0.154"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3>=1.36.18",
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import time
3
3
  from copy import copy
4
- from typing import Dict, List, Tuple
4
+ from typing import Dict, List, Tuple, Union
5
5
 
6
6
  import pyspark.sql.functions as F
7
7
  from pyspark.sql import DataFrame, SparkSession
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
26
26
  domain_code: str,
27
27
  dataset_details: dict = None,
28
28
  dataset_tables: DatasetTables = None,
29
+ keep_history: bool = False,
30
+ write_csv: bool = True,
31
+ source_tag: Union[str, None] = None,
29
32
  ) -> None:
30
33
  self.spark: SparkSession = spark
31
34
  self.dataset_details: dict = dataset_details
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
36
39
  self.dataset_tables: DatasetTables = dataset_tables
37
40
  self.iceberg_tables: IcebergTables = iceberg_tables
38
41
  self.domain_code = domain_code
42
+ self.keep_history: bool = keep_history
43
+ self.write_csv: bool = write_csv
44
+ self.source_tag: Union[str, None] = source_tag
39
45
 
40
46
  if dataset_details is not None:
41
47
  (
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
83
89
  self.df_obs_coord,
84
90
  self.df_metadata,
85
91
  self.df_meta_elem,
92
+ self.df_tag_observation,
86
93
  ) = self.raw_data
87
94
 
88
95
  (
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
92
99
  self.df_meta_elem_type,
93
100
  self.df_language,
94
101
  self.df_unit_of_measure,
102
+ self.df_dataset,
95
103
  self.dfs_dimension,
96
104
  ) = self.raw_reference_data
97
105
 
98
- self.df_user = self.raw_operational_data
106
+ (self.df_user, self.df_tag) = self.raw_operational_data
99
107
 
100
108
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
101
109
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -284,6 +292,148 @@ class SWSBronzeIcebergSparkHelper:
284
292
 
285
293
  return df_obs_denorm
286
294
 
295
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
296
+ # ----------------
297
+ # Prepare dataframes for the joins
298
+ # ----------------
299
+
300
+ select_statement = """
301
+ o.id,
302
+ o.value,
303
+ u.email,
304
+ o.created_on,
305
+ o.replaced_on,
306
+ o.version"""
307
+
308
+ from_statement = f"""
309
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
310
+ JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
311
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
312
+
313
+ hint_statement = ""
314
+
315
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
316
+ for flag_col in self.flag_columns:
317
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
318
+
319
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
320
+ for i, (dim_col, cl) in enumerate(
321
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
322
+ ):
323
+ select_statement += f",\nd{i}.code AS {dim_col}"
324
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
325
+ hint_statement = (
326
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
327
+ if hint_statement
328
+ else f"BROADCAST({cl.iceberg_id})"
329
+ )
330
+
331
+ hint_statement = "/*+ " + hint_statement + " */"
332
+
333
+ final_query = "SELECT " + hint_statement + select_statement + from_statement
334
+ if not self.keep_history:
335
+ final_query += "\nWHERE o.replaced_on IS NULL"
336
+
337
+ logging.info("Final query for merging observation and observation_coordinares")
338
+ logging.info(final_query)
339
+
340
+ df_obs_denorm = self.spark.sql(final_query)
341
+
342
+ df_element_uom = self._prepare_element_uom()
343
+
344
+ dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
345
+
346
+ # Join all the dimension codelists
347
+ for dimension_column, df_dimension in zip(
348
+ self.dim_columns_w_time, dfs_dimension_w_validity
349
+ ):
350
+ df_obs_denorm = (
351
+ df_obs_denorm.alias("o")
352
+ .join(
353
+ F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
354
+ "d"
355
+ ),
356
+ col(f"{dimension_column}") == col("d.join_id"),
357
+ )
358
+ .drop(f"{dimension_column}", "join_id")
359
+ .withColumnRenamed("code", dimension_column)
360
+ )
361
+
362
+ df_obs_denorm = (
363
+ df_obs_denorm.alias("d")
364
+ .join(
365
+ F.broadcast(df_element_uom).alias("e"),
366
+ col(f"d.{self.element_column}") == col("e.element_code"),
367
+ "left",
368
+ )
369
+ .drop("element_code")
370
+ )
371
+
372
+ return df_obs_denorm
373
+
374
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
375
+ # ----------------
376
+ # Prepare dataframes for the joins
377
+ # ----------------
378
+
379
+ select_statement = """
380
+ o.id,
381
+ o.value,
382
+ u.email,
383
+ o.created_on,
384
+ o.replaced_on,
385
+ o.version"""
386
+
387
+ from_statement = f"""
388
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
389
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
390
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
391
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
392
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
393
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
394
+
395
+ hint_statement = ""
396
+
397
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
398
+ for flag_col in self.flag_columns:
399
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
400
+
401
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
402
+ for i, (dim_col, cl) in enumerate(
403
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
404
+ ):
405
+ select_statement += f",\nd{i}.code AS {dim_col}"
406
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
407
+ hint_statement = (
408
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
409
+ if hint_statement
410
+ else f"BROADCAST({cl.iceberg_id})"
411
+ )
412
+
413
+ hint_statement = "/*+ " + hint_statement + " */"
414
+
415
+ # TODO Add tag name as a parameter
416
+ where_statement = (
417
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
418
+ )
419
+
420
+ final_query = (
421
+ "SELECT "
422
+ + hint_statement
423
+ + select_statement
424
+ + from_statement
425
+ + where_statement
426
+ )
427
+ if not self.keep_history:
428
+ final_query += "\n AND o.replaced_on IS NULL"
429
+
430
+ logging.info("Final query for merging observation and observation_coordinares")
431
+ logging.info(final_query)
432
+
433
+ df_obs_denorm = self.spark.sql(final_query)
434
+
435
+ return df_obs_denorm
436
+
287
437
  def _gen_denormalized_metadata(self) -> DataFrame:
288
438
  """Original query upon which the below computation is based
289
439
 
@@ -347,6 +497,32 @@ class SWSBronzeIcebergSparkHelper:
347
497
 
348
498
  return df_meta_denorm
349
499
 
500
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
501
+ # ----------------
502
+ # Generate denormalized observation table
503
+ # ----------------
504
+
505
+ logging.info("meta_denorm start")
506
+
507
+ df_meta_denorm = self.spark.sql(
508
+ f"""
509
+ select m.observation as observation_id,
510
+ mt.code as type,
511
+ met.code as element_type,
512
+ l.country_code as language,
513
+ me.value
514
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
515
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
516
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
517
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
518
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
519
+ """
520
+ )
521
+
522
+ logging.info("meta_denorm write")
523
+
524
+ return df_meta_denorm
525
+
350
526
  def _gen_grouped_metadata(self) -> DataFrame:
351
527
  return (
352
528
  self._gen_denormalized_metadata()
@@ -367,6 +543,26 @@ class SWSBronzeIcebergSparkHelper:
367
543
  .agg(F.collect_list("metadata").alias("metadata"))
368
544
  )
369
545
 
546
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
547
+ return (
548
+ self._gen_denormalized_metadata_sql()
549
+ .select(
550
+ col("observation_id"),
551
+ F.create_map(
552
+ lit("type"),
553
+ col("type"),
554
+ lit("element_type"),
555
+ col("element_type"),
556
+ lit("language"),
557
+ col("language"),
558
+ lit("value"),
559
+ col("value"),
560
+ ).alias("metadata"),
561
+ )
562
+ .groupby("observation_id")
563
+ .agg(F.collect_list("metadata").alias("metadata"))
564
+ )
565
+
370
566
  def _gen_bronze_data(self) -> DataFrame:
371
567
  return (
372
568
  self._gen_denormalized_observation()
@@ -379,9 +575,37 @@ class SWSBronzeIcebergSparkHelper:
379
575
  .drop("m.observation_id")
380
576
  )
381
577
 
578
+ def _gen_bronze_data_sql(self) -> DataFrame:
579
+ return (
580
+ self._gen_denormalized_observation_sql()
581
+ .alias("o")
582
+ .join(
583
+ self._gen_grouped_metadata_sql().alias("m"),
584
+ col("o.id") == col("m.observation_id"),
585
+ "left",
586
+ )
587
+ .drop("m.observation_id")
588
+ )
589
+
590
+ def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
591
+ return (
592
+ self._gen_denormalized_observation_sql_from_tag()
593
+ .alias("o")
594
+ .join(
595
+ self._gen_grouped_metadata_sql().alias("m"),
596
+ col("o.id") == col("m.observation_id"),
597
+ "left",
598
+ )
599
+ .drop("m.observation_id")
600
+ )
601
+
382
602
  # TODO decouple data generation and data writing
383
- def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
384
- self.df_bronze = self._gen_bronze_data()
603
+ def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
604
+
605
+ if sql:
606
+ self.df_bronze = self._gen_bronze_data_sql()
607
+ else:
608
+ self.df_bronze = self._gen_bronze_data()
385
609
 
386
610
  self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
387
611
 
@@ -449,12 +673,15 @@ class SWSBronzeIcebergSparkHelper:
449
673
  logging.info("Bronze Dissemination tags successfully written")
450
674
 
451
675
  def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
452
- self, dimensions: Dict[str, List[str]]
676
+ self, dimensions: Dict[str, List[str]] = {}, from_tag=False
453
677
  ) -> DataFrame:
454
678
 
455
- self.disseminated_tag_df = self.df_bronze
679
+ if from_tag:
680
+ self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
681
+ else:
682
+ self.disseminated_tag_df = self.df_bronze
456
683
 
457
- if isinstance(dimensions, dict):
684
+ if not from_tag and len(dimensions) != 0:
458
685
  for dimension_name, codes in dimensions.items():
459
686
  logging.info(f"dimension_name: {dimension_name}")
460
687
  logging.info(f"codes: {codes}")