sws-spark-dissemination-helper 0.0.160__py3-none-any.whl → 0.0.161__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -276,11 +276,17 @@ class SWSEasyIcebergSparkHelper:
276
276
  if not self.keep_history:
277
277
  final_query += "\nWHERE o.replaced_on IS NULL"
278
278
 
279
- logging.info("Final query for merging observation and observation_coordinares")
279
+ logging.info("Final query for merging observation and observation_coordinates")
280
280
  logging.info(final_query)
281
281
 
282
282
  df_obs_denorm = self.spark.sql(final_query)
283
283
 
284
+ df_obs_denorm.writeTo(
285
+ self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id
286
+ ).createOrReplace()
287
+
288
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_OBSERVATION.table} write")
289
+
284
290
  return df_obs_denorm
285
291
 
286
292
  def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
@@ -418,7 +424,13 @@ class SWSEasyIcebergSparkHelper:
418
424
 
419
425
  df_meta_denorm = self.spark.sql(
420
426
  f"""
421
- select m.observation as observation_id,
427
+ select
428
+ /*+
429
+ BROADCAST({self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id}),
430
+ BROADCAST({self.dataset_tables.METADATA_TYPE.iceberg_id}),
431
+ BROADCAST({self.dataset_tables.LANGUAGE.iceberg_id}),
432
+ */
433
+ m.observation as observation_id,
422
434
  mt.code as type,
423
435
  met.code as element_type,
424
436
  l.country_code as language,
@@ -431,7 +443,11 @@ class SWSEasyIcebergSparkHelper:
431
443
  """
432
444
  )
433
445
 
434
- logging.info("meta_denorm write")
446
+ df_meta_denorm.writeTo(
447
+ self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id
448
+ ).createOrReplace()
449
+
450
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_METADATA.table} write")
435
451
 
436
452
  return df_meta_denorm
437
453
 
@@ -456,25 +472,31 @@ class SWSEasyIcebergSparkHelper:
456
472
  )
457
473
 
458
474
  def _gen_grouped_metadata_sql(self) -> DataFrame:
459
- return (
460
- self._gen_denormalized_metadata_sql()
461
- .select(
462
- col("observation_id"),
463
- F.create_map(
464
- lit("type"),
465
- col("type"),
466
- lit("element_type"),
467
- col("element_type"),
468
- lit("language"),
469
- col("language"),
470
- lit("value"),
471
- col("value"),
472
- ).alias("metadata"),
473
- )
474
- .groupby("observation_id")
475
- .agg(F.collect_list("metadata").alias("metadata"))
475
+ df_meta_grouped = self.spark.sql(
476
+ f"""
477
+ SELECT
478
+ observation_id,
479
+ collect_list(
480
+ map(
481
+ 'type', type,
482
+ 'element_type', element_type,
483
+ 'language', language,
484
+ 'value', value
485
+ )
486
+ ) AS metadata
487
+ FROM {self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id}
488
+ GROUP BY observation_id
489
+ """
476
490
  )
477
491
 
492
+ df_meta_grouped.writeTo(
493
+ self.iceberg_tables.GROUPED_METADATA.iceberg_id
494
+ ).createOrReplace()
495
+
496
+ logging.info(f"{self.iceberg_tables.GROUPED_METADATA.table} write")
497
+
498
+ return df_meta_grouped
499
+
478
500
  def _gen_denormalied_data(self) -> DataFrame:
479
501
  return (
480
502
  self._gen_denormalized_observation()
@@ -488,15 +510,15 @@ class SWSEasyIcebergSparkHelper:
488
510
  )
489
511
 
490
512
  def _gen_denormalied_data_sql(self) -> DataFrame:
491
- return (
492
- self._gen_denormalized_observation_sql()
493
- .alias("o")
494
- .join(
495
- self._gen_grouped_metadata_sql().alias("m"),
496
- col("o.id") == col("m.observation_id"),
497
- "left",
498
- )
499
- .drop("m.observation_id")
513
+ return self.spark.sql(
514
+ f"""
515
+ SELECT
516
+ o.*,
517
+ m.metadata
518
+ FROM {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS o
519
+ LEFT JOIN {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS m
520
+ ON o.id = m.observation_id
521
+ """
500
522
  )
501
523
 
502
524
  def _gen_denormalied_data_sql_from_tag(self) -> DataFrame:
@@ -669,3 +691,29 @@ class SWSEasyIcebergSparkHelper:
669
691
  logging.debug(f"Tag with Added csv Table: {tag}")
670
692
 
671
693
  logging.info("Filtered data tags successfully written")
694
+
695
+
696
+ 1
697
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
698
+ 1
699
+ 1
700
+ 2
701
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
702
+ 2
703
+ 1
704
+ 1
705
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
706
+ 1
707
+ 1
708
+ 2
709
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
710
+ 2
711
+ 1
712
+ 1
713
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
714
+ 1
715
+ 1
716
+ 1
717
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
718
+ 1
719
+ 1
@@ -254,6 +254,9 @@ class IcebergTables:
254
254
  self.__tag_name = tag_name
255
255
 
256
256
  # TODO Fix later with a more appropriate DATABASE
257
+ self.DENORMALIZED_OBSERVATION = self._create_iceberg_table("BRONZE", suffix="denormalized_observation")
258
+ self.DENORMALIZED_METADATA = self._create_iceberg_table("BRONZE", suffix="denormalized_metadata")
259
+ self.GROUPED_METADATA = self._create_iceberg_table("BRONZE", suffix="grouped_metadata")
257
260
  self.TABLE = self._create_iceberg_table("BRONZE")
258
261
  self.TABLE_FILTERED = self._create_iceberg_table("BRONZE", suffix="filtered")
259
262
  self.BRONZE = self._create_iceberg_table("BRONZE")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.160
3
+ Version: 0.0.161
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -1,13 +1,13 @@
1
1
  sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=ocuau0WtpyRwui0qwdQ_Rxh4nYPOyZoHpGKaWRa6B3Q,28868
2
2
  sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=u1fFUagXvVPJirSIesuXrCbFMuZ2jQnbz1yjk1MqRwg,25061
3
+ sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=Ko_gUE4ar2iR1zvIXiyq_bvfLVVjDrHwMJpef8CWipk,26235
4
4
  sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=RcpU8znoyNOfJMLFwrgi7S-BKkzgqh25o2wPLNIxLYc,21246
5
5
  sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=KpG8gp8Ai9pHDiKhUOTcXWxxmFGeKEE3XKlI_Y-SveU,18453
6
6
  sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=qioLv3SlJEfk0LzTiwfXRtZXVImPOJUeh9k1XwHC-pA,26225
7
7
  sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
8
- sws_spark_dissemination_helper/constants.py,sha256=XWbXdsgar4ZCsuhmyFB2VDLL3QMTEkXFyfYa9cYK3I8,13173
8
+ sws_spark_dissemination_helper/constants.py,sha256=zviO6huxWTWonHv4v2M8zKr7HXCDMBGqjHx-eTfGT2A,13487
9
9
  sws_spark_dissemination_helper/utils.py,sha256=G7lQqNRrvqZpgm9WmddD7fWsI8IVn09x1p3cV3458EA,21963
10
- sws_spark_dissemination_helper-0.0.160.dist-info/METADATA,sha256=5_Ju2WgV0yAErjDi0spS6aF5-hYDHIxfXsR_lrJRl7k,2824
11
- sws_spark_dissemination_helper-0.0.160.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
- sws_spark_dissemination_helper-0.0.160.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
- sws_spark_dissemination_helper-0.0.160.dist-info/RECORD,,
10
+ sws_spark_dissemination_helper-0.0.161.dist-info/METADATA,sha256=62_Bd5pfNPWI4hWnflkj1HbyL33bdG_c8GbGp7SLY3U,2824
11
+ sws_spark_dissemination_helper-0.0.161.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
+ sws_spark_dissemination_helper-0.0.161.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
+ sws_spark_dissemination_helper-0.0.161.dist-info/RECORD,,