sws-spark-dissemination-helper 0.0.159__py3-none-any.whl → 0.0.167__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -276,11 +276,17 @@ class SWSEasyIcebergSparkHelper:
276
276
  if not self.keep_history:
277
277
  final_query += "\nWHERE o.replaced_on IS NULL"
278
278
 
279
- logging.info("Final query for merging observation and observation_coordinares")
279
+ logging.info("Final query for merging observation and observation_coordinates")
280
280
  logging.info(final_query)
281
281
 
282
282
  df_obs_denorm = self.spark.sql(final_query)
283
283
 
284
+ df_obs_denorm.writeTo(
285
+ self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id
286
+ ).createOrReplace()
287
+
288
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_OBSERVATION.table} write")
289
+
284
290
  return df_obs_denorm
285
291
 
286
292
  def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
@@ -418,7 +424,13 @@ class SWSEasyIcebergSparkHelper:
418
424
 
419
425
  df_meta_denorm = self.spark.sql(
420
426
  f"""
421
- select m.observation as observation_id,
427
+ select
428
+ /*+
429
+ BROADCAST({self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id}),
430
+ BROADCAST({self.dataset_tables.METADATA_TYPE.iceberg_id}),
431
+ BROADCAST({self.dataset_tables.LANGUAGE.iceberg_id})
432
+ */
433
+ m.observation as observation_id,
422
434
  mt.code as type,
423
435
  met.code as element_type,
424
436
  l.country_code as language,
@@ -431,7 +443,11 @@ class SWSEasyIcebergSparkHelper:
431
443
  """
432
444
  )
433
445
 
434
- logging.info("meta_denorm write")
446
+ df_meta_denorm.writeTo(
447
+ self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id
448
+ ).createOrReplace()
449
+
450
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_METADATA.table} write")
435
451
 
436
452
  return df_meta_denorm
437
453
 
@@ -456,25 +472,31 @@ class SWSEasyIcebergSparkHelper:
456
472
  )
457
473
 
458
474
  def _gen_grouped_metadata_sql(self) -> DataFrame:
459
- return (
460
- self._gen_denormalized_metadata_sql()
461
- .select(
462
- col("observation_id"),
463
- F.create_map(
464
- lit("type"),
465
- col("type"),
466
- lit("element_type"),
467
- col("element_type"),
468
- lit("language"),
469
- col("language"),
470
- lit("value"),
471
- col("value"),
472
- ).alias("metadata"),
473
- )
474
- .groupby("observation_id")
475
- .agg(F.collect_list("metadata").alias("metadata"))
475
+ df_meta_grouped = self.spark.sql(
476
+ f"""
477
+ SELECT
478
+ observation_id,
479
+ collect_list(
480
+ map(
481
+ 'type', type,
482
+ 'element_type', element_type,
483
+ 'language', language,
484
+ 'value', value
485
+ )
486
+ ) AS metadata
487
+ FROM {self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id}
488
+ GROUP BY observation_id
489
+ """
476
490
  )
477
491
 
492
+ df_meta_grouped.writeTo(
493
+ self.iceberg_tables.GROUPED_METADATA.iceberg_id
494
+ ).createOrReplace()
495
+
496
+ logging.info(f"{self.iceberg_tables.GROUPED_METADATA.table} write")
497
+
498
+ return df_meta_grouped
499
+
478
500
  def _gen_denormalied_data(self) -> DataFrame:
479
501
  return (
480
502
  self._gen_denormalized_observation()
@@ -488,15 +510,19 @@ class SWSEasyIcebergSparkHelper:
488
510
  )
489
511
 
490
512
  def _gen_denormalied_data_sql(self) -> DataFrame:
491
- return (
492
- self._gen_denormalized_observation_sql()
493
- .alias("o")
494
- .join(
495
- self._gen_grouped_metadata_sql().alias("m"),
496
- col("o.id") == col("m.observation_id"),
497
- "left",
498
- )
499
- .drop("m.observation_id")
513
+ self._gen_denormalized_observation_sql()
514
+ self._gen_denormalized_metadata_sql()
515
+ self._gen_grouped_metadata_sql()
516
+
517
+ return self.spark.sql(
518
+ f"""
519
+ SELECT
520
+ o.*,
521
+ m.metadata
522
+ FROM {self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id} AS o
523
+ LEFT JOIN {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS m
524
+ ON o.id = m.observation_id
525
+ """
500
526
  )
501
527
 
502
528
  def _gen_denormalied_data_sql_from_tag(self) -> DataFrame:
@@ -669,3 +695,29 @@ class SWSEasyIcebergSparkHelper:
669
695
  logging.debug(f"Tag with Added csv Table: {tag}")
670
696
 
671
697
  logging.info("Filtered data tags successfully written")
698
+
699
+
700
+ 1
701
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
702
+ 1
703
+ 1
704
+ 2
705
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
706
+ 2
707
+ 1
708
+ 1
709
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
710
+ 1
711
+ 1
712
+ 2
713
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
714
+ 2
715
+ 1
716
+ 1
717
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
718
+ 1
719
+ 1
720
+ 1
721
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
722
+ 1
723
+ 1
@@ -271,6 +271,60 @@ class SWSGoldIcebergSparkHelper:
271
271
 
272
272
  return df
273
273
 
274
+ def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
275
+ """The expected input to this function is the output of the sws disseminated function"""
276
+ df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
277
+
278
+ logging.info(
279
+ f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
280
+ )
281
+
282
+ self.spark.sql(
283
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
284
+ )
285
+
286
+ logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
287
+
288
+ df_1 = df.coalesce(1)
289
+
290
+ save_cache_csv(
291
+ df=df_1,
292
+ bucket=self.bucket,
293
+ prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
294
+ tag_name=self.tag_name,
295
+ )
296
+
297
+ return df
298
+
299
+ def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
300
+ self, df: DataFrame
301
+ ) -> DataFrame:
302
+ """The expected input to this function is the output of the sws disseminated function"""
303
+ df.writeTo(
304
+ self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
305
+ ).createOrReplace()
306
+
307
+ logging.info(
308
+ f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
309
+ )
310
+
311
+ self.spark.sql(
312
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
313
+ )
314
+
315
+ logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
316
+
317
+ df_1 = df.coalesce(1)
318
+
319
+ save_cache_csv(
320
+ df=df_1,
321
+ bucket=self.bucket,
322
+ prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
323
+ tag_name=self.tag_name,
324
+ )
325
+
326
+ return df
327
+
274
328
  def write_gold_sws_validated_sws_dissemination_tag(
275
329
  self, df: DataFrame, tags: Tags
276
330
  ) -> DataFrame:
@@ -496,8 +550,8 @@ class SWSGoldIcebergSparkHelper:
496
550
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
497
551
 
498
552
  new_diss_table = BaseDisseminatedTagTable(
499
- id=f"{self.domain_code.lower()}_gold_pre_sdmx_csv",
500
- name=f"{self.domain_code} gold pre-SDMX csv",
553
+ id=f"{self.domain_code.lower()}_gold_sws_csv",
554
+ name=f"{self.domain_code} gold SWS csv",
501
555
  description="Gold table containing the tag data without any processing cached in csv",
502
556
  layer=TableLayer.GOLD,
503
557
  private=True,
@@ -515,3 +569,101 @@ class SWSGoldIcebergSparkHelper:
515
569
  logging.debug(f"Tag with Added csv Table: {tag}")
516
570
 
517
571
  return df
572
+
573
+ def write_gold_faostat_dissemination_tag(
574
+ self, df: DataFrame, tags: Tags
575
+ ) -> DataFrame:
576
+ # Get or create a new tag
577
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
578
+ logging.debug(f"Tag: {tag}")
579
+
580
+ new_iceberg_table = BaseDisseminatedTagTable(
581
+ id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
582
+ name=f"{self.domain_code} gold FAOSTAT Iceberg",
583
+ description="Gold table containing the tag data in FAOSTAT format",
584
+ layer=TableLayer.GOLD,
585
+ private=True,
586
+ type=TableType.ICEBERG,
587
+ database=IcebergDatabases.GOLD_DATABASE,
588
+ table=self.iceberg_tables.GOLD_FAOSTAT.table,
589
+ path=self.iceberg_tables.GOLD_FAOSTAT.path,
590
+ structure={"columns": df.schema.jsonValue()["fields"]},
591
+ )
592
+ tag = upsert_disseminated_table(
593
+ sws_tags=tags,
594
+ tag=tag,
595
+ dataset_id=self.dataset_id,
596
+ tag_name=self.tag_name,
597
+ table=new_iceberg_table,
598
+ )
599
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
600
+
601
+ new_diss_table = BaseDisseminatedTagTable(
602
+ id=f"{self.domain_code.lower()}_gold_faostat_csv",
603
+ name=f"{self.domain_code} gold FAOSTAT csv",
604
+ description="Gold table containing the tag data in FAOSTAT format in csv",
605
+ layer=TableLayer.GOLD,
606
+ private=True,
607
+ type=TableType.CSV,
608
+ path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
609
+ structure={"columns": df.schema.jsonValue()["fields"]},
610
+ )
611
+ tag = upsert_disseminated_table(
612
+ sws_tags=tags,
613
+ tag=tag,
614
+ dataset_id=self.dataset_id,
615
+ tag_name=self.tag_name,
616
+ table=new_diss_table,
617
+ )
618
+ logging.debug(f"Tag with Added csv Table: {tag}")
619
+
620
+ return df
621
+
622
+ def write_gold_faostat_unfiltered_dissemination_tag(
623
+ self, df: DataFrame, tags: Tags
624
+ ) -> DataFrame:
625
+ # Get or create a new tag
626
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
627
+ logging.debug(f"Tag: {tag}")
628
+
629
+ new_iceberg_table = BaseDisseminatedTagTable(
630
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
631
+ name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
632
+ description="Gold table containing all the tag data in FAOSTAT format",
633
+ layer=TableLayer.GOLD,
634
+ private=True,
635
+ type=TableType.ICEBERG,
636
+ database=IcebergDatabases.GOLD_DATABASE,
637
+ table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
638
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
639
+ structure={"columns": df.schema.jsonValue()["fields"]},
640
+ )
641
+ tag = upsert_disseminated_table(
642
+ sws_tags=tags,
643
+ tag=tag,
644
+ dataset_id=self.dataset_id,
645
+ tag_name=self.tag_name,
646
+ table=new_iceberg_table,
647
+ )
648
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
649
+
650
+ new_diss_table = BaseDisseminatedTagTable(
651
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
652
+ name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
653
+ description="Gold table containing the tag data in FAOSTAT format in csv",
654
+ layer=TableLayer.GOLD,
655
+ private=True,
656
+ type=TableType.CSV,
657
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
658
+ structure={"columns": df.schema.jsonValue()["fields"]},
659
+ )
660
+ tag = upsert_disseminated_table(
661
+ sws_tags=tags,
662
+ tag=tag,
663
+ dataset_id=self.dataset_id,
664
+ tag_name=self.tag_name,
665
+ table=new_diss_table,
666
+ )
667
+ logging.debug(f"Tag with Added csv Table: {tag}")
668
+
669
+ return df
@@ -254,6 +254,9 @@ class IcebergTables:
254
254
  self.__tag_name = tag_name
255
255
 
256
256
  # TODO Fix later with a more appropriate DATABASE
257
+ self.DENORMALIZED_OBSERVATION = self._create_iceberg_table("BRONZE", suffix="denormalized_observation")
258
+ self.DENORMALIZED_METADATA = self._create_iceberg_table("BRONZE", suffix="denormalized_metadata")
259
+ self.GROUPED_METADATA = self._create_iceberg_table("BRONZE", suffix="grouped_metadata")
257
260
  self.TABLE = self._create_iceberg_table("BRONZE")
258
261
  self.TABLE_FILTERED = self._create_iceberg_table("BRONZE", suffix="filtered")
259
262
  self.BRONZE = self._create_iceberg_table("BRONZE")
@@ -274,6 +277,12 @@ class IcebergTables:
274
277
  self.GOLD_PRE_SDMX = self._create_iceberg_table(
275
278
  "GOLD", prefix=domain, suffix="pre_sdmx"
276
279
  )
280
+ self.GOLD_FAOSTAT = self._create_iceberg_table(
281
+ "GOLD", prefix=domain, suffix="faostat"
282
+ )
283
+ self.GOLD_FAOSTAT_UNFILTERED = self._create_iceberg_table(
284
+ "GOLD", prefix=domain, suffix="faostat_unfiltered"
285
+ )
277
286
 
278
287
  def _create_iceberg_table(
279
288
  self, level: str, prefix: str = "", suffix: str = ""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.159
3
+ Version: 0.0.167
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -1,13 +1,13 @@
1
1
  sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=ocuau0WtpyRwui0qwdQ_Rxh4nYPOyZoHpGKaWRa6B3Q,28868
2
2
  sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=u1fFUagXvVPJirSIesuXrCbFMuZ2jQnbz1yjk1MqRwg,25061
4
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=KZy6v4V3ugzKq_0L8JLmTPClN0hx-9uWpAwNFcs37Og,19339
3
+ sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
4
+ sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=0dxbVkrhdaASapEffF5PFcgKwAMyJoWBxzgymjZ4JyY,25049
5
5
  sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=KpG8gp8Ai9pHDiKhUOTcXWxxmFGeKEE3XKlI_Y-SveU,18453
6
6
  sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=qioLv3SlJEfk0LzTiwfXRtZXVImPOJUeh9k1XwHC-pA,26225
7
7
  sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
8
- sws_spark_dissemination_helper/constants.py,sha256=_VZXdX5ARnlNzqQEJoFqY_g5r5KyF3cez7SZKV6bYrs,12915
8
+ sws_spark_dissemination_helper/constants.py,sha256=zviO6huxWTWonHv4v2M8zKr7HXCDMBGqjHx-eTfGT2A,13487
9
9
  sws_spark_dissemination_helper/utils.py,sha256=G7lQqNRrvqZpgm9WmddD7fWsI8IVn09x1p3cV3458EA,21963
10
- sws_spark_dissemination_helper-0.0.159.dist-info/METADATA,sha256=JMSC-t_4LKJZNufyFNAe9rShRBSlQUHge-06z4slJ4c,2824
11
- sws_spark_dissemination_helper-0.0.159.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
- sws_spark_dissemination_helper-0.0.159.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
- sws_spark_dissemination_helper-0.0.159.dist-info/RECORD,,
10
+ sws_spark_dissemination_helper-0.0.167.dist-info/METADATA,sha256=h27GpuoB4elORRQybwqiKoKkF59JOPk6HzkQ4uDsSjo,2824
11
+ sws_spark_dissemination_helper-0.0.167.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
+ sws_spark_dissemination_helper-0.0.167.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
+ sws_spark_dissemination_helper-0.0.167.dist-info/RECORD,,