sws-spark-dissemination-helper 0.0.86__py3-none-any.whl → 0.0.88__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -443,3 +443,78 @@ class SWSBronzeIcebergSparkHelper:
443
443
  logging.debug(f"Tag with Added csv Table: {tag}")
444
444
 
445
445
  logging.info("Bronze Dissemination tags successfully written")
446
+
447
+ def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
448
+ self, dimensions: Dict[str, List[str]]
449
+ ) -> DataFrame:
450
+ self.spark.sql(
451
+ f"ALTER TABLE {self.iceberg_tables.BRONZE.iceberg_id}.`tag_{self.tag_name}` CREATE OR REPLACE BRANCH `diss_tag_{self.tag_name}`" # AS OF VERSION `{tag_name}`
452
+ )
453
+
454
+ for dimension_name, codes in dimensions.items():
455
+ if len(codes) != 0:
456
+ not_in_codes = ",".join([f"'{code}'" for code in codes])
457
+ self.spark.sql(
458
+ f"DELETE FROM {self.iceberg_tables.BRONZE.iceberg_id}.`branch_diss_tag_{self.tag_name}` WHERE {dimension_name} NOT IN ({not_in_codes})"
459
+ )
460
+ disseminated_tag_df = self.spark.read.option("branch", self.tag_name).table(
461
+ self.iceberg_tables.BRONZE.iceberg_id
462
+ )
463
+
464
+ disseminated_tag_df = disseminated_tag_df.withColumn(
465
+ "metadata", F.to_json(col("metadata"))
466
+ ).coalesce(1)
467
+
468
+ save_cache_csv(
469
+ df=disseminated_tag_df,
470
+ bucket=self.bucket,
471
+ prefix=f"{self.iceberg_tables.BRONZE.csv_prefix}_disseminated_tag",
472
+ tag_name=self.tag_name,
473
+ )
474
+
475
+ return disseminated_tag_df
476
+
477
+ def write_bronze_sws_filtered_disseminated_tag(self, tags: Tags):
478
+ # Get or create a new tag
479
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
480
+ logging.debug(f"Tag: {tag}")
481
+
482
+ new_iceberg_table = BaseDisseminatedTagTable(
483
+ id=f"{self.domain_code.lower()}_bronze_disseminated_tag_iceberg",
484
+ name=f"{self.domain_code} bronze disseminated tag Iceberg",
485
+ description="Bronze table containing the raw data imported from the SWS, denormalized and filtered per dimension",
486
+ layer=TableLayer.BRONZE,
487
+ private=True,
488
+ type=TableType.ICEBERG,
489
+ database=IcebergDatabases.BRONZE_DATABASE,
490
+ table=self.iceberg_tables.BRONZE.table,
491
+ path=self.iceberg_tables.BRONZE.path,
492
+ structure={
493
+ "columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
494
+ },
495
+ )
496
+ tag = tags.add_dissemination_table(
497
+ self.dataset_id, self.tag_name, new_iceberg_table
498
+ )
499
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
500
+
501
+ new_csv_table = BaseDisseminatedTagTable(
502
+ id=f"{self.domain_code.lower()}_bronze_disseminated_tag_csv",
503
+ name=f"{self.domain_code} bronze disseminated tag csv",
504
+ description="Bronze table containing the raw data imported from the SWS, denormalized and filtered per dimension cached in csv",
505
+ layer=TableLayer.BRONZE,
506
+ private=True,
507
+ type=TableType.CSV,
508
+ # TODO Correct the path in the origin library
509
+ path=self.iceberg_tables.BRONZE.csv_path,
510
+ structure={
511
+ "columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
512
+ },
513
+ )
514
+ tag = tags.add_dissemination_table(
515
+ self.dataset_id, self.tag_name, new_csv_table
516
+ )
517
+
518
+ logging.debug(f"Tag with Added csv Table: {tag}")
519
+
520
+ logging.info("Bronze Disseminated tag with selection successfully written")
@@ -89,20 +89,20 @@ class SWSGoldIcebergSparkHelper:
89
89
  def keep_dim_val_attr_columns(self, df: DataFrame):
90
90
  return df.select(*self.cols_to_keep_sws)
91
91
 
92
+ def read_silver_data(self) -> DataFrame:
93
+ return self.spark.read.option("tag", self.tag_name).table(
94
+ self.iceberg_tables.SILVER.iceberg_id
95
+ )
96
+
92
97
  def gen_gold_sws_disseminated_data(self) -> DataFrame:
93
98
  return (
94
- self.spark.read.option("tag", self.tag_name)
95
- .table(self.iceberg_tables.SILVER.iceberg_id)
99
+ self.read_silver_data()
96
100
  .transform(self.apply_diss_flag_filter)
97
101
  .transform(self.keep_dim_val_attr_columns)
98
102
  )
99
103
 
100
104
  def gen_gold_sws_validated_data(self) -> DataFrame:
101
- return (
102
- self.spark.read.option("tag", self.tag_name)
103
- .table(self.iceberg_tables.BRONZE.iceberg_id)
104
- .transform(self.keep_dim_val_attr_columns)
105
- )
105
+ return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
106
106
 
107
107
  def write_gold_sws_validated_data_to_iceberg_and_csv(
108
108
  self, df: DataFrame
@@ -110,6 +110,10 @@ class SWSSilverIcebergSparkHelper:
110
110
  return self.spark.read.option("tag", self.tag_name).table(
111
111
  self.iceberg_tables.BRONZE.iceberg_id
112
112
  )
113
+ def read_bronze_diss_tag_data(self) -> DataFrame:
114
+ return self.spark.read.option("branch", f"diss_tag_{self.tag_name}").table(
115
+ self.iceberg_tables.BRONZE.iceberg_id
116
+ )
113
117
 
114
118
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
115
119
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.86
3
+ Version: 0.0.88
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -1,11 +1,11 @@
1
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=tyC3e2LNBes9J2UFR-j7bDlvEffeI0YsiYlMvk0wPxA,16382
2
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=o8THI9uVKkNdtZVqs4Rsa9666eOjcSxAD0H_tKs9v4w,16059
1
+ sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=5z3uaVgRjtvZFO8C8LG9k3GO2dO21Ht5l7MXPY5Hb5M,19673
2
+ sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=ZC7hxkppo6qmfCc2z5vm2Y2iH1901F-rx9Er9cxuzP4,16037
3
3
  sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=ja7AbOfbmC_EXHCJk7UMDzzbA-LRxzPkaaUmuvcihJ8,17449
4
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=F0g4N95QIApVNvPFWuQfHphGE320LKoimBRisln7Luk,22033
4
+ sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=zEppNq5shiHZH2yt5faWGsb5QEmpAQS0ToIrG6fmv6o,22231
5
5
  sws_spark_dissemination_helper/__init__.py,sha256=Efjoe9V4vGXWVp-DY5P6NbRwIUr_zkZJkDmMi-lf5Bc,262
6
6
  sws_spark_dissemination_helper/constants.py,sha256=hpHHlbojShMWRfyIelXz6c5BqFzO48Oap1zmztlMMrs,11349
7
7
  sws_spark_dissemination_helper/utils.py,sha256=6SzrXX0xhvynRyv-vRFDbc6V4UNe_RzKKETZAtefnhg,21341
8
- sws_spark_dissemination_helper-0.0.86.dist-info/METADATA,sha256=vACLr-NqneuRqAeZOkE0ZGAhWJozhBprO-zNJzniLgk,2823
9
- sws_spark_dissemination_helper-0.0.86.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
- sws_spark_dissemination_helper-0.0.86.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
11
- sws_spark_dissemination_helper-0.0.86.dist-info/RECORD,,
8
+ sws_spark_dissemination_helper-0.0.88.dist-info/METADATA,sha256=ta-N8JQzmir7jdw5Sm6k5dHKg1gkHkt-yCu25z0HbUY,2823
9
+ sws_spark_dissemination_helper-0.0.88.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
+ sws_spark_dissemination_helper-0.0.88.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
11
+ sws_spark_dissemination_helper-0.0.88.dist-info/RECORD,,