sws-spark-dissemination-helper 0.0.126__tar.gz → 0.0.128__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/PKG-INFO +1 -1
  2. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/pyproject.toml +1 -1
  3. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +53 -46
  4. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/.gitignore +0 -0
  5. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/LICENSE +0 -0
  6. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/README.md +0 -0
  7. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/old_requirements.txt +0 -0
  8. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/requirements.txt +0 -0
  9. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +0 -0
  10. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
  11. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/src/sws_spark_dissemination_helper/constants.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/src/sws_spark_dissemination_helper/utils.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/tests/__init__.py +0 -0
  17. {sws_spark_dissemination_helper-0.0.126 → sws_spark_dissemination_helper-0.0.128}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.126
3
+ Version: 0.0.128
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.126"
7
+ version = "0.0.128"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3==1.36.18",
@@ -25,6 +25,7 @@ class SWSEasyIcebergSparkHelper:
25
25
  dataset_details: dict = None,
26
26
  dataset_tables: DatasetTables = None,
27
27
  keep_history: bool = False,
28
+ write_csv: bool = True,
28
29
  ) -> None:
29
30
  self.spark: SparkSession = spark
30
31
  self.dataset_details: dict = dataset_details
@@ -35,6 +36,7 @@ class SWSEasyIcebergSparkHelper:
35
36
  self.dataset_tables: DatasetTables = dataset_tables
36
37
  self.iceberg_tables: IcebergTables = iceberg_tables
37
38
  self.keep_history: bool = keep_history
39
+ self.write_csv: bool = write_csv
38
40
 
39
41
  if dataset_details is not None:
40
42
  (
@@ -335,16 +337,16 @@ class SWSEasyIcebergSparkHelper:
335
337
 
336
338
  logging.info(f"Iceberg tag '{self.tag_name}' created")
337
339
 
338
- df_denorm = self.df_denorm.withColumn(
339
- "metadata", F.to_json(col("metadata"))
340
- ).coalesce(1)
340
+ df_denorm = self.df_denorm.withColumn("metadata", F.to_json(col("metadata")))
341
+ if self.write_csv:
342
+ df_denorm = df_denorm.coalesce(1)
341
343
 
342
- save_cache_csv(
343
- df=df_denorm,
344
- bucket=self.bucket,
345
- prefix=self.iceberg_tables.TABLE.csv_prefix,
346
- tag_name=self.tag_name,
347
- )
344
+ save_cache_csv(
345
+ df=df_denorm,
346
+ bucket=self.bucket,
347
+ prefix=self.iceberg_tables.TABLE.csv_prefix,
348
+ tag_name=self.tag_name,
349
+ )
348
350
 
349
351
  return df_denorm
350
352
 
@@ -372,21 +374,22 @@ class SWSEasyIcebergSparkHelper:
372
374
  )
373
375
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
374
376
 
375
- new_csv_table = BaseDisseminatedTagTable(
376
- id="unfiltered_csv",
377
- name="Unfiltered csv",
378
- description="Csv table containing all the raw data imported from the SWS and denormalized",
379
- layer=TableLayer.CACHE,
380
- private=True,
381
- debug=True,
382
- type=TableType.CSV,
383
- path=self.iceberg_tables.TABLE.csv_path,
384
- structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
385
- )
386
- tag = tags.add_dissemination_table(
387
- self.dataset_id, self.tag_name, new_csv_table
388
- )
389
- logging.debug(f"Tag with Added csv Table: {tag}")
377
+ if self.write_csv:
378
+ new_csv_table = BaseDisseminatedTagTable(
379
+ id="unfiltered_csv",
380
+ name="Unfiltered csv",
381
+ description="Csv table containing all the raw data imported from the SWS and denormalized",
382
+ layer=TableLayer.CACHE,
383
+ private=True,
384
+ debug=True,
385
+ type=TableType.CSV,
386
+ path=self.iceberg_tables.TABLE.csv_path,
387
+ structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
388
+ )
389
+ tag = tags.add_dissemination_table(
390
+ self.dataset_id, self.tag_name, new_csv_table
391
+ )
392
+ logging.debug(f"Tag with Added csv Table: {tag}")
390
393
 
391
394
  logging.info("Unfiltered data tags successfully written")
392
395
 
@@ -418,18 +421,21 @@ class SWSEasyIcebergSparkHelper:
418
421
 
419
422
  disseminated_tag_df = self.filtered_df.withColumn(
420
423
  "metadata", F.to_json(col("metadata"))
421
- ).coalesce(1)
422
-
423
- save_cache_csv(
424
- df=disseminated_tag_df,
425
- bucket=self.bucket,
426
- prefix=f"{self.iceberg_tables.TABLE_FILTERED.csv_prefix}",
427
- tag_name=self.tag_name,
428
424
  )
429
425
 
426
+ if self.write_csv:
427
+ disseminated_tag_df = disseminated_tag_df.coalesce(1)
428
+
429
+ save_cache_csv(
430
+ df=disseminated_tag_df,
431
+ bucket=self.bucket,
432
+ prefix=f"{self.iceberg_tables.TABLE_FILTERED.csv_prefix}",
433
+ tag_name=self.tag_name,
434
+ )
435
+
430
436
  return disseminated_tag_df
431
437
 
432
- def write_bronze_sws_filtered_disseminated_tag(self, tags: Tags):
438
+ def write_sws_filtered_dissemination_tag(self, tags: Tags):
433
439
  # Get or create a new tag
434
440
  tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
435
441
  logging.debug(f"Tag: {tag}")
@@ -452,20 +458,21 @@ class SWSEasyIcebergSparkHelper:
452
458
  )
453
459
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
454
460
 
455
- new_csv_table = BaseDisseminatedTagTable(
456
- id="filtered_csv",
457
- name="Filtered csv",
458
- description="Csv table containing the raw data imported from the SWS, denormalized and filtered per dimension cached",
459
- layer=TableLayer.CACHE,
460
- private=True,
461
- type=TableType.CSV,
462
- path=self.iceberg_tables.TABLE_FILTERED.csv_path,
463
- structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
464
- )
465
- tag = tags.add_dissemination_table(
466
- self.dataset_id, self.tag_name, new_csv_table
467
- )
461
+ if self.write_csv:
462
+ new_csv_table = BaseDisseminatedTagTable(
463
+ id="filtered_csv",
464
+ name="Filtered csv",
465
+ description="Csv table containing the raw data imported from the SWS, denormalized and filtered per dimension cached",
466
+ layer=TableLayer.CACHE,
467
+ private=True,
468
+ type=TableType.CSV,
469
+ path=self.iceberg_tables.TABLE_FILTERED.csv_path,
470
+ structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
471
+ )
472
+ tag = tags.add_dissemination_table(
473
+ self.dataset_id, self.tag_name, new_csv_table
474
+ )
468
475
 
469
- logging.debug(f"Tag with Added csv Table: {tag}")
476
+ logging.debug(f"Tag with Added csv Table: {tag}")
470
477
 
471
478
  logging.info("Filtered data tags successfully written")