sws-spark-dissemination-helper 0.0.99__tar.gz → 0.0.168__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sws-spark-dissemination-helper might be problematic. Click here for more details.

Files changed (18) hide show
  1. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/.gitignore +1 -1
  2. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/PKG-INFO +5 -5
  3. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/pyproject.toml +5 -5
  4. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +317 -95
  5. sws_spark_dissemination_helper-0.0.168/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +723 -0
  6. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +244 -4
  7. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +54 -31
  8. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +104 -3
  9. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/__init__.py +1 -0
  10. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/constants.py +67 -18
  11. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/utils.py +18 -0
  12. sws_spark_dissemination_helper-0.0.168/tests/__init__.py +0 -0
  13. sws_spark_dissemination_helper-0.0.99/old_requirements.txt +0 -23
  14. sws_spark_dissemination_helper-0.0.99/requirements.txt +0 -23
  15. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/LICENSE +0 -0
  16. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/README.md +0 -0
  17. /sws_spark_dissemination_helper-0.0.99/tests/__init__.py → /sws_spark_dissemination_helper-0.0.168/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  18. {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/tests/test.py +0 -0
@@ -2,7 +2,6 @@
2
2
  # You should customize this list as applicable to your project.
3
3
  # Learn more about .gitignore:
4
4
  # https://www.atlassian.com/git/tutorials/saving-changes/gitignore
5
- .*
6
5
 
7
6
  # Node artifact files
8
7
  node_modules/
@@ -49,3 +48,4 @@ Thumbs.db
49
48
  *.mov
50
49
  *.wmv
51
50
 
51
+ .venv/
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.99
3
+ Version: 0.0.168
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
- Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
5
+ Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
7
7
  License: MIT License
8
8
 
@@ -31,8 +31,8 @@ Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python :: 3
32
32
  Requires-Python: >=3.9
33
33
  Requires-Dist: annotated-types==0.7.0
34
- Requires-Dist: boto3==1.36.18
35
- Requires-Dist: botocore==1.36.18
34
+ Requires-Dist: boto3>=1.36.18
35
+ Requires-Dist: botocore>=1.36.18
36
36
  Requires-Dist: certifi==2025.1.31
37
37
  Requires-Dist: charset-normalizer==3.4.1
38
38
  Requires-Dist: idna==3.10
@@ -49,7 +49,7 @@ Requires-Dist: pytz==2025.1
49
49
  Requires-Dist: requests==2.32.3
50
50
  Requires-Dist: s3transfer==0.11.2
51
51
  Requires-Dist: six==1.17.0
52
- Requires-Dist: sws-api-client==1.4.5
52
+ Requires-Dist: sws-api-client==1.5.3
53
53
  Requires-Dist: typing-extensions==4.12.2
54
54
  Requires-Dist: tzdata==2025.1
55
55
  Requires-Dist: urllib3==1.26.20
@@ -4,11 +4,11 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.99"
7
+ version = "0.0.168"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
- "boto3==1.36.18",
11
- "botocore==1.36.18",
10
+ "boto3>=1.36.18",
11
+ "botocore>=1.36.18",
12
12
  "certifi==2025.1.31",
13
13
  "charset-normalizer==3.4.1",
14
14
  "idna==3.10",
@@ -25,7 +25,7 @@ dependencies = [
25
25
  "requests==2.32.3",
26
26
  "s3transfer==0.11.2",
27
27
  "six==1.17.0",
28
- "sws_api_client==1.4.5",
28
+ "sws_api_client==1.5.3",
29
29
  "typing_extensions==4.12.2",
30
30
  "tzdata==2025.1",
31
31
  "urllib3==1.26.20"
@@ -42,4 +42,4 @@ classifiers = [
42
42
  ]
43
43
 
44
44
  [project.urls]
45
- Repository = "https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper"
45
+ Repository = "https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper"
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import time
3
3
  from copy import copy
4
- from typing import Dict, List, Tuple
4
+ from typing import Dict, List, Tuple, Union
5
5
 
6
6
  import pyspark.sql.functions as F
7
7
  from pyspark.sql import DataFrame, SparkSession
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
26
26
  domain_code: str,
27
27
  dataset_details: dict = None,
28
28
  dataset_tables: DatasetTables = None,
29
+ keep_history: bool = False,
30
+ write_csv: bool = True,
31
+ source_tag: Union[str, None] = None,
29
32
  ) -> None:
30
33
  self.spark: SparkSession = spark
31
34
  self.dataset_details: dict = dataset_details
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
36
39
  self.dataset_tables: DatasetTables = dataset_tables
37
40
  self.iceberg_tables: IcebergTables = iceberg_tables
38
41
  self.domain_code = domain_code
42
+ self.keep_history: bool = keep_history
43
+ self.write_csv: bool = write_csv
44
+ self.source_tag: Union[str, None] = source_tag
39
45
 
40
46
  if dataset_details is not None:
41
47
  (
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
83
89
  self.df_obs_coord,
84
90
  self.df_metadata,
85
91
  self.df_meta_elem,
92
+ self.df_tag_observation,
86
93
  ) = self.raw_data
87
94
 
88
95
  (
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
92
99
  self.df_meta_elem_type,
93
100
  self.df_language,
94
101
  self.df_unit_of_measure,
102
+ self.df_dataset,
95
103
  self.dfs_dimension,
96
104
  ) = self.raw_reference_data
97
105
 
98
- self.df_user = self.raw_operational_data
106
+ (self.df_user, self.df_tag) = self.raw_operational_data
99
107
 
100
108
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
101
109
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -150,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
150
158
 
151
159
  return dfs_dimension
152
160
 
153
- def _prepare_element_uom(self) -> DataFrame:
161
+ def _prepare_element_uom(self) -> Union[DataFrame, None]:
154
162
  """Prepare the element and unit of measure join."""
155
163
 
156
164
  # Get the element DataFrame
@@ -162,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
162
170
  if dimension_column == self.element_column
163
171
  )
164
172
 
165
- # Join the element and the unit_of_measure
166
- df_element_uom = (
167
- df_element.alias("e")
168
- .join(
169
- self.df_unit_of_measure.alias("u"),
170
- col("e.unit_of_measure") == col("u.id"),
171
- )
172
- .select(
173
- col("e.code").alias("element_code"),
174
- col("u.code").alias("unit_of_measure"),
175
- col("u.symbol").alias("unit_of_measure_symbol"),
176
- col("u.base_unit").alias("unit_of_measure_base_unit"),
177
- col("u.multiplier").alias("unit_of_measure_multiplier"),
173
+ if any("unit_of_measure" == column.lower() for column in df_element.columns):
174
+ # Join the element and the unit_of_measure
175
+ df_element_uom = (
176
+ df_element.alias("e")
177
+ .join(
178
+ self.df_unit_of_measure.alias("u"),
179
+ col("e.unit_of_measure") == col("u.id"),
180
+ )
181
+ .select(
182
+ col("e.code").alias("element_code"),
183
+ col("u.code").alias("unit_of_measure"),
184
+ col("u.symbol").alias("unit_of_measure_symbol"),
185
+ col("u.base_unit").alias("unit_of_measure_base_unit"),
186
+ col("u.multiplier").alias("unit_of_measure_multiplier"),
187
+ )
178
188
  )
179
- )
180
189
 
181
- return df_element_uom
190
+ return df_element_uom
182
191
 
183
192
  def _gen_denormalized_observation(self) -> DataFrame:
184
193
  """Original query upon which the below computation is based
@@ -270,20 +279,170 @@ class SWSBronzeIcebergSparkHelper:
270
279
  .withColumnRenamed("code", dimension_column)
271
280
  )
272
281
 
273
- df_intermediate = (
274
- df_intermediate.alias("d")
275
- .join(
276
- F.broadcast(df_element_uom).alias("e"),
277
- col(f"d.{self.element_column}") == col("e.element_code"),
278
- "left",
282
+ if df_element_uom is not None:
283
+ df_intermediate = (
284
+ df_intermediate.alias("d")
285
+ .join(
286
+ F.broadcast(df_element_uom).alias("e"),
287
+ col(f"d.{self.element_column}") == col("e.element_code"),
288
+ "left",
289
+ )
290
+ .drop("element_code")
279
291
  )
280
- .drop("element_code")
281
- )
282
292
 
283
293
  df_obs_denorm = df_intermediate
284
294
 
285
295
  return df_obs_denorm
286
296
 
297
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
298
+ # ----------------
299
+ # Prepare dataframes for the joins
300
+ # ----------------
301
+
302
+ select_statement = """
303
+ o.id,
304
+ o.value,
305
+ u.email,
306
+ o.created_on,
307
+ o.replaced_on,
308
+ o.version"""
309
+
310
+ from_statement = f"""
311
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
312
+ JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
313
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
314
+
315
+ hint_statement = ""
316
+
317
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
318
+ for flag_col in self.flag_columns:
319
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
320
+
321
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
322
+ for i, (dim_col, cl) in enumerate(
323
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
324
+ ):
325
+ select_statement += f",\nd{i}.code AS {dim_col}"
326
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
327
+ hint_statement = (
328
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
329
+ if hint_statement
330
+ else f"BROADCAST({cl.iceberg_id})"
331
+ )
332
+
333
+ hint_statement = "/*+ " + hint_statement + " */"
334
+
335
+ final_query = "SELECT " + hint_statement + select_statement + from_statement
336
+ if not self.keep_history:
337
+ final_query += "\nWHERE o.replaced_on IS NULL"
338
+
339
+ logging.info("Final query for merging observation and observation_coordinares")
340
+ logging.info(final_query)
341
+
342
+ df_obs_denorm = self.spark.sql(final_query)
343
+
344
+ df_element_uom = self._prepare_element_uom()
345
+
346
+ dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
347
+
348
+ # Join all the dimension codelists
349
+ for dimension_column, df_dimension in zip(
350
+ self.dim_columns_w_time, dfs_dimension_w_validity
351
+ ):
352
+ logging.debug(f"Joining dimension column: {dimension_column}")
353
+ logging.debug(f"df_obs_denorm columns: {df_obs_denorm.columns}")
354
+ logging.debug(
355
+ f"Is dimension {dimension_column} in the dataframe? {dimension_column in df_obs_denorm.columns}"
356
+ )
357
+ df_obs_denorm = (
358
+ df_obs_denorm.alias("o")
359
+ .join(
360
+ F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
361
+ "d"
362
+ ),
363
+ col(f"{dimension_column}") == col("d.code"),
364
+ )
365
+ .drop("code", "join_id")
366
+ )
367
+ logging.debug(f"After join count: {df_obs_denorm.count()}")
368
+
369
+ if df_element_uom is not None:
370
+ df_obs_denorm = (
371
+ df_obs_denorm.alias("d")
372
+ .join(
373
+ F.broadcast(df_element_uom).alias("e"),
374
+ col(f"d.{self.element_column}") == col("e.element_code"),
375
+ "left",
376
+ )
377
+ .drop("element_code")
378
+ )
379
+ logging.debug(f"After uom count: {df_obs_denorm.count()}")
380
+
381
+ return df_obs_denorm
382
+
383
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
384
+ # ----------------
385
+ # Prepare dataframes for the joins
386
+ # ----------------
387
+
388
+ select_statement = """
389
+ o.id,
390
+ o.value,
391
+ u.email,
392
+ o.created_on,
393
+ o.replaced_on,
394
+ o.version"""
395
+
396
+ from_statement = f"""
397
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
398
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
399
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
400
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
401
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
402
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
403
+
404
+ hint_statement = ""
405
+
406
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
407
+ for flag_col in self.flag_columns:
408
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
409
+
410
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
411
+ for i, (dim_col, cl) in enumerate(
412
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
413
+ ):
414
+ select_statement += f",\nd{i}.code AS {dim_col}"
415
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
416
+ hint_statement = (
417
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
418
+ if hint_statement
419
+ else f"BROADCAST({cl.iceberg_id})"
420
+ )
421
+
422
+ hint_statement = "/*+ " + hint_statement + " */"
423
+
424
+ # TODO Add tag name as a parameter
425
+ where_statement = (
426
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
427
+ )
428
+
429
+ final_query = (
430
+ "SELECT "
431
+ + hint_statement
432
+ + select_statement
433
+ + from_statement
434
+ + where_statement
435
+ )
436
+ if not self.keep_history:
437
+ final_query += "\n AND o.replaced_on IS NULL"
438
+
439
+ logging.info("Final query for merging observation and observation_coordinares")
440
+ logging.info(final_query)
441
+
442
+ df_obs_denorm = self.spark.sql(final_query)
443
+
444
+ return df_obs_denorm
445
+
287
446
  def _gen_denormalized_metadata(self) -> DataFrame:
288
447
  """Original query upon which the below computation is based
289
448
 
@@ -347,6 +506,32 @@ class SWSBronzeIcebergSparkHelper:
347
506
 
348
507
  return df_meta_denorm
349
508
 
509
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
510
+ # ----------------
511
+ # Generate denormalized observation table
512
+ # ----------------
513
+
514
+ logging.info("meta_denorm start")
515
+
516
+ df_meta_denorm = self.spark.sql(
517
+ f"""
518
+ select m.observation as observation_id,
519
+ mt.code as type,
520
+ met.code as element_type,
521
+ l.country_code as language,
522
+ me.value
523
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
524
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
525
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
526
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
527
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
528
+ """
529
+ )
530
+
531
+ logging.info("meta_denorm write")
532
+
533
+ return df_meta_denorm
534
+
350
535
  def _gen_grouped_metadata(self) -> DataFrame:
351
536
  return (
352
537
  self._gen_denormalized_metadata()
@@ -367,6 +552,26 @@ class SWSBronzeIcebergSparkHelper:
367
552
  .agg(F.collect_list("metadata").alias("metadata"))
368
553
  )
369
554
 
555
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
556
+ return (
557
+ self._gen_denormalized_metadata_sql()
558
+ .select(
559
+ col("observation_id"),
560
+ F.create_map(
561
+ lit("type"),
562
+ col("type"),
563
+ lit("element_type"),
564
+ col("element_type"),
565
+ lit("language"),
566
+ col("language"),
567
+ lit("value"),
568
+ col("value"),
569
+ ).alias("metadata"),
570
+ )
571
+ .groupby("observation_id")
572
+ .agg(F.collect_list("metadata").alias("metadata"))
573
+ )
574
+
370
575
  def _gen_bronze_data(self) -> DataFrame:
371
576
  return (
372
577
  self._gen_denormalized_observation()
@@ -379,9 +584,37 @@ class SWSBronzeIcebergSparkHelper:
379
584
  .drop("m.observation_id")
380
585
  )
381
586
 
587
+ def _gen_bronze_data_sql(self) -> DataFrame:
588
+ return (
589
+ self._gen_denormalized_observation_sql()
590
+ .alias("o")
591
+ .join(
592
+ self._gen_grouped_metadata_sql().alias("m"),
593
+ col("o.id") == col("m.observation_id"),
594
+ "left",
595
+ )
596
+ .drop("m.observation_id")
597
+ )
598
+
599
+ def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
600
+ return (
601
+ self._gen_denormalized_observation_sql_from_tag()
602
+ .alias("o")
603
+ .join(
604
+ self._gen_grouped_metadata_sql().alias("m"),
605
+ col("o.id") == col("m.observation_id"),
606
+ "left",
607
+ )
608
+ .drop("m.observation_id")
609
+ )
610
+
382
611
  # TODO decouple data generation and data writing
383
- def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
384
- self.df_bronze = self._gen_bronze_data()
612
+ def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
613
+
614
+ if sql:
615
+ self.df_bronze = self._gen_bronze_data_sql()
616
+ else:
617
+ self.df_bronze = self._gen_bronze_data()
385
618
 
386
619
  self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
387
620
 
@@ -390,15 +623,6 @@ class SWSBronzeIcebergSparkHelper:
390
623
  self.spark.sql(
391
624
  f"ALTER TABLE {self.iceberg_tables.BRONZE.iceberg_id} CREATE TAG `{self.tag_name}`"
392
625
  )
393
- while (
394
- self.spark.sql(
395
- f"SELECT * FROM {self.iceberg_tables.BRONZE.iceberg_id}.refs"
396
- )
397
- .filter((col("type") == lit("TAG")) & (col("name") == lit(self.tag_name)))
398
- .count()
399
- ) == 0:
400
- logging.info(f"Waiting for the tag {self.tag_name} to be created")
401
- time.sleep(2)
402
626
 
403
627
  logging.info(f"bronze tag '{self.tag_name}' created")
404
628
 
@@ -426,11 +650,13 @@ class SWSBronzeIcebergSparkHelper:
426
650
  description="Bronze table containing all the raw data imported from the SWS and denormalized",
427
651
  layer=TableLayer.BRONZE,
428
652
  private=True,
653
+ debug=True,
429
654
  type=TableType.ICEBERG,
430
655
  database=IcebergDatabases.BRONZE_DATABASE,
431
656
  table=self.iceberg_tables.BRONZE.table,
432
657
  path=self.iceberg_tables.BRONZE.path,
433
658
  structure={"columns": self.df_bronze.schema.jsonValue()["fields"]},
659
+ pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
434
660
  )
435
661
  tag = tags.add_dissemination_table(
436
662
  self.dataset_id, self.tag_name, new_iceberg_table
@@ -443,6 +669,7 @@ class SWSBronzeIcebergSparkHelper:
443
669
  description="Bronze table containing all the raw data imported from the SWS and denormalized cached in csv",
444
670
  layer=TableLayer.BRONZE,
445
671
  private=True,
672
+ debug=True,
446
673
  type=TableType.CSV,
447
674
  path=self.iceberg_tables.BRONZE.csv_path,
448
675
  structure={"columns": self.df_bronze.schema.jsonValue()["fields"]},
@@ -455,66 +682,34 @@ class SWSBronzeIcebergSparkHelper:
455
682
  logging.info("Bronze Dissemination tags successfully written")
456
683
 
457
684
  def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
458
- self, dimensions: Dict[str, List[str]]
685
+ self, dimensions: Dict[str, List[str]] = {}, from_tag=False
459
686
  ) -> DataFrame:
460
687
 
461
- refs = self.spark.sql(
462
- f"SELECT * FROM {self.iceberg_tables.BRONZE.iceberg_id}.refs"
463
- ).collect()
464
- logging.info(f"bronze refs: {refs}")
688
+ if from_tag:
689
+ self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
690
+ else:
691
+ self.disseminated_tag_df = self.df_bronze
465
692
 
466
- create_branch_query = f"ALTER TABLE {self.iceberg_tables.BRONZE.iceberg_id}.`tag_{self.tag_name}` CREATE OR REPLACE BRANCH `diss_tag_{self.tag_name}`" # AS OF VERSION `{tag_name}`
467
- create_branch_query = f"ALTER TABLE {self.iceberg_tables.BRONZE.iceberg_id} CREATE OR REPLACE BRANCH `diss_tag_{self.tag_name}`"
468
- logging.info(f"create_branch_query: {create_branch_query}")
469
- create_branch_query_result = self.spark.sql(create_branch_query).collect()
693
+ if not from_tag and dimensions is not None and len(dimensions) != 0:
694
+ for dimension_name, codes in dimensions.items():
695
+ logging.info(f"dimension_name: {dimension_name}")
696
+ logging.info(f"codes: {codes}")
697
+ if len(codes) != 0:
698
+ self.disseminated_tag_df = self.disseminated_tag_df.filter(
699
+ col(dimension_name).isin(codes)
700
+ )
470
701
 
471
- while (
472
- self.spark.sql(
473
- f"SELECT * FROM {self.iceberg_tables.BRONZE.iceberg_id}.refs"
474
- )
475
- .filter(
476
- (col("type") == lit("BRANCH"))
477
- & (col("name") == lit(f"diss_tag_{self.tag_name}"))
478
- )
479
- .count()
480
- ) == 0:
481
- logging.info(
482
- f"Waiting for the branch {self.tag_name} diss_tag_{self.tag_name} to be created"
483
- )
484
- time.sleep(2)
485
-
486
- logging.info(f"result of create_branch_query: {create_branch_query_result}")
487
-
488
- self.disseminated_tag_df = self.spark.read.option(
489
- "branch", f"`diss_tag_{self.tag_name}`"
490
- ).table(self.iceberg_tables.BRONZE.iceberg_id)
491
-
492
- logging.info(f"dimensions: {dimensions}")
493
- for dimension_name, codes in dimensions.items():
494
- logging.info(f"dimension_name: {dimension_name}")
495
- logging.info(f"codes: {codes}")
496
- if len(codes) != 0:
497
- # not_in_codes = ",".join([f"'{code}'" for code in codes])
498
- # delete_from_branch_query = f"DELETE FROM {self.iceberg_tables.BRONZE.iceberg_id}.`branch_diss_tag_{self.tag_name}` WHERE {dimension_name} NOT IN ({not_in_codes})"
499
- # logging.info(f"delete_from_branch_query: {delete_from_branch_query}")
500
- # delete_from_branch_query_result = self.spark.sql(
501
- # delete_from_branch_query
502
- # ).collect()
503
-
504
- # logging.info(
505
- # f"result of delete_from_branch_query: {delete_from_branch_query_result}"
506
- # )
507
- self.disseminated_tag_df = self.disseminated_tag_df.filter(
508
- col(dimension_name).isin(codes)
509
- )
702
+ self.disseminated_tag_df.writeTo(
703
+ self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
704
+ ).createOrReplace()
510
705
 
511
- # self.disseminated_tag_df = self.spark.read.option(
512
- # "branch", f"`diss_tag_{self.tag_name}`"
513
- # ).table(self.iceberg_tables.BRONZE.iceberg_id)
706
+ logging.info(
707
+ f"Bronze disseminated tag table written to {self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id}"
708
+ )
514
709
 
515
- self.disseminated_tag_df.writeTo(
516
- f"{self.iceberg_tables.BRONZE.iceberg_id}.`branch_diss_tag_{self.tag_name}`"
517
- ).overwritePartitions()
710
+ self.spark.sql(
711
+ f"ALTER TABLE {self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id} CREATE TAG `{self.tag_name}`"
712
+ )
518
713
 
519
714
  disseminated_tag_df = self.disseminated_tag_df.withColumn(
520
715
  "metadata", F.to_json(col("metadata"))
@@ -523,7 +718,7 @@ class SWSBronzeIcebergSparkHelper:
523
718
  save_cache_csv(
524
719
  df=disseminated_tag_df,
525
720
  bucket=self.bucket,
526
- prefix=f"{self.iceberg_tables.BRONZE.csv_prefix}_disseminated_tag",
721
+ prefix=f"{self.iceberg_tables.BRONZE_DISS_TAG.csv_prefix}",
527
722
  tag_name=self.tag_name,
528
723
  )
529
724
 
@@ -542,11 +737,12 @@ class SWSBronzeIcebergSparkHelper:
542
737
  private=True,
543
738
  type=TableType.ICEBERG,
544
739
  database=IcebergDatabases.BRONZE_DATABASE,
545
- table=self.iceberg_tables.BRONZE.table,
546
- path=self.iceberg_tables.BRONZE.path,
740
+ table=self.iceberg_tables.BRONZE_DISS_TAG.table,
741
+ path=self.iceberg_tables.BRONZE_DISS_TAG.path,
547
742
  structure={
548
743
  "columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
549
744
  },
745
+ pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
550
746
  )
551
747
  tag = tags.add_dissemination_table(
552
748
  self.dataset_id, self.tag_name, new_iceberg_table
@@ -561,7 +757,7 @@ class SWSBronzeIcebergSparkHelper:
561
757
  private=True,
562
758
  type=TableType.CSV,
563
759
  # TODO Correct the path in the origin library
564
- path=self.iceberg_tables.BRONZE.csv_path,
760
+ path=self.iceberg_tables.BRONZE_DISS_TAG.csv_path,
565
761
  structure={
566
762
  "columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
567
763
  },
@@ -573,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
573
769
  logging.debug(f"Tag with Added csv Table: {tag}")
574
770
 
575
771
  logging.info("Bronze Disseminated tag with selection successfully written")
772
+
773
+
774
+ 1
775
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
776
+ 1
777
+ 1
778
+ 2
779
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
780
+ 2
781
+ 1
782
+ 1
783
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
784
+ 1
785
+ 1
786
+ 2
787
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
788
+ 2
789
+ 1
790
+ 1
791
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
792
+ 1
793
+ 1
794
+ 1
795
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
796
+ 1
797
+ 1