sws-spark-dissemination-helper 0.0.93__py3-none-any.whl → 0.0.183__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ import time
2
3
  from copy import copy
3
- from typing import Dict, List, Tuple
4
+ from typing import Dict, List, Tuple, Union
4
5
 
5
6
  import pyspark.sql.functions as F
6
7
  from pyspark.sql import DataFrame, SparkSession
@@ -25,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
25
26
  domain_code: str,
26
27
  dataset_details: dict = None,
27
28
  dataset_tables: DatasetTables = None,
29
+ keep_history: bool = False,
30
+ write_csv: bool = True,
31
+ source_tag: Union[str, None] = None,
28
32
  ) -> None:
29
33
  self.spark: SparkSession = spark
30
34
  self.dataset_details: dict = dataset_details
@@ -35,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
35
39
  self.dataset_tables: DatasetTables = dataset_tables
36
40
  self.iceberg_tables: IcebergTables = iceberg_tables
37
41
  self.domain_code = domain_code
42
+ self.keep_history: bool = keep_history
43
+ self.write_csv: bool = write_csv
44
+ self.source_tag: Union[str, None] = source_tag
38
45
 
39
46
  if dataset_details is not None:
40
47
  (
@@ -82,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
82
89
  self.df_obs_coord,
83
90
  self.df_metadata,
84
91
  self.df_meta_elem,
92
+ self.df_tag_observation,
85
93
  ) = self.raw_data
86
94
 
87
95
  (
@@ -91,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
91
99
  self.df_meta_elem_type,
92
100
  self.df_language,
93
101
  self.df_unit_of_measure,
102
+ self.df_dataset,
94
103
  self.dfs_dimension,
95
104
  ) = self.raw_reference_data
96
105
 
97
- self.df_user = self.raw_operational_data
106
+ (self.df_user, self.df_tag) = self.raw_operational_data
98
107
 
99
108
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
100
109
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -149,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
149
158
 
150
159
  return dfs_dimension
151
160
 
152
- def _prepare_element_uom(self) -> DataFrame:
161
+ def _prepare_element_uom(self) -> Union[DataFrame, None]:
153
162
  """Prepare the element and unit of measure join."""
154
163
 
155
164
  # Get the element DataFrame
@@ -161,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
161
170
  if dimension_column == self.element_column
162
171
  )
163
172
 
164
- # Join the element and the unit_of_measure
165
- df_element_uom = (
166
- df_element.alias("e")
167
- .join(
168
- self.df_unit_of_measure.alias("u"),
169
- col("e.unit_of_measure") == col("u.id"),
170
- )
171
- .select(
172
- col("e.code").alias("element_code"),
173
- col("u.code").alias("unit_of_measure"),
174
- col("u.symbol").alias("unit_of_measure_symbol"),
175
- col("u.base_unit").alias("unit_of_measure_base_unit"),
176
- col("u.multiplier").alias("unit_of_measure_multiplier"),
173
+ if any("unit_of_measure" == column.lower() for column in df_element.columns):
174
+ # Join the element and the unit_of_measure
175
+ df_element_uom = (
176
+ df_element.alias("e")
177
+ .join(
178
+ self.df_unit_of_measure.alias("u"),
179
+ col("e.unit_of_measure") == col("u.id"),
180
+ )
181
+ .select(
182
+ col("e.code").alias("element_code"),
183
+ col("u.code").alias("unit_of_measure"),
184
+ col("u.symbol").alias("unit_of_measure_symbol"),
185
+ col("u.base_unit").alias("unit_of_measure_base_unit"),
186
+ col("u.multiplier").alias("unit_of_measure_multiplier"),
187
+ )
177
188
  )
178
- )
179
189
 
180
- return df_element_uom
190
+ return df_element_uom
181
191
 
182
192
  def _gen_denormalized_observation(self) -> DataFrame:
183
193
  """Original query upon which the below computation is based
@@ -269,20 +279,170 @@ class SWSBronzeIcebergSparkHelper:
269
279
  .withColumnRenamed("code", dimension_column)
270
280
  )
271
281
 
272
- df_intermediate = (
273
- df_intermediate.alias("d")
274
- .join(
275
- F.broadcast(df_element_uom).alias("e"),
276
- col(f"d.{self.element_column}") == col("e.element_code"),
277
- "left",
282
+ if df_element_uom is not None:
283
+ df_intermediate = (
284
+ df_intermediate.alias("d")
285
+ .join(
286
+ F.broadcast(df_element_uom).alias("e"),
287
+ col(f"d.{self.element_column}") == col("e.element_code"),
288
+ "left",
289
+ )
290
+ .drop("element_code")
278
291
  )
279
- .drop("element_code")
280
- )
281
292
 
282
293
  df_obs_denorm = df_intermediate
283
294
 
284
295
  return df_obs_denorm
285
296
 
297
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
298
+ # ----------------
299
+ # Prepare dataframes for the joins
300
+ # ----------------
301
+
302
+ select_statement = """
303
+ o.id,
304
+ o.value,
305
+ u.email,
306
+ o.created_on,
307
+ o.replaced_on,
308
+ o.version"""
309
+
310
+ from_statement = f"""
311
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
312
+ JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
313
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
314
+
315
+ hint_statement = ""
316
+
317
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
318
+ for flag_col in self.flag_columns:
319
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
320
+
321
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
322
+ for i, (dim_col, cl) in enumerate(
323
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
324
+ ):
325
+ select_statement += f",\nd{i}.code AS {dim_col}"
326
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
327
+ hint_statement = (
328
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
329
+ if hint_statement
330
+ else f"BROADCAST({cl.iceberg_id})"
331
+ )
332
+
333
+ hint_statement = "/*+ " + hint_statement + " */"
334
+
335
+ final_query = "SELECT " + hint_statement + select_statement + from_statement
336
+ if not self.keep_history:
337
+ final_query += "\nWHERE o.replaced_on IS NULL"
338
+
339
+ logging.info("Final query for merging observation and observation_coordinares")
340
+ logging.info(final_query)
341
+
342
+ df_obs_denorm = self.spark.sql(final_query)
343
+
344
+ df_element_uom = self._prepare_element_uom()
345
+
346
+ dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
347
+
348
+ # Join all the dimension codelists
349
+ for dimension_column, df_dimension in zip(
350
+ self.dim_columns_w_time, dfs_dimension_w_validity
351
+ ):
352
+ logging.debug(f"Joining dimension column: {dimension_column}")
353
+ logging.debug(f"df_obs_denorm columns: {df_obs_denorm.columns}")
354
+ logging.debug(
355
+ f"Is dimension {dimension_column} in the dataframe? {dimension_column in df_obs_denorm.columns}"
356
+ )
357
+ df_obs_denorm = (
358
+ df_obs_denorm.alias("o")
359
+ .join(
360
+ F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
361
+ "d"
362
+ ),
363
+ col(f"{dimension_column}") == col("d.code"),
364
+ )
365
+ .drop("code", "join_id")
366
+ )
367
+ logging.debug(f"After join count: {df_obs_denorm.count()}")
368
+
369
+ if df_element_uom is not None:
370
+ df_obs_denorm = (
371
+ df_obs_denorm.alias("d")
372
+ .join(
373
+ F.broadcast(df_element_uom).alias("e"),
374
+ col(f"d.{self.element_column}") == col("e.element_code"),
375
+ "left",
376
+ )
377
+ .drop("element_code")
378
+ )
379
+ logging.debug(f"After uom count: {df_obs_denorm.count()}")
380
+
381
+ return df_obs_denorm
382
+
383
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
384
+ # ----------------
385
+ # Prepare dataframes for the joins
386
+ # ----------------
387
+
388
+ select_statement = """
389
+ o.id,
390
+ o.value,
391
+ u.email,
392
+ o.created_on,
393
+ o.replaced_on,
394
+ o.version"""
395
+
396
+ from_statement = f"""
397
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
398
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
399
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
400
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
401
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
402
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
403
+
404
+ hint_statement = ""
405
+
406
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
407
+ for flag_col in self.flag_columns:
408
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
409
+
410
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
411
+ for i, (dim_col, cl) in enumerate(
412
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
413
+ ):
414
+ select_statement += f",\nd{i}.code AS {dim_col}"
415
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
416
+ hint_statement = (
417
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
418
+ if hint_statement
419
+ else f"BROADCAST({cl.iceberg_id})"
420
+ )
421
+
422
+ hint_statement = "/*+ " + hint_statement + " */"
423
+
424
+ # TODO Add tag name as a parameter
425
+ where_statement = (
426
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
427
+ )
428
+
429
+ final_query = (
430
+ "SELECT "
431
+ + hint_statement
432
+ + select_statement
433
+ + from_statement
434
+ + where_statement
435
+ )
436
+ if not self.keep_history:
437
+ final_query += "\n AND o.replaced_on IS NULL"
438
+
439
+ logging.info("Final query for merging observation and observation_coordinares")
440
+ logging.info(final_query)
441
+
442
+ df_obs_denorm = self.spark.sql(final_query)
443
+
444
+ return df_obs_denorm
445
+
286
446
  def _gen_denormalized_metadata(self) -> DataFrame:
287
447
  """Original query upon which the below computation is based
288
448
 
@@ -346,6 +506,32 @@ class SWSBronzeIcebergSparkHelper:
346
506
 
347
507
  return df_meta_denorm
348
508
 
509
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
510
+ # ----------------
511
+ # Generate denormalized observation table
512
+ # ----------------
513
+
514
+ logging.info("meta_denorm start")
515
+
516
+ df_meta_denorm = self.spark.sql(
517
+ f"""
518
+ select m.observation as observation_id,
519
+ mt.code as type,
520
+ met.code as element_type,
521
+ l.country_code as language,
522
+ me.value
523
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
524
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
525
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
526
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
527
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
528
+ """
529
+ )
530
+
531
+ logging.info("meta_denorm write")
532
+
533
+ return df_meta_denorm
534
+
349
535
  def _gen_grouped_metadata(self) -> DataFrame:
350
536
  return (
351
537
  self._gen_denormalized_metadata()
@@ -366,6 +552,26 @@ class SWSBronzeIcebergSparkHelper:
366
552
  .agg(F.collect_list("metadata").alias("metadata"))
367
553
  )
368
554
 
555
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
556
+ return (
557
+ self._gen_denormalized_metadata_sql()
558
+ .select(
559
+ col("observation_id"),
560
+ F.create_map(
561
+ lit("type"),
562
+ col("type"),
563
+ lit("element_type"),
564
+ col("element_type"),
565
+ lit("language"),
566
+ col("language"),
567
+ lit("value"),
568
+ col("value"),
569
+ ).alias("metadata"),
570
+ )
571
+ .groupby("observation_id")
572
+ .agg(F.collect_list("metadata").alias("metadata"))
573
+ )
574
+
369
575
  def _gen_bronze_data(self) -> DataFrame:
370
576
  return (
371
577
  self._gen_denormalized_observation()
@@ -378,9 +584,37 @@ class SWSBronzeIcebergSparkHelper:
378
584
  .drop("m.observation_id")
379
585
  )
380
586
 
587
+ def _gen_bronze_data_sql(self) -> DataFrame:
588
+ return (
589
+ self._gen_denormalized_observation_sql()
590
+ .alias("o")
591
+ .join(
592
+ self._gen_grouped_metadata_sql().alias("m"),
593
+ col("o.id") == col("m.observation_id"),
594
+ "left",
595
+ )
596
+ .drop("m.observation_id")
597
+ )
598
+
599
+ def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
600
+ return (
601
+ self._gen_denormalized_observation_sql_from_tag()
602
+ .alias("o")
603
+ .join(
604
+ self._gen_grouped_metadata_sql().alias("m"),
605
+ col("o.id") == col("m.observation_id"),
606
+ "left",
607
+ )
608
+ .drop("m.observation_id")
609
+ )
610
+
381
611
  # TODO decouple data generation and data writing
382
- def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
383
- self.df_bronze = self._gen_bronze_data()
612
+ def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
613
+
614
+ if sql:
615
+ self.df_bronze = self._gen_bronze_data_sql()
616
+ else:
617
+ self.df_bronze = self._gen_bronze_data()
384
618
 
385
619
  self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
386
620
 
@@ -416,11 +650,13 @@ class SWSBronzeIcebergSparkHelper:
416
650
  description="Bronze table containing all the raw data imported from the SWS and denormalized",
417
651
  layer=TableLayer.BRONZE,
418
652
  private=True,
653
+ debug=True,
419
654
  type=TableType.ICEBERG,
420
655
  database=IcebergDatabases.BRONZE_DATABASE,
421
656
  table=self.iceberg_tables.BRONZE.table,
422
657
  path=self.iceberg_tables.BRONZE.path,
423
658
  structure={"columns": self.df_bronze.schema.jsonValue()["fields"]},
659
+ pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
424
660
  )
425
661
  tag = tags.add_dissemination_table(
426
662
  self.dataset_id, self.tag_name, new_iceberg_table
@@ -433,6 +669,7 @@ class SWSBronzeIcebergSparkHelper:
433
669
  description="Bronze table containing all the raw data imported from the SWS and denormalized cached in csv",
434
670
  layer=TableLayer.BRONZE,
435
671
  private=True,
672
+ debug=True,
436
673
  type=TableType.CSV,
437
674
  path=self.iceberg_tables.BRONZE.csv_path,
438
675
  structure={"columns": self.df_bronze.schema.jsonValue()["fields"]},
@@ -445,34 +682,34 @@ class SWSBronzeIcebergSparkHelper:
445
682
  logging.info("Bronze Dissemination tags successfully written")
446
683
 
447
684
  def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
448
- self, dimensions: Dict[str, List[str]]
685
+ self, dimensions: Dict[str, List[str]] = {}, from_tag=False
449
686
  ) -> DataFrame:
450
687
 
451
- create_branch_query = f"ALTER TABLE {self.iceberg_tables.BRONZE.iceberg_id}.`tag_{self.tag_name}` CREATE OR REPLACE BRANCH `diss_tag_{self.tag_name}`" # AS OF VERSION `{tag_name}`
452
- logging.info(f"create_branch_query: {create_branch_query}")
453
- create_branch_query_result = self.spark.sql(create_branch_query).collect()
454
-
455
- logging.info(f"result of create_branch_query: {create_branch_query_result}")
456
-
457
- logging.info(f"dimensions: {dimensions}")
458
- for dimension_name, codes in dimensions.items():
459
- logging.info(f"dimension_name: {dimension_name}")
460
- logging.info(f"codes: {codes}")
461
- if len(codes) != 0:
462
- not_in_codes = ",".join([f"'{code}'" for code in codes])
463
- delete_from_branch_query = f"DELETE FROM {self.iceberg_tables.BRONZE.iceberg_id}.`branch_diss_tag_{self.tag_name}` WHERE {dimension_name} NOT IN ({not_in_codes})"
464
- logging.info(f"delete_from_branch_query: {delete_from_branch_query}")
465
- delete_from_branch_query_result = self.spark.sql(
466
- delete_from_branch_query
467
- ).collect()
468
-
469
- logging.info(
470
- f"result of delete_from_branch_query: {delete_from_branch_query_result}"
471
- )
688
+ if from_tag:
689
+ self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
690
+ else:
691
+ self.disseminated_tag_df = self.df_bronze
692
+
693
+ if not from_tag and dimensions is not None and len(dimensions) != 0:
694
+ for dimension_name, codes in dimensions.items():
695
+ logging.info(f"dimension_name: {dimension_name}")
696
+ logging.info(f"codes: {codes}")
697
+ if len(codes) != 0:
698
+ self.disseminated_tag_df = self.disseminated_tag_df.filter(
699
+ col(dimension_name).isin(codes)
700
+ )
701
+
702
+ self.disseminated_tag_df.writeTo(
703
+ self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
704
+ ).createOrReplace()
705
+
706
+ logging.info(
707
+ f"Bronze disseminated tag table written to {self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id}"
708
+ )
472
709
 
473
- self.disseminated_tag_df = self.spark.read.option(
474
- "branch", f"`diss_tag_{self.tag_name}`"
475
- ).table(self.iceberg_tables.BRONZE.iceberg_id)
710
+ self.spark.sql(
711
+ f"ALTER TABLE {self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id} CREATE TAG `{self.tag_name}`"
712
+ )
476
713
 
477
714
  disseminated_tag_df = self.disseminated_tag_df.withColumn(
478
715
  "metadata", F.to_json(col("metadata"))
@@ -481,7 +718,7 @@ class SWSBronzeIcebergSparkHelper:
481
718
  save_cache_csv(
482
719
  df=disseminated_tag_df,
483
720
  bucket=self.bucket,
484
- prefix=f"{self.iceberg_tables.BRONZE.csv_prefix}_disseminated_tag",
721
+ prefix=f"{self.iceberg_tables.BRONZE_DISS_TAG.csv_prefix}",
485
722
  tag_name=self.tag_name,
486
723
  )
487
724
 
@@ -500,11 +737,12 @@ class SWSBronzeIcebergSparkHelper:
500
737
  private=True,
501
738
  type=TableType.ICEBERG,
502
739
  database=IcebergDatabases.BRONZE_DATABASE,
503
- table=self.iceberg_tables.BRONZE.table,
504
- path=self.iceberg_tables.BRONZE.path,
740
+ table=self.iceberg_tables.BRONZE_DISS_TAG.table,
741
+ path=self.iceberg_tables.BRONZE_DISS_TAG.path,
505
742
  structure={
506
743
  "columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
507
744
  },
745
+ pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
508
746
  )
509
747
  tag = tags.add_dissemination_table(
510
748
  self.dataset_id, self.tag_name, new_iceberg_table
@@ -519,7 +757,7 @@ class SWSBronzeIcebergSparkHelper:
519
757
  private=True,
520
758
  type=TableType.CSV,
521
759
  # TODO Correct the path in the origin library
522
- path=self.iceberg_tables.BRONZE.csv_path,
760
+ path=self.iceberg_tables.BRONZE_DISS_TAG.csv_path,
523
761
  structure={
524
762
  "columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
525
763
  },
@@ -531,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
531
769
  logging.debug(f"Tag with Added csv Table: {tag}")
532
770
 
533
771
  logging.info("Bronze Disseminated tag with selection successfully written")
772
+
773
+
774
+ 1
775
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
776
+ 1
777
+ 1
778
+ 2
779
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
780
+ 2
781
+ 1
782
+ 1
783
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
784
+ 1
785
+ 1
786
+ 2
787
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
788
+ 2
789
+ 1
790
+ 1
791
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
792
+ 1
793
+ 1
794
+ 1
795
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
796
+ 1
797
+ 1