sws-spark-dissemination-helper 0.0.149__tar.gz → 0.0.168__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sws-spark-dissemination-helper might be problematic. Click here for more details.

Files changed (16) hide show
  1. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/PKG-INFO +1 -1
  2. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/pyproject.toml +1 -1
  3. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +293 -31
  4. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +95 -41
  5. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +154 -2
  6. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +33 -21
  7. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/constants.py +9 -0
  8. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/.gitignore +0 -0
  9. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/LICENSE +0 -0
  10. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/README.md +0 -0
  11. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/utils.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/tests/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.149 → sws_spark_dissemination_helper-0.0.168}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.149
3
+ Version: 0.0.168
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.149"
7
+ version = "0.0.168"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3>=1.36.18",
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import time
3
3
  from copy import copy
4
- from typing import Dict, List, Tuple
4
+ from typing import Dict, List, Tuple, Union
5
5
 
6
6
  import pyspark.sql.functions as F
7
7
  from pyspark.sql import DataFrame, SparkSession
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
26
26
  domain_code: str,
27
27
  dataset_details: dict = None,
28
28
  dataset_tables: DatasetTables = None,
29
+ keep_history: bool = False,
30
+ write_csv: bool = True,
31
+ source_tag: Union[str, None] = None,
29
32
  ) -> None:
30
33
  self.spark: SparkSession = spark
31
34
  self.dataset_details: dict = dataset_details
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
36
39
  self.dataset_tables: DatasetTables = dataset_tables
37
40
  self.iceberg_tables: IcebergTables = iceberg_tables
38
41
  self.domain_code = domain_code
42
+ self.keep_history: bool = keep_history
43
+ self.write_csv: bool = write_csv
44
+ self.source_tag: Union[str, None] = source_tag
39
45
 
40
46
  if dataset_details is not None:
41
47
  (
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
83
89
  self.df_obs_coord,
84
90
  self.df_metadata,
85
91
  self.df_meta_elem,
92
+ self.df_tag_observation,
86
93
  ) = self.raw_data
87
94
 
88
95
  (
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
92
99
  self.df_meta_elem_type,
93
100
  self.df_language,
94
101
  self.df_unit_of_measure,
102
+ self.df_dataset,
95
103
  self.dfs_dimension,
96
104
  ) = self.raw_reference_data
97
105
 
98
- self.df_user = self.raw_operational_data
106
+ (self.df_user, self.df_tag) = self.raw_operational_data
99
107
 
100
108
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
101
109
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -150,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
150
158
 
151
159
  return dfs_dimension
152
160
 
153
- def _prepare_element_uom(self) -> DataFrame:
161
+ def _prepare_element_uom(self) -> Union[DataFrame, None]:
154
162
  """Prepare the element and unit of measure join."""
155
163
 
156
164
  # Get the element DataFrame
@@ -162,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
162
170
  if dimension_column == self.element_column
163
171
  )
164
172
 
165
- # Join the element and the unit_of_measure
166
- df_element_uom = (
167
- df_element.alias("e")
168
- .join(
169
- self.df_unit_of_measure.alias("u"),
170
- col("e.unit_of_measure") == col("u.id"),
171
- )
172
- .select(
173
- col("e.code").alias("element_code"),
174
- col("u.code").alias("unit_of_measure"),
175
- col("u.symbol").alias("unit_of_measure_symbol"),
176
- col("u.base_unit").alias("unit_of_measure_base_unit"),
177
- col("u.multiplier").alias("unit_of_measure_multiplier"),
173
+ if any("unit_of_measure" == column.lower() for column in df_element.columns):
174
+ # Join the element and the unit_of_measure
175
+ df_element_uom = (
176
+ df_element.alias("e")
177
+ .join(
178
+ self.df_unit_of_measure.alias("u"),
179
+ col("e.unit_of_measure") == col("u.id"),
180
+ )
181
+ .select(
182
+ col("e.code").alias("element_code"),
183
+ col("u.code").alias("unit_of_measure"),
184
+ col("u.symbol").alias("unit_of_measure_symbol"),
185
+ col("u.base_unit").alias("unit_of_measure_base_unit"),
186
+ col("u.multiplier").alias("unit_of_measure_multiplier"),
187
+ )
178
188
  )
179
- )
180
189
 
181
- return df_element_uom
190
+ return df_element_uom
182
191
 
183
192
  def _gen_denormalized_observation(self) -> DataFrame:
184
193
  """Original query upon which the below computation is based
@@ -270,20 +279,170 @@ class SWSBronzeIcebergSparkHelper:
270
279
  .withColumnRenamed("code", dimension_column)
271
280
  )
272
281
 
273
- df_intermediate = (
274
- df_intermediate.alias("d")
275
- .join(
276
- F.broadcast(df_element_uom).alias("e"),
277
- col(f"d.{self.element_column}") == col("e.element_code"),
278
- "left",
282
+ if df_element_uom is not None:
283
+ df_intermediate = (
284
+ df_intermediate.alias("d")
285
+ .join(
286
+ F.broadcast(df_element_uom).alias("e"),
287
+ col(f"d.{self.element_column}") == col("e.element_code"),
288
+ "left",
289
+ )
290
+ .drop("element_code")
279
291
  )
280
- .drop("element_code")
281
- )
282
292
 
283
293
  df_obs_denorm = df_intermediate
284
294
 
285
295
  return df_obs_denorm
286
296
 
297
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
298
+ # ----------------
299
+ # Prepare dataframes for the joins
300
+ # ----------------
301
+
302
+ select_statement = """
303
+ o.id,
304
+ o.value,
305
+ u.email,
306
+ o.created_on,
307
+ o.replaced_on,
308
+ o.version"""
309
+
310
+ from_statement = f"""
311
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
312
+ JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
313
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
314
+
315
+ hint_statement = ""
316
+
317
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
318
+ for flag_col in self.flag_columns:
319
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
320
+
321
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
322
+ for i, (dim_col, cl) in enumerate(
323
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
324
+ ):
325
+ select_statement += f",\nd{i}.code AS {dim_col}"
326
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
327
+ hint_statement = (
328
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
329
+ if hint_statement
330
+ else f"BROADCAST({cl.iceberg_id})"
331
+ )
332
+
333
+ hint_statement = "/*+ " + hint_statement + " */"
334
+
335
+ final_query = "SELECT " + hint_statement + select_statement + from_statement
336
+ if not self.keep_history:
337
+ final_query += "\nWHERE o.replaced_on IS NULL"
338
+
339
+ logging.info("Final query for merging observation and observation_coordinares")
340
+ logging.info(final_query)
341
+
342
+ df_obs_denorm = self.spark.sql(final_query)
343
+
344
+ df_element_uom = self._prepare_element_uom()
345
+
346
+ dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
347
+
348
+ # Join all the dimension codelists
349
+ for dimension_column, df_dimension in zip(
350
+ self.dim_columns_w_time, dfs_dimension_w_validity
351
+ ):
352
+ logging.debug(f"Joining dimension column: {dimension_column}")
353
+ logging.debug(f"df_obs_denorm columns: {df_obs_denorm.columns}")
354
+ logging.debug(
355
+ f"Is dimension {dimension_column} in the dataframe? {dimension_column in df_obs_denorm.columns}"
356
+ )
357
+ df_obs_denorm = (
358
+ df_obs_denorm.alias("o")
359
+ .join(
360
+ F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
361
+ "d"
362
+ ),
363
+ col(f"{dimension_column}") == col("d.code"),
364
+ )
365
+ .drop("code", "join_id")
366
+ )
367
+ logging.debug(f"After join count: {df_obs_denorm.count()}")
368
+
369
+ if df_element_uom is not None:
370
+ df_obs_denorm = (
371
+ df_obs_denorm.alias("d")
372
+ .join(
373
+ F.broadcast(df_element_uom).alias("e"),
374
+ col(f"d.{self.element_column}") == col("e.element_code"),
375
+ "left",
376
+ )
377
+ .drop("element_code")
378
+ )
379
+ logging.debug(f"After uom count: {df_obs_denorm.count()}")
380
+
381
+ return df_obs_denorm
382
+
383
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
384
+ # ----------------
385
+ # Prepare dataframes for the joins
386
+ # ----------------
387
+
388
+ select_statement = """
389
+ o.id,
390
+ o.value,
391
+ u.email,
392
+ o.created_on,
393
+ o.replaced_on,
394
+ o.version"""
395
+
396
+ from_statement = f"""
397
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
398
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
399
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
400
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
401
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
402
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
403
+
404
+ hint_statement = ""
405
+
406
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
407
+ for flag_col in self.flag_columns:
408
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
409
+
410
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
411
+ for i, (dim_col, cl) in enumerate(
412
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
413
+ ):
414
+ select_statement += f",\nd{i}.code AS {dim_col}"
415
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
416
+ hint_statement = (
417
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
418
+ if hint_statement
419
+ else f"BROADCAST({cl.iceberg_id})"
420
+ )
421
+
422
+ hint_statement = "/*+ " + hint_statement + " */"
423
+
424
+ # TODO Add tag name as a parameter
425
+ where_statement = (
426
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
427
+ )
428
+
429
+ final_query = (
430
+ "SELECT "
431
+ + hint_statement
432
+ + select_statement
433
+ + from_statement
434
+ + where_statement
435
+ )
436
+ if not self.keep_history:
437
+ final_query += "\n AND o.replaced_on IS NULL"
438
+
439
+ logging.info("Final query for merging observation and observation_coordinares")
440
+ logging.info(final_query)
441
+
442
+ df_obs_denorm = self.spark.sql(final_query)
443
+
444
+ return df_obs_denorm
445
+
287
446
  def _gen_denormalized_metadata(self) -> DataFrame:
288
447
  """Original query upon which the below computation is based
289
448
 
@@ -347,6 +506,32 @@ class SWSBronzeIcebergSparkHelper:
347
506
 
348
507
  return df_meta_denorm
349
508
 
509
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
510
+ # ----------------
511
+ # Generate denormalized observation table
512
+ # ----------------
513
+
514
+ logging.info("meta_denorm start")
515
+
516
+ df_meta_denorm = self.spark.sql(
517
+ f"""
518
+ select m.observation as observation_id,
519
+ mt.code as type,
520
+ met.code as element_type,
521
+ l.country_code as language,
522
+ me.value
523
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
524
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
525
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
526
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
527
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
528
+ """
529
+ )
530
+
531
+ logging.info("meta_denorm write")
532
+
533
+ return df_meta_denorm
534
+
350
535
  def _gen_grouped_metadata(self) -> DataFrame:
351
536
  return (
352
537
  self._gen_denormalized_metadata()
@@ -367,6 +552,26 @@ class SWSBronzeIcebergSparkHelper:
367
552
  .agg(F.collect_list("metadata").alias("metadata"))
368
553
  )
369
554
 
555
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
556
+ return (
557
+ self._gen_denormalized_metadata_sql()
558
+ .select(
559
+ col("observation_id"),
560
+ F.create_map(
561
+ lit("type"),
562
+ col("type"),
563
+ lit("element_type"),
564
+ col("element_type"),
565
+ lit("language"),
566
+ col("language"),
567
+ lit("value"),
568
+ col("value"),
569
+ ).alias("metadata"),
570
+ )
571
+ .groupby("observation_id")
572
+ .agg(F.collect_list("metadata").alias("metadata"))
573
+ )
574
+
370
575
  def _gen_bronze_data(self) -> DataFrame:
371
576
  return (
372
577
  self._gen_denormalized_observation()
@@ -379,9 +584,37 @@ class SWSBronzeIcebergSparkHelper:
379
584
  .drop("m.observation_id")
380
585
  )
381
586
 
587
+ def _gen_bronze_data_sql(self) -> DataFrame:
588
+ return (
589
+ self._gen_denormalized_observation_sql()
590
+ .alias("o")
591
+ .join(
592
+ self._gen_grouped_metadata_sql().alias("m"),
593
+ col("o.id") == col("m.observation_id"),
594
+ "left",
595
+ )
596
+ .drop("m.observation_id")
597
+ )
598
+
599
+ def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
600
+ return (
601
+ self._gen_denormalized_observation_sql_from_tag()
602
+ .alias("o")
603
+ .join(
604
+ self._gen_grouped_metadata_sql().alias("m"),
605
+ col("o.id") == col("m.observation_id"),
606
+ "left",
607
+ )
608
+ .drop("m.observation_id")
609
+ )
610
+
382
611
  # TODO decouple data generation and data writing
383
- def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
384
- self.df_bronze = self._gen_bronze_data()
612
+ def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
613
+
614
+ if sql:
615
+ self.df_bronze = self._gen_bronze_data_sql()
616
+ else:
617
+ self.df_bronze = self._gen_bronze_data()
385
618
 
386
619
  self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
387
620
 
@@ -449,12 +682,15 @@ class SWSBronzeIcebergSparkHelper:
449
682
  logging.info("Bronze Dissemination tags successfully written")
450
683
 
451
684
  def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
452
- self, dimensions: Dict[str, List[str]]
685
+ self, dimensions: Dict[str, List[str]] = {}, from_tag=False
453
686
  ) -> DataFrame:
454
687
 
455
- self.disseminated_tag_df = self.df_bronze
688
+ if from_tag:
689
+ self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
690
+ else:
691
+ self.disseminated_tag_df = self.df_bronze
456
692
 
457
- if isinstance(dimensions, dict):
693
+ if not from_tag and dimensions is not None and len(dimensions) != 0:
458
694
  for dimension_name, codes in dimensions.items():
459
695
  logging.info(f"dimension_name: {dimension_name}")
460
696
  logging.info(f"codes: {codes}")
@@ -533,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
533
769
  logging.debug(f"Tag with Added csv Table: {tag}")
534
770
 
535
771
  logging.info("Bronze Disseminated tag with selection successfully written")
772
+
773
+
774
+ 1
775
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
776
+ 1
777
+ 1
778
+ 2
779
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
780
+ 2
781
+ 1
782
+ 1
783
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
784
+ 1
785
+ 1
786
+ 2
787
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
788
+ 2
789
+ 1
790
+ 1
791
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
792
+ 1
793
+ 1
794
+ 1
795
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
796
+ 1
797
+ 1
@@ -74,6 +74,7 @@ class SWSEasyIcebergSparkHelper:
74
74
  self.df_tag_observation,
75
75
  ) = self.raw_data
76
76
 
77
+ logging.info(self.raw_reference_data)
77
78
  (
78
79
  self.df_flag_method,
79
80
  self.df_flag_obs_status,
@@ -275,11 +276,17 @@ class SWSEasyIcebergSparkHelper:
275
276
  if not self.keep_history:
276
277
  final_query += "\nWHERE o.replaced_on IS NULL"
277
278
 
278
- logging.info("Final query for merging observation and observation_coordinares")
279
+ logging.info("Final query for merging observation and observation_coordinates")
279
280
  logging.info(final_query)
280
281
 
281
282
  df_obs_denorm = self.spark.sql(final_query)
282
283
 
284
+ df_obs_denorm.writeTo(
285
+ self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id
286
+ ).createOrReplace()
287
+
288
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_OBSERVATION.table} write")
289
+
283
290
  return df_obs_denorm
284
291
 
285
292
  def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
@@ -417,7 +424,13 @@ class SWSEasyIcebergSparkHelper:
417
424
 
418
425
  df_meta_denorm = self.spark.sql(
419
426
  f"""
420
- select m.observation as observation_id,
427
+ select
428
+ /*+
429
+ BROADCAST({self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id}),
430
+ BROADCAST({self.dataset_tables.METADATA_TYPE.iceberg_id}),
431
+ BROADCAST({self.dataset_tables.LANGUAGE.iceberg_id})
432
+ */
433
+ m.observation as observation_id,
421
434
  mt.code as type,
422
435
  met.code as element_type,
423
436
  l.country_code as language,
@@ -430,7 +443,11 @@ class SWSEasyIcebergSparkHelper:
430
443
  """
431
444
  )
432
445
 
433
- logging.info("meta_denorm write")
446
+ df_meta_denorm.writeTo(
447
+ self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id
448
+ ).createOrReplace()
449
+
450
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_METADATA.table} write")
434
451
 
435
452
  return df_meta_denorm
436
453
 
@@ -455,25 +472,31 @@ class SWSEasyIcebergSparkHelper:
455
472
  )
456
473
 
457
474
  def _gen_grouped_metadata_sql(self) -> DataFrame:
458
- return (
459
- self._gen_denormalized_metadata_sql()
460
- .select(
461
- col("observation_id"),
462
- F.create_map(
463
- lit("type"),
464
- col("type"),
465
- lit("element_type"),
466
- col("element_type"),
467
- lit("language"),
468
- col("language"),
469
- lit("value"),
470
- col("value"),
471
- ).alias("metadata"),
472
- )
473
- .groupby("observation_id")
474
- .agg(F.collect_list("metadata").alias("metadata"))
475
+ df_meta_grouped = self.spark.sql(
476
+ f"""
477
+ SELECT
478
+ observation_id,
479
+ collect_list(
480
+ map(
481
+ 'type', type,
482
+ 'element_type', element_type,
483
+ 'language', language,
484
+ 'value', value
485
+ )
486
+ ) AS metadata
487
+ FROM {self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id}
488
+ GROUP BY observation_id
489
+ """
475
490
  )
476
491
 
492
+ df_meta_grouped.writeTo(
493
+ self.iceberg_tables.GROUPED_METADATA.iceberg_id
494
+ ).createOrReplace()
495
+
496
+ logging.info(f"{self.iceberg_tables.GROUPED_METADATA.table} write")
497
+
498
+ return df_meta_grouped
499
+
477
500
  def _gen_denormalied_data(self) -> DataFrame:
478
501
  return (
479
502
  self._gen_denormalized_observation()
@@ -487,15 +510,19 @@ class SWSEasyIcebergSparkHelper:
487
510
  )
488
511
 
489
512
  def _gen_denormalied_data_sql(self) -> DataFrame:
490
- return (
491
- self._gen_denormalized_observation_sql()
492
- .alias("o")
493
- .join(
494
- self._gen_grouped_metadata_sql().alias("m"),
495
- col("o.id") == col("m.observation_id"),
496
- "left",
497
- )
498
- .drop("m.observation_id")
513
+ self._gen_denormalized_observation_sql()
514
+ self._gen_denormalized_metadata_sql()
515
+ self._gen_grouped_metadata_sql()
516
+
517
+ return self.spark.sql(
518
+ f"""
519
+ SELECT
520
+ o.*,
521
+ m.metadata
522
+ FROM {self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id} AS o
523
+ LEFT JOIN {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS m
524
+ ON o.id = m.observation_id
525
+ """
499
526
  )
500
527
 
501
528
  def _gen_denormalied_data_sql_from_tag(self) -> DataFrame:
@@ -510,11 +537,9 @@ class SWSEasyIcebergSparkHelper:
510
537
  .drop("m.observation_id")
511
538
  )
512
539
 
513
- def write_data_to_iceberg_and_csv(self, sql=False, from_tag=False) -> DataFrame:
540
+ def write_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
514
541
  if sql:
515
542
  self.df_denorm = self._gen_denormalied_data_sql()
516
- elif from_tag:
517
- self.df_denorm = self._gen_denormalied_data_sql_from_tag()
518
543
  else:
519
544
  self.df_denorm = self._gen_denormalied_data()
520
545
 
@@ -585,18 +610,21 @@ class SWSEasyIcebergSparkHelper:
585
610
  logging.info("Unfiltered data tags successfully written")
586
611
 
587
612
  def write_filtered_data_to_iceberg_and_csv(
588
- self, dimensions: Dict[str, List[str]]
613
+ self, dimensions: Dict[str, List[str]] = None, from_tag=False
589
614
  ) -> DataFrame:
590
615
 
591
- self.filtered_df = self.df_denorm
616
+ if from_tag:
617
+ self.filtered_df = self._gen_denormalied_data_sql_from_tag()
618
+ else:
619
+ self.filtered_df = self.df_denorm
592
620
 
593
- for dimension_name, codes in dimensions.items():
594
- logging.info(f"dimension_name: {dimension_name}")
595
- logging.info(f"codes: {codes}")
596
- if len(codes) != 0:
597
- self.filtered_df = self.filtered_df.filter(
598
- col(dimension_name).isin(codes)
599
- )
621
+ for dimension_name, codes in dimensions.items():
622
+ logging.info(f"dimension_name: {dimension_name}")
623
+ logging.info(f"codes: {codes}")
624
+ if len(codes) != 0:
625
+ self.filtered_df = self.filtered_df.filter(
626
+ col(dimension_name).isin(codes)
627
+ )
600
628
 
601
629
  self.filtered_df.writeTo(
602
630
  self.iceberg_tables.TABLE_FILTERED.iceberg_id
@@ -667,3 +695,29 @@ class SWSEasyIcebergSparkHelper:
667
695
  logging.debug(f"Tag with Added csv Table: {tag}")
668
696
 
669
697
  logging.info("Filtered data tags successfully written")
698
+
699
+
700
+ 1
701
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
702
+ 1
703
+ 1
704
+ 2
705
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
706
+ 2
707
+ 1
708
+ 1
709
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
710
+ 1
711
+ 1
712
+ 2
713
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
714
+ 2
715
+ 1
716
+ 1
717
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
718
+ 1
719
+ 1
720
+ 1
721
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
722
+ 1
723
+ 1
@@ -271,6 +271,60 @@ class SWSGoldIcebergSparkHelper:
271
271
 
272
272
  return df
273
273
 
274
+ def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
275
+ """The expected input to this function is the output of the sws disseminated function"""
276
+ df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
277
+
278
+ logging.info(
279
+ f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
280
+ )
281
+
282
+ self.spark.sql(
283
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
284
+ )
285
+
286
+ logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
287
+
288
+ df_1 = df.coalesce(1)
289
+
290
+ save_cache_csv(
291
+ df=df_1,
292
+ bucket=self.bucket,
293
+ prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
294
+ tag_name=self.tag_name,
295
+ )
296
+
297
+ return df
298
+
299
+ def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
300
+ self, df: DataFrame
301
+ ) -> DataFrame:
302
+ """The expected input to this function is the output of the sws disseminated function"""
303
+ df.writeTo(
304
+ self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
305
+ ).createOrReplace()
306
+
307
+ logging.info(
308
+ f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
309
+ )
310
+
311
+ self.spark.sql(
312
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
313
+ )
314
+
315
+ logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
316
+
317
+ df_1 = df.coalesce(1)
318
+
319
+ save_cache_csv(
320
+ df=df_1,
321
+ bucket=self.bucket,
322
+ prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
323
+ tag_name=self.tag_name,
324
+ )
325
+
326
+ return df
327
+
274
328
  def write_gold_sws_validated_sws_dissemination_tag(
275
329
  self, df: DataFrame, tags: Tags
276
330
  ) -> DataFrame:
@@ -496,8 +550,8 @@ class SWSGoldIcebergSparkHelper:
496
550
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
497
551
 
498
552
  new_diss_table = BaseDisseminatedTagTable(
499
- id=f"{self.domain_code.lower()}_gold_pre_sdmx_csv",
500
- name=f"{self.domain_code} gold pre-SDMX csv",
553
+ id=f"{self.domain_code.lower()}_gold_sws_csv",
554
+ name=f"{self.domain_code} gold SWS csv",
501
555
  description="Gold table containing the tag data without any processing cached in csv",
502
556
  layer=TableLayer.GOLD,
503
557
  private=True,
@@ -515,3 +569,101 @@ class SWSGoldIcebergSparkHelper:
515
569
  logging.debug(f"Tag with Added csv Table: {tag}")
516
570
 
517
571
  return df
572
+
573
+ def write_gold_faostat_dissemination_tag(
574
+ self, df: DataFrame, tags: Tags
575
+ ) -> DataFrame:
576
+ # Get or create a new tag
577
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
578
+ logging.debug(f"Tag: {tag}")
579
+
580
+ new_iceberg_table = BaseDisseminatedTagTable(
581
+ id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
582
+ name=f"{self.domain_code} gold FAOSTAT Iceberg",
583
+ description="Gold table containing the tag data in FAOSTAT format",
584
+ layer=TableLayer.GOLD,
585
+ private=True,
586
+ type=TableType.ICEBERG,
587
+ database=IcebergDatabases.GOLD_DATABASE,
588
+ table=self.iceberg_tables.GOLD_FAOSTAT.table,
589
+ path=self.iceberg_tables.GOLD_FAOSTAT.path,
590
+ structure={"columns": df.schema.jsonValue()["fields"]},
591
+ )
592
+ tag = upsert_disseminated_table(
593
+ sws_tags=tags,
594
+ tag=tag,
595
+ dataset_id=self.dataset_id,
596
+ tag_name=self.tag_name,
597
+ table=new_iceberg_table,
598
+ )
599
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
600
+
601
+ new_diss_table = BaseDisseminatedTagTable(
602
+ id=f"{self.domain_code.lower()}_gold_faostat_csv",
603
+ name=f"{self.domain_code} gold FAOSTAT csv",
604
+ description="Gold table containing the tag data in FAOSTAT format in csv",
605
+ layer=TableLayer.GOLD,
606
+ private=True,
607
+ type=TableType.CSV,
608
+ path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
609
+ structure={"columns": df.schema.jsonValue()["fields"]},
610
+ )
611
+ tag = upsert_disseminated_table(
612
+ sws_tags=tags,
613
+ tag=tag,
614
+ dataset_id=self.dataset_id,
615
+ tag_name=self.tag_name,
616
+ table=new_diss_table,
617
+ )
618
+ logging.debug(f"Tag with Added csv Table: {tag}")
619
+
620
+ return df
621
+
622
+ def write_gold_faostat_unfiltered_dissemination_tag(
623
+ self, df: DataFrame, tags: Tags
624
+ ) -> DataFrame:
625
+ # Get or create a new tag
626
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
627
+ logging.debug(f"Tag: {tag}")
628
+
629
+ new_iceberg_table = BaseDisseminatedTagTable(
630
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
631
+ name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
632
+ description="Gold table containing all the tag data in FAOSTAT format",
633
+ layer=TableLayer.GOLD,
634
+ private=True,
635
+ type=TableType.ICEBERG,
636
+ database=IcebergDatabases.GOLD_DATABASE,
637
+ table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
638
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
639
+ structure={"columns": df.schema.jsonValue()["fields"]},
640
+ )
641
+ tag = upsert_disseminated_table(
642
+ sws_tags=tags,
643
+ tag=tag,
644
+ dataset_id=self.dataset_id,
645
+ tag_name=self.tag_name,
646
+ table=new_iceberg_table,
647
+ )
648
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
649
+
650
+ new_diss_table = BaseDisseminatedTagTable(
651
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
652
+ name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
653
+ description="Gold table containing the tag data in FAOSTAT format in csv",
654
+ layer=TableLayer.GOLD,
655
+ private=True,
656
+ type=TableType.CSV,
657
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
658
+ structure={"columns": df.schema.jsonValue()["fields"]},
659
+ )
660
+ tag = upsert_disseminated_table(
661
+ sws_tags=tags,
662
+ tag=tag,
663
+ dataset_id=self.dataset_id,
664
+ tag_name=self.tag_name,
665
+ table=new_diss_table,
666
+ )
667
+ logging.debug(f"Tag with Added csv Table: {tag}")
668
+
669
+ return df
@@ -94,25 +94,37 @@ class SWSPostgresSparkReader:
94
94
 
95
95
  logging.info(f"{pg_table} read start")
96
96
 
97
- # Read observations from the PostgreSQL table into a DataFrame
98
- df = (
99
- self.spark.read.format("jdbc")
100
- .option("customSchema", custom_schema)
101
- .option("dbtable", pg_table)
102
- .option("partitionColumn", partition_column)
103
- .option("lowerBound", min_id)
104
- .option("upperBound", max_id)
105
- .option("numPartitions", num_partitions)
106
- .option("fetchsize", "1000")
107
- .option("url", self.jdbc_url)
108
- .option("user", self.jdbc_conn_properties["user"])
109
- .option("password", self.jdbc_conn_properties["password"])
110
- .option("driver", SPARK_POSTGRES_DRIVER)
111
- .load()
112
- # .repartition(1024, partition_column)
113
- # .sortWithinPartitions(partition_column)
114
- # .cache()
115
- )
97
+ if min_id is None or max_id is None:
98
+ df = (
99
+ self.spark.read.format("jdbc")
100
+ .option("customSchema", custom_schema)
101
+ .option("dbtable", pg_table)
102
+ .option("fetchsize", "1000")
103
+ .option("url", self.jdbc_url)
104
+ .option("user", self.jdbc_conn_properties["user"])
105
+ .option("password", self.jdbc_conn_properties["password"])
106
+ .option("driver", SPARK_POSTGRES_DRIVER)
107
+ .load()
108
+ )
109
+ else:
110
+ df = (
111
+ self.spark.read.format("jdbc")
112
+ .option("customSchema", custom_schema)
113
+ .option("dbtable", pg_table)
114
+ .option("partitionColumn", partition_column)
115
+ .option("lowerBound", min_id)
116
+ .option("upperBound", max_id)
117
+ .option("numPartitions", num_partitions)
118
+ .option("fetchsize", "1000")
119
+ .option("url", self.jdbc_url)
120
+ .option("user", self.jdbc_conn_properties["user"])
121
+ .option("password", self.jdbc_conn_properties["password"])
122
+ .option("driver", SPARK_POSTGRES_DRIVER)
123
+ .load()
124
+ # .repartition(1024, partition_column)
125
+ # .sortWithinPartitions(partition_column)
126
+ # .cache()
127
+ )
116
128
  else:
117
129
  df = (
118
130
  self.spark.read.format("jdbc")
@@ -254,8 +266,8 @@ class SWSPostgresSparkReader:
254
266
  return (
255
267
  tuple(data_dfs),
256
268
  (
257
- *reference_data_dfs[:6],
258
- reference_data_dfs[6:],
269
+ *reference_data_dfs[:7],
270
+ reference_data_dfs[7:],
259
271
  ),
260
272
  tuple(operational_data_dfs),
261
273
  )
@@ -254,6 +254,9 @@ class IcebergTables:
254
254
  self.__tag_name = tag_name
255
255
 
256
256
  # TODO Fix later with a more appropriate DATABASE
257
+ self.DENORMALIZED_OBSERVATION = self._create_iceberg_table("BRONZE", suffix="denormalized_observation")
258
+ self.DENORMALIZED_METADATA = self._create_iceberg_table("BRONZE", suffix="denormalized_metadata")
259
+ self.GROUPED_METADATA = self._create_iceberg_table("BRONZE", suffix="grouped_metadata")
257
260
  self.TABLE = self._create_iceberg_table("BRONZE")
258
261
  self.TABLE_FILTERED = self._create_iceberg_table("BRONZE", suffix="filtered")
259
262
  self.BRONZE = self._create_iceberg_table("BRONZE")
@@ -274,6 +277,12 @@ class IcebergTables:
274
277
  self.GOLD_PRE_SDMX = self._create_iceberg_table(
275
278
  "GOLD", prefix=domain, suffix="pre_sdmx"
276
279
  )
280
+ self.GOLD_FAOSTAT = self._create_iceberg_table(
281
+ "GOLD", prefix=domain, suffix="faostat"
282
+ )
283
+ self.GOLD_FAOSTAT_UNFILTERED = self._create_iceberg_table(
284
+ "GOLD", prefix=domain, suffix="faostat_unfiltered"
285
+ )
277
286
 
278
287
  def _create_iceberg_table(
279
288
  self, level: str, prefix: str = "", suffix: str = ""