sws-spark-dissemination-helper 0.0.151__tar.gz → 0.0.153__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/PKG-INFO +1 -1
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/pyproject.toml +1 -1
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +204 -7
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +14 -13
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/.gitignore +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/LICENSE +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/README.md +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/constants.py +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/src/sws_spark_dissemination_helper/utils.py +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/tests/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/tests/test.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.153
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
from copy import copy
|
|
4
|
-
from typing import Dict, List, Tuple
|
|
4
|
+
from typing import Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import pyspark.sql.functions as F
|
|
7
7
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
26
26
|
domain_code: str,
|
|
27
27
|
dataset_details: dict = None,
|
|
28
28
|
dataset_tables: DatasetTables = None,
|
|
29
|
+
keep_history: bool = False,
|
|
30
|
+
write_csv: bool = True,
|
|
31
|
+
source_tag: Union[str, None] = None,
|
|
29
32
|
) -> None:
|
|
30
33
|
self.spark: SparkSession = spark
|
|
31
34
|
self.dataset_details: dict = dataset_details
|
|
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
36
39
|
self.dataset_tables: DatasetTables = dataset_tables
|
|
37
40
|
self.iceberg_tables: IcebergTables = iceberg_tables
|
|
38
41
|
self.domain_code = domain_code
|
|
42
|
+
self.keep_history: bool = keep_history
|
|
43
|
+
self.write_csv: bool = write_csv
|
|
44
|
+
self.source_tag: Union[str, None] = source_tag
|
|
39
45
|
|
|
40
46
|
if dataset_details is not None:
|
|
41
47
|
(
|
|
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
83
89
|
self.df_obs_coord,
|
|
84
90
|
self.df_metadata,
|
|
85
91
|
self.df_meta_elem,
|
|
92
|
+
self.df_tag_observation,
|
|
86
93
|
) = self.raw_data
|
|
87
94
|
|
|
88
95
|
(
|
|
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
92
99
|
self.df_meta_elem_type,
|
|
93
100
|
self.df_language,
|
|
94
101
|
self.df_unit_of_measure,
|
|
102
|
+
self.df_dataset,
|
|
95
103
|
self.dfs_dimension,
|
|
96
104
|
) = self.raw_reference_data
|
|
97
105
|
|
|
98
|
-
self.df_user = self.raw_operational_data
|
|
106
|
+
(self.df_user, self.df_tag) = self.raw_operational_data
|
|
99
107
|
|
|
100
108
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
101
109
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
@@ -284,6 +292,118 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
284
292
|
|
|
285
293
|
return df_obs_denorm
|
|
286
294
|
|
|
295
|
+
def _gen_denormalized_observation_sql(self) -> DataFrame:
|
|
296
|
+
# ----------------
|
|
297
|
+
# Prepare dataframes for the joins
|
|
298
|
+
# ----------------
|
|
299
|
+
|
|
300
|
+
select_statement = """
|
|
301
|
+
o.id,
|
|
302
|
+
o.value,
|
|
303
|
+
u.email,
|
|
304
|
+
o.created_on,
|
|
305
|
+
o.replaced_on,
|
|
306
|
+
o.version"""
|
|
307
|
+
|
|
308
|
+
from_statement = f"""
|
|
309
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
310
|
+
JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
311
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
312
|
+
|
|
313
|
+
hint_statement = ""
|
|
314
|
+
|
|
315
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
316
|
+
for flag_col in self.flag_columns:
|
|
317
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
318
|
+
|
|
319
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
320
|
+
for i, (dim_col, cl) in enumerate(
|
|
321
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
322
|
+
):
|
|
323
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
324
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
325
|
+
hint_statement = (
|
|
326
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
327
|
+
if hint_statement
|
|
328
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
332
|
+
|
|
333
|
+
final_query = "SELECT " + hint_statement + select_statement + from_statement
|
|
334
|
+
if not self.keep_history:
|
|
335
|
+
final_query += "\nWHERE o.replaced_on IS NULL"
|
|
336
|
+
|
|
337
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
338
|
+
logging.info(final_query)
|
|
339
|
+
|
|
340
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
341
|
+
|
|
342
|
+
return df_obs_denorm
|
|
343
|
+
|
|
344
|
+
def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
|
|
345
|
+
# ----------------
|
|
346
|
+
# Prepare dataframes for the joins
|
|
347
|
+
# ----------------
|
|
348
|
+
|
|
349
|
+
select_statement = """
|
|
350
|
+
o.id,
|
|
351
|
+
o.value,
|
|
352
|
+
u.email,
|
|
353
|
+
o.created_on,
|
|
354
|
+
o.replaced_on,
|
|
355
|
+
o.version"""
|
|
356
|
+
|
|
357
|
+
from_statement = f"""
|
|
358
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
359
|
+
INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
|
|
360
|
+
INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
|
|
361
|
+
INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
|
|
362
|
+
LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
363
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
364
|
+
|
|
365
|
+
hint_statement = ""
|
|
366
|
+
|
|
367
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
368
|
+
for flag_col in self.flag_columns:
|
|
369
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
370
|
+
|
|
371
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
372
|
+
for i, (dim_col, cl) in enumerate(
|
|
373
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
374
|
+
):
|
|
375
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
376
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
377
|
+
hint_statement = (
|
|
378
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
379
|
+
if hint_statement
|
|
380
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
384
|
+
|
|
385
|
+
# TODO Add tag name as a parameter
|
|
386
|
+
where_statement = (
|
|
387
|
+
f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
final_query = (
|
|
391
|
+
"SELECT "
|
|
392
|
+
+ hint_statement
|
|
393
|
+
+ select_statement
|
|
394
|
+
+ from_statement
|
|
395
|
+
+ where_statement
|
|
396
|
+
)
|
|
397
|
+
if not self.keep_history:
|
|
398
|
+
final_query += "\n AND o.replaced_on IS NULL"
|
|
399
|
+
|
|
400
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
401
|
+
logging.info(final_query)
|
|
402
|
+
|
|
403
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
404
|
+
|
|
405
|
+
return df_obs_denorm
|
|
406
|
+
|
|
287
407
|
def _gen_denormalized_metadata(self) -> DataFrame:
|
|
288
408
|
"""Original query upon which the below computation is based
|
|
289
409
|
|
|
@@ -347,6 +467,32 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
347
467
|
|
|
348
468
|
return df_meta_denorm
|
|
349
469
|
|
|
470
|
+
def _gen_denormalized_metadata_sql(self) -> DataFrame:
|
|
471
|
+
# ----------------
|
|
472
|
+
# Generate denormalized observation table
|
|
473
|
+
# ----------------
|
|
474
|
+
|
|
475
|
+
logging.info("meta_denorm start")
|
|
476
|
+
|
|
477
|
+
df_meta_denorm = self.spark.sql(
|
|
478
|
+
f"""
|
|
479
|
+
select m.observation as observation_id,
|
|
480
|
+
mt.code as type,
|
|
481
|
+
met.code as element_type,
|
|
482
|
+
l.country_code as language,
|
|
483
|
+
me.value
|
|
484
|
+
from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
|
|
485
|
+
left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
|
|
486
|
+
left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
|
|
487
|
+
left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
|
|
488
|
+
left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
|
|
489
|
+
"""
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
logging.info("meta_denorm write")
|
|
493
|
+
|
|
494
|
+
return df_meta_denorm
|
|
495
|
+
|
|
350
496
|
def _gen_grouped_metadata(self) -> DataFrame:
|
|
351
497
|
return (
|
|
352
498
|
self._gen_denormalized_metadata()
|
|
@@ -367,6 +513,26 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
367
513
|
.agg(F.collect_list("metadata").alias("metadata"))
|
|
368
514
|
)
|
|
369
515
|
|
|
516
|
+
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
517
|
+
return (
|
|
518
|
+
self._gen_denormalized_metadata_sql()
|
|
519
|
+
.select(
|
|
520
|
+
col("observation_id"),
|
|
521
|
+
F.create_map(
|
|
522
|
+
lit("type"),
|
|
523
|
+
col("type"),
|
|
524
|
+
lit("element_type"),
|
|
525
|
+
col("element_type"),
|
|
526
|
+
lit("language"),
|
|
527
|
+
col("language"),
|
|
528
|
+
lit("value"),
|
|
529
|
+
col("value"),
|
|
530
|
+
).alias("metadata"),
|
|
531
|
+
)
|
|
532
|
+
.groupby("observation_id")
|
|
533
|
+
.agg(F.collect_list("metadata").alias("metadata"))
|
|
534
|
+
)
|
|
535
|
+
|
|
370
536
|
def _gen_bronze_data(self) -> DataFrame:
|
|
371
537
|
return (
|
|
372
538
|
self._gen_denormalized_observation()
|
|
@@ -379,9 +545,37 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
379
545
|
.drop("m.observation_id")
|
|
380
546
|
)
|
|
381
547
|
|
|
548
|
+
def _gen_bronze_data_sql(self) -> DataFrame:
|
|
549
|
+
return (
|
|
550
|
+
self._gen_denormalized_observation_sql()
|
|
551
|
+
.alias("o")
|
|
552
|
+
.join(
|
|
553
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
554
|
+
col("o.id") == col("m.observation_id"),
|
|
555
|
+
"left",
|
|
556
|
+
)
|
|
557
|
+
.drop("m.observation_id")
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
|
|
561
|
+
return (
|
|
562
|
+
self._gen_denormalized_observation_sql_from_tag()
|
|
563
|
+
.alias("o")
|
|
564
|
+
.join(
|
|
565
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
566
|
+
col("o.id") == col("m.observation_id"),
|
|
567
|
+
"left",
|
|
568
|
+
)
|
|
569
|
+
.drop("m.observation_id")
|
|
570
|
+
)
|
|
571
|
+
|
|
382
572
|
# TODO decouple data generation and data writing
|
|
383
|
-
def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
384
|
-
|
|
573
|
+
def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
|
|
574
|
+
|
|
575
|
+
if sql:
|
|
576
|
+
self.df_bronze = self._gen_bronze_data_sql()
|
|
577
|
+
else:
|
|
578
|
+
self.df_bronze = self._gen_bronze_data()
|
|
385
579
|
|
|
386
580
|
self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
|
|
387
581
|
|
|
@@ -449,12 +643,15 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
449
643
|
logging.info("Bronze Dissemination tags successfully written")
|
|
450
644
|
|
|
451
645
|
def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
|
|
452
|
-
self, dimensions: Dict[str, List[str]]
|
|
646
|
+
self, dimensions: Dict[str, List[str]] = {}, from_tag=False
|
|
453
647
|
) -> DataFrame:
|
|
454
648
|
|
|
455
|
-
|
|
649
|
+
if from_tag:
|
|
650
|
+
self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
|
|
651
|
+
else:
|
|
652
|
+
self.disseminated_tag_df = self.df_bronze
|
|
456
653
|
|
|
457
|
-
if
|
|
654
|
+
if not from_tag and len(dimensions) != 0:
|
|
458
655
|
for dimension_name, codes in dimensions.items():
|
|
459
656
|
logging.info(f"dimension_name: {dimension_name}")
|
|
460
657
|
logging.info(f"codes: {codes}")
|
|
@@ -511,11 +511,9 @@ class SWSEasyIcebergSparkHelper:
|
|
|
511
511
|
.drop("m.observation_id")
|
|
512
512
|
)
|
|
513
513
|
|
|
514
|
-
def write_data_to_iceberg_and_csv(self, sql=
|
|
514
|
+
def write_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
|
|
515
515
|
if sql:
|
|
516
516
|
self.df_denorm = self._gen_denormalied_data_sql()
|
|
517
|
-
elif from_tag:
|
|
518
|
-
self.df_denorm = self._gen_denormalied_data_sql_from_tag()
|
|
519
517
|
else:
|
|
520
518
|
self.df_denorm = self._gen_denormalied_data()
|
|
521
519
|
|
|
@@ -586,18 +584,21 @@ class SWSEasyIcebergSparkHelper:
|
|
|
586
584
|
logging.info("Unfiltered data tags successfully written")
|
|
587
585
|
|
|
588
586
|
def write_filtered_data_to_iceberg_and_csv(
|
|
589
|
-
self, dimensions: Dict[str, List[str]]
|
|
587
|
+
self, dimensions: Dict[str, List[str]] = None, from_tag=False
|
|
590
588
|
) -> DataFrame:
|
|
591
589
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
)
|
|
590
|
+
if from_tag:
|
|
591
|
+
self.filtered_df = self._gen_denormalied_data_sql_from_tag()
|
|
592
|
+
else:
|
|
593
|
+
self.filtered_df = self.df_denorm
|
|
594
|
+
|
|
595
|
+
for dimension_name, codes in dimensions.items():
|
|
596
|
+
logging.info(f"dimension_name: {dimension_name}")
|
|
597
|
+
logging.info(f"codes: {codes}")
|
|
598
|
+
if len(codes) != 0:
|
|
599
|
+
self.filtered_df = self.filtered_df.filter(
|
|
600
|
+
col(dimension_name).isin(codes)
|
|
601
|
+
)
|
|
601
602
|
|
|
602
603
|
self.filtered_df.writeTo(
|
|
603
604
|
self.iceberg_tables.TABLE_FILTERED.iceberg_id
|
{sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/.gitignore
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/tests/__init__.py
RENAMED
|
File without changes
|
{sws_spark_dissemination_helper-0.0.151 → sws_spark_dissemination_helper-0.0.153}/tests/test.py
RENAMED
|
File without changes
|