sws-spark-dissemination-helper 0.0.152__tar.gz → 0.0.154__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/PKG-INFO +1 -1
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/pyproject.toml +1 -1
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +234 -7
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/.gitignore +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/LICENSE +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/README.md +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/constants.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/src/sws_spark_dissemination_helper/utils.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/tests/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/tests/test.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.154
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
from copy import copy
|
|
4
|
-
from typing import Dict, List, Tuple
|
|
4
|
+
from typing import Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import pyspark.sql.functions as F
|
|
7
7
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
26
26
|
domain_code: str,
|
|
27
27
|
dataset_details: dict = None,
|
|
28
28
|
dataset_tables: DatasetTables = None,
|
|
29
|
+
keep_history: bool = False,
|
|
30
|
+
write_csv: bool = True,
|
|
31
|
+
source_tag: Union[str, None] = None,
|
|
29
32
|
) -> None:
|
|
30
33
|
self.spark: SparkSession = spark
|
|
31
34
|
self.dataset_details: dict = dataset_details
|
|
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
36
39
|
self.dataset_tables: DatasetTables = dataset_tables
|
|
37
40
|
self.iceberg_tables: IcebergTables = iceberg_tables
|
|
38
41
|
self.domain_code = domain_code
|
|
42
|
+
self.keep_history: bool = keep_history
|
|
43
|
+
self.write_csv: bool = write_csv
|
|
44
|
+
self.source_tag: Union[str, None] = source_tag
|
|
39
45
|
|
|
40
46
|
if dataset_details is not None:
|
|
41
47
|
(
|
|
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
83
89
|
self.df_obs_coord,
|
|
84
90
|
self.df_metadata,
|
|
85
91
|
self.df_meta_elem,
|
|
92
|
+
self.df_tag_observation,
|
|
86
93
|
) = self.raw_data
|
|
87
94
|
|
|
88
95
|
(
|
|
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
92
99
|
self.df_meta_elem_type,
|
|
93
100
|
self.df_language,
|
|
94
101
|
self.df_unit_of_measure,
|
|
102
|
+
self.df_dataset,
|
|
95
103
|
self.dfs_dimension,
|
|
96
104
|
) = self.raw_reference_data
|
|
97
105
|
|
|
98
|
-
self.df_user = self.raw_operational_data
|
|
106
|
+
(self.df_user, self.df_tag) = self.raw_operational_data
|
|
99
107
|
|
|
100
108
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
101
109
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
@@ -284,6 +292,148 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
284
292
|
|
|
285
293
|
return df_obs_denorm
|
|
286
294
|
|
|
295
|
+
def _gen_denormalized_observation_sql(self) -> DataFrame:
|
|
296
|
+
# ----------------
|
|
297
|
+
# Prepare dataframes for the joins
|
|
298
|
+
# ----------------
|
|
299
|
+
|
|
300
|
+
select_statement = """
|
|
301
|
+
o.id,
|
|
302
|
+
o.value,
|
|
303
|
+
u.email,
|
|
304
|
+
o.created_on,
|
|
305
|
+
o.replaced_on,
|
|
306
|
+
o.version"""
|
|
307
|
+
|
|
308
|
+
from_statement = f"""
|
|
309
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
310
|
+
JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
311
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
312
|
+
|
|
313
|
+
hint_statement = ""
|
|
314
|
+
|
|
315
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
316
|
+
for flag_col in self.flag_columns:
|
|
317
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
318
|
+
|
|
319
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
320
|
+
for i, (dim_col, cl) in enumerate(
|
|
321
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
322
|
+
):
|
|
323
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
324
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
325
|
+
hint_statement = (
|
|
326
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
327
|
+
if hint_statement
|
|
328
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
332
|
+
|
|
333
|
+
final_query = "SELECT " + hint_statement + select_statement + from_statement
|
|
334
|
+
if not self.keep_history:
|
|
335
|
+
final_query += "\nWHERE o.replaced_on IS NULL"
|
|
336
|
+
|
|
337
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
338
|
+
logging.info(final_query)
|
|
339
|
+
|
|
340
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
341
|
+
|
|
342
|
+
df_element_uom = self._prepare_element_uom()
|
|
343
|
+
|
|
344
|
+
dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
|
|
345
|
+
|
|
346
|
+
# Join all the dimension codelists
|
|
347
|
+
for dimension_column, df_dimension in zip(
|
|
348
|
+
self.dim_columns_w_time, dfs_dimension_w_validity
|
|
349
|
+
):
|
|
350
|
+
df_obs_denorm = (
|
|
351
|
+
df_obs_denorm.alias("o")
|
|
352
|
+
.join(
|
|
353
|
+
F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
|
|
354
|
+
"d"
|
|
355
|
+
),
|
|
356
|
+
col(f"{dimension_column}") == col("d.join_id"),
|
|
357
|
+
)
|
|
358
|
+
.drop(f"{dimension_column}", "join_id")
|
|
359
|
+
.withColumnRenamed("code", dimension_column)
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
df_obs_denorm = (
|
|
363
|
+
df_obs_denorm.alias("d")
|
|
364
|
+
.join(
|
|
365
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
366
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
367
|
+
"left",
|
|
368
|
+
)
|
|
369
|
+
.drop("element_code")
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
return df_obs_denorm
|
|
373
|
+
|
|
374
|
+
def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
|
|
375
|
+
# ----------------
|
|
376
|
+
# Prepare dataframes for the joins
|
|
377
|
+
# ----------------
|
|
378
|
+
|
|
379
|
+
select_statement = """
|
|
380
|
+
o.id,
|
|
381
|
+
o.value,
|
|
382
|
+
u.email,
|
|
383
|
+
o.created_on,
|
|
384
|
+
o.replaced_on,
|
|
385
|
+
o.version"""
|
|
386
|
+
|
|
387
|
+
from_statement = f"""
|
|
388
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
389
|
+
INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
|
|
390
|
+
INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
|
|
391
|
+
INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
|
|
392
|
+
LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
393
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
394
|
+
|
|
395
|
+
hint_statement = ""
|
|
396
|
+
|
|
397
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
398
|
+
for flag_col in self.flag_columns:
|
|
399
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
400
|
+
|
|
401
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
402
|
+
for i, (dim_col, cl) in enumerate(
|
|
403
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
404
|
+
):
|
|
405
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
406
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
407
|
+
hint_statement = (
|
|
408
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
409
|
+
if hint_statement
|
|
410
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
414
|
+
|
|
415
|
+
# TODO Add tag name as a parameter
|
|
416
|
+
where_statement = (
|
|
417
|
+
f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
final_query = (
|
|
421
|
+
"SELECT "
|
|
422
|
+
+ hint_statement
|
|
423
|
+
+ select_statement
|
|
424
|
+
+ from_statement
|
|
425
|
+
+ where_statement
|
|
426
|
+
)
|
|
427
|
+
if not self.keep_history:
|
|
428
|
+
final_query += "\n AND o.replaced_on IS NULL"
|
|
429
|
+
|
|
430
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
431
|
+
logging.info(final_query)
|
|
432
|
+
|
|
433
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
434
|
+
|
|
435
|
+
return df_obs_denorm
|
|
436
|
+
|
|
287
437
|
def _gen_denormalized_metadata(self) -> DataFrame:
|
|
288
438
|
"""Original query upon which the below computation is based
|
|
289
439
|
|
|
@@ -347,6 +497,32 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
347
497
|
|
|
348
498
|
return df_meta_denorm
|
|
349
499
|
|
|
500
|
+
def _gen_denormalized_metadata_sql(self) -> DataFrame:
|
|
501
|
+
# ----------------
|
|
502
|
+
# Generate denormalized observation table
|
|
503
|
+
# ----------------
|
|
504
|
+
|
|
505
|
+
logging.info("meta_denorm start")
|
|
506
|
+
|
|
507
|
+
df_meta_denorm = self.spark.sql(
|
|
508
|
+
f"""
|
|
509
|
+
select m.observation as observation_id,
|
|
510
|
+
mt.code as type,
|
|
511
|
+
met.code as element_type,
|
|
512
|
+
l.country_code as language,
|
|
513
|
+
me.value
|
|
514
|
+
from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
|
|
515
|
+
left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
|
|
516
|
+
left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
|
|
517
|
+
left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
|
|
518
|
+
left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
|
|
519
|
+
"""
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
logging.info("meta_denorm write")
|
|
523
|
+
|
|
524
|
+
return df_meta_denorm
|
|
525
|
+
|
|
350
526
|
def _gen_grouped_metadata(self) -> DataFrame:
|
|
351
527
|
return (
|
|
352
528
|
self._gen_denormalized_metadata()
|
|
@@ -367,6 +543,26 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
367
543
|
.agg(F.collect_list("metadata").alias("metadata"))
|
|
368
544
|
)
|
|
369
545
|
|
|
546
|
+
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
547
|
+
return (
|
|
548
|
+
self._gen_denormalized_metadata_sql()
|
|
549
|
+
.select(
|
|
550
|
+
col("observation_id"),
|
|
551
|
+
F.create_map(
|
|
552
|
+
lit("type"),
|
|
553
|
+
col("type"),
|
|
554
|
+
lit("element_type"),
|
|
555
|
+
col("element_type"),
|
|
556
|
+
lit("language"),
|
|
557
|
+
col("language"),
|
|
558
|
+
lit("value"),
|
|
559
|
+
col("value"),
|
|
560
|
+
).alias("metadata"),
|
|
561
|
+
)
|
|
562
|
+
.groupby("observation_id")
|
|
563
|
+
.agg(F.collect_list("metadata").alias("metadata"))
|
|
564
|
+
)
|
|
565
|
+
|
|
370
566
|
def _gen_bronze_data(self) -> DataFrame:
|
|
371
567
|
return (
|
|
372
568
|
self._gen_denormalized_observation()
|
|
@@ -379,9 +575,37 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
379
575
|
.drop("m.observation_id")
|
|
380
576
|
)
|
|
381
577
|
|
|
578
|
+
def _gen_bronze_data_sql(self) -> DataFrame:
|
|
579
|
+
return (
|
|
580
|
+
self._gen_denormalized_observation_sql()
|
|
581
|
+
.alias("o")
|
|
582
|
+
.join(
|
|
583
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
584
|
+
col("o.id") == col("m.observation_id"),
|
|
585
|
+
"left",
|
|
586
|
+
)
|
|
587
|
+
.drop("m.observation_id")
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
|
|
591
|
+
return (
|
|
592
|
+
self._gen_denormalized_observation_sql_from_tag()
|
|
593
|
+
.alias("o")
|
|
594
|
+
.join(
|
|
595
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
596
|
+
col("o.id") == col("m.observation_id"),
|
|
597
|
+
"left",
|
|
598
|
+
)
|
|
599
|
+
.drop("m.observation_id")
|
|
600
|
+
)
|
|
601
|
+
|
|
382
602
|
# TODO decouple data generation and data writing
|
|
383
|
-
def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
384
|
-
|
|
603
|
+
def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
|
|
604
|
+
|
|
605
|
+
if sql:
|
|
606
|
+
self.df_bronze = self._gen_bronze_data_sql()
|
|
607
|
+
else:
|
|
608
|
+
self.df_bronze = self._gen_bronze_data()
|
|
385
609
|
|
|
386
610
|
self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
|
|
387
611
|
|
|
@@ -449,12 +673,15 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
449
673
|
logging.info("Bronze Dissemination tags successfully written")
|
|
450
674
|
|
|
451
675
|
def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
|
|
452
|
-
self, dimensions: Dict[str, List[str]]
|
|
676
|
+
self, dimensions: Dict[str, List[str]] = {}, from_tag=False
|
|
453
677
|
) -> DataFrame:
|
|
454
678
|
|
|
455
|
-
|
|
679
|
+
if from_tag:
|
|
680
|
+
self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
|
|
681
|
+
else:
|
|
682
|
+
self.disseminated_tag_df = self.df_bronze
|
|
456
683
|
|
|
457
|
-
if
|
|
684
|
+
if not from_tag and len(dimensions) != 0:
|
|
458
685
|
for dimension_name, codes in dimensions.items():
|
|
459
686
|
logging.info(f"dimension_name: {dimension_name}")
|
|
460
687
|
logging.info(f"codes: {codes}")
|
{sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/.gitignore
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/tests/__init__.py
RENAMED
|
File without changes
|
{sws_spark_dissemination_helper-0.0.152 → sws_spark_dissemination_helper-0.0.154}/tests/test.py
RENAMED
|
File without changes
|