sws-spark-dissemination-helper 0.0.166__py3-none-any.whl → 0.0.169__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +62 -33
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +78 -0
- sws_spark_dissemination_helper/constants.py +16 -16
- {sws_spark_dissemination_helper-0.0.166.dist-info → sws_spark_dissemination_helper-0.0.169.dist-info}/METADATA +1 -1
- {sws_spark_dissemination_helper-0.0.166.dist-info → sws_spark_dissemination_helper-0.0.169.dist-info}/RECORD +7 -7
- {sws_spark_dissemination_helper-0.0.166.dist-info → sws_spark_dissemination_helper-0.0.169.dist-info}/WHEEL +0 -0
- {sws_spark_dissemination_helper-0.0.166.dist-info → sws_spark_dissemination_helper-0.0.169.dist-info}/licenses/LICENSE +0 -0
|
@@ -158,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
158
158
|
|
|
159
159
|
return dfs_dimension
|
|
160
160
|
|
|
161
|
-
def _prepare_element_uom(self) -> DataFrame:
|
|
161
|
+
def _prepare_element_uom(self) -> Union[DataFrame, None]:
|
|
162
162
|
"""Prepare the element and unit of measure join."""
|
|
163
163
|
|
|
164
164
|
# Get the element DataFrame
|
|
@@ -170,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
170
170
|
if dimension_column == self.element_column
|
|
171
171
|
)
|
|
172
172
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
173
|
+
if any("unit_of_measure" == column.lower() for column in df_element.columns):
|
|
174
|
+
# Join the element and the unit_of_measure
|
|
175
|
+
df_element_uom = (
|
|
176
|
+
df_element.alias("e")
|
|
177
|
+
.join(
|
|
178
|
+
self.df_unit_of_measure.alias("u"),
|
|
179
|
+
col("e.unit_of_measure") == col("u.id"),
|
|
180
|
+
)
|
|
181
|
+
.select(
|
|
182
|
+
col("e.code").alias("element_code"),
|
|
183
|
+
col("u.code").alias("unit_of_measure"),
|
|
184
|
+
col("u.symbol").alias("unit_of_measure_symbol"),
|
|
185
|
+
col("u.base_unit").alias("unit_of_measure_base_unit"),
|
|
186
|
+
col("u.multiplier").alias("unit_of_measure_multiplier"),
|
|
187
|
+
)
|
|
186
188
|
)
|
|
187
|
-
)
|
|
188
189
|
|
|
189
|
-
|
|
190
|
+
return df_element_uom
|
|
190
191
|
|
|
191
192
|
def _gen_denormalized_observation(self) -> DataFrame:
|
|
192
193
|
"""Original query upon which the below computation is based
|
|
@@ -278,15 +279,16 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
278
279
|
.withColumnRenamed("code", dimension_column)
|
|
279
280
|
)
|
|
280
281
|
|
|
281
|
-
|
|
282
|
-
df_intermediate
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
282
|
+
if df_element_uom is not None:
|
|
283
|
+
df_intermediate = (
|
|
284
|
+
df_intermediate.alias("d")
|
|
285
|
+
.join(
|
|
286
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
287
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
288
|
+
"left",
|
|
289
|
+
)
|
|
290
|
+
.drop("element_code")
|
|
287
291
|
)
|
|
288
|
-
.drop("element_code")
|
|
289
|
-
)
|
|
290
292
|
|
|
291
293
|
df_obs_denorm = df_intermediate
|
|
292
294
|
|
|
@@ -364,16 +366,17 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
364
366
|
)
|
|
365
367
|
logging.debug(f"After join count: {df_obs_denorm.count()}")
|
|
366
368
|
|
|
367
|
-
|
|
368
|
-
df_obs_denorm
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
369
|
+
if df_element_uom is not None:
|
|
370
|
+
df_obs_denorm = (
|
|
371
|
+
df_obs_denorm.alias("d")
|
|
372
|
+
.join(
|
|
373
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
374
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
375
|
+
"left",
|
|
376
|
+
)
|
|
377
|
+
.drop("element_code")
|
|
373
378
|
)
|
|
374
|
-
.
|
|
375
|
-
)
|
|
376
|
-
logging.debug(f"After uom count: {df_obs_denorm.count()}")
|
|
379
|
+
logging.debug(f"After uom count: {df_obs_denorm.count()}")
|
|
377
380
|
|
|
378
381
|
return df_obs_denorm
|
|
379
382
|
|
|
@@ -766,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
766
769
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
767
770
|
|
|
768
771
|
logging.info("Bronze Disseminated tag with selection successfully written")
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
1
|
|
775
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
776
|
+
1
|
|
777
|
+
1
|
|
778
|
+
2
|
|
779
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
780
|
+
2
|
|
781
|
+
1
|
|
782
|
+
1
|
|
783
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
784
|
+
1
|
|
785
|
+
1
|
|
786
|
+
2
|
|
787
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
788
|
+
2
|
|
789
|
+
1
|
|
790
|
+
1
|
|
791
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
792
|
+
1
|
|
793
|
+
1
|
|
794
|
+
1
|
|
795
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
796
|
+
1
|
|
797
|
+
1
|
|
@@ -296,6 +296,35 @@ class SWSGoldIcebergSparkHelper:
|
|
|
296
296
|
|
|
297
297
|
return df
|
|
298
298
|
|
|
299
|
+
def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
|
|
300
|
+
self, df: DataFrame
|
|
301
|
+
) -> DataFrame:
|
|
302
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
303
|
+
df.writeTo(
|
|
304
|
+
self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
|
|
305
|
+
).createOrReplace()
|
|
306
|
+
|
|
307
|
+
logging.info(
|
|
308
|
+
f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
self.spark.sql(
|
|
312
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
|
|
316
|
+
|
|
317
|
+
df_1 = df.coalesce(1)
|
|
318
|
+
|
|
319
|
+
save_cache_csv(
|
|
320
|
+
df=df_1,
|
|
321
|
+
bucket=self.bucket,
|
|
322
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
|
|
323
|
+
tag_name=self.tag_name,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return df
|
|
327
|
+
|
|
299
328
|
def write_gold_sws_validated_sws_dissemination_tag(
|
|
300
329
|
self, df: DataFrame, tags: Tags
|
|
301
330
|
) -> DataFrame:
|
|
@@ -589,3 +618,52 @@ class SWSGoldIcebergSparkHelper:
|
|
|
589
618
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
590
619
|
|
|
591
620
|
return df
|
|
621
|
+
|
|
622
|
+
def write_gold_faostat_unfiltered_dissemination_tag(
|
|
623
|
+
self, df: DataFrame, tags: Tags
|
|
624
|
+
) -> DataFrame:
|
|
625
|
+
# Get or create a new tag
|
|
626
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
627
|
+
logging.debug(f"Tag: {tag}")
|
|
628
|
+
|
|
629
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
630
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
|
|
631
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
|
|
632
|
+
description="Gold table containing all the tag data in FAOSTAT format",
|
|
633
|
+
layer=TableLayer.GOLD,
|
|
634
|
+
private=True,
|
|
635
|
+
type=TableType.ICEBERG,
|
|
636
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
637
|
+
table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
|
|
638
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
|
|
639
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
640
|
+
)
|
|
641
|
+
tag = upsert_disseminated_table(
|
|
642
|
+
sws_tags=tags,
|
|
643
|
+
tag=tag,
|
|
644
|
+
dataset_id=self.dataset_id,
|
|
645
|
+
tag_name=self.tag_name,
|
|
646
|
+
table=new_iceberg_table,
|
|
647
|
+
)
|
|
648
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
649
|
+
|
|
650
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
651
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
|
|
652
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
|
|
653
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
654
|
+
layer=TableLayer.GOLD,
|
|
655
|
+
private=True,
|
|
656
|
+
type=TableType.CSV,
|
|
657
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
|
|
658
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
659
|
+
)
|
|
660
|
+
tag = upsert_disseminated_table(
|
|
661
|
+
sws_tags=tags,
|
|
662
|
+
tag=tag,
|
|
663
|
+
dataset_id=self.dataset_id,
|
|
664
|
+
tag_name=self.tag_name,
|
|
665
|
+
table=new_diss_table,
|
|
666
|
+
)
|
|
667
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
668
|
+
|
|
669
|
+
return df
|
|
@@ -254,37 +254,37 @@ class IcebergTables:
|
|
|
254
254
|
self.__tag_name = tag_name
|
|
255
255
|
|
|
256
256
|
# TODO Fix later with a more appropriate DATABASE
|
|
257
|
-
self.DENORMALIZED_OBSERVATION = self.
|
|
258
|
-
self.DENORMALIZED_METADATA = self.
|
|
259
|
-
self.GROUPED_METADATA = self.
|
|
260
|
-
self.TABLE = self.
|
|
261
|
-
self.TABLE_FILTERED = self.
|
|
262
|
-
self.BRONZE = self.
|
|
263
|
-
self.BRONZE_DISS_TAG = self.
|
|
264
|
-
self.SILVER = self.
|
|
257
|
+
self.DENORMALIZED_OBSERVATION = self.create_iceberg_table("BRONZE", suffix="denormalized_observation")
|
|
258
|
+
self.DENORMALIZED_METADATA = self.create_iceberg_table("BRONZE", suffix="denormalized_metadata")
|
|
259
|
+
self.GROUPED_METADATA = self.create_iceberg_table("BRONZE", suffix="grouped_metadata")
|
|
260
|
+
self.TABLE = self.create_iceberg_table("BRONZE")
|
|
261
|
+
self.TABLE_FILTERED = self.create_iceberg_table("BRONZE", suffix="filtered")
|
|
262
|
+
self.BRONZE = self.create_iceberg_table("BRONZE")
|
|
263
|
+
self.BRONZE_DISS_TAG = self.create_iceberg_table("BRONZE", suffix="diss_tag")
|
|
264
|
+
self.SILVER = self.create_iceberg_table("SILVER", prefix=domain)
|
|
265
265
|
|
|
266
266
|
# GOLD tables with specific suffixes
|
|
267
|
-
self.GOLD_SWS = self.
|
|
268
|
-
self.GOLD_SDMX = self.
|
|
267
|
+
self.GOLD_SWS = self.create_iceberg_table("GOLD", prefix=domain, suffix="sws")
|
|
268
|
+
self.GOLD_SDMX = self.create_iceberg_table(
|
|
269
269
|
"GOLD", prefix=domain, suffix="sdmx_disseminated"
|
|
270
270
|
)
|
|
271
|
-
self.GOLD_SWS_VALIDATED = self.
|
|
271
|
+
self.GOLD_SWS_VALIDATED = self.create_iceberg_table(
|
|
272
272
|
"GOLD", prefix=domain, suffix="sws_validated"
|
|
273
273
|
)
|
|
274
|
-
self.GOLD_SWS_DISSEMINATED = self.
|
|
274
|
+
self.GOLD_SWS_DISSEMINATED = self.create_iceberg_table(
|
|
275
275
|
"GOLD", prefix=domain, suffix="sws_disseminated"
|
|
276
276
|
)
|
|
277
|
-
self.GOLD_PRE_SDMX = self.
|
|
277
|
+
self.GOLD_PRE_SDMX = self.create_iceberg_table(
|
|
278
278
|
"GOLD", prefix=domain, suffix="pre_sdmx"
|
|
279
279
|
)
|
|
280
|
-
self.GOLD_FAOSTAT = self.
|
|
280
|
+
self.GOLD_FAOSTAT = self.create_iceberg_table(
|
|
281
281
|
"GOLD", prefix=domain, suffix="faostat"
|
|
282
282
|
)
|
|
283
|
-
self.GOLD_FAOSTAT_UNFILTERED = self.
|
|
283
|
+
self.GOLD_FAOSTAT_UNFILTERED = self.create_iceberg_table(
|
|
284
284
|
"GOLD", prefix=domain, suffix="faostat_unfiltered"
|
|
285
285
|
)
|
|
286
286
|
|
|
287
|
-
def
|
|
287
|
+
def create_iceberg_table(
|
|
288
288
|
self, level: str, prefix: str = "", suffix: str = ""
|
|
289
289
|
) -> IcebergTable:
|
|
290
290
|
database = getattr(IcebergDatabases, f"{level}_DATABASE")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.169
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=
|
|
1
|
+
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
|
|
2
2
|
sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
|
|
4
|
-
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=
|
|
4
|
+
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=0dxbVkrhdaASapEffF5PFcgKwAMyJoWBxzgymjZ4JyY,25049
|
|
5
5
|
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=KpG8gp8Ai9pHDiKhUOTcXWxxmFGeKEE3XKlI_Y-SveU,18453
|
|
6
6
|
sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=qioLv3SlJEfk0LzTiwfXRtZXVImPOJUeh9k1XwHC-pA,26225
|
|
7
7
|
sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
|
|
8
|
-
sws_spark_dissemination_helper/constants.py,sha256=
|
|
8
|
+
sws_spark_dissemination_helper/constants.py,sha256=vQmalAqInwPAybgJOfYx99jn47KsKp8jeD8eqmjw-Rs,13471
|
|
9
9
|
sws_spark_dissemination_helper/utils.py,sha256=G7lQqNRrvqZpgm9WmddD7fWsI8IVn09x1p3cV3458EA,21963
|
|
10
|
-
sws_spark_dissemination_helper-0.0.
|
|
11
|
-
sws_spark_dissemination_helper-0.0.
|
|
12
|
-
sws_spark_dissemination_helper-0.0.
|
|
13
|
-
sws_spark_dissemination_helper-0.0.
|
|
10
|
+
sws_spark_dissemination_helper-0.0.169.dist-info/METADATA,sha256=1gUttuBzd1iSvoe2zEdyGFjk2ZuXPS5IXqQjuv0dAWQ,2824
|
|
11
|
+
sws_spark_dissemination_helper-0.0.169.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
12
|
+
sws_spark_dissemination_helper-0.0.169.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
|
|
13
|
+
sws_spark_dissemination_helper-0.0.169.dist-info/RECORD,,
|
|
File without changes
|