sws-spark-dissemination-helper 0.0.127__tar.gz → 0.0.129__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/PKG-INFO +1 -1
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/pyproject.toml +1 -1
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +155 -48
- sws_spark_dissemination_helper-0.0.129/tests/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/.gitignore +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/LICENSE +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/README.md +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/old_requirements.txt +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/requirements.txt +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +0 -0
- /sws_spark_dissemination_helper-0.0.127/tests/__init__.py → /sws_spark_dissemination_helper-0.0.129/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/constants.py +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/utils.py +0 -0
- {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/tests/test.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.129
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -25,6 +25,7 @@ class SWSEasyIcebergSparkHelper:
|
|
|
25
25
|
dataset_details: dict = None,
|
|
26
26
|
dataset_tables: DatasetTables = None,
|
|
27
27
|
keep_history: bool = False,
|
|
28
|
+
write_csv: bool = True,
|
|
28
29
|
) -> None:
|
|
29
30
|
self.spark: SparkSession = spark
|
|
30
31
|
self.dataset_details: dict = dataset_details
|
|
@@ -35,6 +36,7 @@ class SWSEasyIcebergSparkHelper:
|
|
|
35
36
|
self.dataset_tables: DatasetTables = dataset_tables
|
|
36
37
|
self.iceberg_tables: IcebergTables = iceberg_tables
|
|
37
38
|
self.keep_history: bool = keep_history
|
|
39
|
+
self.write_csv: bool = write_csv
|
|
38
40
|
|
|
39
41
|
if dataset_details is not None:
|
|
40
42
|
(
|
|
@@ -157,7 +159,7 @@ class SWSEasyIcebergSparkHelper:
|
|
|
157
159
|
left join reference_data.dim_element_fao d1 on d1.id = oc.dim_element_fao
|
|
158
160
|
left join reference_data.dim_item_cpc d2 on d2.id = oc.dim_item_cpc
|
|
159
161
|
left join reference_data.dim_time_series_years d3 on d3.id = oc.dim_time_series_years
|
|
160
|
-
where o.replaced_on is null
|
|
162
|
+
where o.replaced_on is null
|
|
161
163
|
"""
|
|
162
164
|
|
|
163
165
|
# ----------------
|
|
@@ -227,6 +229,45 @@ class SWSEasyIcebergSparkHelper:
|
|
|
227
229
|
|
|
228
230
|
return df_obs_denorm
|
|
229
231
|
|
|
232
|
+
def _gen_denormalized_observation_sql(self) -> DataFrame:
|
|
233
|
+
# ----------------
|
|
234
|
+
# Prepare dataframes for the joins
|
|
235
|
+
# ----------------
|
|
236
|
+
|
|
237
|
+
select_statement = """
|
|
238
|
+
select o.id,
|
|
239
|
+
o.value,
|
|
240
|
+
u.email,
|
|
241
|
+
o.created_on,
|
|
242
|
+
o.replaced_on,
|
|
243
|
+
o.version,
|
|
244
|
+
o.flag_obs_status,
|
|
245
|
+
o.flag_method
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
from_statement = f"""
|
|
249
|
+
from {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
250
|
+
join {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
251
|
+
left join {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} as oc on oc.id = o.observation_coordinates
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
for i, (dim, cl) in enumerate(
|
|
255
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
256
|
+
):
|
|
257
|
+
select_statement += f",\nd{i}.code as '{dim}'"
|
|
258
|
+
from_statement += f"\nleft join {cl.iceberg_id} d{i} on d{i}.id = oc.{dim}"
|
|
259
|
+
|
|
260
|
+
final_query = select_statement + from_statement
|
|
261
|
+
if not self.keep_history:
|
|
262
|
+
final_query += "where o.replaced_on is null"
|
|
263
|
+
|
|
264
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
265
|
+
logging.info(final_query)
|
|
266
|
+
|
|
267
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
268
|
+
|
|
269
|
+
return df_obs_denorm
|
|
270
|
+
|
|
230
271
|
def _gen_denormalized_metadata(self) -> DataFrame:
|
|
231
272
|
"""Original query upon which the below computation is based
|
|
232
273
|
|
|
@@ -290,6 +331,32 @@ class SWSEasyIcebergSparkHelper:
|
|
|
290
331
|
|
|
291
332
|
return df_meta_denorm
|
|
292
333
|
|
|
334
|
+
def _gen_denormalized_metadata_sql(self) -> DataFrame:
|
|
335
|
+
# ----------------
|
|
336
|
+
# Generate denormalized observation table
|
|
337
|
+
# ----------------
|
|
338
|
+
|
|
339
|
+
logging.info("meta_denorm start")
|
|
340
|
+
|
|
341
|
+
df_meta_denorm = self.spark.sql(
|
|
342
|
+
f"""
|
|
343
|
+
select m.observation as observation_id,
|
|
344
|
+
mt.code as type,
|
|
345
|
+
met.code as element_type,
|
|
346
|
+
l.country_code as language,
|
|
347
|
+
me.value
|
|
348
|
+
from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
|
|
349
|
+
left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
|
|
350
|
+
left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
|
|
351
|
+
left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
|
|
352
|
+
left join {self.dataset_tables.LANGUAGE.iceberg_id} reference_data.language l on l.id = m.language
|
|
353
|
+
"""
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
logging.info("meta_denorm write")
|
|
357
|
+
|
|
358
|
+
return df_meta_denorm
|
|
359
|
+
|
|
293
360
|
def _gen_grouped_metadata(self) -> DataFrame:
|
|
294
361
|
return (
|
|
295
362
|
self._gen_denormalized_metadata()
|
|
@@ -310,6 +377,26 @@ class SWSEasyIcebergSparkHelper:
|
|
|
310
377
|
.agg(F.collect_list("metadata").alias("metadata"))
|
|
311
378
|
)
|
|
312
379
|
|
|
380
|
+
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
381
|
+
return (
|
|
382
|
+
self._gen_denormalized_metadata_sql()
|
|
383
|
+
.select(
|
|
384
|
+
col("observation_id"),
|
|
385
|
+
F.create_map(
|
|
386
|
+
lit("type"),
|
|
387
|
+
col("type"),
|
|
388
|
+
lit("element_type"),
|
|
389
|
+
col("element_type"),
|
|
390
|
+
lit("language"),
|
|
391
|
+
col("language"),
|
|
392
|
+
lit("value"),
|
|
393
|
+
col("value"),
|
|
394
|
+
).alias("metadata"),
|
|
395
|
+
)
|
|
396
|
+
.groupby("observation_id")
|
|
397
|
+
.agg(F.collect_list("metadata").alias("metadata"))
|
|
398
|
+
)
|
|
399
|
+
|
|
313
400
|
def _gen_denormalied_data(self) -> DataFrame:
|
|
314
401
|
return (
|
|
315
402
|
self._gen_denormalized_observation()
|
|
@@ -322,8 +409,23 @@ class SWSEasyIcebergSparkHelper:
|
|
|
322
409
|
.drop("m.observation_id")
|
|
323
410
|
)
|
|
324
411
|
|
|
325
|
-
def
|
|
326
|
-
|
|
412
|
+
def _gen_denormalied_data_sql(self) -> DataFrame:
|
|
413
|
+
return (
|
|
414
|
+
self._gen_denormalized_observation_sql()
|
|
415
|
+
.alias("o")
|
|
416
|
+
.join(
|
|
417
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
418
|
+
col("o.id") == col("m.observation_id"),
|
|
419
|
+
"left",
|
|
420
|
+
)
|
|
421
|
+
.drop("m.observation_id")
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
def write_data_to_iceberg_and_csv(self, sql=False) -> DataFrame:
|
|
425
|
+
if sql:
|
|
426
|
+
self.df_denorm = self._gen_denormalied_data_sql()
|
|
427
|
+
else:
|
|
428
|
+
self.df_denorm = self._gen_denormalied_data()
|
|
327
429
|
|
|
328
430
|
self.df_denorm.writeTo(self.iceberg_tables.TABLE.iceberg_id).createOrReplace()
|
|
329
431
|
|
|
@@ -335,16 +437,16 @@ class SWSEasyIcebergSparkHelper:
|
|
|
335
437
|
|
|
336
438
|
logging.info(f"Iceberg tag '{self.tag_name}' created")
|
|
337
439
|
|
|
338
|
-
df_denorm = self.df_denorm.withColumn(
|
|
339
|
-
|
|
340
|
-
|
|
440
|
+
df_denorm = self.df_denorm.withColumn("metadata", F.to_json(col("metadata")))
|
|
441
|
+
if self.write_csv:
|
|
442
|
+
df_denorm = df_denorm.coalesce(1)
|
|
341
443
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
444
|
+
save_cache_csv(
|
|
445
|
+
df=df_denorm,
|
|
446
|
+
bucket=self.bucket,
|
|
447
|
+
prefix=self.iceberg_tables.TABLE.csv_prefix,
|
|
448
|
+
tag_name=self.tag_name,
|
|
449
|
+
)
|
|
348
450
|
|
|
349
451
|
return df_denorm
|
|
350
452
|
|
|
@@ -372,21 +474,22 @@ class SWSEasyIcebergSparkHelper:
|
|
|
372
474
|
)
|
|
373
475
|
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
374
476
|
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
477
|
+
if self.write_csv:
|
|
478
|
+
new_csv_table = BaseDisseminatedTagTable(
|
|
479
|
+
id="unfiltered_csv",
|
|
480
|
+
name="Unfiltered csv",
|
|
481
|
+
description="Csv table containing all the raw data imported from the SWS and denormalized",
|
|
482
|
+
layer=TableLayer.CACHE,
|
|
483
|
+
private=True,
|
|
484
|
+
debug=True,
|
|
485
|
+
type=TableType.CSV,
|
|
486
|
+
path=self.iceberg_tables.TABLE.csv_path,
|
|
487
|
+
structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
|
|
488
|
+
)
|
|
489
|
+
tag = tags.add_dissemination_table(
|
|
490
|
+
self.dataset_id, self.tag_name, new_csv_table
|
|
491
|
+
)
|
|
492
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
390
493
|
|
|
391
494
|
logging.info("Unfiltered data tags successfully written")
|
|
392
495
|
|
|
@@ -418,15 +521,18 @@ class SWSEasyIcebergSparkHelper:
|
|
|
418
521
|
|
|
419
522
|
disseminated_tag_df = self.filtered_df.withColumn(
|
|
420
523
|
"metadata", F.to_json(col("metadata"))
|
|
421
|
-
).coalesce(1)
|
|
422
|
-
|
|
423
|
-
save_cache_csv(
|
|
424
|
-
df=disseminated_tag_df,
|
|
425
|
-
bucket=self.bucket,
|
|
426
|
-
prefix=f"{self.iceberg_tables.TABLE_FILTERED.csv_prefix}",
|
|
427
|
-
tag_name=self.tag_name,
|
|
428
524
|
)
|
|
429
525
|
|
|
526
|
+
if self.write_csv:
|
|
527
|
+
disseminated_tag_df = disseminated_tag_df.coalesce(1)
|
|
528
|
+
|
|
529
|
+
save_cache_csv(
|
|
530
|
+
df=disseminated_tag_df,
|
|
531
|
+
bucket=self.bucket,
|
|
532
|
+
prefix=f"{self.iceberg_tables.TABLE_FILTERED.csv_prefix}",
|
|
533
|
+
tag_name=self.tag_name,
|
|
534
|
+
)
|
|
535
|
+
|
|
430
536
|
return disseminated_tag_df
|
|
431
537
|
|
|
432
538
|
def write_sws_filtered_dissemination_tag(self, tags: Tags):
|
|
@@ -452,20 +558,21 @@ class SWSEasyIcebergSparkHelper:
|
|
|
452
558
|
)
|
|
453
559
|
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
454
560
|
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
561
|
+
if self.write_csv:
|
|
562
|
+
new_csv_table = BaseDisseminatedTagTable(
|
|
563
|
+
id="filtered_csv",
|
|
564
|
+
name="Filtered csv",
|
|
565
|
+
description="Csv table containing the raw data imported from the SWS, denormalized and filtered per dimension cached",
|
|
566
|
+
layer=TableLayer.CACHE,
|
|
567
|
+
private=True,
|
|
568
|
+
type=TableType.CSV,
|
|
569
|
+
path=self.iceberg_tables.TABLE_FILTERED.csv_path,
|
|
570
|
+
structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
|
|
571
|
+
)
|
|
572
|
+
tag = tags.add_dissemination_table(
|
|
573
|
+
self.dataset_id, self.tag_name, new_csv_table
|
|
574
|
+
)
|
|
468
575
|
|
|
469
|
-
|
|
576
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
470
577
|
|
|
471
578
|
logging.info("Filtered data tags successfully written")
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/.gitignore
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/tests/test.py
RENAMED
|
File without changes
|