sws-spark-dissemination-helper 0.0.127__tar.gz → 0.0.129__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/PKG-INFO +1 -1
  2. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/pyproject.toml +1 -1
  3. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +155 -48
  4. sws_spark_dissemination_helper-0.0.129/tests/__init__.py +0 -0
  5. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/.gitignore +0 -0
  6. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/LICENSE +0 -0
  7. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/README.md +0 -0
  8. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/old_requirements.txt +0 -0
  9. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/requirements.txt +0 -0
  10. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +0 -0
  11. /sws_spark_dissemination_helper-0.0.127/tests/__init__.py → /sws_spark_dissemination_helper-0.0.129/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/constants.py +0 -0
  17. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/src/sws_spark_dissemination_helper/utils.py +0 -0
  18. {sws_spark_dissemination_helper-0.0.127 → sws_spark_dissemination_helper-0.0.129}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.127
3
+ Version: 0.0.129
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.127"
7
+ version = "0.0.129"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3==1.36.18",
@@ -25,6 +25,7 @@ class SWSEasyIcebergSparkHelper:
25
25
  dataset_details: dict = None,
26
26
  dataset_tables: DatasetTables = None,
27
27
  keep_history: bool = False,
28
+ write_csv: bool = True,
28
29
  ) -> None:
29
30
  self.spark: SparkSession = spark
30
31
  self.dataset_details: dict = dataset_details
@@ -35,6 +36,7 @@ class SWSEasyIcebergSparkHelper:
35
36
  self.dataset_tables: DatasetTables = dataset_tables
36
37
  self.iceberg_tables: IcebergTables = iceberg_tables
37
38
  self.keep_history: bool = keep_history
39
+ self.write_csv: bool = write_csv
38
40
 
39
41
  if dataset_details is not None:
40
42
  (
@@ -157,7 +159,7 @@ class SWSEasyIcebergSparkHelper:
157
159
  left join reference_data.dim_element_fao d1 on d1.id = oc.dim_element_fao
158
160
  left join reference_data.dim_item_cpc d2 on d2.id = oc.dim_item_cpc
159
161
  left join reference_data.dim_time_series_years d3 on d3.id = oc.dim_time_series_years
160
- where o.replaced_on is null,
162
+ where o.replaced_on is null
161
163
  """
162
164
 
163
165
  # ----------------
@@ -227,6 +229,45 @@ class SWSEasyIcebergSparkHelper:
227
229
 
228
230
  return df_obs_denorm
229
231
 
232
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
233
+ # ----------------
234
+ # Prepare dataframes for the joins
235
+ # ----------------
236
+
237
+ select_statement = """
238
+ select o.id,
239
+ o.value,
240
+ u.email,
241
+ o.created_on,
242
+ o.replaced_on,
243
+ o.version,
244
+ o.flag_obs_status,
245
+ o.flag_method
246
+ """
247
+
248
+ from_statement = f"""
249
+ from {self.dataset_tables.OBSERVATION.iceberg_id} o
250
+ join {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
251
+ left join {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} as oc on oc.id = o.observation_coordinates
252
+ """
253
+
254
+ for i, (dim, cl) in enumerate(
255
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
256
+ ):
257
+ select_statement += f",\nd{i}.code as '{dim}'"
258
+ from_statement += f"\nleft join {cl.iceberg_id} d{i} on d{i}.id = oc.{dim}"
259
+
260
+ final_query = select_statement + from_statement
261
+ if not self.keep_history:
262
+ final_query += "where o.replaced_on is null"
263
+
264
+ logging.info("Final query for merging observation and observation_coordinares")
265
+ logging.info(final_query)
266
+
267
+ df_obs_denorm = self.spark.sql(final_query)
268
+
269
+ return df_obs_denorm
270
+
230
271
  def _gen_denormalized_metadata(self) -> DataFrame:
231
272
  """Original query upon which the below computation is based
232
273
 
@@ -290,6 +331,32 @@ class SWSEasyIcebergSparkHelper:
290
331
 
291
332
  return df_meta_denorm
292
333
 
334
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
335
+ # ----------------
336
+ # Generate denormalized observation table
337
+ # ----------------
338
+
339
+ logging.info("meta_denorm start")
340
+
341
+ df_meta_denorm = self.spark.sql(
342
+ f"""
343
+ select m.observation as observation_id,
344
+ mt.code as type,
345
+ met.code as element_type,
346
+ l.country_code as language,
347
+ me.value
348
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
349
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
350
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
351
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
352
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} reference_data.language l on l.id = m.language
353
+ """
354
+ )
355
+
356
+ logging.info("meta_denorm write")
357
+
358
+ return df_meta_denorm
359
+
293
360
  def _gen_grouped_metadata(self) -> DataFrame:
294
361
  return (
295
362
  self._gen_denormalized_metadata()
@@ -310,6 +377,26 @@ class SWSEasyIcebergSparkHelper:
310
377
  .agg(F.collect_list("metadata").alias("metadata"))
311
378
  )
312
379
 
380
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
381
+ return (
382
+ self._gen_denormalized_metadata_sql()
383
+ .select(
384
+ col("observation_id"),
385
+ F.create_map(
386
+ lit("type"),
387
+ col("type"),
388
+ lit("element_type"),
389
+ col("element_type"),
390
+ lit("language"),
391
+ col("language"),
392
+ lit("value"),
393
+ col("value"),
394
+ ).alias("metadata"),
395
+ )
396
+ .groupby("observation_id")
397
+ .agg(F.collect_list("metadata").alias("metadata"))
398
+ )
399
+
313
400
  def _gen_denormalied_data(self) -> DataFrame:
314
401
  return (
315
402
  self._gen_denormalized_observation()
@@ -322,8 +409,23 @@ class SWSEasyIcebergSparkHelper:
322
409
  .drop("m.observation_id")
323
410
  )
324
411
 
325
- def write_data_to_iceberg_and_csv(self) -> DataFrame:
326
- self.df_denorm = self._gen_denormalied_data()
412
+ def _gen_denormalied_data_sql(self) -> DataFrame:
413
+ return (
414
+ self._gen_denormalized_observation_sql()
415
+ .alias("o")
416
+ .join(
417
+ self._gen_grouped_metadata_sql().alias("m"),
418
+ col("o.id") == col("m.observation_id"),
419
+ "left",
420
+ )
421
+ .drop("m.observation_id")
422
+ )
423
+
424
+ def write_data_to_iceberg_and_csv(self, sql=False) -> DataFrame:
425
+ if sql:
426
+ self.df_denorm = self._gen_denormalied_data_sql()
427
+ else:
428
+ self.df_denorm = self._gen_denormalied_data()
327
429
 
328
430
  self.df_denorm.writeTo(self.iceberg_tables.TABLE.iceberg_id).createOrReplace()
329
431
 
@@ -335,16 +437,16 @@ class SWSEasyIcebergSparkHelper:
335
437
 
336
438
  logging.info(f"Iceberg tag '{self.tag_name}' created")
337
439
 
338
- df_denorm = self.df_denorm.withColumn(
339
- "metadata", F.to_json(col("metadata"))
340
- ).coalesce(1)
440
+ df_denorm = self.df_denorm.withColumn("metadata", F.to_json(col("metadata")))
441
+ if self.write_csv:
442
+ df_denorm = df_denorm.coalesce(1)
341
443
 
342
- save_cache_csv(
343
- df=df_denorm,
344
- bucket=self.bucket,
345
- prefix=self.iceberg_tables.TABLE.csv_prefix,
346
- tag_name=self.tag_name,
347
- )
444
+ save_cache_csv(
445
+ df=df_denorm,
446
+ bucket=self.bucket,
447
+ prefix=self.iceberg_tables.TABLE.csv_prefix,
448
+ tag_name=self.tag_name,
449
+ )
348
450
 
349
451
  return df_denorm
350
452
 
@@ -372,21 +474,22 @@ class SWSEasyIcebergSparkHelper:
372
474
  )
373
475
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
374
476
 
375
- new_csv_table = BaseDisseminatedTagTable(
376
- id="unfiltered_csv",
377
- name="Unfiltered csv",
378
- description="Csv table containing all the raw data imported from the SWS and denormalized",
379
- layer=TableLayer.CACHE,
380
- private=True,
381
- debug=True,
382
- type=TableType.CSV,
383
- path=self.iceberg_tables.TABLE.csv_path,
384
- structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
385
- )
386
- tag = tags.add_dissemination_table(
387
- self.dataset_id, self.tag_name, new_csv_table
388
- )
389
- logging.debug(f"Tag with Added csv Table: {tag}")
477
+ if self.write_csv:
478
+ new_csv_table = BaseDisseminatedTagTable(
479
+ id="unfiltered_csv",
480
+ name="Unfiltered csv",
481
+ description="Csv table containing all the raw data imported from the SWS and denormalized",
482
+ layer=TableLayer.CACHE,
483
+ private=True,
484
+ debug=True,
485
+ type=TableType.CSV,
486
+ path=self.iceberg_tables.TABLE.csv_path,
487
+ structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
488
+ )
489
+ tag = tags.add_dissemination_table(
490
+ self.dataset_id, self.tag_name, new_csv_table
491
+ )
492
+ logging.debug(f"Tag with Added csv Table: {tag}")
390
493
 
391
494
  logging.info("Unfiltered data tags successfully written")
392
495
 
@@ -418,15 +521,18 @@ class SWSEasyIcebergSparkHelper:
418
521
 
419
522
  disseminated_tag_df = self.filtered_df.withColumn(
420
523
  "metadata", F.to_json(col("metadata"))
421
- ).coalesce(1)
422
-
423
- save_cache_csv(
424
- df=disseminated_tag_df,
425
- bucket=self.bucket,
426
- prefix=f"{self.iceberg_tables.TABLE_FILTERED.csv_prefix}",
427
- tag_name=self.tag_name,
428
524
  )
429
525
 
526
+ if self.write_csv:
527
+ disseminated_tag_df = disseminated_tag_df.coalesce(1)
528
+
529
+ save_cache_csv(
530
+ df=disseminated_tag_df,
531
+ bucket=self.bucket,
532
+ prefix=f"{self.iceberg_tables.TABLE_FILTERED.csv_prefix}",
533
+ tag_name=self.tag_name,
534
+ )
535
+
430
536
  return disseminated_tag_df
431
537
 
432
538
  def write_sws_filtered_dissemination_tag(self, tags: Tags):
@@ -452,20 +558,21 @@ class SWSEasyIcebergSparkHelper:
452
558
  )
453
559
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
454
560
 
455
- new_csv_table = BaseDisseminatedTagTable(
456
- id="filtered_csv",
457
- name="Filtered csv",
458
- description="Csv table containing the raw data imported from the SWS, denormalized and filtered per dimension cached",
459
- layer=TableLayer.CACHE,
460
- private=True,
461
- type=TableType.CSV,
462
- path=self.iceberg_tables.TABLE_FILTERED.csv_path,
463
- structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
464
- )
465
- tag = tags.add_dissemination_table(
466
- self.dataset_id, self.tag_name, new_csv_table
467
- )
561
+ if self.write_csv:
562
+ new_csv_table = BaseDisseminatedTagTable(
563
+ id="filtered_csv",
564
+ name="Filtered csv",
565
+ description="Csv table containing the raw data imported from the SWS, denormalized and filtered per dimension cached",
566
+ layer=TableLayer.CACHE,
567
+ private=True,
568
+ type=TableType.CSV,
569
+ path=self.iceberg_tables.TABLE_FILTERED.csv_path,
570
+ structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
571
+ )
572
+ tag = tags.add_dissemination_table(
573
+ self.dataset_id, self.tag_name, new_csv_table
574
+ )
468
575
 
469
- logging.debug(f"Tag with Added csv Table: {tag}")
576
+ logging.debug(f"Tag with Added csv Table: {tag}")
470
577
 
471
578
  logging.info("Filtered data tags successfully written")