sws-spark-dissemination-helper 0.0.128__tar.gz → 0.0.130__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/PKG-INFO +1 -1
  2. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/pyproject.toml +1 -1
  3. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +103 -3
  4. sws_spark_dissemination_helper-0.0.130/tests/__init__.py +0 -0
  5. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/.gitignore +0 -0
  6. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/LICENSE +0 -0
  7. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/README.md +0 -0
  8. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/old_requirements.txt +0 -0
  9. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/requirements.txt +0 -0
  10. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +0 -0
  11. /sws_spark_dissemination_helper-0.0.128/tests/__init__.py → /sws_spark_dissemination_helper-0.0.130/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/constants.py +0 -0
  17. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/src/sws_spark_dissemination_helper/utils.py +0 -0
  18. {sws_spark_dissemination_helper-0.0.128 → sws_spark_dissemination_helper-0.0.130}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.128
3
+ Version: 0.0.130
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.128"
7
+ version = "0.0.130"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3==1.36.18",
@@ -159,7 +159,7 @@ class SWSEasyIcebergSparkHelper:
159
159
  left join reference_data.dim_element_fao d1 on d1.id = oc.dim_element_fao
160
160
  left join reference_data.dim_item_cpc d2 on d2.id = oc.dim_item_cpc
161
161
  left join reference_data.dim_time_series_years d3 on d3.id = oc.dim_time_series_years
162
- where o.replaced_on is null,
162
+ where o.replaced_on is null
163
163
  """
164
164
 
165
165
  # ----------------
@@ -229,6 +229,45 @@ class SWSEasyIcebergSparkHelper:
229
229
 
230
230
  return df_obs_denorm
231
231
 
232
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
233
+ # ----------------
234
+ # Prepare dataframes for the joins
235
+ # ----------------
236
+
237
+ select_statement = """
238
+ select o.id,
239
+ o.value,
240
+ u.email,
241
+ o.created_on,
242
+ o.replaced_on,
243
+ o.version,
244
+ o.flag_obs_status,
245
+ o.flag_method
246
+ """
247
+
248
+ from_statement = f"""
249
+ from {self.dataset_tables.OBSERVATION.iceberg_id} o
250
+ join {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
251
+ left join {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} as oc on oc.id = o.observation_coordinates
252
+ """
253
+
254
+ for i, (dim, cl) in enumerate(
255
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
256
+ ):
257
+ select_statement += f',\nd{i}.code as "{dim}"'
258
+ from_statement += f"\nleft join {cl.iceberg_id} d{i} on d{i}.id = oc.{dim}"
259
+
260
+ final_query = select_statement + from_statement
261
+ if not self.keep_history:
262
+ final_query += "where o.replaced_on is null"
263
+
264
+ logging.info("Final query for merging observation and observation_coordinares")
265
+ logging.info(final_query)
266
+
267
+ df_obs_denorm = self.spark.sql(final_query)
268
+
269
+ return df_obs_denorm
270
+
232
271
  def _gen_denormalized_metadata(self) -> DataFrame:
233
272
  """Original query upon which the below computation is based
234
273
 
@@ -292,6 +331,32 @@ class SWSEasyIcebergSparkHelper:
292
331
 
293
332
  return df_meta_denorm
294
333
 
334
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
335
+ # ----------------
336
+ # Generate denormalized observation table
337
+ # ----------------
338
+
339
+ logging.info("meta_denorm start")
340
+
341
+ df_meta_denorm = self.spark.sql(
342
+ f"""
343
+ select m.observation as observation_id,
344
+ mt.code as type,
345
+ met.code as element_type,
346
+ l.country_code as language,
347
+ me.value
348
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
349
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
350
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
351
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
352
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} reference_data.language l on l.id = m.language
353
+ """
354
+ )
355
+
356
+ logging.info("meta_denorm write")
357
+
358
+ return df_meta_denorm
359
+
295
360
  def _gen_grouped_metadata(self) -> DataFrame:
296
361
  return (
297
362
  self._gen_denormalized_metadata()
@@ -312,6 +377,26 @@ class SWSEasyIcebergSparkHelper:
312
377
  .agg(F.collect_list("metadata").alias("metadata"))
313
378
  )
314
379
 
380
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
381
+ return (
382
+ self._gen_denormalized_metadata_sql()
383
+ .select(
384
+ col("observation_id"),
385
+ F.create_map(
386
+ lit("type"),
387
+ col("type"),
388
+ lit("element_type"),
389
+ col("element_type"),
390
+ lit("language"),
391
+ col("language"),
392
+ lit("value"),
393
+ col("value"),
394
+ ).alias("metadata"),
395
+ )
396
+ .groupby("observation_id")
397
+ .agg(F.collect_list("metadata").alias("metadata"))
398
+ )
399
+
315
400
  def _gen_denormalied_data(self) -> DataFrame:
316
401
  return (
317
402
  self._gen_denormalized_observation()
@@ -324,8 +409,23 @@ class SWSEasyIcebergSparkHelper:
324
409
  .drop("m.observation_id")
325
410
  )
326
411
 
327
- def write_data_to_iceberg_and_csv(self) -> DataFrame:
328
- self.df_denorm = self._gen_denormalied_data()
412
+ def _gen_denormalied_data_sql(self) -> DataFrame:
413
+ return (
414
+ self._gen_denormalized_observation_sql()
415
+ .alias("o")
416
+ .join(
417
+ self._gen_grouped_metadata_sql().alias("m"),
418
+ col("o.id") == col("m.observation_id"),
419
+ "left",
420
+ )
421
+ .drop("m.observation_id")
422
+ )
423
+
424
+ def write_data_to_iceberg_and_csv(self, sql=False) -> DataFrame:
425
+ if sql:
426
+ self.df_denorm = self._gen_denormalied_data_sql()
427
+ else:
428
+ self.df_denorm = self._gen_denormalied_data()
329
429
 
330
430
  self.df_denorm.writeTo(self.iceberg_tables.TABLE.iceberg_id).createOrReplace()
331
431