sws-spark-dissemination-helper 0.0.122__tar.gz → 0.0.124__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/PKG-INFO +1 -1
  2. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/pyproject.toml +1 -1
  3. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +8 -7
  4. sws_spark_dissemination_helper-0.0.124/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +471 -0
  5. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/src/sws_spark_dissemination_helper/constants.py +3 -0
  6. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/.gitignore +0 -0
  7. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/LICENSE +0 -0
  8. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/README.md +0 -0
  9. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/old_requirements.txt +0 -0
  10. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/requirements.txt +0 -0
  11. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/src/sws_spark_dissemination_helper/utils.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/tests/__init__.py +0 -0
  17. {sws_spark_dissemination_helper-0.0.122 → sws_spark_dissemination_helper-0.0.124}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.122
3
+ Version: 0.0.124
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.122"
7
+ version = "0.0.124"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3==1.36.18",
@@ -454,13 +454,14 @@ class SWSBronzeIcebergSparkHelper:
454
454
 
455
455
  self.disseminated_tag_df = self.df_bronze
456
456
 
457
- for dimension_name, codes in dimensions.items():
458
- logging.info(f"dimension_name: {dimension_name}")
459
- logging.info(f"codes: {codes}")
460
- if len(codes) != 0:
461
- self.disseminated_tag_df = self.disseminated_tag_df.filter(
462
- col(dimension_name).isin(codes)
463
- )
457
+ if isinstance(dimensions, dict):
458
+ for dimension_name, codes in dimensions.items():
459
+ logging.info(f"dimension_name: {dimension_name}")
460
+ logging.info(f"codes: {codes}")
461
+ if len(codes) != 0:
462
+ self.disseminated_tag_df = self.disseminated_tag_df.filter(
463
+ col(dimension_name).isin(codes)
464
+ )
464
465
 
465
466
  self.disseminated_tag_df.writeTo(
466
467
  self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
@@ -0,0 +1,471 @@
1
+ import logging
2
+ from copy import copy
3
+ from typing import Dict, List, Tuple
4
+
5
+ import pyspark.sql.functions as F
6
+ from pyspark.sql import DataFrame, SparkSession
7
+ from pyspark.sql.functions import col, lit
8
+ from sws_api_client import Tags
9
+ from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
10
+
11
+ from .constants import DatasetTables, IcebergDatabases, IcebergTables
12
+ from .SWSPostgresSparkReader import SWSPostgresSparkReader
13
+ from .utils import get_or_create_tag, save_cache_csv
14
+
15
+
16
+ class SWSEasyIcebergSparkHelper:
17
+ def __init__(
18
+ self,
19
+ spark: SparkSession,
20
+ bucket: str,
21
+ tag_name: str,
22
+ dataset_id: str,
23
+ sws_postgres_spark_reader: SWSPostgresSparkReader,
24
+ iceberg_tables: IcebergTables,
25
+ dataset_details: dict = None,
26
+ dataset_tables: DatasetTables = None,
27
+ keep_history: bool = False,
28
+ ) -> None:
29
+ self.spark: SparkSession = spark
30
+ self.dataset_details: dict = dataset_details
31
+ self.bucket: str = bucket
32
+ self.tag_name: str = tag_name
33
+ self.dataset_id: str = dataset_id
34
+ self.sws_postgres_spark_reader = sws_postgres_spark_reader
35
+ self.dataset_tables: DatasetTables = dataset_tables
36
+ self.iceberg_tables: IcebergTables = iceberg_tables
37
+ self.keep_history: bool = keep_history
38
+
39
+ if dataset_details is not None:
40
+ (
41
+ self.dim_columns_w_time,
42
+ self.dim_columns,
43
+ self.time_column,
44
+ self.flag_columns,
45
+ ) = self._get_dim_time_flag_columns()
46
+
47
+ # ----------------
48
+ # Get the codelist -> type mapping (e.g. geographicAreaM49 -> area )
49
+ # ----------------
50
+
51
+ self.dim_col_to_id_mapping: Dict[str, str] = (
52
+ self._get_column_names_to_idmappings("dimension")
53
+ )
54
+ self.flag_col_to_id_mapping: Dict[str, str] = (
55
+ self._get_column_names_to_idmappings("flag")
56
+ )
57
+
58
+ if dataset_tables is not None:
59
+ self.raw_data, self.raw_reference_data, self.raw_operational_data = (
60
+ self.sws_postgres_spark_reader.import_data_reference_data_operational_data(
61
+ self.dataset_tables
62
+ )
63
+ )
64
+
65
+ (
66
+ self.df_observation,
67
+ self.df_obs_coord,
68
+ self.df_metadata,
69
+ self.df_meta_elem,
70
+ ) = self.raw_data
71
+
72
+ (
73
+ self.df_flag_method,
74
+ self.df_flag_obs_status,
75
+ self.df_metadata_type,
76
+ self.df_meta_elem_type,
77
+ self.df_language,
78
+ self.df_unit_of_measure,
79
+ self.dfs_dimension,
80
+ ) = self.raw_reference_data
81
+
82
+ self.df_user = self.raw_operational_data
83
+
84
+ def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
85
+ """Extract the dimension columns with time, without time, the time column and the flag columns names."""
86
+ dim_columns_w_time = [
87
+ dimension["id"] for dimension in self.dataset_details["dimensions"]
88
+ ]
89
+ time_column = next(
90
+ dimension["id"]
91
+ for dimension in self.dataset_details["dimensions"]
92
+ if dimension["codelist"]["type"] == "time"
93
+ )
94
+ dim_columns = copy(dim_columns_w_time)
95
+ dim_columns.remove(time_column)
96
+
97
+ flag_columns = [flag["id"] for flag in self.dataset_details["flags"]]
98
+
99
+ return dim_columns_w_time, dim_columns, time_column, flag_columns
100
+
101
+ def _get_column_names_to_idmappings(self, col_type: str) -> Dict[str, str]:
102
+ """Create a mapping from column names to dimension/flag ids."""
103
+ return {
104
+ dimension[f"{col_type}Column"]: dimension["id"]
105
+ for dimension in self.dataset_details[f"{col_type}s"]
106
+ }
107
+
108
+ def _convert_dim_start_end_date_to_data(self) -> List[DataFrame]:
109
+ """Prepare the dimension DataFrames for joining by adding the validity date time range."""
110
+
111
+ dfs_dimension = [
112
+ df_dimension.select(
113
+ "id",
114
+ "code",
115
+ F.to_date(F.coalesce("start_date", lit(None))).alias(
116
+ f"{dimension_column}_start_date"
117
+ ),
118
+ F.to_date(F.coalesce("end_date", lit(None))).alias(
119
+ f"{dimension_column}_end_date"
120
+ ),
121
+ )
122
+ for dimension_column, df_dimension in zip(
123
+ self.dim_columns_w_time, self.dfs_dimension
124
+ )
125
+ ]
126
+
127
+ for dimension_column, df_dimension in zip(
128
+ self.dim_columns_w_time, dfs_dimension
129
+ ):
130
+ logging.debug("dimension_column")
131
+ logging.debug(dimension_column)
132
+ logging.debug("df_dimension.columns")
133
+ logging.debug(df_dimension.columns)
134
+
135
+ return dfs_dimension
136
+
137
+ def _gen_denormalized_observation(self) -> DataFrame:
138
+ """Original query upon which the below computation is based
139
+
140
+ select o.id,
141
+ o.value,
142
+ u.email,
143
+ o.created_on,
144
+ o.replaced_on, // To remove (always null)
145
+ o.version,
146
+ o.flag_obs_status,
147
+ o.flag_method,
148
+ d0.code as "geographic_area_m49",
149
+ d1.code as "element_fao",
150
+ d2.code as "item_cpc ",
151
+ d3.code as "time_series_years",
152
+ ...
153
+ from <dataset_id>.observation o
154
+ join operational_data.user u ON u.id = o.created_by
155
+ left join <dataset_id>.observation_coordinate as oc on oc.id = o.observation_coordinates
156
+ left join reference_data.dim_geographic_area_m49 d0 on d0.id = oc.dim_geographic_area_m49
157
+ left join reference_data.dim_element_fao d1 on d1.id = oc.dim_element_fao
158
+ left join reference_data.dim_item_cpc d2 on d2.id = oc.dim_item_cpc
159
+ left join reference_data.dim_time_series_years d3 on d3.id = oc.dim_time_series_years
160
+ where o.replaced_on is null,
161
+ """
162
+
163
+ # ----------------
164
+ # Prepare dataframes for the joins
165
+ # ----------------
166
+
167
+ df_observation = self.df_observation.withColumnsRenamed(
168
+ self.flag_col_to_id_mapping
169
+ )
170
+
171
+ df_obs_coord = self.df_obs_coord.withColumnsRenamed(
172
+ self.dim_col_to_id_mapping
173
+ ).drop("approved_observation", "num_version")
174
+
175
+ logging.debug("df_observation.columns")
176
+ logging.debug(df_observation.columns)
177
+ logging.debug("df_obs_coord.columns")
178
+ logging.debug(df_obs_coord.columns)
179
+
180
+ dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
181
+
182
+ # ----------------
183
+ # Generate denormalized observation table
184
+ # ----------------
185
+
186
+ logging.info("obs_denorm start")
187
+
188
+ # Join observations with user and observation coordinate
189
+ if not self.keep_history:
190
+ df_observation = df_observation.where(col("replaced_on").isNull())
191
+
192
+ df_intermediate = (
193
+ # Keep only the latest version of an observation
194
+ df_observation.alias("o")
195
+ # Join the user with the observation
196
+ .join(
197
+ F.broadcast(self.df_user).alias("u"),
198
+ col("o.created_by") == col("u.id"),
199
+ )
200
+ .select("o.*", "u.email")
201
+ .alias("o")
202
+ .join(
203
+ df_obs_coord.withColumnRenamed("id", "join_id").alias("oc"),
204
+ col("o.observation_coordinates") == col("oc.join_id"),
205
+ "left",
206
+ )
207
+ .drop("join_id")
208
+ )
209
+
210
+ # Join all the dimension codelists
211
+ for dimension_column, df_dimension in zip(
212
+ self.dim_columns_w_time, dfs_dimension_w_validity
213
+ ):
214
+ df_intermediate = (
215
+ df_intermediate.alias("o")
216
+ .join(
217
+ F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
218
+ "d"
219
+ ),
220
+ col(f"{dimension_column}") == col("d.join_id"),
221
+ )
222
+ .drop(f"{dimension_column}", "join_id")
223
+ .withColumnRenamed("code", dimension_column)
224
+ )
225
+
226
+ df_obs_denorm = df_intermediate
227
+
228
+ return df_obs_denorm
229
+
230
+ def _gen_denormalized_metadata(self) -> DataFrame:
231
+ """Original query upon which the below computation is based
232
+
233
+ select m.observation as observation_id,
234
+ mt.code as type,
235
+ met.code as element_type,
236
+ l.country_code as language,
237
+ me.value
238
+ from <dataset_id>.metadata_element me
239
+ left join <dataset_id>.metadata m on m.id = me.metadata
240
+ left join reference_data.metadata_element_type met on met.id = me.metadata_element_type
241
+ left join reference_data.metadata_type mt on mt.id = m.metadata_type
242
+ left join reference_data.language l on l.id = m.language
243
+ """
244
+
245
+ # ----------------
246
+ # Generate denormalized observation table
247
+ # ----------------
248
+
249
+ logging.info("meta_denorm start")
250
+
251
+ df_meta_denorm = (
252
+ self.df_meta_elem.select("metadata", "metadata_element_type", "value")
253
+ .alias("me")
254
+ .join(
255
+ self.df_metadata.alias("m"), col("me.metadata") == col("m.id"), "left"
256
+ )
257
+ .select("me.*", "m.id", "m.observation", "m.metadata_type", "m.language")
258
+ .alias("md")
259
+ .join(
260
+ self.df_meta_elem_type.alias("met"),
261
+ col("md.metadata_element_type") == col("met.id"),
262
+ "left",
263
+ )
264
+ .select("md.*", col("met.code").alias("element_type"))
265
+ .alias("md")
266
+ .join(
267
+ self.df_metadata_type.alias("mt"),
268
+ col("md.metadata_type") == col("mt.id"),
269
+ "left",
270
+ )
271
+ .select("md.*", col("mt.code").alias("type"))
272
+ .withColumnRenamed("language", "join_language")
273
+ .alias("md")
274
+ .join(
275
+ self.df_language.alias("l"),
276
+ col("md.join_language") == col("l.id"),
277
+ "left",
278
+ )
279
+ .select("md.*", col("l.country_code").alias("language"))
280
+ .select(
281
+ col("observation").alias("observation_id"),
282
+ "type",
283
+ "element_type",
284
+ "language",
285
+ "value",
286
+ )
287
+ )
288
+
289
+ logging.info("meta_denorm write")
290
+
291
+ return df_meta_denorm
292
+
293
+ def _gen_grouped_metadata(self) -> DataFrame:
294
+ return (
295
+ self._gen_denormalized_metadata()
296
+ .select(
297
+ col("observation_id"),
298
+ F.create_map(
299
+ lit("type"),
300
+ col("type"),
301
+ lit("element_type"),
302
+ col("element_type"),
303
+ lit("language"),
304
+ col("language"),
305
+ lit("value"),
306
+ col("value"),
307
+ ).alias("metadata"),
308
+ )
309
+ .groupby("observation_id")
310
+ .agg(F.collect_list("metadata").alias("metadata"))
311
+ )
312
+
313
+ def _gen_denormalied_data(self) -> DataFrame:
314
+ return (
315
+ self._gen_denormalized_observation()
316
+ .alias("o")
317
+ .join(
318
+ self._gen_grouped_metadata().alias("m"),
319
+ col("o.id") == col("m.observation_id"),
320
+ "left",
321
+ )
322
+ .drop("m.observation_id")
323
+ )
324
+
325
+ def write_data_to_iceberg_and_csv(self) -> DataFrame:
326
+ self.df_denorm = self._gen_denormalied_data()
327
+
328
+ self.df_denorm.writeTo(self.iceberg_tables.TABLE.iceberg_id).createOrReplace()
329
+
330
+ logging.info(f"Iceberg table written to {self.iceberg_tables.TABLE.iceberg_id}")
331
+
332
+ self.spark.sql(
333
+ f"ALTER TABLE {self.iceberg_tables.TABLE.iceberg_id} CREATE TAG `{self.tag_name}`"
334
+ )
335
+
336
+ logging.info(f"Iceberg tag '{self.tag_name}' created")
337
+
338
+ df_denorm = self.df_denorm.withColumn(
339
+ "metadata", F.to_json(col("metadata"))
340
+ ).coalesce(1)
341
+
342
+ save_cache_csv(
343
+ df=df_denorm,
344
+ bucket=self.bucket,
345
+ prefix=self.iceberg_tables.TABLE.csv_prefix,
346
+ tag_name=self.tag_name,
347
+ )
348
+
349
+ return df_denorm
350
+
351
+ def write_sws_dissemination_tag(self, tags: Tags):
352
+ # Get or create a new tag
353
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
354
+ logging.debug(f"Tag: {tag}")
355
+
356
+ new_iceberg_table = BaseDisseminatedTagTable(
357
+ id=f"unfiltered_iceberg",
358
+ name=f"Unfiltered Iceberg",
359
+ description="Iceberg table containing all the raw data imported from the SWS and denormalized",
360
+ layer=TableLayer.CACHE,
361
+ private=True,
362
+ debug=True,
363
+ type=TableType.ICEBERG,
364
+ database=IcebergDatabases.BRONZE_SCHEME,
365
+ table=self.iceberg_tables.TABLE.table,
366
+ path=self.iceberg_tables.TABLE.path,
367
+ structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
368
+ pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
369
+ )
370
+ tag = tags.add_dissemination_table(
371
+ self.dataset_id, self.tag_name, new_iceberg_table
372
+ )
373
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
374
+
375
+ new_csv_table = BaseDisseminatedTagTable(
376
+ id="unfiltered_csv",
377
+ name="Unfiltered csv",
378
+ description="Csv table containing all the raw data imported from the SWS and denormalized",
379
+ layer=TableLayer.CACHE,
380
+ private=True,
381
+ debug=True,
382
+ type=TableType.CSV,
383
+ path=self.iceberg_tables.TABLE.csv_path,
384
+ structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
385
+ )
386
+ tag = tags.add_dissemination_table(
387
+ self.dataset_id, self.tag_name, new_csv_table
388
+ )
389
+ logging.debug(f"Tag with Added csv Table: {tag}")
390
+
391
+ logging.info("Unfiltered data tags successfully written")
392
+
393
+ def write_filtered_data_to_iceberg_and_csv(
394
+ self, dimensions: Dict[str, List[str]]
395
+ ) -> DataFrame:
396
+
397
+ self.filtered_df = self.df_denorm
398
+
399
+ for dimension_name, codes in dimensions.items():
400
+ logging.info(f"dimension_name: {dimension_name}")
401
+ logging.info(f"codes: {codes}")
402
+ if len(codes) != 0:
403
+ self.filtered_df = self.filtered_df.filter(
404
+ col(dimension_name).isin(codes)
405
+ )
406
+
407
+ self.filtered_df.writeTo(
408
+ self.iceberg_tables.TABLE_FILTERED.iceberg_id
409
+ ).createOrReplace()
410
+
411
+ logging.info(
412
+ f"Filtered table written to {self.iceberg_tables.TABLE_FILTERED.iceberg_id}"
413
+ )
414
+
415
+ self.spark.sql(
416
+ f"ALTER TABLE {self.iceberg_tables.TABLE_FILTERED.iceberg_id} CREATE TAG `{self.tag_name}`"
417
+ )
418
+
419
+ disseminated_tag_df = self.filtered_df.withColumn(
420
+ "metadata", F.to_json(col("metadata"))
421
+ ).coalesce(1)
422
+
423
+ save_cache_csv(
424
+ df=disseminated_tag_df,
425
+ bucket=self.bucket,
426
+ prefix=f"{self.iceberg_tables.TABLE_FILTERED.csv_prefix}",
427
+ tag_name=self.tag_name,
428
+ )
429
+
430
+ return disseminated_tag_df
431
+
432
+ def write_bronze_sws_filtered_disseminated_tag(self, tags: Tags):
433
+ # Get or create a new tag
434
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
435
+ logging.debug(f"Tag: {tag}")
436
+
437
+ new_iceberg_table = BaseDisseminatedTagTable(
438
+ id="filtered_iceberg",
439
+ name="Filtered Iceberg",
440
+ description="Iceberg table containing the raw data imported from the SWS, denormalized and filtered per dimension",
441
+ layer=TableLayer.CACHE,
442
+ private=True,
443
+ type=TableType.ICEBERG,
444
+ database=IcebergDatabases.BRONZE_DATABASE,
445
+ table=self.iceberg_tables.TABLE_FILTERED.table,
446
+ path=self.iceberg_tables.TABLE_FILTERED.path,
447
+ structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
448
+ pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
449
+ )
450
+ tag = tags.add_dissemination_table(
451
+ self.dataset_id, self.tag_name, new_iceberg_table
452
+ )
453
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
454
+
455
+ new_csv_table = BaseDisseminatedTagTable(
456
+ id="filtered_csv",
457
+ name="Filtered csv",
458
+ description="Csv table containing the raw data imported from the SWS, denormalized and filtered per dimension cached",
459
+ layer=TableLayer.CACHE,
460
+ private=True,
461
+ type=TableType.CSV,
462
+ path=self.iceberg_tables.TABLE_FILTERED.csv_path,
463
+ structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
464
+ )
465
+ tag = tags.add_dissemination_table(
466
+ self.dataset_id, self.tag_name, new_csv_table
467
+ )
468
+
469
+ logging.debug(f"Tag with Added csv Table: {tag}")
470
+
471
+ logging.info("Filtered data tags successfully written")
@@ -218,6 +218,9 @@ class IcebergTables:
218
218
  self.__dataset_id = dataset_id
219
219
  self.__tag_name = tag_name
220
220
 
221
+ # TODO Fix later with a more appropriate DATABASE
222
+ self.TABLE = self._create_iceberg_table("BRONZE")
223
+ self.TABLE_FILTERED = self._create_iceberg_table("BRONZE", suffix="filtered")
221
224
  self.BRONZE = self._create_iceberg_table("BRONZE")
222
225
  self.BRONZE_DISS_TAG = self._create_iceberg_table("BRONZE", suffix="diss_tag")
223
226
  self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)