sws-spark-dissemination-helper 0.0.93__py3-none-any.whl → 0.0.183__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,723 @@
1
+ import logging
2
+ from copy import copy
3
+ from typing import Dict, List, Tuple, Union
4
+
5
+ import pyspark.sql.functions as F
6
+ from pyspark.sql import DataFrame, SparkSession
7
+ from pyspark.sql.functions import col, lit
8
+ from sws_api_client import Tags
9
+ from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
10
+
11
+ from .constants import DatasetTables, IcebergDatabases, IcebergTables
12
+ from .SWSPostgresSparkReader import SWSPostgresSparkReader
13
+ from .utils import get_or_create_tag, save_cache_csv
14
+
15
+
16
+ class SWSEasyIcebergSparkHelper:
17
+ def __init__(
18
+ self,
19
+ spark: SparkSession,
20
+ bucket: str,
21
+ tag_name: str,
22
+ dataset_id: str,
23
+ sws_postgres_spark_reader: SWSPostgresSparkReader,
24
+ iceberg_tables: IcebergTables,
25
+ dataset_details: dict = None,
26
+ dataset_tables: DatasetTables = None,
27
+ keep_history: bool = False,
28
+ write_csv: bool = True,
29
+ source_tag: Union[str, None] = None,
30
+ ) -> None:
31
+ self.spark: SparkSession = spark
32
+ self.dataset_details: dict = dataset_details
33
+ self.bucket: str = bucket
34
+ self.tag_name: str = tag_name
35
+ self.dataset_id: str = dataset_id
36
+ self.sws_postgres_spark_reader = sws_postgres_spark_reader
37
+ self.dataset_tables: DatasetTables = dataset_tables
38
+ self.iceberg_tables: IcebergTables = iceberg_tables
39
+ self.keep_history: bool = keep_history
40
+ self.write_csv: bool = write_csv
41
+ self.source_tag: Union[str, None] = source_tag
42
+
43
+ if dataset_details is not None:
44
+ (
45
+ self.dim_columns_w_time,
46
+ self.dim_columns,
47
+ self.time_column,
48
+ self.flag_columns,
49
+ ) = self._get_dim_time_flag_columns()
50
+
51
+ # ----------------
52
+ # Get the codelist -> type mapping (e.g. geographicAreaM49 -> area )
53
+ # ----------------
54
+
55
+ self.dim_col_to_id_mapping: Dict[str, str] = (
56
+ self._get_column_names_to_idmappings("dimension")
57
+ )
58
+ self.flag_col_to_id_mapping: Dict[str, str] = (
59
+ self._get_column_names_to_idmappings("flag")
60
+ )
61
+
62
+ if dataset_tables is not None:
63
+ self.raw_data, self.raw_reference_data, self.raw_operational_data = (
64
+ self.sws_postgres_spark_reader.import_data_reference_data_operational_data(
65
+ self.dataset_tables
66
+ )
67
+ )
68
+
69
+ (
70
+ self.df_observation,
71
+ self.df_obs_coord,
72
+ self.df_metadata,
73
+ self.df_meta_elem,
74
+ self.df_tag_observation,
75
+ ) = self.raw_data
76
+
77
+ logging.info(self.raw_reference_data)
78
+ (
79
+ self.df_flag_method,
80
+ self.df_flag_obs_status,
81
+ self.df_metadata_type,
82
+ self.df_meta_elem_type,
83
+ self.df_language,
84
+ self.df_unit_of_measure,
85
+ self.df_dataset,
86
+ self.dfs_dimension,
87
+ ) = self.raw_reference_data
88
+
89
+ (self.df_user, self.df_tag) = self.raw_operational_data
90
+
91
+ def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
92
+ """Extract the dimension columns with time, without time, the time column and the flag columns names."""
93
+ dim_columns_w_time = [
94
+ dimension["id"] for dimension in self.dataset_details["dimensions"]
95
+ ]
96
+ time_column = next(
97
+ dimension["id"]
98
+ for dimension in self.dataset_details["dimensions"]
99
+ if dimension["codelist"]["type"] == "time"
100
+ )
101
+ dim_columns = copy(dim_columns_w_time)
102
+ dim_columns.remove(time_column)
103
+
104
+ flag_columns = [flag["id"] for flag in self.dataset_details["flags"]]
105
+
106
+ return dim_columns_w_time, dim_columns, time_column, flag_columns
107
+
108
+ def _get_column_names_to_idmappings(self, col_type: str) -> Dict[str, str]:
109
+ """Create a mapping from column names to dimension/flag ids."""
110
+ return {
111
+ dimension[f"{col_type}Column"]: dimension["id"]
112
+ for dimension in self.dataset_details[f"{col_type}s"]
113
+ }
114
+
115
+ def _convert_dim_start_end_date_to_data(self) -> List[DataFrame]:
116
+ """Prepare the dimension DataFrames for joining by adding the validity date time range."""
117
+
118
+ dfs_dimension = [
119
+ df_dimension.select(
120
+ "id",
121
+ "code",
122
+ F.to_date(F.coalesce("start_date", lit(None))).alias(
123
+ f"{dimension_column}_start_date"
124
+ ),
125
+ F.to_date(F.coalesce("end_date", lit(None))).alias(
126
+ f"{dimension_column}_end_date"
127
+ ),
128
+ )
129
+ for dimension_column, df_dimension in zip(
130
+ self.dim_columns_w_time, self.dfs_dimension
131
+ )
132
+ ]
133
+
134
+ for dimension_column, df_dimension in zip(
135
+ self.dim_columns_w_time, dfs_dimension
136
+ ):
137
+ logging.debug("dimension_column")
138
+ logging.debug(dimension_column)
139
+ logging.debug("df_dimension.columns")
140
+ logging.debug(df_dimension.columns)
141
+
142
+ return dfs_dimension
143
+
144
+ def _gen_denormalized_observation(self) -> DataFrame:
145
+ """Original query upon which the below computation is based
146
+
147
+ select o.id,
148
+ o.value,
149
+ u.email,
150
+ o.created_on,
151
+ o.replaced_on, // To remove (always null)
152
+ o.version,
153
+ o.flag_obs_status,
154
+ o.flag_method,
155
+ d0.code as "geographic_area_m49",
156
+ d1.code as "element_fao",
157
+ d2.code as "item_cpc ",
158
+ d3.code as "time_series_years",
159
+ ...
160
+ from <dataset_id>.observation o
161
+ join operational_data.user u ON u.id = o.created_by
162
+ left join <dataset_id>.observation_coordinate as oc on oc.id = o.observation_coordinates
163
+ left join reference_data.dim_geographic_area_m49 d0 on d0.id = oc.dim_geographic_area_m49
164
+ left join reference_data.dim_element_fao d1 on d1.id = oc.dim_element_fao
165
+ left join reference_data.dim_item_cpc d2 on d2.id = oc.dim_item_cpc
166
+ left join reference_data.dim_time_series_years d3 on d3.id = oc.dim_time_series_years
167
+ where o.replaced_on is null
168
+ """
169
+
170
+ # ----------------
171
+ # Prepare dataframes for the joins
172
+ # ----------------
173
+
174
+ df_observation = self.df_observation.withColumnsRenamed(
175
+ self.flag_col_to_id_mapping
176
+ )
177
+
178
+ df_obs_coord = self.df_obs_coord.withColumnsRenamed(
179
+ self.dim_col_to_id_mapping
180
+ ).drop("approved_observation", "num_version")
181
+
182
+ logging.debug("df_observation.columns")
183
+ logging.debug(df_observation.columns)
184
+ logging.debug("df_obs_coord.columns")
185
+ logging.debug(df_obs_coord.columns)
186
+
187
+ dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
188
+
189
+ # ----------------
190
+ # Generate denormalized observation table
191
+ # ----------------
192
+
193
+ logging.info("obs_denorm start")
194
+
195
+ # Join observations with user and observation coordinate
196
+ if not self.keep_history:
197
+ df_observation = df_observation.where(col("replaced_on").isNull())
198
+
199
+ df_intermediate = (
200
+ # Keep only the latest version of an observation
201
+ df_observation.alias("o")
202
+ # Join the user with the observation
203
+ .join(
204
+ F.broadcast(self.df_user).alias("u"),
205
+ col("o.created_by") == col("u.id"),
206
+ )
207
+ .select("o.*", "u.email")
208
+ .alias("o")
209
+ .join(
210
+ df_obs_coord.withColumnRenamed("id", "join_id").alias("oc"),
211
+ col("o.observation_coordinates") == col("oc.join_id"),
212
+ "left",
213
+ )
214
+ .drop("join_id")
215
+ )
216
+
217
+ # Join all the dimension codelists
218
+ for dimension_column, df_dimension in zip(
219
+ self.dim_columns_w_time, dfs_dimension_w_validity
220
+ ):
221
+ df_intermediate = (
222
+ df_intermediate.alias("o")
223
+ .join(
224
+ F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
225
+ "d"
226
+ ),
227
+ col(f"{dimension_column}") == col("d.join_id"),
228
+ )
229
+ .drop(f"{dimension_column}", "join_id")
230
+ .withColumnRenamed("code", dimension_column)
231
+ )
232
+
233
+ df_obs_denorm = df_intermediate
234
+
235
+ return df_obs_denorm
236
+
237
+ def _gen_denormalized_observation_sql(self) -> DataFrame:
238
+ # ----------------
239
+ # Prepare dataframes for the joins
240
+ # ----------------
241
+
242
+ select_statement = """
243
+ o.id,
244
+ o.value,
245
+ u.email,
246
+ o.created_on,
247
+ o.replaced_on,
248
+ o.version"""
249
+
250
+ from_statement = f"""
251
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
252
+ JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
253
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
254
+
255
+ hint_statement = ""
256
+
257
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
258
+ for flag_col in self.flag_columns:
259
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
260
+
261
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
262
+ for i, (dim_col, cl) in enumerate(
263
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
264
+ ):
265
+ select_statement += f",\nd{i}.code AS {dim_col}"
266
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
267
+ hint_statement = (
268
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
269
+ if hint_statement
270
+ else f"BROADCAST({cl.iceberg_id})"
271
+ )
272
+
273
+ hint_statement = "/*+ " + hint_statement + " */"
274
+
275
+ final_query = "SELECT " + hint_statement + select_statement + from_statement
276
+ if not self.keep_history:
277
+ final_query += "\nWHERE o.replaced_on IS NULL"
278
+
279
+ logging.info("Final query for merging observation and observation_coordinates")
280
+ logging.info(final_query)
281
+
282
+ df_obs_denorm = self.spark.sql(final_query)
283
+
284
+ df_obs_denorm.writeTo(
285
+ self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id
286
+ ).createOrReplace()
287
+
288
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_OBSERVATION.table} write")
289
+
290
+ return df_obs_denorm
291
+
292
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
293
+ # ----------------
294
+ # Prepare dataframes for the joins
295
+ # ----------------
296
+
297
+ select_statement = """
298
+ o.id,
299
+ o.value,
300
+ u.email,
301
+ o.created_on,
302
+ o.replaced_on,
303
+ o.version"""
304
+
305
+ from_statement = f"""
306
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
307
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
308
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
309
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
310
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
311
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
312
+
313
+ hint_statement = ""
314
+
315
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
316
+ for flag_col in self.flag_columns:
317
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
318
+
319
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
320
+ for i, (dim_col, cl) in enumerate(
321
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
322
+ ):
323
+ select_statement += f",\nd{i}.code AS {dim_col}"
324
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
325
+ hint_statement = (
326
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
327
+ if hint_statement
328
+ else f"BROADCAST({cl.iceberg_id})"
329
+ )
330
+
331
+ hint_statement = "/*+ " + hint_statement + " */"
332
+
333
+ # TODO Add tag name as a parameter
334
+ where_statement = (
335
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
336
+ )
337
+
338
+ final_query = (
339
+ "SELECT "
340
+ + hint_statement
341
+ + select_statement
342
+ + from_statement
343
+ + where_statement
344
+ )
345
+ if not self.keep_history:
346
+ final_query += "\n AND o.replaced_on IS NULL"
347
+
348
+ logging.info("Final query for merging observation and observation_coordinares")
349
+ logging.info(final_query)
350
+
351
+ df_obs_denorm = self.spark.sql(final_query)
352
+
353
+ return df_obs_denorm
354
+
355
+ def _gen_denormalized_metadata(self) -> DataFrame:
356
+ """Original query upon which the below computation is based
357
+
358
+ select m.observation as observation_id,
359
+ mt.code as type,
360
+ met.code as element_type,
361
+ l.country_code as language,
362
+ me.value
363
+ from <dataset_id>.metadata_element me
364
+ left join <dataset_id>.metadata m on m.id = me.metadata
365
+ left join reference_data.metadata_element_type met on met.id = me.metadata_element_type
366
+ left join reference_data.metadata_type mt on mt.id = m.metadata_type
367
+ left join reference_data.language l on l.id = m.language
368
+ """
369
+
370
+ # ----------------
371
+ # Generate denormalized observation table
372
+ # ----------------
373
+
374
+ logging.info("meta_denorm start")
375
+
376
+ df_meta_denorm = (
377
+ self.df_meta_elem.select("metadata", "metadata_element_type", "value")
378
+ .alias("me")
379
+ .join(
380
+ self.df_metadata.alias("m"), col("me.metadata") == col("m.id"), "left"
381
+ )
382
+ .select("me.*", "m.id", "m.observation", "m.metadata_type", "m.language")
383
+ .alias("md")
384
+ .join(
385
+ self.df_meta_elem_type.alias("met"),
386
+ col("md.metadata_element_type") == col("met.id"),
387
+ "left",
388
+ )
389
+ .select("md.*", col("met.code").alias("element_type"))
390
+ .alias("md")
391
+ .join(
392
+ self.df_metadata_type.alias("mt"),
393
+ col("md.metadata_type") == col("mt.id"),
394
+ "left",
395
+ )
396
+ .select("md.*", col("mt.code").alias("type"))
397
+ .withColumnRenamed("language", "join_language")
398
+ .alias("md")
399
+ .join(
400
+ self.df_language.alias("l"),
401
+ col("md.join_language") == col("l.id"),
402
+ "left",
403
+ )
404
+ .select("md.*", col("l.country_code").alias("language"))
405
+ .select(
406
+ col("observation").alias("observation_id"),
407
+ "type",
408
+ "element_type",
409
+ "language",
410
+ "value",
411
+ )
412
+ )
413
+
414
+ logging.info("meta_denorm write")
415
+
416
+ return df_meta_denorm
417
+
418
+ def _gen_denormalized_metadata_sql(self) -> DataFrame:
419
+ # ----------------
420
+ # Generate denormalized observation table
421
+ # ----------------
422
+
423
+ logging.info("meta_denorm start")
424
+
425
+ df_meta_denorm = self.spark.sql(
426
+ f"""
427
+ select
428
+ /*+
429
+ BROADCAST({self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id}),
430
+ BROADCAST({self.dataset_tables.METADATA_TYPE.iceberg_id}),
431
+ BROADCAST({self.dataset_tables.LANGUAGE.iceberg_id})
432
+ */
433
+ m.observation as observation_id,
434
+ mt.code as type,
435
+ met.code as element_type,
436
+ l.country_code as language,
437
+ me.value
438
+ from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
439
+ left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
440
+ left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
441
+ left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
442
+ left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
443
+ """
444
+ )
445
+
446
+ df_meta_denorm.writeTo(
447
+ self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id
448
+ ).createOrReplace()
449
+
450
+ logging.info(f"{self.iceberg_tables.DENORMALIZED_METADATA.table} write")
451
+
452
+ return df_meta_denorm
453
+
454
+ def _gen_grouped_metadata(self) -> DataFrame:
455
+ return (
456
+ self._gen_denormalized_metadata()
457
+ .select(
458
+ col("observation_id"),
459
+ F.create_map(
460
+ lit("type"),
461
+ col("type"),
462
+ lit("element_type"),
463
+ col("element_type"),
464
+ lit("language"),
465
+ col("language"),
466
+ lit("value"),
467
+ col("value"),
468
+ ).alias("metadata"),
469
+ )
470
+ .groupby("observation_id")
471
+ .agg(F.collect_list("metadata").alias("metadata"))
472
+ )
473
+
474
+ def _gen_grouped_metadata_sql(self) -> DataFrame:
475
+ df_meta_grouped = self.spark.sql(
476
+ f"""
477
+ SELECT
478
+ observation_id,
479
+ collect_list(
480
+ map(
481
+ 'type', type,
482
+ 'element_type', element_type,
483
+ 'language', language,
484
+ 'value', value
485
+ )
486
+ ) AS metadata
487
+ FROM {self.iceberg_tables.DENORMALIZED_METADATA.iceberg_id}
488
+ GROUP BY observation_id
489
+ """
490
+ )
491
+
492
+ df_meta_grouped.writeTo(
493
+ self.iceberg_tables.GROUPED_METADATA.iceberg_id
494
+ ).createOrReplace()
495
+
496
+ logging.info(f"{self.iceberg_tables.GROUPED_METADATA.table} write")
497
+
498
+ return df_meta_grouped
499
+
500
+ def _gen_denormalied_data(self) -> DataFrame:
501
+ return (
502
+ self._gen_denormalized_observation()
503
+ .alias("o")
504
+ .join(
505
+ self._gen_grouped_metadata().alias("m"),
506
+ col("o.id") == col("m.observation_id"),
507
+ "left",
508
+ )
509
+ .drop("m.observation_id")
510
+ )
511
+
512
+ def _gen_denormalied_data_sql(self) -> DataFrame:
513
+ self._gen_denormalized_observation_sql()
514
+ self._gen_denormalized_metadata_sql()
515
+ self._gen_grouped_metadata_sql()
516
+
517
+ return self.spark.sql(
518
+ f"""
519
+ SELECT
520
+ o.*,
521
+ m.metadata
522
+ FROM {self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id} AS o
523
+ LEFT JOIN {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS m
524
+ ON o.id = m.observation_id
525
+ """
526
+ )
527
+
528
+ def _gen_denormalied_data_sql_from_tag(self) -> DataFrame:
529
+ return (
530
+ self._gen_denormalized_observation_sql_from_tag()
531
+ .alias("o")
532
+ .join(
533
+ self._gen_grouped_metadata_sql().alias("m"),
534
+ col("o.id") == col("m.observation_id"),
535
+ "left",
536
+ )
537
+ .drop("m.observation_id")
538
+ )
539
+
540
+ def write_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
541
+ if sql:
542
+ self.df_denorm = self._gen_denormalied_data_sql()
543
+ else:
544
+ self.df_denorm = self._gen_denormalied_data()
545
+
546
+ self.df_denorm.writeTo(self.iceberg_tables.TABLE.iceberg_id).createOrReplace()
547
+
548
+ logging.info(f"Iceberg table written to {self.iceberg_tables.TABLE.iceberg_id}")
549
+
550
+ self.spark.sql(
551
+ f"ALTER TABLE {self.iceberg_tables.TABLE.iceberg_id} CREATE TAG `{self.tag_name}`"
552
+ )
553
+
554
+ logging.info(f"Iceberg tag '{self.tag_name}' created")
555
+
556
+ df_denorm = self.df_denorm.withColumn("metadata", F.to_json(col("metadata")))
557
+ if self.write_csv:
558
+ df_denorm = df_denorm.coalesce(1)
559
+
560
+ save_cache_csv(
561
+ df=df_denorm,
562
+ bucket=self.bucket,
563
+ prefix=self.iceberg_tables.TABLE.csv_prefix,
564
+ tag_name=self.tag_name,
565
+ )
566
+
567
+ return df_denorm
568
+
569
+ def write_sws_dissemination_tag(self, tags: Tags):
570
+ # Get or create a new tag
571
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
572
+ logging.debug(f"Tag: {tag}")
573
+
574
+ new_iceberg_table = BaseDisseminatedTagTable(
575
+ id=f"unfiltered_iceberg",
576
+ name=f"Unfiltered Iceberg",
577
+ description="Iceberg table containing all the raw data imported from the SWS and denormalized",
578
+ layer=TableLayer.CACHE,
579
+ private=True,
580
+ debug=True,
581
+ type=TableType.ICEBERG,
582
+ database=IcebergDatabases.BRONZE_SCHEME,
583
+ table=self.iceberg_tables.TABLE.table,
584
+ path=self.iceberg_tables.TABLE.path,
585
+ structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
586
+ pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
587
+ )
588
+ tag = tags.add_dissemination_table(
589
+ self.dataset_id, self.tag_name, new_iceberg_table
590
+ )
591
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
592
+
593
+ if self.write_csv:
594
+ new_csv_table = BaseDisseminatedTagTable(
595
+ id="unfiltered_csv",
596
+ name="Unfiltered csv",
597
+ description="Csv table containing all the raw data imported from the SWS and denormalized",
598
+ layer=TableLayer.CACHE,
599
+ private=True,
600
+ debug=True,
601
+ type=TableType.CSV,
602
+ path=self.iceberg_tables.TABLE.csv_path,
603
+ structure={"columns": self.df_denorm.schema.jsonValue()["fields"]},
604
+ )
605
+ tag = tags.add_dissemination_table(
606
+ self.dataset_id, self.tag_name, new_csv_table
607
+ )
608
+ logging.debug(f"Tag with Added csv Table: {tag}")
609
+
610
+ logging.info("Unfiltered data tags successfully written")
611
+
612
+ def write_filtered_data_to_iceberg_and_csv(
613
+ self, dimensions: Dict[str, List[str]] = None, from_tag=False
614
+ ) -> DataFrame:
615
+
616
+ if from_tag:
617
+ self.filtered_df = self._gen_denormalied_data_sql_from_tag()
618
+ else:
619
+ self.filtered_df = self.df_denorm
620
+
621
+ for dimension_name, codes in dimensions.items():
622
+ logging.info(f"dimension_name: {dimension_name}")
623
+ logging.info(f"codes: {codes}")
624
+ if len(codes) != 0:
625
+ self.filtered_df = self.filtered_df.filter(
626
+ col(dimension_name).isin(codes)
627
+ )
628
+
629
+ self.filtered_df.writeTo(
630
+ self.iceberg_tables.TABLE_FILTERED.iceberg_id
631
+ ).createOrReplace()
632
+
633
+ logging.info(
634
+ f"Filtered table written to {self.iceberg_tables.TABLE_FILTERED.iceberg_id}"
635
+ )
636
+
637
+ self.spark.sql(
638
+ f"ALTER TABLE {self.iceberg_tables.TABLE_FILTERED.iceberg_id} CREATE TAG `{self.tag_name}`"
639
+ )
640
+
641
+ disseminated_tag_df = self.filtered_df.withColumn(
642
+ "metadata", F.to_json(col("metadata"))
643
+ )
644
+
645
+ if self.write_csv:
646
+ disseminated_tag_df = disseminated_tag_df.coalesce(1)
647
+
648
+ save_cache_csv(
649
+ df=disseminated_tag_df,
650
+ bucket=self.bucket,
651
+ prefix=f"{self.iceberg_tables.TABLE_FILTERED.csv_prefix}",
652
+ tag_name=self.tag_name,
653
+ )
654
+
655
+ return disseminated_tag_df
656
+
657
+ def write_sws_filtered_dissemination_tag(self, tags: Tags):
658
+ # Get or create a new tag
659
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
660
+ logging.debug(f"Tag: {tag}")
661
+
662
+ new_iceberg_table = BaseDisseminatedTagTable(
663
+ id="filtered_iceberg",
664
+ name="Filtered Iceberg",
665
+ description="Iceberg table containing the raw data imported from the SWS, denormalized and filtered per dimension",
666
+ layer=TableLayer.CACHE,
667
+ private=True,
668
+ type=TableType.ICEBERG,
669
+ database=IcebergDatabases.BRONZE_DATABASE,
670
+ table=self.iceberg_tables.TABLE_FILTERED.table,
671
+ path=self.iceberg_tables.TABLE_FILTERED.path,
672
+ structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
673
+ pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
674
+ )
675
+ tag = tags.add_dissemination_table(
676
+ self.dataset_id, self.tag_name, new_iceberg_table
677
+ )
678
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
679
+
680
+ if self.write_csv:
681
+ new_csv_table = BaseDisseminatedTagTable(
682
+ id="filtered_csv",
683
+ name="Filtered csv",
684
+ description="Csv table containing the raw data imported from the SWS, denormalized and filtered per dimension cached",
685
+ layer=TableLayer.CACHE,
686
+ private=True,
687
+ type=TableType.CSV,
688
+ path=self.iceberg_tables.TABLE_FILTERED.csv_path,
689
+ structure={"columns": self.filtered_df.schema.jsonValue()["fields"]},
690
+ )
691
+ tag = tags.add_dissemination_table(
692
+ self.dataset_id, self.tag_name, new_csv_table
693
+ )
694
+
695
+ logging.debug(f"Tag with Added csv Table: {tag}")
696
+
697
+ logging.info("Filtered data tags successfully written")
698
+
699
+
700
+ 1
701
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
702
+ 1
703
+ 1
704
+ 2
705
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
706
+ 2
707
+ 1
708
+ 1
709
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
710
+ 1
711
+ 1
712
+ 2
713
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
714
+ 2
715
+ 1
716
+ 1
717
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
718
+ 1
719
+ 1
720
+ 1
721
+ frozenset({"1", "0", "7", "9", "4", "8", "6", "3", "2", "5"})
722
+ 1
723
+ 1