sws-spark-dissemination-helper 0.0.99__tar.gz → 0.0.168__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sws-spark-dissemination-helper might be problematic. Click here for more details.
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/.gitignore +1 -1
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/PKG-INFO +5 -5
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/pyproject.toml +5 -5
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +317 -95
- sws_spark_dissemination_helper-0.0.168/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +723 -0
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +244 -4
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +54 -31
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +104 -3
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/__init__.py +1 -0
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/constants.py +67 -18
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/src/sws_spark_dissemination_helper/utils.py +18 -0
- sws_spark_dissemination_helper-0.0.168/tests/__init__.py +0 -0
- sws_spark_dissemination_helper-0.0.99/old_requirements.txt +0 -23
- sws_spark_dissemination_helper-0.0.99/requirements.txt +0 -23
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/LICENSE +0 -0
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/README.md +0 -0
- /sws_spark_dissemination_helper-0.0.99/tests/__init__.py → /sws_spark_dissemination_helper-0.0.168/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- {sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/tests/test.py +0 -0
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
# You should customize this list as applicable to your project.
|
|
3
3
|
# Learn more about .gitignore:
|
|
4
4
|
# https://www.atlassian.com/git/tutorials/saving-changes/gitignore
|
|
5
|
-
.*
|
|
6
5
|
|
|
7
6
|
# Node artifact files
|
|
8
7
|
node_modules/
|
|
@@ -49,3 +48,4 @@ Thumbs.db
|
|
|
49
48
|
*.mov
|
|
50
49
|
*.wmv
|
|
51
50
|
|
|
51
|
+
.venv/
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.168
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
|
-
Project-URL: Repository, https://
|
|
5
|
+
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
7
7
|
License: MIT License
|
|
8
8
|
|
|
@@ -31,8 +31,8 @@ Classifier: Operating System :: OS Independent
|
|
|
31
31
|
Classifier: Programming Language :: Python :: 3
|
|
32
32
|
Requires-Python: >=3.9
|
|
33
33
|
Requires-Dist: annotated-types==0.7.0
|
|
34
|
-
Requires-Dist: boto3
|
|
35
|
-
Requires-Dist: botocore
|
|
34
|
+
Requires-Dist: boto3>=1.36.18
|
|
35
|
+
Requires-Dist: botocore>=1.36.18
|
|
36
36
|
Requires-Dist: certifi==2025.1.31
|
|
37
37
|
Requires-Dist: charset-normalizer==3.4.1
|
|
38
38
|
Requires-Dist: idna==3.10
|
|
@@ -49,7 +49,7 @@ Requires-Dist: pytz==2025.1
|
|
|
49
49
|
Requires-Dist: requests==2.32.3
|
|
50
50
|
Requires-Dist: s3transfer==0.11.2
|
|
51
51
|
Requires-Dist: six==1.17.0
|
|
52
|
-
Requires-Dist: sws-api-client==1.
|
|
52
|
+
Requires-Dist: sws-api-client==1.5.3
|
|
53
53
|
Requires-Dist: typing-extensions==4.12.2
|
|
54
54
|
Requires-Dist: tzdata==2025.1
|
|
55
55
|
Requires-Dist: urllib3==1.26.20
|
{sws_spark_dissemination_helper-0.0.99 → sws_spark_dissemination_helper-0.0.168}/pyproject.toml
RENAMED
|
@@ -4,11 +4,11 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sws-spark-dissemination-helper"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.168"
|
|
8
8
|
dependencies = [
|
|
9
9
|
"annotated-types==0.7.0",
|
|
10
|
-
"boto3
|
|
11
|
-
"botocore
|
|
10
|
+
"boto3>=1.36.18",
|
|
11
|
+
"botocore>=1.36.18",
|
|
12
12
|
"certifi==2025.1.31",
|
|
13
13
|
"charset-normalizer==3.4.1",
|
|
14
14
|
"idna==3.10",
|
|
@@ -25,7 +25,7 @@ dependencies = [
|
|
|
25
25
|
"requests==2.32.3",
|
|
26
26
|
"s3transfer==0.11.2",
|
|
27
27
|
"six==1.17.0",
|
|
28
|
-
"sws_api_client==1.
|
|
28
|
+
"sws_api_client==1.5.3",
|
|
29
29
|
"typing_extensions==4.12.2",
|
|
30
30
|
"tzdata==2025.1",
|
|
31
31
|
"urllib3==1.26.20"
|
|
@@ -42,4 +42,4 @@ classifiers = [
|
|
|
42
42
|
]
|
|
43
43
|
|
|
44
44
|
[project.urls]
|
|
45
|
-
Repository = "https://
|
|
45
|
+
Repository = "https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
from copy import copy
|
|
4
|
-
from typing import Dict, List, Tuple
|
|
4
|
+
from typing import Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import pyspark.sql.functions as F
|
|
7
7
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -26,6 +26,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
26
26
|
domain_code: str,
|
|
27
27
|
dataset_details: dict = None,
|
|
28
28
|
dataset_tables: DatasetTables = None,
|
|
29
|
+
keep_history: bool = False,
|
|
30
|
+
write_csv: bool = True,
|
|
31
|
+
source_tag: Union[str, None] = None,
|
|
29
32
|
) -> None:
|
|
30
33
|
self.spark: SparkSession = spark
|
|
31
34
|
self.dataset_details: dict = dataset_details
|
|
@@ -36,6 +39,9 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
36
39
|
self.dataset_tables: DatasetTables = dataset_tables
|
|
37
40
|
self.iceberg_tables: IcebergTables = iceberg_tables
|
|
38
41
|
self.domain_code = domain_code
|
|
42
|
+
self.keep_history: bool = keep_history
|
|
43
|
+
self.write_csv: bool = write_csv
|
|
44
|
+
self.source_tag: Union[str, None] = source_tag
|
|
39
45
|
|
|
40
46
|
if dataset_details is not None:
|
|
41
47
|
(
|
|
@@ -83,6 +89,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
83
89
|
self.df_obs_coord,
|
|
84
90
|
self.df_metadata,
|
|
85
91
|
self.df_meta_elem,
|
|
92
|
+
self.df_tag_observation,
|
|
86
93
|
) = self.raw_data
|
|
87
94
|
|
|
88
95
|
(
|
|
@@ -92,10 +99,11 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
92
99
|
self.df_meta_elem_type,
|
|
93
100
|
self.df_language,
|
|
94
101
|
self.df_unit_of_measure,
|
|
102
|
+
self.df_dataset,
|
|
95
103
|
self.dfs_dimension,
|
|
96
104
|
) = self.raw_reference_data
|
|
97
105
|
|
|
98
|
-
self.df_user = self.raw_operational_data
|
|
106
|
+
(self.df_user, self.df_tag) = self.raw_operational_data
|
|
99
107
|
|
|
100
108
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
101
109
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
@@ -150,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
150
158
|
|
|
151
159
|
return dfs_dimension
|
|
152
160
|
|
|
153
|
-
def _prepare_element_uom(self) -> DataFrame:
|
|
161
|
+
def _prepare_element_uom(self) -> Union[DataFrame, None]:
|
|
154
162
|
"""Prepare the element and unit of measure join."""
|
|
155
163
|
|
|
156
164
|
# Get the element DataFrame
|
|
@@ -162,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
162
170
|
if dimension_column == self.element_column
|
|
163
171
|
)
|
|
164
172
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
173
|
+
if any("unit_of_measure" == column.lower() for column in df_element.columns):
|
|
174
|
+
# Join the element and the unit_of_measure
|
|
175
|
+
df_element_uom = (
|
|
176
|
+
df_element.alias("e")
|
|
177
|
+
.join(
|
|
178
|
+
self.df_unit_of_measure.alias("u"),
|
|
179
|
+
col("e.unit_of_measure") == col("u.id"),
|
|
180
|
+
)
|
|
181
|
+
.select(
|
|
182
|
+
col("e.code").alias("element_code"),
|
|
183
|
+
col("u.code").alias("unit_of_measure"),
|
|
184
|
+
col("u.symbol").alias("unit_of_measure_symbol"),
|
|
185
|
+
col("u.base_unit").alias("unit_of_measure_base_unit"),
|
|
186
|
+
col("u.multiplier").alias("unit_of_measure_multiplier"),
|
|
187
|
+
)
|
|
178
188
|
)
|
|
179
|
-
)
|
|
180
189
|
|
|
181
|
-
|
|
190
|
+
return df_element_uom
|
|
182
191
|
|
|
183
192
|
def _gen_denormalized_observation(self) -> DataFrame:
|
|
184
193
|
"""Original query upon which the below computation is based
|
|
@@ -270,20 +279,170 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
270
279
|
.withColumnRenamed("code", dimension_column)
|
|
271
280
|
)
|
|
272
281
|
|
|
273
|
-
|
|
274
|
-
df_intermediate
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
282
|
+
if df_element_uom is not None:
|
|
283
|
+
df_intermediate = (
|
|
284
|
+
df_intermediate.alias("d")
|
|
285
|
+
.join(
|
|
286
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
287
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
288
|
+
"left",
|
|
289
|
+
)
|
|
290
|
+
.drop("element_code")
|
|
279
291
|
)
|
|
280
|
-
.drop("element_code")
|
|
281
|
-
)
|
|
282
292
|
|
|
283
293
|
df_obs_denorm = df_intermediate
|
|
284
294
|
|
|
285
295
|
return df_obs_denorm
|
|
286
296
|
|
|
297
|
+
def _gen_denormalized_observation_sql(self) -> DataFrame:
|
|
298
|
+
# ----------------
|
|
299
|
+
# Prepare dataframes for the joins
|
|
300
|
+
# ----------------
|
|
301
|
+
|
|
302
|
+
select_statement = """
|
|
303
|
+
o.id,
|
|
304
|
+
o.value,
|
|
305
|
+
u.email,
|
|
306
|
+
o.created_on,
|
|
307
|
+
o.replaced_on,
|
|
308
|
+
o.version"""
|
|
309
|
+
|
|
310
|
+
from_statement = f"""
|
|
311
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
312
|
+
JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
313
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
314
|
+
|
|
315
|
+
hint_statement = ""
|
|
316
|
+
|
|
317
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
318
|
+
for flag_col in self.flag_columns:
|
|
319
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
320
|
+
|
|
321
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
322
|
+
for i, (dim_col, cl) in enumerate(
|
|
323
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
324
|
+
):
|
|
325
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
326
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
327
|
+
hint_statement = (
|
|
328
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
329
|
+
if hint_statement
|
|
330
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
334
|
+
|
|
335
|
+
final_query = "SELECT " + hint_statement + select_statement + from_statement
|
|
336
|
+
if not self.keep_history:
|
|
337
|
+
final_query += "\nWHERE o.replaced_on IS NULL"
|
|
338
|
+
|
|
339
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
340
|
+
logging.info(final_query)
|
|
341
|
+
|
|
342
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
343
|
+
|
|
344
|
+
df_element_uom = self._prepare_element_uom()
|
|
345
|
+
|
|
346
|
+
dfs_dimension_w_validity = self._convert_dim_start_end_date_to_data()
|
|
347
|
+
|
|
348
|
+
# Join all the dimension codelists
|
|
349
|
+
for dimension_column, df_dimension in zip(
|
|
350
|
+
self.dim_columns_w_time, dfs_dimension_w_validity
|
|
351
|
+
):
|
|
352
|
+
logging.debug(f"Joining dimension column: {dimension_column}")
|
|
353
|
+
logging.debug(f"df_obs_denorm columns: {df_obs_denorm.columns}")
|
|
354
|
+
logging.debug(
|
|
355
|
+
f"Is dimension {dimension_column} in the dataframe? {dimension_column in df_obs_denorm.columns}"
|
|
356
|
+
)
|
|
357
|
+
df_obs_denorm = (
|
|
358
|
+
df_obs_denorm.alias("o")
|
|
359
|
+
.join(
|
|
360
|
+
F.broadcast(df_dimension.withColumnRenamed("id", "join_id")).alias(
|
|
361
|
+
"d"
|
|
362
|
+
),
|
|
363
|
+
col(f"{dimension_column}") == col("d.code"),
|
|
364
|
+
)
|
|
365
|
+
.drop("code", "join_id")
|
|
366
|
+
)
|
|
367
|
+
logging.debug(f"After join count: {df_obs_denorm.count()}")
|
|
368
|
+
|
|
369
|
+
if df_element_uom is not None:
|
|
370
|
+
df_obs_denorm = (
|
|
371
|
+
df_obs_denorm.alias("d")
|
|
372
|
+
.join(
|
|
373
|
+
F.broadcast(df_element_uom).alias("e"),
|
|
374
|
+
col(f"d.{self.element_column}") == col("e.element_code"),
|
|
375
|
+
"left",
|
|
376
|
+
)
|
|
377
|
+
.drop("element_code")
|
|
378
|
+
)
|
|
379
|
+
logging.debug(f"After uom count: {df_obs_denorm.count()}")
|
|
380
|
+
|
|
381
|
+
return df_obs_denorm
|
|
382
|
+
|
|
383
|
+
def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
|
|
384
|
+
# ----------------
|
|
385
|
+
# Prepare dataframes for the joins
|
|
386
|
+
# ----------------
|
|
387
|
+
|
|
388
|
+
select_statement = """
|
|
389
|
+
o.id,
|
|
390
|
+
o.value,
|
|
391
|
+
u.email,
|
|
392
|
+
o.created_on,
|
|
393
|
+
o.replaced_on,
|
|
394
|
+
o.version"""
|
|
395
|
+
|
|
396
|
+
from_statement = f"""
|
|
397
|
+
FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
|
|
398
|
+
INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
|
|
399
|
+
INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
|
|
400
|
+
INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
|
|
401
|
+
LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
|
|
402
|
+
LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
|
|
403
|
+
|
|
404
|
+
hint_statement = ""
|
|
405
|
+
|
|
406
|
+
id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
|
|
407
|
+
for flag_col in self.flag_columns:
|
|
408
|
+
select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
|
|
409
|
+
|
|
410
|
+
id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
|
|
411
|
+
for i, (dim_col, cl) in enumerate(
|
|
412
|
+
zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
|
|
413
|
+
):
|
|
414
|
+
select_statement += f",\nd{i}.code AS {dim_col}"
|
|
415
|
+
from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
|
|
416
|
+
hint_statement = (
|
|
417
|
+
hint_statement + f", BROADCAST({cl.iceberg_id})"
|
|
418
|
+
if hint_statement
|
|
419
|
+
else f"BROADCAST({cl.iceberg_id})"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
hint_statement = "/*+ " + hint_statement + " */"
|
|
423
|
+
|
|
424
|
+
# TODO Add tag name as a parameter
|
|
425
|
+
where_statement = (
|
|
426
|
+
f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
final_query = (
|
|
430
|
+
"SELECT "
|
|
431
|
+
+ hint_statement
|
|
432
|
+
+ select_statement
|
|
433
|
+
+ from_statement
|
|
434
|
+
+ where_statement
|
|
435
|
+
)
|
|
436
|
+
if not self.keep_history:
|
|
437
|
+
final_query += "\n AND o.replaced_on IS NULL"
|
|
438
|
+
|
|
439
|
+
logging.info("Final query for merging observation and observation_coordinares")
|
|
440
|
+
logging.info(final_query)
|
|
441
|
+
|
|
442
|
+
df_obs_denorm = self.spark.sql(final_query)
|
|
443
|
+
|
|
444
|
+
return df_obs_denorm
|
|
445
|
+
|
|
287
446
|
def _gen_denormalized_metadata(self) -> DataFrame:
|
|
288
447
|
"""Original query upon which the below computation is based
|
|
289
448
|
|
|
@@ -347,6 +506,32 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
347
506
|
|
|
348
507
|
return df_meta_denorm
|
|
349
508
|
|
|
509
|
+
def _gen_denormalized_metadata_sql(self) -> DataFrame:
|
|
510
|
+
# ----------------
|
|
511
|
+
# Generate denormalized observation table
|
|
512
|
+
# ----------------
|
|
513
|
+
|
|
514
|
+
logging.info("meta_denorm start")
|
|
515
|
+
|
|
516
|
+
df_meta_denorm = self.spark.sql(
|
|
517
|
+
f"""
|
|
518
|
+
select m.observation as observation_id,
|
|
519
|
+
mt.code as type,
|
|
520
|
+
met.code as element_type,
|
|
521
|
+
l.country_code as language,
|
|
522
|
+
me.value
|
|
523
|
+
from {self.dataset_tables.METADATA_ELEMENT.iceberg_id} me
|
|
524
|
+
left join {self.dataset_tables.METADATA.iceberg_id} m on m.id = me.metadata
|
|
525
|
+
left join {self.dataset_tables.METADATA_ELEMENT_TYPE.iceberg_id} met on met.id = me.metadata_element_type
|
|
526
|
+
left join {self.dataset_tables.METADATA_TYPE.iceberg_id} mt on mt.id = m.metadata_type
|
|
527
|
+
left join {self.dataset_tables.LANGUAGE.iceberg_id} l on l.id = m.language
|
|
528
|
+
"""
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
logging.info("meta_denorm write")
|
|
532
|
+
|
|
533
|
+
return df_meta_denorm
|
|
534
|
+
|
|
350
535
|
def _gen_grouped_metadata(self) -> DataFrame:
|
|
351
536
|
return (
|
|
352
537
|
self._gen_denormalized_metadata()
|
|
@@ -367,6 +552,26 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
367
552
|
.agg(F.collect_list("metadata").alias("metadata"))
|
|
368
553
|
)
|
|
369
554
|
|
|
555
|
+
def _gen_grouped_metadata_sql(self) -> DataFrame:
|
|
556
|
+
return (
|
|
557
|
+
self._gen_denormalized_metadata_sql()
|
|
558
|
+
.select(
|
|
559
|
+
col("observation_id"),
|
|
560
|
+
F.create_map(
|
|
561
|
+
lit("type"),
|
|
562
|
+
col("type"),
|
|
563
|
+
lit("element_type"),
|
|
564
|
+
col("element_type"),
|
|
565
|
+
lit("language"),
|
|
566
|
+
col("language"),
|
|
567
|
+
lit("value"),
|
|
568
|
+
col("value"),
|
|
569
|
+
).alias("metadata"),
|
|
570
|
+
)
|
|
571
|
+
.groupby("observation_id")
|
|
572
|
+
.agg(F.collect_list("metadata").alias("metadata"))
|
|
573
|
+
)
|
|
574
|
+
|
|
370
575
|
def _gen_bronze_data(self) -> DataFrame:
|
|
371
576
|
return (
|
|
372
577
|
self._gen_denormalized_observation()
|
|
@@ -379,9 +584,37 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
379
584
|
.drop("m.observation_id")
|
|
380
585
|
)
|
|
381
586
|
|
|
587
|
+
def _gen_bronze_data_sql(self) -> DataFrame:
|
|
588
|
+
return (
|
|
589
|
+
self._gen_denormalized_observation_sql()
|
|
590
|
+
.alias("o")
|
|
591
|
+
.join(
|
|
592
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
593
|
+
col("o.id") == col("m.observation_id"),
|
|
594
|
+
"left",
|
|
595
|
+
)
|
|
596
|
+
.drop("m.observation_id")
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
def _gen_bronze_data_sql_from_tag(self) -> DataFrame:
|
|
600
|
+
return (
|
|
601
|
+
self._gen_denormalized_observation_sql_from_tag()
|
|
602
|
+
.alias("o")
|
|
603
|
+
.join(
|
|
604
|
+
self._gen_grouped_metadata_sql().alias("m"),
|
|
605
|
+
col("o.id") == col("m.observation_id"),
|
|
606
|
+
"left",
|
|
607
|
+
)
|
|
608
|
+
.drop("m.observation_id")
|
|
609
|
+
)
|
|
610
|
+
|
|
382
611
|
# TODO decouple data generation and data writing
|
|
383
|
-
def write_bronze_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
384
|
-
|
|
612
|
+
def write_bronze_data_to_iceberg_and_csv(self, sql=True) -> DataFrame:
|
|
613
|
+
|
|
614
|
+
if sql:
|
|
615
|
+
self.df_bronze = self._gen_bronze_data_sql()
|
|
616
|
+
else:
|
|
617
|
+
self.df_bronze = self._gen_bronze_data()
|
|
385
618
|
|
|
386
619
|
self.df_bronze.writeTo(self.iceberg_tables.BRONZE.iceberg_id).createOrReplace()
|
|
387
620
|
|
|
@@ -390,15 +623,6 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
390
623
|
self.spark.sql(
|
|
391
624
|
f"ALTER TABLE {self.iceberg_tables.BRONZE.iceberg_id} CREATE TAG `{self.tag_name}`"
|
|
392
625
|
)
|
|
393
|
-
while (
|
|
394
|
-
self.spark.sql(
|
|
395
|
-
f"SELECT * FROM {self.iceberg_tables.BRONZE.iceberg_id}.refs"
|
|
396
|
-
)
|
|
397
|
-
.filter((col("type") == lit("TAG")) & (col("name") == lit(self.tag_name)))
|
|
398
|
-
.count()
|
|
399
|
-
) == 0:
|
|
400
|
-
logging.info(f"Waiting for the tag {self.tag_name} to be created")
|
|
401
|
-
time.sleep(2)
|
|
402
626
|
|
|
403
627
|
logging.info(f"bronze tag '{self.tag_name}' created")
|
|
404
628
|
|
|
@@ -426,11 +650,13 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
426
650
|
description="Bronze table containing all the raw data imported from the SWS and denormalized",
|
|
427
651
|
layer=TableLayer.BRONZE,
|
|
428
652
|
private=True,
|
|
653
|
+
debug=True,
|
|
429
654
|
type=TableType.ICEBERG,
|
|
430
655
|
database=IcebergDatabases.BRONZE_DATABASE,
|
|
431
656
|
table=self.iceberg_tables.BRONZE.table,
|
|
432
657
|
path=self.iceberg_tables.BRONZE.path,
|
|
433
658
|
structure={"columns": self.df_bronze.schema.jsonValue()["fields"]},
|
|
659
|
+
pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
|
|
434
660
|
)
|
|
435
661
|
tag = tags.add_dissemination_table(
|
|
436
662
|
self.dataset_id, self.tag_name, new_iceberg_table
|
|
@@ -443,6 +669,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
443
669
|
description="Bronze table containing all the raw data imported from the SWS and denormalized cached in csv",
|
|
444
670
|
layer=TableLayer.BRONZE,
|
|
445
671
|
private=True,
|
|
672
|
+
debug=True,
|
|
446
673
|
type=TableType.CSV,
|
|
447
674
|
path=self.iceberg_tables.BRONZE.csv_path,
|
|
448
675
|
structure={"columns": self.df_bronze.schema.jsonValue()["fields"]},
|
|
@@ -455,66 +682,34 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
455
682
|
logging.info("Bronze Dissemination tags successfully written")
|
|
456
683
|
|
|
457
684
|
def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
|
|
458
|
-
self, dimensions: Dict[str, List[str]]
|
|
685
|
+
self, dimensions: Dict[str, List[str]] = {}, from_tag=False
|
|
459
686
|
) -> DataFrame:
|
|
460
687
|
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
688
|
+
if from_tag:
|
|
689
|
+
self.disseminated_tag_df = self._gen_bronze_data_sql_from_tag()
|
|
690
|
+
else:
|
|
691
|
+
self.disseminated_tag_df = self.df_bronze
|
|
465
692
|
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
693
|
+
if not from_tag and dimensions is not None and len(dimensions) != 0:
|
|
694
|
+
for dimension_name, codes in dimensions.items():
|
|
695
|
+
logging.info(f"dimension_name: {dimension_name}")
|
|
696
|
+
logging.info(f"codes: {codes}")
|
|
697
|
+
if len(codes) != 0:
|
|
698
|
+
self.disseminated_tag_df = self.disseminated_tag_df.filter(
|
|
699
|
+
col(dimension_name).isin(codes)
|
|
700
|
+
)
|
|
470
701
|
|
|
471
|
-
|
|
472
|
-
self.
|
|
473
|
-
|
|
474
|
-
)
|
|
475
|
-
.filter(
|
|
476
|
-
(col("type") == lit("BRANCH"))
|
|
477
|
-
& (col("name") == lit(f"diss_tag_{self.tag_name}"))
|
|
478
|
-
)
|
|
479
|
-
.count()
|
|
480
|
-
) == 0:
|
|
481
|
-
logging.info(
|
|
482
|
-
f"Waiting for the branch {self.tag_name} diss_tag_{self.tag_name} to be created"
|
|
483
|
-
)
|
|
484
|
-
time.sleep(2)
|
|
485
|
-
|
|
486
|
-
logging.info(f"result of create_branch_query: {create_branch_query_result}")
|
|
487
|
-
|
|
488
|
-
self.disseminated_tag_df = self.spark.read.option(
|
|
489
|
-
"branch", f"`diss_tag_{self.tag_name}`"
|
|
490
|
-
).table(self.iceberg_tables.BRONZE.iceberg_id)
|
|
491
|
-
|
|
492
|
-
logging.info(f"dimensions: {dimensions}")
|
|
493
|
-
for dimension_name, codes in dimensions.items():
|
|
494
|
-
logging.info(f"dimension_name: {dimension_name}")
|
|
495
|
-
logging.info(f"codes: {codes}")
|
|
496
|
-
if len(codes) != 0:
|
|
497
|
-
# not_in_codes = ",".join([f"'{code}'" for code in codes])
|
|
498
|
-
# delete_from_branch_query = f"DELETE FROM {self.iceberg_tables.BRONZE.iceberg_id}.`branch_diss_tag_{self.tag_name}` WHERE {dimension_name} NOT IN ({not_in_codes})"
|
|
499
|
-
# logging.info(f"delete_from_branch_query: {delete_from_branch_query}")
|
|
500
|
-
# delete_from_branch_query_result = self.spark.sql(
|
|
501
|
-
# delete_from_branch_query
|
|
502
|
-
# ).collect()
|
|
503
|
-
|
|
504
|
-
# logging.info(
|
|
505
|
-
# f"result of delete_from_branch_query: {delete_from_branch_query_result}"
|
|
506
|
-
# )
|
|
507
|
-
self.disseminated_tag_df = self.disseminated_tag_df.filter(
|
|
508
|
-
col(dimension_name).isin(codes)
|
|
509
|
-
)
|
|
702
|
+
self.disseminated_tag_df.writeTo(
|
|
703
|
+
self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
|
|
704
|
+
).createOrReplace()
|
|
510
705
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
706
|
+
logging.info(
|
|
707
|
+
f"Bronze disseminated tag table written to {self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id}"
|
|
708
|
+
)
|
|
514
709
|
|
|
515
|
-
self.
|
|
516
|
-
f"{self.iceberg_tables.
|
|
517
|
-
)
|
|
710
|
+
self.spark.sql(
|
|
711
|
+
f"ALTER TABLE {self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id} CREATE TAG `{self.tag_name}`"
|
|
712
|
+
)
|
|
518
713
|
|
|
519
714
|
disseminated_tag_df = self.disseminated_tag_df.withColumn(
|
|
520
715
|
"metadata", F.to_json(col("metadata"))
|
|
@@ -523,7 +718,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
523
718
|
save_cache_csv(
|
|
524
719
|
df=disseminated_tag_df,
|
|
525
720
|
bucket=self.bucket,
|
|
526
|
-
prefix=f"{self.iceberg_tables.
|
|
721
|
+
prefix=f"{self.iceberg_tables.BRONZE_DISS_TAG.csv_prefix}",
|
|
527
722
|
tag_name=self.tag_name,
|
|
528
723
|
)
|
|
529
724
|
|
|
@@ -542,11 +737,12 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
542
737
|
private=True,
|
|
543
738
|
type=TableType.ICEBERG,
|
|
544
739
|
database=IcebergDatabases.BRONZE_DATABASE,
|
|
545
|
-
table=self.iceberg_tables.
|
|
546
|
-
path=self.iceberg_tables.
|
|
740
|
+
table=self.iceberg_tables.BRONZE_DISS_TAG.table,
|
|
741
|
+
path=self.iceberg_tables.BRONZE_DISS_TAG.path,
|
|
547
742
|
structure={
|
|
548
743
|
"columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
|
|
549
744
|
},
|
|
745
|
+
pinned_columns=[*self.dim_columns_w_time, "value", *self.flag_columns],
|
|
550
746
|
)
|
|
551
747
|
tag = tags.add_dissemination_table(
|
|
552
748
|
self.dataset_id, self.tag_name, new_iceberg_table
|
|
@@ -561,7 +757,7 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
561
757
|
private=True,
|
|
562
758
|
type=TableType.CSV,
|
|
563
759
|
# TODO Correct the path in the origin library
|
|
564
|
-
path=self.iceberg_tables.
|
|
760
|
+
path=self.iceberg_tables.BRONZE_DISS_TAG.csv_path,
|
|
565
761
|
structure={
|
|
566
762
|
"columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
|
|
567
763
|
},
|
|
@@ -573,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
573
769
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
574
770
|
|
|
575
771
|
logging.info("Bronze Disseminated tag with selection successfully written")
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
1
|
|
775
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
776
|
+
1
|
|
777
|
+
1
|
|
778
|
+
2
|
|
779
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
780
|
+
2
|
|
781
|
+
1
|
|
782
|
+
1
|
|
783
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
784
|
+
1
|
|
785
|
+
1
|
|
786
|
+
2
|
|
787
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
788
|
+
2
|
|
789
|
+
1
|
|
790
|
+
1
|
|
791
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
792
|
+
1
|
|
793
|
+
1
|
|
794
|
+
1
|
|
795
|
+
frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
|
|
796
|
+
1
|
|
797
|
+
1
|