sws-spark-dissemination-helper 0.0.93__py3-none-any.whl → 0.0.183__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +321 -57
- sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +723 -0
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +324 -16
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py +96 -31
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +117 -16
- sws_spark_dissemination_helper/__init__.py +1 -0
- sws_spark_dissemination_helper/constants.py +93 -25
- sws_spark_dissemination_helper/utils.py +24 -6
- {sws_spark_dissemination_helper-0.0.93.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/METADATA +11 -11
- sws_spark_dissemination_helper-0.0.183.dist-info/RECORD +13 -0
- sws_spark_dissemination_helper-0.0.93.dist-info/RECORD +0 -11
- {sws_spark_dissemination_helper-0.0.93.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/WHEEL +0 -0
- {sws_spark_dissemination_helper-0.0.93.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,13 +8,9 @@ from pyspark.sql.functions import col, lit
|
|
|
8
8
|
from sws_api_client import Tags
|
|
9
9
|
from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
|
|
10
10
|
|
|
11
|
-
from .constants import IcebergDatabases, IcebergTables
|
|
11
|
+
from .constants import IcebergDatabases, IcebergTables, DatasetDatatables
|
|
12
12
|
from .SWSPostgresSparkReader import SWSPostgresSparkReader
|
|
13
|
-
from .utils import
|
|
14
|
-
get_or_create_tag,
|
|
15
|
-
save_cache_csv,
|
|
16
|
-
upsert_disseminated_table,
|
|
17
|
-
)
|
|
13
|
+
from .utils import get_or_create_tag, save_cache_csv, upsert_disseminated_table
|
|
18
14
|
|
|
19
15
|
|
|
20
16
|
class SWSGoldIcebergSparkHelper:
|
|
@@ -66,6 +62,12 @@ class SWSGoldIcebergSparkHelper:
|
|
|
66
62
|
if col_name in self.dim_columns
|
|
67
63
|
}
|
|
68
64
|
|
|
65
|
+
self.display_decimals = (
|
|
66
|
+
self.sws_postgres_spark_reader.get_display_decimals_datatable(
|
|
67
|
+
domain_code=domain_code
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
69
71
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
70
72
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
71
73
|
dim_columns_w_time = [
|
|
@@ -86,23 +88,97 @@ class SWSGoldIcebergSparkHelper:
|
|
|
86
88
|
def apply_diss_flag_filter(self, df: DataFrame) -> DataFrame:
|
|
87
89
|
return df.filter(col("diss_flag"))
|
|
88
90
|
|
|
89
|
-
def keep_dim_val_attr_columns(
|
|
90
|
-
|
|
91
|
+
def keep_dim_val_attr_columns(
|
|
92
|
+
self, df: DataFrame, additional_columns: List[str] = []
|
|
93
|
+
):
|
|
94
|
+
cols_to_keep_sws = self.cols_to_keep_sws
|
|
95
|
+
for additional_column in additional_columns:
|
|
96
|
+
if additional_column in df.columns:
|
|
97
|
+
cols_to_keep_sws = cols_to_keep_sws + [additional_column]
|
|
98
|
+
if "unit_of_measure_symbol" in df.columns:
|
|
99
|
+
cols_to_keep_sws = cols_to_keep_sws + ["unit_of_measure_symbol"]
|
|
100
|
+
return df.select(*cols_to_keep_sws)
|
|
101
|
+
|
|
102
|
+
def round_to_display_decimals(self, df: DataFrame):
|
|
103
|
+
col1_name, col2_name = (
|
|
104
|
+
self.display_decimals.select("column_1_name", "column_2_name")
|
|
105
|
+
.distinct()
|
|
106
|
+
.collect()[0]
|
|
107
|
+
)
|
|
108
|
+
if col1_name.lower() not in [column.lower() for column in df.columns]:
|
|
109
|
+
raise ValueError(
|
|
110
|
+
f"{col1_name} is not part of the columns available for this dataset ({df.columns})"
|
|
111
|
+
)
|
|
112
|
+
if col2_name.lower() not in [column.lower() for column in df.columns]:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"{col2_name} is not part of the columns available for this dataset ({df.columns})"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
df = (
|
|
118
|
+
df.alias("d")
|
|
119
|
+
.join(
|
|
120
|
+
self.display_decimals.alias("dd"),
|
|
121
|
+
on=(col(f"d.{col1_name}") == col("dd.column_1_value"))
|
|
122
|
+
& (col(f"d.{col2_name}") == col("dd.column_2_value")),
|
|
123
|
+
how="left",
|
|
124
|
+
)
|
|
125
|
+
.select("d.*", "dd.display_decimals")
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
df.filter(col("display_decimals").isNull()).select(
|
|
129
|
+
col1_name, col2_name
|
|
130
|
+
).distinct()
|
|
131
|
+
logging.warning(
|
|
132
|
+
f"The following combinations of {col1_name} and {col2_name} are not available in the table {DatasetDatatables.DISPLAY_DECIMALS.name} and will be assigned to 0"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
df = df.withColumn(
|
|
136
|
+
"display_decimals",
|
|
137
|
+
F.coalesce(col("display_decimals"), lit("0")).cast("INT"),
|
|
138
|
+
).withColumn(
|
|
139
|
+
"value",
|
|
140
|
+
F.round(
|
|
141
|
+
F.col("value").cast("FLOAT") * F.pow(10, F.col("display_decimals")), 0
|
|
142
|
+
)
|
|
143
|
+
/ F.pow(10, F.col("display_decimals")).cast("STRING"),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# F.round(
|
|
147
|
+
# col("value").cast("FLOAT"), col("display_decimals").cast("INT")
|
|
148
|
+
# ).cast("STRING"),
|
|
149
|
+
|
|
150
|
+
return df
|
|
151
|
+
|
|
152
|
+
def read_bronze_data(self) -> DataFrame:
|
|
153
|
+
return self.spark.read.option("tag", self.tag_name).table(
|
|
154
|
+
self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
|
|
155
|
+
)
|
|
91
156
|
|
|
92
157
|
def read_silver_data(self) -> DataFrame:
|
|
93
158
|
return self.spark.read.option("tag", self.tag_name).table(
|
|
94
159
|
self.iceberg_tables.SILVER.iceberg_id
|
|
95
160
|
)
|
|
96
161
|
|
|
97
|
-
def gen_gold_sws_disseminated_data(
|
|
162
|
+
def gen_gold_sws_disseminated_data(
|
|
163
|
+
self, additional_columns: List[str] = []
|
|
164
|
+
) -> DataFrame:
|
|
98
165
|
return (
|
|
99
166
|
self.read_silver_data()
|
|
100
167
|
.transform(self.apply_diss_flag_filter)
|
|
101
|
-
.transform(self.keep_dim_val_attr_columns)
|
|
168
|
+
.transform(self.keep_dim_val_attr_columns, additional_columns)
|
|
102
169
|
)
|
|
103
170
|
|
|
104
|
-
def
|
|
105
|
-
return self.
|
|
171
|
+
def gen_gold_sws_data(self, additional_columns: List[str] = []) -> DataFrame:
|
|
172
|
+
return self.read_bronze_data().transform(
|
|
173
|
+
self.keep_dim_val_attr_columns, additional_columns
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def gen_gold_sws_validated_data(
|
|
177
|
+
self, additional_columns: List[str] = []
|
|
178
|
+
) -> DataFrame:
|
|
179
|
+
return self.read_silver_data().transform(
|
|
180
|
+
self.keep_dim_val_attr_columns, additional_columns
|
|
181
|
+
)
|
|
106
182
|
|
|
107
183
|
def write_gold_sws_validated_data_to_iceberg_and_csv(
|
|
108
184
|
self, df: DataFrame
|
|
@@ -130,6 +206,37 @@ class SWSGoldIcebergSparkHelper:
|
|
|
130
206
|
|
|
131
207
|
return df
|
|
132
208
|
|
|
209
|
+
def write_gold_sws_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
|
|
210
|
+
df.writeTo(self.iceberg_tables.GOLD_SWS.iceberg_id).createOrReplace()
|
|
211
|
+
|
|
212
|
+
logging.info(
|
|
213
|
+
f"Gold SWS table written to {self.iceberg_tables.GOLD_SWS.iceberg_id}"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
self.spark.sql(
|
|
217
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_SWS.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
logging.info(f"gold SWS tag '{self.tag_name}' created")
|
|
221
|
+
|
|
222
|
+
df_1 = df.coalesce(1)
|
|
223
|
+
|
|
224
|
+
save_cache_csv(
|
|
225
|
+
df=df_1,
|
|
226
|
+
bucket=self.bucket,
|
|
227
|
+
prefix=self.iceberg_tables.GOLD_SWS.csv_prefix,
|
|
228
|
+
tag_name=self.tag_name,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
return df
|
|
232
|
+
|
|
233
|
+
def gen_and_write_gold_sws_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
234
|
+
self.df_gold_sws = self.gen_gold_sws_data()
|
|
235
|
+
|
|
236
|
+
self.write_gold_sws_data_to_iceberg_and_csv(self.df_gold_sws)
|
|
237
|
+
|
|
238
|
+
return self.df_gold_sws
|
|
239
|
+
|
|
133
240
|
def gen_and_write_gold_sws_validated_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
134
241
|
self.df_gold_sws_validated = self.gen_gold_sws_validated_data()
|
|
135
242
|
|
|
@@ -232,6 +339,60 @@ class SWSGoldIcebergSparkHelper:
|
|
|
232
339
|
|
|
233
340
|
return df
|
|
234
341
|
|
|
342
|
+
def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
|
|
343
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
344
|
+
df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
|
|
345
|
+
|
|
346
|
+
logging.info(
|
|
347
|
+
f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
self.spark.sql(
|
|
351
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
|
|
355
|
+
|
|
356
|
+
df_1 = df.coalesce(1)
|
|
357
|
+
|
|
358
|
+
save_cache_csv(
|
|
359
|
+
df=df_1,
|
|
360
|
+
bucket=self.bucket,
|
|
361
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
|
|
362
|
+
tag_name=self.tag_name,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
return df
|
|
366
|
+
|
|
367
|
+
def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
|
|
368
|
+
self, df: DataFrame
|
|
369
|
+
) -> DataFrame:
|
|
370
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
371
|
+
df.writeTo(
|
|
372
|
+
self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
|
|
373
|
+
).createOrReplace()
|
|
374
|
+
|
|
375
|
+
logging.info(
|
|
376
|
+
f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
self.spark.sql(
|
|
380
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
|
|
384
|
+
|
|
385
|
+
df_1 = df.coalesce(1)
|
|
386
|
+
|
|
387
|
+
save_cache_csv(
|
|
388
|
+
df=df_1,
|
|
389
|
+
bucket=self.bucket,
|
|
390
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
|
|
391
|
+
tag_name=self.tag_name,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
return df
|
|
395
|
+
|
|
235
396
|
def write_gold_sws_validated_sws_dissemination_tag(
|
|
236
397
|
self, df: DataFrame, tags: Tags
|
|
237
398
|
) -> DataFrame:
|
|
@@ -242,7 +403,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
242
403
|
new_iceberg_table = BaseDisseminatedTagTable(
|
|
243
404
|
id=f"{self.domain_code.lower()}_gold_sws_validated_iceberg",
|
|
244
405
|
name=f"{self.domain_code} gold SWS validated Iceberg",
|
|
245
|
-
description="Gold table containing all the data
|
|
406
|
+
description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format",
|
|
246
407
|
layer=TableLayer.GOLD,
|
|
247
408
|
private=True,
|
|
248
409
|
type=TableType.ICEBERG,
|
|
@@ -263,7 +424,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
263
424
|
new_diss_table = BaseDisseminatedTagTable(
|
|
264
425
|
id=f"{self.domain_code.lower()}_gold_sws_validated_csv",
|
|
265
426
|
name=f"{self.domain_code} gold SWS validated csv",
|
|
266
|
-
description="Gold table containing all the data
|
|
427
|
+
description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format, cached in csv",
|
|
267
428
|
layer=TableLayer.GOLD,
|
|
268
429
|
private=True,
|
|
269
430
|
type=TableType.CSV,
|
|
@@ -291,7 +452,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
291
452
|
new_iceberg_table = BaseDisseminatedTagTable(
|
|
292
453
|
id=f"{self.domain_code.lower()}_gold_sws_disseminated_iceberg",
|
|
293
454
|
name=f"{self.domain_code} gold SWS disseminated Iceberg",
|
|
294
|
-
description="Gold table containing
|
|
455
|
+
description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format",
|
|
295
456
|
layer=TableLayer.GOLD,
|
|
296
457
|
private=True,
|
|
297
458
|
type=TableType.ICEBERG,
|
|
@@ -312,7 +473,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
312
473
|
new_diss_table = BaseDisseminatedTagTable(
|
|
313
474
|
id=f"{self.domain_code.lower()}_gold_sws_disseminated_csv",
|
|
314
475
|
name=f"{self.domain_code} gold SWS disseminated csv",
|
|
315
|
-
description="Gold table containing
|
|
476
|
+
description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format, cached in csv",
|
|
316
477
|
layer=TableLayer.GOLD,
|
|
317
478
|
private=True,
|
|
318
479
|
type=TableType.CSV,
|
|
@@ -392,6 +553,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
392
553
|
description="Gold table containing all the cleaned data in SDMX compatible format, ready to be mapped using FMR",
|
|
393
554
|
layer=TableLayer.GOLD,
|
|
394
555
|
private=True,
|
|
556
|
+
debug=True,
|
|
395
557
|
type=TableType.ICEBERG,
|
|
396
558
|
database=IcebergDatabases.GOLD_DATABASE,
|
|
397
559
|
table=self.iceberg_tables.GOLD_PRE_SDMX.table,
|
|
@@ -413,6 +575,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
413
575
|
description="Gold table containing all the cleaned data in SDMX compatible format, ready to be mapped using FMR and cached in csv",
|
|
414
576
|
layer=TableLayer.GOLD,
|
|
415
577
|
private=True,
|
|
578
|
+
debug=True,
|
|
416
579
|
type=TableType.CSV,
|
|
417
580
|
path=self.iceberg_tables.GOLD_PRE_SDMX.csv_path,
|
|
418
581
|
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
@@ -427,3 +590,148 @@ class SWSGoldIcebergSparkHelper:
|
|
|
427
590
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
428
591
|
|
|
429
592
|
return df
|
|
593
|
+
|
|
594
|
+
def write_gold_sws_dissemination_tag(self, df: DataFrame, tags: Tags) -> DataFrame:
|
|
595
|
+
# Get or create a new tag
|
|
596
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
597
|
+
logging.debug(f"Tag: {tag}")
|
|
598
|
+
|
|
599
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
600
|
+
id=f"{self.domain_code.lower()}_gold_sws_iceberg",
|
|
601
|
+
name=f"{self.domain_code} gold SWS Iceberg",
|
|
602
|
+
description="Gold table containing the tag data without any processing",
|
|
603
|
+
layer=TableLayer.GOLD,
|
|
604
|
+
private=True,
|
|
605
|
+
type=TableType.ICEBERG,
|
|
606
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
607
|
+
table=self.iceberg_tables.GOLD_SWS.table,
|
|
608
|
+
path=self.iceberg_tables.GOLD_SWS.path,
|
|
609
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
610
|
+
)
|
|
611
|
+
tag = upsert_disseminated_table(
|
|
612
|
+
sws_tags=tags,
|
|
613
|
+
tag=tag,
|
|
614
|
+
dataset_id=self.dataset_id,
|
|
615
|
+
tag_name=self.tag_name,
|
|
616
|
+
table=new_iceberg_table,
|
|
617
|
+
)
|
|
618
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
619
|
+
|
|
620
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
621
|
+
id=f"{self.domain_code.lower()}_gold_sws_csv",
|
|
622
|
+
name=f"{self.domain_code} gold SWS csv",
|
|
623
|
+
description="Gold table containing the tag data without any processing cached in csv",
|
|
624
|
+
layer=TableLayer.GOLD,
|
|
625
|
+
private=True,
|
|
626
|
+
type=TableType.CSV,
|
|
627
|
+
path=self.iceberg_tables.GOLD_SWS.csv_path,
|
|
628
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
629
|
+
)
|
|
630
|
+
tag = upsert_disseminated_table(
|
|
631
|
+
sws_tags=tags,
|
|
632
|
+
tag=tag,
|
|
633
|
+
dataset_id=self.dataset_id,
|
|
634
|
+
tag_name=self.tag_name,
|
|
635
|
+
table=new_diss_table,
|
|
636
|
+
)
|
|
637
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
638
|
+
|
|
639
|
+
return df
|
|
640
|
+
|
|
641
|
+
def write_gold_faostat_dissemination_tag(
|
|
642
|
+
self, df: DataFrame, tags: Tags
|
|
643
|
+
) -> DataFrame:
|
|
644
|
+
# Get or create a new tag
|
|
645
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
646
|
+
logging.debug(f"Tag: {tag}")
|
|
647
|
+
|
|
648
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
649
|
+
id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
|
|
650
|
+
name=f"{self.domain_code} gold FAOSTAT Iceberg",
|
|
651
|
+
description="Gold table containing the tag data in FAOSTAT format",
|
|
652
|
+
layer=TableLayer.GOLD,
|
|
653
|
+
private=True,
|
|
654
|
+
type=TableType.ICEBERG,
|
|
655
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
656
|
+
table=self.iceberg_tables.GOLD_FAOSTAT.table,
|
|
657
|
+
path=self.iceberg_tables.GOLD_FAOSTAT.path,
|
|
658
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
659
|
+
)
|
|
660
|
+
tag = upsert_disseminated_table(
|
|
661
|
+
sws_tags=tags,
|
|
662
|
+
tag=tag,
|
|
663
|
+
dataset_id=self.dataset_id,
|
|
664
|
+
tag_name=self.tag_name,
|
|
665
|
+
table=new_iceberg_table,
|
|
666
|
+
)
|
|
667
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
668
|
+
|
|
669
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
670
|
+
id=f"{self.domain_code.lower()}_gold_faostat_csv",
|
|
671
|
+
name=f"{self.domain_code} gold FAOSTAT csv",
|
|
672
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
673
|
+
layer=TableLayer.GOLD,
|
|
674
|
+
private=True,
|
|
675
|
+
type=TableType.CSV,
|
|
676
|
+
path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
|
|
677
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
678
|
+
)
|
|
679
|
+
tag = upsert_disseminated_table(
|
|
680
|
+
sws_tags=tags,
|
|
681
|
+
tag=tag,
|
|
682
|
+
dataset_id=self.dataset_id,
|
|
683
|
+
tag_name=self.tag_name,
|
|
684
|
+
table=new_diss_table,
|
|
685
|
+
)
|
|
686
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
687
|
+
|
|
688
|
+
return df
|
|
689
|
+
|
|
690
|
+
def write_gold_faostat_unfiltered_dissemination_tag(
|
|
691
|
+
self, df: DataFrame, tags: Tags
|
|
692
|
+
) -> DataFrame:
|
|
693
|
+
# Get or create a new tag
|
|
694
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
695
|
+
logging.debug(f"Tag: {tag}")
|
|
696
|
+
|
|
697
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
698
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
|
|
699
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
|
|
700
|
+
description="Gold table containing all the tag data in FAOSTAT format",
|
|
701
|
+
layer=TableLayer.GOLD,
|
|
702
|
+
private=True,
|
|
703
|
+
type=TableType.ICEBERG,
|
|
704
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
705
|
+
table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
|
|
706
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
|
|
707
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
708
|
+
)
|
|
709
|
+
tag = upsert_disseminated_table(
|
|
710
|
+
sws_tags=tags,
|
|
711
|
+
tag=tag,
|
|
712
|
+
dataset_id=self.dataset_id,
|
|
713
|
+
tag_name=self.tag_name,
|
|
714
|
+
table=new_iceberg_table,
|
|
715
|
+
)
|
|
716
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
717
|
+
|
|
718
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
719
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
|
|
720
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
|
|
721
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
722
|
+
layer=TableLayer.GOLD,
|
|
723
|
+
private=True,
|
|
724
|
+
type=TableType.CSV,
|
|
725
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
|
|
726
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
727
|
+
)
|
|
728
|
+
tag = upsert_disseminated_table(
|
|
729
|
+
sws_tags=tags,
|
|
730
|
+
tag=tag,
|
|
731
|
+
dataset_id=self.dataset_id,
|
|
732
|
+
tag_name=self.tag_name,
|
|
733
|
+
table=new_diss_table,
|
|
734
|
+
)
|
|
735
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
736
|
+
|
|
737
|
+
return df
|
|
@@ -94,25 +94,37 @@ class SWSPostgresSparkReader:
|
|
|
94
94
|
|
|
95
95
|
logging.info(f"{pg_table} read start")
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
97
|
+
if min_id is None or max_id is None:
|
|
98
|
+
df = (
|
|
99
|
+
self.spark.read.format("jdbc")
|
|
100
|
+
.option("customSchema", custom_schema)
|
|
101
|
+
.option("dbtable", pg_table)
|
|
102
|
+
.option("fetchsize", "1000")
|
|
103
|
+
.option("url", self.jdbc_url)
|
|
104
|
+
.option("user", self.jdbc_conn_properties["user"])
|
|
105
|
+
.option("password", self.jdbc_conn_properties["password"])
|
|
106
|
+
.option("driver", SPARK_POSTGRES_DRIVER)
|
|
107
|
+
.load()
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
df = (
|
|
111
|
+
self.spark.read.format("jdbc")
|
|
112
|
+
.option("customSchema", custom_schema)
|
|
113
|
+
.option("dbtable", pg_table)
|
|
114
|
+
.option("partitionColumn", partition_column)
|
|
115
|
+
.option("lowerBound", min_id)
|
|
116
|
+
.option("upperBound", max_id)
|
|
117
|
+
.option("numPartitions", num_partitions)
|
|
118
|
+
.option("fetchsize", "1000")
|
|
119
|
+
.option("url", self.jdbc_url)
|
|
120
|
+
.option("user", self.jdbc_conn_properties["user"])
|
|
121
|
+
.option("password", self.jdbc_conn_properties["password"])
|
|
122
|
+
.option("driver", SPARK_POSTGRES_DRIVER)
|
|
123
|
+
.load()
|
|
124
|
+
# .repartition(1024, partition_column)
|
|
125
|
+
# .sortWithinPartitions(partition_column)
|
|
126
|
+
# .cache()
|
|
127
|
+
)
|
|
116
128
|
else:
|
|
117
129
|
df = (
|
|
118
130
|
self.spark.read.format("jdbc")
|
|
@@ -195,6 +207,7 @@ class SWSPostgresSparkReader:
|
|
|
195
207
|
(dataset_tables.OBSERVATION_COORDINATE, "id", 10),
|
|
196
208
|
(dataset_tables.METADATA, "id", 10),
|
|
197
209
|
(dataset_tables.METADATA_ELEMENT, "metadata", 10),
|
|
210
|
+
(dataset_tables.TAG_OBSERVATION, "tag", 10),
|
|
198
211
|
]
|
|
199
212
|
return self._import_tables(data_tables)
|
|
200
213
|
|
|
@@ -209,25 +222,30 @@ class SWSPostgresSparkReader:
|
|
|
209
222
|
dataset_tables.METADATA_ELEMENT_TYPE,
|
|
210
223
|
dataset_tables.LANGUAGE,
|
|
211
224
|
dataset_tables.UNIT_OF_MEASURE,
|
|
225
|
+
dataset_tables.DATASET,
|
|
212
226
|
*dataset_tables.CODELISTS,
|
|
213
227
|
]
|
|
228
|
+
logging.info(
|
|
229
|
+
f"Importing reference data tables: {[(table.postgres_id, table.iceberg_id) for table in reference_data_tables]}"
|
|
230
|
+
)
|
|
214
231
|
return self._import_tables(
|
|
215
232
|
[(table, None, 1) for table in reference_data_tables]
|
|
216
233
|
)
|
|
217
234
|
|
|
218
235
|
def import_operational_data_tables(
|
|
219
236
|
self, dataset_tables: DatasetTables
|
|
220
|
-
) -> DataFrame:
|
|
237
|
+
) -> List[DataFrame]:
|
|
221
238
|
# Define and import operational data table without partitioning
|
|
222
239
|
operational_data_tables = [
|
|
223
240
|
(dataset_tables.USER, None, 1),
|
|
241
|
+
(dataset_tables.TAG, None, 1),
|
|
224
242
|
]
|
|
225
|
-
return self._import_tables(operational_data_tables)
|
|
243
|
+
return self._import_tables(operational_data_tables)
|
|
226
244
|
|
|
227
245
|
def import_data_reference_data_operational_data(
|
|
228
246
|
self, dataset_tables: DatasetTables
|
|
229
247
|
) -> Tuple[
|
|
230
|
-
Tuple[DataFrame, DataFrame, DataFrame, DataFrame],
|
|
248
|
+
Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame],
|
|
231
249
|
Tuple[
|
|
232
250
|
DataFrame,
|
|
233
251
|
DataFrame,
|
|
@@ -235,22 +253,23 @@ class SWSPostgresSparkReader:
|
|
|
235
253
|
DataFrame,
|
|
236
254
|
DataFrame,
|
|
237
255
|
DataFrame,
|
|
256
|
+
DataFrame,
|
|
238
257
|
List[DataFrame],
|
|
239
258
|
],
|
|
240
|
-
DataFrame,
|
|
259
|
+
Tuple[DataFrame, DataFrame],
|
|
241
260
|
]:
|
|
242
261
|
# Import and organize DataFrames into the desired output structure
|
|
243
262
|
data_dfs = self.import_data_tables(dataset_tables)
|
|
244
263
|
reference_data_dfs = self.import_reference_data_tables(dataset_tables)
|
|
245
|
-
|
|
264
|
+
operational_data_dfs = self.import_operational_data_tables(dataset_tables)
|
|
246
265
|
|
|
247
266
|
return (
|
|
248
267
|
tuple(data_dfs),
|
|
249
268
|
(
|
|
250
|
-
*reference_data_dfs[:
|
|
251
|
-
reference_data_dfs[
|
|
269
|
+
*reference_data_dfs[:7],
|
|
270
|
+
reference_data_dfs[7:],
|
|
252
271
|
),
|
|
253
|
-
|
|
272
|
+
tuple(operational_data_dfs),
|
|
254
273
|
)
|
|
255
274
|
|
|
256
275
|
def get_codelist_type_mapping(
|
|
@@ -291,13 +310,17 @@ class SWSPostgresSparkReader:
|
|
|
291
310
|
self,
|
|
292
311
|
domain_code: str,
|
|
293
312
|
) -> DataFrame:
|
|
294
|
-
|
|
313
|
+
df = self.read_pg_table(
|
|
295
314
|
pg_table=DatasetDatatables.MAPPING_CODE_CORRECTION.id,
|
|
296
|
-
table_name=DatasetDatatables.MAPPING_CODE_CORRECTION.name,
|
|
297
315
|
custom_schema=DatasetDatatables.MAPPING_CODE_CORRECTION.schema,
|
|
298
|
-
domain_code=domain_code,
|
|
299
|
-
unique_columns=["old_code"],
|
|
300
316
|
)
|
|
317
|
+
df.filter(
|
|
318
|
+
col("mapping_type").isNull() | (col("mapping_type") == lit(""))
|
|
319
|
+
).transform(
|
|
320
|
+
correct_domain_filter, domain=domain_code, unique_columns=["old_code"]
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
return df
|
|
301
324
|
|
|
302
325
|
def get_domain_code_source_datasets_ids_dest_dataset_id(
|
|
303
326
|
self, dataset_id: str, domain_code: str = None
|
|
@@ -474,3 +497,45 @@ class SWSPostgresSparkReader:
|
|
|
474
497
|
"aggregation",
|
|
475
498
|
],
|
|
476
499
|
)
|
|
500
|
+
|
|
501
|
+
def get_display_decimals_datatable(
|
|
502
|
+
self,
|
|
503
|
+
domain_code: str,
|
|
504
|
+
) -> DataFrame:
|
|
505
|
+
df = self.read_pg_table(
|
|
506
|
+
pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
|
|
507
|
+
custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
|
|
508
|
+
).filter(col("domain") == lit(domain_code))
|
|
509
|
+
|
|
510
|
+
pairs = df.select("column_1_name", "column_2_name").distinct().collect()
|
|
511
|
+
|
|
512
|
+
# If no config exists for this domain, fail early
|
|
513
|
+
if not pairs:
|
|
514
|
+
msg = (
|
|
515
|
+
f'No display-decimals configuration found for domain "{domain_code}". '
|
|
516
|
+
f'Please add an entry in table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
|
|
517
|
+
)
|
|
518
|
+
logging.error(msg)
|
|
519
|
+
# raise ValueError(msg)
|
|
520
|
+
|
|
521
|
+
# If more than one mapping exists, it's invalid
|
|
522
|
+
if len(pairs) > 1:
|
|
523
|
+
formatted_pairs = [(p["column_1_name"], p["column_2_name"]) for p in pairs]
|
|
524
|
+
|
|
525
|
+
msg = (
|
|
526
|
+
f'Invalid configuration for domain "{domain_code}". '
|
|
527
|
+
f"Expected exactly one (column_1_name, column_2_name) pair, but found {len(pairs)}: "
|
|
528
|
+
f"{formatted_pairs}. "
|
|
529
|
+
f'Please correct the table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
logging.error(
|
|
533
|
+
"Multiple display-decimals column pairs detected",
|
|
534
|
+
extra={
|
|
535
|
+
"domain": domain_code,
|
|
536
|
+
"pairs_found": formatted_pairs,
|
|
537
|
+
},
|
|
538
|
+
)
|
|
539
|
+
raise ValueError(msg)
|
|
540
|
+
|
|
541
|
+
return df
|