sws-spark-dissemination-helper 0.0.79__py3-none-any.whl → 0.0.183__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +380 -28
- sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +723 -0
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +310 -347
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py +152 -31
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +119 -14
- sws_spark_dissemination_helper/__init__.py +1 -0
- sws_spark_dissemination_helper/constants.py +93 -25
- sws_spark_dissemination_helper/utils.py +24 -6
- {sws_spark_dissemination_helper-0.0.79.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/METADATA +21 -17
- sws_spark_dissemination_helper-0.0.183.dist-info/RECORD +13 -0
- sws_spark_dissemination_helper-0.0.79.dist-info/RECORD +0 -11
- {sws_spark_dissemination_helper-0.0.79.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/WHEEL +0 -0
- {sws_spark_dissemination_helper-0.0.79.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/licenses/LICENSE +0 -0
|
@@ -94,25 +94,37 @@ class SWSPostgresSparkReader:
|
|
|
94
94
|
|
|
95
95
|
logging.info(f"{pg_table} read start")
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
97
|
+
if min_id is None or max_id is None:
|
|
98
|
+
df = (
|
|
99
|
+
self.spark.read.format("jdbc")
|
|
100
|
+
.option("customSchema", custom_schema)
|
|
101
|
+
.option("dbtable", pg_table)
|
|
102
|
+
.option("fetchsize", "1000")
|
|
103
|
+
.option("url", self.jdbc_url)
|
|
104
|
+
.option("user", self.jdbc_conn_properties["user"])
|
|
105
|
+
.option("password", self.jdbc_conn_properties["password"])
|
|
106
|
+
.option("driver", SPARK_POSTGRES_DRIVER)
|
|
107
|
+
.load()
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
df = (
|
|
111
|
+
self.spark.read.format("jdbc")
|
|
112
|
+
.option("customSchema", custom_schema)
|
|
113
|
+
.option("dbtable", pg_table)
|
|
114
|
+
.option("partitionColumn", partition_column)
|
|
115
|
+
.option("lowerBound", min_id)
|
|
116
|
+
.option("upperBound", max_id)
|
|
117
|
+
.option("numPartitions", num_partitions)
|
|
118
|
+
.option("fetchsize", "1000")
|
|
119
|
+
.option("url", self.jdbc_url)
|
|
120
|
+
.option("user", self.jdbc_conn_properties["user"])
|
|
121
|
+
.option("password", self.jdbc_conn_properties["password"])
|
|
122
|
+
.option("driver", SPARK_POSTGRES_DRIVER)
|
|
123
|
+
.load()
|
|
124
|
+
# .repartition(1024, partition_column)
|
|
125
|
+
# .sortWithinPartitions(partition_column)
|
|
126
|
+
# .cache()
|
|
127
|
+
)
|
|
116
128
|
else:
|
|
117
129
|
df = (
|
|
118
130
|
self.spark.read.format("jdbc")
|
|
@@ -195,6 +207,7 @@ class SWSPostgresSparkReader:
|
|
|
195
207
|
(dataset_tables.OBSERVATION_COORDINATE, "id", 10),
|
|
196
208
|
(dataset_tables.METADATA, "id", 10),
|
|
197
209
|
(dataset_tables.METADATA_ELEMENT, "metadata", 10),
|
|
210
|
+
(dataset_tables.TAG_OBSERVATION, "tag", 10),
|
|
198
211
|
]
|
|
199
212
|
return self._import_tables(data_tables)
|
|
200
213
|
|
|
@@ -209,25 +222,30 @@ class SWSPostgresSparkReader:
|
|
|
209
222
|
dataset_tables.METADATA_ELEMENT_TYPE,
|
|
210
223
|
dataset_tables.LANGUAGE,
|
|
211
224
|
dataset_tables.UNIT_OF_MEASURE,
|
|
225
|
+
dataset_tables.DATASET,
|
|
212
226
|
*dataset_tables.CODELISTS,
|
|
213
227
|
]
|
|
228
|
+
logging.info(
|
|
229
|
+
f"Importing reference data tables: {[(table.postgres_id, table.iceberg_id) for table in reference_data_tables]}"
|
|
230
|
+
)
|
|
214
231
|
return self._import_tables(
|
|
215
232
|
[(table, None, 1) for table in reference_data_tables]
|
|
216
233
|
)
|
|
217
234
|
|
|
218
235
|
def import_operational_data_tables(
|
|
219
236
|
self, dataset_tables: DatasetTables
|
|
220
|
-
) -> DataFrame:
|
|
237
|
+
) -> List[DataFrame]:
|
|
221
238
|
# Define and import operational data table without partitioning
|
|
222
239
|
operational_data_tables = [
|
|
223
240
|
(dataset_tables.USER, None, 1),
|
|
241
|
+
(dataset_tables.TAG, None, 1),
|
|
224
242
|
]
|
|
225
|
-
return self._import_tables(operational_data_tables)
|
|
243
|
+
return self._import_tables(operational_data_tables)
|
|
226
244
|
|
|
227
245
|
def import_data_reference_data_operational_data(
|
|
228
246
|
self, dataset_tables: DatasetTables
|
|
229
247
|
) -> Tuple[
|
|
230
|
-
Tuple[DataFrame, DataFrame, DataFrame, DataFrame],
|
|
248
|
+
Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame],
|
|
231
249
|
Tuple[
|
|
232
250
|
DataFrame,
|
|
233
251
|
DataFrame,
|
|
@@ -235,22 +253,23 @@ class SWSPostgresSparkReader:
|
|
|
235
253
|
DataFrame,
|
|
236
254
|
DataFrame,
|
|
237
255
|
DataFrame,
|
|
256
|
+
DataFrame,
|
|
238
257
|
List[DataFrame],
|
|
239
258
|
],
|
|
240
|
-
DataFrame,
|
|
259
|
+
Tuple[DataFrame, DataFrame],
|
|
241
260
|
]:
|
|
242
261
|
# Import and organize DataFrames into the desired output structure
|
|
243
262
|
data_dfs = self.import_data_tables(dataset_tables)
|
|
244
263
|
reference_data_dfs = self.import_reference_data_tables(dataset_tables)
|
|
245
|
-
|
|
264
|
+
operational_data_dfs = self.import_operational_data_tables(dataset_tables)
|
|
246
265
|
|
|
247
266
|
return (
|
|
248
267
|
tuple(data_dfs),
|
|
249
268
|
(
|
|
250
|
-
*reference_data_dfs[:
|
|
251
|
-
reference_data_dfs[
|
|
269
|
+
*reference_data_dfs[:7],
|
|
270
|
+
reference_data_dfs[7:],
|
|
252
271
|
),
|
|
253
|
-
|
|
272
|
+
tuple(operational_data_dfs),
|
|
254
273
|
)
|
|
255
274
|
|
|
256
275
|
def get_codelist_type_mapping(
|
|
@@ -291,13 +310,73 @@ class SWSPostgresSparkReader:
|
|
|
291
310
|
self,
|
|
292
311
|
domain_code: str,
|
|
293
312
|
) -> DataFrame:
|
|
294
|
-
|
|
313
|
+
df = self.read_pg_table(
|
|
295
314
|
pg_table=DatasetDatatables.MAPPING_CODE_CORRECTION.id,
|
|
296
|
-
table_name=DatasetDatatables.MAPPING_CODE_CORRECTION.name,
|
|
297
315
|
custom_schema=DatasetDatatables.MAPPING_CODE_CORRECTION.schema,
|
|
298
|
-
domain_code=domain_code,
|
|
299
|
-
unique_columns=["old_code"],
|
|
300
316
|
)
|
|
317
|
+
df.filter(
|
|
318
|
+
col("mapping_type").isNull() | (col("mapping_type") == lit(""))
|
|
319
|
+
).transform(
|
|
320
|
+
correct_domain_filter, domain=domain_code, unique_columns=["old_code"]
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
return df
|
|
324
|
+
|
|
325
|
+
def get_domain_code_source_datasets_ids_dest_dataset_id(
|
|
326
|
+
self, dataset_id: str, domain_code: str = None
|
|
327
|
+
) -> Tuple[str, List[str], str]:
|
|
328
|
+
mapping_domains_id_df = self.read_pg_table(
|
|
329
|
+
pg_table=DatasetDatatables.MAPPING_DOMAINS_ID.id,
|
|
330
|
+
custom_schema=DatasetDatatables.MAPPING_DOMAINS_ID.schema,
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
if domain_code is None:
|
|
334
|
+
domain_code_df = mapping_domains_id_df.filter(
|
|
335
|
+
col("sws_source_id") == lit(dataset_id)
|
|
336
|
+
).select("domain")
|
|
337
|
+
|
|
338
|
+
if domain_code_df.count() == 0:
|
|
339
|
+
raise ValueError(
|
|
340
|
+
f'There is no row connecting the current source dataset id ({dataset_id}) to any domain in the table "{DatasetDatatables.MAPPING_DOMAINS_ID.name}"'
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
if domain_code_df.count() > 1:
|
|
344
|
+
raise ValueError(
|
|
345
|
+
f'There is more than one domain referencing the current source dataset id ({dataset_id}) in the table "{DatasetDatatables.MAPPING_DOMAINS_ID.name}", please specify the domain code you want to process in the parameters'
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
domain_code = domain_code_df.collect()[0][0]
|
|
349
|
+
|
|
350
|
+
source_datasets_ids = [
|
|
351
|
+
row[0]
|
|
352
|
+
for row in (
|
|
353
|
+
mapping_domains_id_df.filter(col("domain") == lit(domain_code))
|
|
354
|
+
.select("sws_source_id")
|
|
355
|
+
.collect()
|
|
356
|
+
)
|
|
357
|
+
]
|
|
358
|
+
dest_datasets_id_df = (
|
|
359
|
+
mapping_domains_id_df.filter(col("domain") == lit(domain_code))
|
|
360
|
+
.select("sws_destination_id")
|
|
361
|
+
.distinct()
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if dest_datasets_id_df.count() == 0:
|
|
365
|
+
raise ValueError(
|
|
366
|
+
f'There is no row connecting the current source dataset id and domain pair ({dataset_id}, {domain_code}) to any destination dataset id in the table "{DatasetDatatables.MAPPING_DOMAINS_ID.name}"'
|
|
367
|
+
)
|
|
368
|
+
if dest_datasets_id_df.count() > 1:
|
|
369
|
+
raise ValueError(
|
|
370
|
+
f'The source dataset id and domain pair ({dataset_id}, {domain_code}) must point only to one destination dataset in the table "{DatasetDatatables.MAPPING_DOMAINS_ID.name}"'
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
dest_datasets_id = dest_datasets_id_df.collect()[0][0]
|
|
374
|
+
|
|
375
|
+
logging.info(f"domain code: {domain_code}")
|
|
376
|
+
logging.info(f"source datasets ids: {source_datasets_ids}")
|
|
377
|
+
logging.info(f"dest datasets ids: {dest_datasets_id}")
|
|
378
|
+
|
|
379
|
+
return (domain_code, source_datasets_ids, dest_datasets_id)
|
|
301
380
|
|
|
302
381
|
def get_dest_dataset_id(self, domain_code: str, dataset_id: str) -> Tuple[str, str]:
|
|
303
382
|
|
|
@@ -418,3 +497,45 @@ class SWSPostgresSparkReader:
|
|
|
418
497
|
"aggregation",
|
|
419
498
|
],
|
|
420
499
|
)
|
|
500
|
+
|
|
501
|
+
def get_display_decimals_datatable(
|
|
502
|
+
self,
|
|
503
|
+
domain_code: str,
|
|
504
|
+
) -> DataFrame:
|
|
505
|
+
df = self.read_pg_table(
|
|
506
|
+
pg_table=DatasetDatatables.DISPLAY_DECIMALS.id,
|
|
507
|
+
custom_schema=DatasetDatatables.DISPLAY_DECIMALS.schema,
|
|
508
|
+
).filter(col("domain") == lit(domain_code))
|
|
509
|
+
|
|
510
|
+
pairs = df.select("column_1_name", "column_2_name").distinct().collect()
|
|
511
|
+
|
|
512
|
+
# If no config exists for this domain, fail early
|
|
513
|
+
if not pairs:
|
|
514
|
+
msg = (
|
|
515
|
+
f'No display-decimals configuration found for domain "{domain_code}". '
|
|
516
|
+
f'Please add an entry in table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
|
|
517
|
+
)
|
|
518
|
+
logging.error(msg)
|
|
519
|
+
# raise ValueError(msg)
|
|
520
|
+
|
|
521
|
+
# If more than one mapping exists, it's invalid
|
|
522
|
+
if len(pairs) > 1:
|
|
523
|
+
formatted_pairs = [(p["column_1_name"], p["column_2_name"]) for p in pairs]
|
|
524
|
+
|
|
525
|
+
msg = (
|
|
526
|
+
f'Invalid configuration for domain "{domain_code}". '
|
|
527
|
+
f"Expected exactly one (column_1_name, column_2_name) pair, but found {len(pairs)}: "
|
|
528
|
+
f"{formatted_pairs}. "
|
|
529
|
+
f'Please correct the table "{DatasetDatatables.DISPLAY_DECIMALS.id}".'
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
logging.error(
|
|
533
|
+
"Multiple display-decimals column pairs detected",
|
|
534
|
+
extra={
|
|
535
|
+
"domain": domain_code,
|
|
536
|
+
"pairs_found": formatted_pairs,
|
|
537
|
+
},
|
|
538
|
+
)
|
|
539
|
+
raise ValueError(msg)
|
|
540
|
+
|
|
541
|
+
return df
|
|
@@ -10,7 +10,7 @@ from pyspark.sql.window import Window
|
|
|
10
10
|
from sws_api_client import Tags
|
|
11
11
|
from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
|
|
12
12
|
|
|
13
|
-
from .constants import IcebergDatabases, IcebergTables
|
|
13
|
+
from .constants import IcebergDatabases, IcebergTables, DatasetDatatables
|
|
14
14
|
from .SWSPostgresSparkReader import SWSPostgresSparkReader
|
|
15
15
|
from .utils import (
|
|
16
16
|
get_or_create_tag,
|
|
@@ -103,7 +103,7 @@ class SWSSilverIcebergSparkHelper:
|
|
|
103
103
|
# The diss_flag column is needed to initialize the condition expression
|
|
104
104
|
# The note column will contain the eventual reasons why diss_flag has been set to false
|
|
105
105
|
return df.withColumn("diss_flag", lit(True)).withColumn(
|
|
106
|
-
"
|
|
106
|
+
"diss_note", lit([]).cast(ArrayType(StringType()))
|
|
107
107
|
)
|
|
108
108
|
|
|
109
109
|
def read_bronze_data(self) -> DataFrame:
|
|
@@ -111,6 +111,11 @@ class SWSSilverIcebergSparkHelper:
|
|
|
111
111
|
self.iceberg_tables.BRONZE.iceberg_id
|
|
112
112
|
)
|
|
113
113
|
|
|
114
|
+
def read_bronze_diss_tag_data(self) -> DataFrame:
|
|
115
|
+
return self.spark.read.option("tag", self.tag_name).table(
|
|
116
|
+
self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
|
|
117
|
+
)
|
|
118
|
+
|
|
114
119
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
115
120
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
116
121
|
dim_columns_w_time = [
|
|
@@ -158,6 +163,99 @@ class SWSSilverIcebergSparkHelper:
|
|
|
158
163
|
|
|
159
164
|
logging.info(f"Checking time validity for {col_name} of type {col_type}")
|
|
160
165
|
|
|
166
|
+
if col_type == "area":
|
|
167
|
+
logging.info(
|
|
168
|
+
f'Changing start and end year according to "{DatasetDatatables.MAPPING_CODE_CORRECTION.name}"'
|
|
169
|
+
)
|
|
170
|
+
df_start_year_correction = self.df_mapping_code_correction.filter(
|
|
171
|
+
col("var_type") == lit("start_year")
|
|
172
|
+
)
|
|
173
|
+
df_end_year_correction = self.df_mapping_code_correction.filter(
|
|
174
|
+
col("var_type") == lit("end_year")
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
original_col_order = df.columns
|
|
178
|
+
cols_to_select = df.columns
|
|
179
|
+
col_name_lower = col_name.lower()
|
|
180
|
+
cols_to_select = [
|
|
181
|
+
column
|
|
182
|
+
for column in cols_to_select
|
|
183
|
+
if column.lower()
|
|
184
|
+
not in (
|
|
185
|
+
"diss_note",
|
|
186
|
+
f"{col_name_lower}_start_date",
|
|
187
|
+
f"{col_name_lower}_end_date",
|
|
188
|
+
)
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
df = (
|
|
192
|
+
df.alias("d")
|
|
193
|
+
.join(
|
|
194
|
+
F.broadcast(df_start_year_correction).alias("sy"),
|
|
195
|
+
on=col(f"d.{col_name}") == col("sy.mapping_type"),
|
|
196
|
+
how="left",
|
|
197
|
+
)
|
|
198
|
+
.join(
|
|
199
|
+
F.broadcast(df_end_year_correction).alias("ey"),
|
|
200
|
+
on=col(f"d.{col_name}") == col("ey.mapping_type"),
|
|
201
|
+
how="left",
|
|
202
|
+
)
|
|
203
|
+
.withColumn("valid_new_start_year", col("sy.new_code").isNotNull())
|
|
204
|
+
.withColumn("valid_new_end_year", col("ey.new_code").isNotNull())
|
|
205
|
+
.withColumn(
|
|
206
|
+
"new_diss_note",
|
|
207
|
+
F.when(
|
|
208
|
+
col("valid_new_start_year"),
|
|
209
|
+
F.array_append(
|
|
210
|
+
col("d.diss_note"),
|
|
211
|
+
F.concat(
|
|
212
|
+
col("sy.note"),
|
|
213
|
+
lit(" from "),
|
|
214
|
+
col("sy.old_code"),
|
|
215
|
+
lit(" to "),
|
|
216
|
+
col("sy.new_code"),
|
|
217
|
+
),
|
|
218
|
+
),
|
|
219
|
+
).otherwise(col("d.diss_note")),
|
|
220
|
+
)
|
|
221
|
+
.withColumn(
|
|
222
|
+
"new_diss_note",
|
|
223
|
+
F.when(
|
|
224
|
+
col("valid_new_end_year"),
|
|
225
|
+
F.array_append(
|
|
226
|
+
col("new_diss_note"),
|
|
227
|
+
F.concat(
|
|
228
|
+
col("ey.note"),
|
|
229
|
+
lit(" from "),
|
|
230
|
+
col("ey.old_code"),
|
|
231
|
+
lit(" to "),
|
|
232
|
+
col("ey.new_code"),
|
|
233
|
+
),
|
|
234
|
+
),
|
|
235
|
+
).otherwise(col("new_diss_note")),
|
|
236
|
+
)
|
|
237
|
+
.withColumn(
|
|
238
|
+
f"new_{col_name}_start_date",
|
|
239
|
+
F.when(
|
|
240
|
+
col("valid_new_start_year"), F.to_date(col("sy.new_code"))
|
|
241
|
+
).otherwise(col(f"d.{col_name}_start_date")),
|
|
242
|
+
)
|
|
243
|
+
.withColumn(
|
|
244
|
+
f"new_{col_name}_end_date",
|
|
245
|
+
F.when(
|
|
246
|
+
col("valid_new_end_year"),
|
|
247
|
+
F.to_date(F.concat(col("ey.new_code"), lit("-12-31"))),
|
|
248
|
+
).otherwise(col(f"d.{col_name}_end_date")),
|
|
249
|
+
)
|
|
250
|
+
.select(
|
|
251
|
+
*cols_to_select,
|
|
252
|
+
col("new_diss_note").alias("diss_note"),
|
|
253
|
+
col(f"new_{col_name}_start_date").alias(f"{col_name}_start_date"),
|
|
254
|
+
col(f"new_{col_name}_end_date").alias(f"{col_name}_end_date"),
|
|
255
|
+
)
|
|
256
|
+
.select(*original_col_order)
|
|
257
|
+
)
|
|
258
|
+
|
|
161
259
|
# Iterate through columns and build conditions dynamically
|
|
162
260
|
start_date_condition = col(f"{col_name}_start_date").isNull() | (
|
|
163
261
|
col(f"{col_name}_start_date") <= col(f"{self.time_column}_start_date")
|
|
@@ -172,15 +270,15 @@ class SWSSilverIcebergSparkHelper:
|
|
|
172
270
|
start_date_condition & end_date_condition,
|
|
173
271
|
)
|
|
174
272
|
.withColumn("diss_flag", col("diss_flag") & col("condition_result"))
|
|
175
|
-
# In case the condition is satisfied update diss_flag accordingly and append a
|
|
273
|
+
# In case the condition is satisfied update diss_flag accordingly and append a diss_note indicating the reason for the observation exclusion from the dissemination
|
|
176
274
|
.withColumn(
|
|
177
|
-
"
|
|
275
|
+
"diss_note",
|
|
178
276
|
F.when(
|
|
179
277
|
~col("condition_result"),
|
|
180
278
|
F.array_append(
|
|
181
|
-
col("
|
|
279
|
+
col("diss_note"), lit(f"{col_type} out of time validity range")
|
|
182
280
|
),
|
|
183
|
-
).otherwise(col("
|
|
281
|
+
).otherwise(col("diss_note")),
|
|
184
282
|
)
|
|
185
283
|
.drop("condition_result")
|
|
186
284
|
)
|
|
@@ -292,7 +390,7 @@ class SWSSilverIcebergSparkHelper:
|
|
|
292
390
|
col_name (str): The DataFrame column name on which to apply the filter
|
|
293
391
|
|
|
294
392
|
Returns:
|
|
295
|
-
DataFrame: The DataFrame with updated `diss_flag` and `
|
|
393
|
+
DataFrame: The DataFrame with updated `diss_flag` and `diss_note` columns based on the check outcome
|
|
296
394
|
"""
|
|
297
395
|
|
|
298
396
|
# Remove the duplicates that may be in the tables
|
|
@@ -330,14 +428,14 @@ class SWSSilverIcebergSparkHelper:
|
|
|
330
428
|
col("diss_flag") & col("condition_result"),
|
|
331
429
|
)
|
|
332
430
|
.withColumn(
|
|
333
|
-
"
|
|
431
|
+
"diss_note",
|
|
334
432
|
F.when(
|
|
335
433
|
~col("condition_result"),
|
|
336
434
|
F.array_append(
|
|
337
|
-
col("
|
|
435
|
+
col("diss_note"),
|
|
338
436
|
lit(f"{col_type} not disseminated for this domain"),
|
|
339
437
|
),
|
|
340
|
-
).otherwise(col("
|
|
438
|
+
).otherwise(col("diss_note")),
|
|
341
439
|
)
|
|
342
440
|
.drop("condition_result")
|
|
343
441
|
)
|
|
@@ -424,16 +522,16 @@ class SWSSilverIcebergSparkHelper:
|
|
|
424
522
|
col("diss_flag") & col("condition_result"),
|
|
425
523
|
)
|
|
426
524
|
.withColumn(
|
|
427
|
-
"
|
|
525
|
+
"diss_note",
|
|
428
526
|
F.when(
|
|
429
527
|
~col("condition_result"),
|
|
430
528
|
F.array_append(
|
|
431
|
-
col("
|
|
529
|
+
col("diss_note"),
|
|
432
530
|
lit(
|
|
433
531
|
f"not disseminated according to exception with note: {row_exception['note']}"
|
|
434
532
|
),
|
|
435
533
|
),
|
|
436
|
-
).otherwise(col("
|
|
534
|
+
).otherwise(col("diss_note")),
|
|
437
535
|
)
|
|
438
536
|
.drop("condition_result")
|
|
439
537
|
)
|
|
@@ -518,7 +616,7 @@ class SWSSilverIcebergSparkHelper:
|
|
|
518
616
|
|
|
519
617
|
df = (
|
|
520
618
|
df.withColumn("metadata", F.to_json(col("metadata")))
|
|
521
|
-
.withColumn("
|
|
619
|
+
.withColumn("diss_note", F.to_json(col("diss_note")))
|
|
522
620
|
.coalesce(1)
|
|
523
621
|
)
|
|
524
622
|
|
|
@@ -547,6 +645,13 @@ class SWSSilverIcebergSparkHelper:
|
|
|
547
645
|
table=self.iceberg_tables.SILVER.table,
|
|
548
646
|
path=self.iceberg_tables.SILVER.path,
|
|
549
647
|
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
648
|
+
pinned_columns=[
|
|
649
|
+
*self.dim_columns_w_time,
|
|
650
|
+
"value",
|
|
651
|
+
*self.flag_columns,
|
|
652
|
+
"diss_flag",
|
|
653
|
+
"diss_note",
|
|
654
|
+
],
|
|
550
655
|
)
|
|
551
656
|
tag = upsert_disseminated_table(
|
|
552
657
|
sws_tags=tags,
|
|
@@ -2,3 +2,4 @@ from .SWSPostgresSparkReader import SWSPostgresSparkReader
|
|
|
2
2
|
from .SWSBronzeIcebergSparkHelper import SWSBronzeIcebergSparkHelper
|
|
3
3
|
from .SWSSilverIcebergSparkHelper import SWSSilverIcebergSparkHelper
|
|
4
4
|
from .SWSGoldIcebergSparkHelper import SWSGoldIcebergSparkHelper
|
|
5
|
+
from .SWSEasyIcebergSparkHelper import SWSEasyIcebergSparkHelper
|