sws-spark-dissemination-helper 0.0.60__py3-none-any.whl → 0.0.171__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +380 -28
- sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +723 -0
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +331 -353
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py +110 -31
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +138 -23
- sws_spark_dissemination_helper/__init__.py +1 -0
- sws_spark_dissemination_helper/constants.py +76 -24
- sws_spark_dissemination_helper/utils.py +133 -68
- {sws_spark_dissemination_helper-0.0.60.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/METADATA +21 -17
- sws_spark_dissemination_helper-0.0.171.dist-info/RECORD +13 -0
- sws_spark_dissemination_helper-0.0.60.dist-info/RECORD +0 -11
- {sws_spark_dissemination_helper-0.0.60.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/WHEEL +0 -0
- {sws_spark_dissemination_helper-0.0.60.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,7 +11,6 @@ from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
|
|
|
11
11
|
from .constants import IcebergDatabases, IcebergTables
|
|
12
12
|
from .SWSPostgresSparkReader import SWSPostgresSparkReader
|
|
13
13
|
from .utils import (
|
|
14
|
-
col_is_null_or_empty,
|
|
15
14
|
get_or_create_tag,
|
|
16
15
|
save_cache_csv,
|
|
17
16
|
upsert_disseminated_table,
|
|
@@ -47,11 +46,6 @@ class SWSGoldIcebergSparkHelper:
|
|
|
47
46
|
self.flag_columns,
|
|
48
47
|
) = self._get_dim_time_flag_columns()
|
|
49
48
|
|
|
50
|
-
self.cols_to_keep_sdmx = (
|
|
51
|
-
self.dim_columns_w_time
|
|
52
|
-
+ ["unit_of_measure", "unit_of_measure_multiplier", "value"]
|
|
53
|
-
+ self.flag_columns
|
|
54
|
-
)
|
|
55
49
|
self.cols_to_keep_sws = (
|
|
56
50
|
self.dim_columns_w_time + ["value"] + self.flag_columns
|
|
57
51
|
)
|
|
@@ -72,40 +66,6 @@ class SWSGoldIcebergSparkHelper:
|
|
|
72
66
|
if col_name in self.dim_columns
|
|
73
67
|
}
|
|
74
68
|
|
|
75
|
-
(
|
|
76
|
-
self.df_mapping_sdmx_codes,
|
|
77
|
-
self.df_mapping_sdmx_uom,
|
|
78
|
-
self.df_mapping_sdmx_col_names,
|
|
79
|
-
) = sws_postgres_spark_reader.import_sdmx_mapping_datatables(self.domain_code)
|
|
80
|
-
|
|
81
|
-
self._check_column_mappings(self.df_mapping_sdmx_col_names)
|
|
82
|
-
|
|
83
|
-
def _check_column_mappings(
|
|
84
|
-
self,
|
|
85
|
-
df_mapping_sdmx_col_names: DataFrame,
|
|
86
|
-
) -> DataFrame:
|
|
87
|
-
cols_to_keep_set = set(self.cols_to_keep_sdmx)
|
|
88
|
-
mapping_sdmx_col_names_internal_set = {
|
|
89
|
-
row[0]
|
|
90
|
-
for row in df_mapping_sdmx_col_names.filter(
|
|
91
|
-
col("internal_name").isNotNull() & (col("internal_name") != lit(""))
|
|
92
|
-
)
|
|
93
|
-
.select("internal_name")
|
|
94
|
-
.collect()
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
if not (cols_to_keep_set <= mapping_sdmx_col_names_internal_set):
|
|
98
|
-
missing_mappings = cols_to_keep_set - mapping_sdmx_col_names_internal_set
|
|
99
|
-
|
|
100
|
-
message = 'The mappings in the table "Mapping - SDMX columns names" are not correct'
|
|
101
|
-
|
|
102
|
-
if len(missing_mappings) > 0:
|
|
103
|
-
message += (
|
|
104
|
-
f"\nThe following column mappings are missing: {missing_mappings}"
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
raise ValueError(message)
|
|
108
|
-
|
|
109
69
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
110
70
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
111
71
|
dim_columns_w_time = [
|
|
@@ -126,319 +86,31 @@ class SWSGoldIcebergSparkHelper:
|
|
|
126
86
|
def apply_diss_flag_filter(self, df: DataFrame) -> DataFrame:
|
|
127
87
|
return df.filter(col("diss_flag"))
|
|
128
88
|
|
|
129
|
-
# TODO implement the delete flag
|
|
130
|
-
def apply_uom_mapping(
|
|
131
|
-
self,
|
|
132
|
-
df: DataFrame,
|
|
133
|
-
) -> DataFrame:
|
|
134
|
-
logging.info("mapping unit of measure for dissemination")
|
|
135
|
-
|
|
136
|
-
df = df.withColumn(
|
|
137
|
-
"official_sws_uom",
|
|
138
|
-
F.when(
|
|
139
|
-
col_is_null_or_empty("unit_of_measure_base_unit"),
|
|
140
|
-
col("unit_of_measure"),
|
|
141
|
-
).otherwise(col("unit_of_measure_base_unit")),
|
|
142
|
-
).withColumn(
|
|
143
|
-
"official_sws_multiplier",
|
|
144
|
-
F.coalesce(F.log10(col("unit_of_measure_multiplier")), lit(0)).cast("int"),
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
delete_df_uom_mapping = self.df_mapping_sdmx_uom.filter(
|
|
148
|
-
col("delete")
|
|
149
|
-
& col_is_null_or_empty("sdmx_code")
|
|
150
|
-
& col("sdmx_multiplier").isNull()
|
|
151
|
-
& col("value_multiplier").isNull()
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
generic_df_uom_mapping = self.df_mapping_sdmx_uom.filter(
|
|
155
|
-
~col("delete")
|
|
156
|
-
& col("sws_multiplier").isNull()
|
|
157
|
-
& col("sdmx_multiplier").isNull()
|
|
158
|
-
& (col("value_multiplier") == lit(0))
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
specific_df_uom_mapping = self.df_mapping_sdmx_uom.filter(
|
|
162
|
-
~col("delete")
|
|
163
|
-
& col("sws_multiplier").isNotNull()
|
|
164
|
-
& col("sdmx_multiplier").isNotNull()
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
# Apply generic uom mapping
|
|
168
|
-
df = (
|
|
169
|
-
df.alias("d")
|
|
170
|
-
.join(
|
|
171
|
-
generic_df_uom_mapping.alias("m"),
|
|
172
|
-
col("d.official_sws_uom") == col("m.sws_code"),
|
|
173
|
-
"left",
|
|
174
|
-
)
|
|
175
|
-
.select("d.*", col("sdmx_code").alias("generic_sdmx_uom"))
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
# Apply specific uom mapping
|
|
179
|
-
df = (
|
|
180
|
-
df.alias("d")
|
|
181
|
-
.join(
|
|
182
|
-
specific_df_uom_mapping.alias("m"),
|
|
183
|
-
(col("d.official_sws_uom") == col("m.sws_code"))
|
|
184
|
-
& (col("d.official_sws_multiplier") == col("m.sws_multiplier")),
|
|
185
|
-
"left",
|
|
186
|
-
)
|
|
187
|
-
.select(
|
|
188
|
-
"d.*",
|
|
189
|
-
col("sdmx_code").alias("specific_sdmx_uom"),
|
|
190
|
-
col("sdmx_multiplier").alias("specific_sdmx_multiplier"),
|
|
191
|
-
(col("value") * F.pow(lit(10), col("value_multiplier"))).alias(
|
|
192
|
-
"specific_sdmx_value"
|
|
193
|
-
),
|
|
194
|
-
)
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
# Select the official values according to descending specificity
|
|
198
|
-
df = (
|
|
199
|
-
df.withColumn(
|
|
200
|
-
"unit_of_measure",
|
|
201
|
-
F.coalesce(
|
|
202
|
-
col("specific_sdmx_uom"),
|
|
203
|
-
col("generic_sdmx_uom"),
|
|
204
|
-
col("official_sws_uom"),
|
|
205
|
-
),
|
|
206
|
-
)
|
|
207
|
-
.withColumn(
|
|
208
|
-
"unit_of_measure_multiplier",
|
|
209
|
-
F.coalesce(
|
|
210
|
-
col("specific_sdmx_multiplier"), col("official_sws_multiplier")
|
|
211
|
-
),
|
|
212
|
-
)
|
|
213
|
-
.withColumn(
|
|
214
|
-
"value",
|
|
215
|
-
F.coalesce(col("specific_sdmx_value"), col("value")),
|
|
216
|
-
)
|
|
217
|
-
# Remove the columns that were not in the original dataset
|
|
218
|
-
.drop(
|
|
219
|
-
col("specific_sdmx_uom"),
|
|
220
|
-
col("specific_sdmx_multiplier"),
|
|
221
|
-
col("specific_sdmx_value"),
|
|
222
|
-
col("generic_sdmx_uom"),
|
|
223
|
-
col("official_sws_uom"),
|
|
224
|
-
col("official_sws_multiplier"),
|
|
225
|
-
)
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
return df
|
|
229
|
-
|
|
230
|
-
def keep_dim_uom_val_attr_columns(self, df: DataFrame):
|
|
231
|
-
return df.select(*self.cols_to_keep_sdmx)
|
|
232
|
-
|
|
233
89
|
def keep_dim_val_attr_columns(self, df: DataFrame):
|
|
234
90
|
return df.select(*self.cols_to_keep_sws)
|
|
235
91
|
|
|
236
|
-
def
|
|
237
|
-
self,
|
|
238
|
-
|
|
239
|
-
dimension_name: str,
|
|
240
|
-
dimension_type: str,
|
|
241
|
-
) -> DataFrame:
|
|
242
|
-
logging.info(
|
|
243
|
-
f"mapping column {dimension_name} of type {dimension_type} for dissemination"
|
|
244
|
-
)
|
|
245
|
-
return (
|
|
246
|
-
df.alias("d")
|
|
247
|
-
# Join the data with the standard mapping for the specific dimension
|
|
248
|
-
.join(
|
|
249
|
-
F.broadcast(
|
|
250
|
-
self.df_mapping_sdmx_codes.filter(
|
|
251
|
-
(col("domain").isNull() | (col("domain") == lit("")))
|
|
252
|
-
& (col("var_type") == lit(dimension_type))
|
|
253
|
-
& (
|
|
254
|
-
col("mapping_type").isNull()
|
|
255
|
-
| (col("mapping_type") == lit(""))
|
|
256
|
-
)
|
|
257
|
-
)
|
|
258
|
-
).alias("m_standard"),
|
|
259
|
-
col(f"d.{dimension_name}") == col("m_standard.internal_code"),
|
|
260
|
-
"left",
|
|
261
|
-
)
|
|
262
|
-
# Join the data with the domain specific mapping for the specific dimension
|
|
263
|
-
.join(
|
|
264
|
-
F.broadcast(
|
|
265
|
-
self.df_mapping_sdmx_codes.filter(
|
|
266
|
-
(col("domain") == lit(self.domain_code))
|
|
267
|
-
& (col("var_type") == lit(dimension_type))
|
|
268
|
-
& (
|
|
269
|
-
col("mapping_type").isNull()
|
|
270
|
-
| (col("mapping_type") == lit(""))
|
|
271
|
-
)
|
|
272
|
-
)
|
|
273
|
-
).alias("m_domain"),
|
|
274
|
-
col(f"d.{dimension_name}") == col("m_domain.internal_code"),
|
|
275
|
-
"left",
|
|
276
|
-
)
|
|
277
|
-
# Select only the columns we are interested in (this step is optional but recommended for debugging)
|
|
278
|
-
.select(
|
|
279
|
-
"d.*",
|
|
280
|
-
col("m_standard.external_code").alias("standard_external_code"),
|
|
281
|
-
col("m_standard.delete").alias("standard_delete"),
|
|
282
|
-
col("m_standard.multiplier").alias("standard_multiplier"),
|
|
283
|
-
col("m_domain.external_code").alias("domain_specific_external_code"),
|
|
284
|
-
col("m_domain.delete").alias("domain_specific_delete"),
|
|
285
|
-
col("m_domain.multiplier").alias("domain_specific_multiplier"),
|
|
286
|
-
)
|
|
287
|
-
# Filter out records to delete
|
|
288
|
-
.filter(
|
|
289
|
-
# Evaluate first the domain specific flag
|
|
290
|
-
F.when(
|
|
291
|
-
col("domain_specific_delete").isNotNull(),
|
|
292
|
-
~col("domain_specific_delete"),
|
|
293
|
-
)
|
|
294
|
-
# Then evaluate the general flag
|
|
295
|
-
.when(
|
|
296
|
-
col("standard_delete").isNotNull(), ~col("standard_delete")
|
|
297
|
-
).otherwise(lit(True))
|
|
298
|
-
)
|
|
299
|
-
.withColumn(
|
|
300
|
-
dimension_name,
|
|
301
|
-
# Evaluate first the domain specific mapping
|
|
302
|
-
F.when(
|
|
303
|
-
col("domain_specific_external_code").isNotNull(),
|
|
304
|
-
col("domain_specific_external_code"),
|
|
305
|
-
)
|
|
306
|
-
# Then evaluate the general mapping
|
|
307
|
-
.when(
|
|
308
|
-
col("standard_external_code").isNotNull(),
|
|
309
|
-
col("standard_external_code"),
|
|
310
|
-
).otherwise(col(dimension_name)),
|
|
311
|
-
)
|
|
312
|
-
.withColumn(
|
|
313
|
-
"value",
|
|
314
|
-
# Multiply first by the domain specific multiplier
|
|
315
|
-
F.when(
|
|
316
|
-
col("domain_specific_multiplier").isNotNull(),
|
|
317
|
-
col("value") * col("domain_specific_multiplier"),
|
|
318
|
-
)
|
|
319
|
-
# Then multiply by the general multiplier
|
|
320
|
-
.when(
|
|
321
|
-
col("standard_external_code").isNotNull(),
|
|
322
|
-
col("value") * col("standard_multiplier"),
|
|
323
|
-
).otherwise(col("value")),
|
|
324
|
-
)
|
|
325
|
-
# Remove the columns that were not in the original dataset
|
|
326
|
-
.drop(
|
|
327
|
-
"standard_external_code",
|
|
328
|
-
"standard_delete",
|
|
329
|
-
"standard_multiplier",
|
|
330
|
-
"domain_specific_external_code",
|
|
331
|
-
"domain_specific_delete",
|
|
332
|
-
"domain_specific_multiplier",
|
|
333
|
-
)
|
|
92
|
+
def read_bronze_data(self) -> DataFrame:
|
|
93
|
+
return self.spark.read.option("tag", self.tag_name).table(
|
|
94
|
+
self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
|
|
334
95
|
)
|
|
335
96
|
|
|
336
|
-
def
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
df = df.transform(
|
|
340
|
-
self._apply_sdmx_dimension_codes_mapping_single,
|
|
341
|
-
dimension_name=dimension_name,
|
|
342
|
-
dimension_type=dimension_type,
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
return df
|
|
346
|
-
|
|
347
|
-
def drop_non_sdmx_columns(self, df: DataFrame) -> DataFrame:
|
|
348
|
-
cols_to_drop = [
|
|
349
|
-
row["internal_name"]
|
|
350
|
-
for row in self.df_mapping_sdmx_col_names.collect()
|
|
351
|
-
if row["delete"] is True
|
|
352
|
-
]
|
|
353
|
-
logging.info(f"Dropping non-SDMX columns: {cols_to_drop}")
|
|
354
|
-
return df.drop(*cols_to_drop)
|
|
355
|
-
|
|
356
|
-
def apply_sdmx_column_names_mapping(self, df: DataFrame) -> DataFrame:
|
|
357
|
-
logging.info("Renaming columns to comply with SDMX standard")
|
|
358
|
-
|
|
359
|
-
mapping_sws_col_sdmx_col = {
|
|
360
|
-
row["internal_name"]: row["external_name"]
|
|
361
|
-
for row in self.df_mapping_sdmx_col_names.filter(
|
|
362
|
-
col("internal_name").isNotNull()
|
|
363
|
-
& (col("internal_name") != lit(""))
|
|
364
|
-
& ~col("delete")
|
|
365
|
-
).collect()
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
logging.info(f"Column names mappings: {mapping_sws_col_sdmx_col}")
|
|
369
|
-
|
|
370
|
-
return df.withColumnsRenamed(mapping_sws_col_sdmx_col)
|
|
371
|
-
|
|
372
|
-
def add_sdmx_default_columns(self, df: DataFrame) -> DataFrame:
|
|
373
|
-
col_w_default_value = {
|
|
374
|
-
row["external_name"]: row["default_value"]
|
|
375
|
-
for row in self.df_mapping_sdmx_col_names.collect()
|
|
376
|
-
if row["add"] is True
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
logging.info("Adding SDMX columns with default values")
|
|
380
|
-
|
|
381
|
-
for name, default_value in col_w_default_value.items():
|
|
382
|
-
logging.info(
|
|
383
|
-
f"Adding SDMX column {name} with default value {default_value}"
|
|
384
|
-
)
|
|
385
|
-
df = df.withColumn(name, lit(default_value))
|
|
386
|
-
|
|
387
|
-
return df
|
|
388
|
-
|
|
389
|
-
def rearrange_sdmx_columns(self, df: DataFrame) -> DataFrame:
|
|
390
|
-
logging.info(
|
|
391
|
-
"Rearranging the columns to have the following order: Dimensions, TimeDimension, PrimaryMeasure, Attributes"
|
|
97
|
+
def read_silver_data(self) -> DataFrame:
|
|
98
|
+
return self.spark.read.option("tag", self.tag_name).table(
|
|
99
|
+
self.iceberg_tables.SILVER.iceberg_id
|
|
392
100
|
)
|
|
393
101
|
|
|
394
|
-
get_columns_for_type = lambda df, type: [
|
|
395
|
-
row[0]
|
|
396
|
-
for row in df.filter(col("type") == lit(type))
|
|
397
|
-
.select("external_name")
|
|
398
|
-
.collect()
|
|
399
|
-
]
|
|
400
|
-
|
|
401
|
-
df_mapping_sdmx_no_del = self.df_mapping_sdmx_col_names.filter(~col("delete"))
|
|
402
|
-
|
|
403
|
-
dimensions = get_columns_for_type(df_mapping_sdmx_no_del, "Dimension")
|
|
404
|
-
time_dimensions = get_columns_for_type(df_mapping_sdmx_no_del, "TimeDimension")
|
|
405
|
-
primary_measure = get_columns_for_type(df_mapping_sdmx_no_del, "PrimaryMeasure")
|
|
406
|
-
attributes = get_columns_for_type(df_mapping_sdmx_no_del, "Attribute")
|
|
407
|
-
|
|
408
|
-
logging.info(f"Dimensions: {dimensions}")
|
|
409
|
-
logging.info(f"Time Dimensions: {time_dimensions}")
|
|
410
|
-
logging.info(f"Primary Measure: {primary_measure}")
|
|
411
|
-
logging.info(f"Attributes: {attributes}")
|
|
412
|
-
|
|
413
|
-
return df.select(*dimensions, *time_dimensions, *primary_measure, *attributes)
|
|
414
|
-
|
|
415
102
|
def gen_gold_sws_disseminated_data(self) -> DataFrame:
|
|
416
103
|
return (
|
|
417
|
-
self.
|
|
418
|
-
.table(self.iceberg_tables.SILVER.iceberg_id)
|
|
104
|
+
self.read_silver_data()
|
|
419
105
|
.transform(self.apply_diss_flag_filter)
|
|
420
106
|
.transform(self.keep_dim_val_attr_columns)
|
|
421
107
|
)
|
|
422
108
|
|
|
423
|
-
def
|
|
424
|
-
return (
|
|
425
|
-
self.spark.read.option("tag", self.tag_name)
|
|
426
|
-
.table(self.iceberg_tables.BRONZE.iceberg_id)
|
|
427
|
-
.transform(self.keep_dim_val_attr_columns)
|
|
428
|
-
)
|
|
109
|
+
def gen_gold_sws_data(self) -> DataFrame:
|
|
110
|
+
return self.read_bronze_data().transform(self.keep_dim_val_attr_columns)
|
|
429
111
|
|
|
430
|
-
def
|
|
431
|
-
return (
|
|
432
|
-
self.spark.read.option("tag", self.tag_name)
|
|
433
|
-
.table(self.iceberg_tables.SILVER.iceberg_id)
|
|
434
|
-
.transform(self.apply_diss_flag_filter)
|
|
435
|
-
.transform(self.apply_uom_mapping)
|
|
436
|
-
.transform(self.keep_dim_uom_val_attr_columns)
|
|
437
|
-
.transform(self.apply_sdmx_dimension_codes_mapping)
|
|
438
|
-
.transform(self.apply_sdmx_column_names_mapping)
|
|
439
|
-
.transform(self.add_sdmx_default_columns)
|
|
440
|
-
.transform(self.rearrange_sdmx_columns)
|
|
441
|
-
)
|
|
112
|
+
def gen_gold_sws_validated_data(self) -> DataFrame:
|
|
113
|
+
return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
|
|
442
114
|
|
|
443
115
|
def write_gold_sws_validated_data_to_iceberg_and_csv(
|
|
444
116
|
self, df: DataFrame
|
|
@@ -466,6 +138,37 @@ class SWSGoldIcebergSparkHelper:
|
|
|
466
138
|
|
|
467
139
|
return df
|
|
468
140
|
|
|
141
|
+
def write_gold_sws_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
|
|
142
|
+
df.writeTo(self.iceberg_tables.GOLD_SWS.iceberg_id).createOrReplace()
|
|
143
|
+
|
|
144
|
+
logging.info(
|
|
145
|
+
f"Gold SWS table written to {self.iceberg_tables.GOLD_SWS.iceberg_id}"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
self.spark.sql(
|
|
149
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_SWS.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
logging.info(f"gold SWS tag '{self.tag_name}' created")
|
|
153
|
+
|
|
154
|
+
df_1 = df.coalesce(1)
|
|
155
|
+
|
|
156
|
+
save_cache_csv(
|
|
157
|
+
df=df_1,
|
|
158
|
+
bucket=self.bucket,
|
|
159
|
+
prefix=self.iceberg_tables.GOLD_SWS.csv_prefix,
|
|
160
|
+
tag_name=self.tag_name,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return df
|
|
164
|
+
|
|
165
|
+
def gen_and_write_gold_sws_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
166
|
+
self.df_gold_sws = self.gen_gold_sws_data()
|
|
167
|
+
|
|
168
|
+
self.write_gold_sws_data_to_iceberg_and_csv(self.df_gold_sws)
|
|
169
|
+
|
|
170
|
+
return self.df_gold_sws
|
|
171
|
+
|
|
469
172
|
def gen_and_write_gold_sws_validated_data_to_iceberg_and_csv(self) -> DataFrame:
|
|
470
173
|
self.df_gold_sws_validated = self.gen_gold_sws_validated_data()
|
|
471
174
|
|
|
@@ -536,12 +239,91 @@ class SWSGoldIcebergSparkHelper:
|
|
|
536
239
|
|
|
537
240
|
return df
|
|
538
241
|
|
|
539
|
-
def
|
|
540
|
-
|
|
242
|
+
def write_gold_pre_sdmx_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
|
|
243
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
244
|
+
for column in self.dim_columns:
|
|
245
|
+
df = df.withColumn(
|
|
246
|
+
column, F.regexp_replace(col(column), lit("\."), lit("_"))
|
|
247
|
+
)
|
|
248
|
+
df = df.withColumnRenamed("value", "OBS_VALUE").withColumnsRenamed(
|
|
249
|
+
{column: column.upper() for column in df.columns}
|
|
250
|
+
)
|
|
251
|
+
df.writeTo(self.iceberg_tables.GOLD_PRE_SDMX.iceberg_id).createOrReplace()
|
|
252
|
+
|
|
253
|
+
logging.info(
|
|
254
|
+
f"Gold pre-SDMX table written to {self.iceberg_tables.GOLD_PRE_SDMX.iceberg_id}"
|
|
255
|
+
)
|
|
541
256
|
|
|
542
|
-
self.
|
|
257
|
+
self.spark.sql(
|
|
258
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_PRE_SDMX.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
259
|
+
)
|
|
543
260
|
|
|
544
|
-
|
|
261
|
+
logging.info(f"gold pre-SDMX tag '{self.tag_name}' created")
|
|
262
|
+
|
|
263
|
+
df_1 = df.coalesce(1)
|
|
264
|
+
|
|
265
|
+
save_cache_csv(
|
|
266
|
+
df=df_1,
|
|
267
|
+
bucket=self.bucket,
|
|
268
|
+
prefix=self.iceberg_tables.GOLD_PRE_SDMX.csv_prefix,
|
|
269
|
+
tag_name=self.tag_name,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
return df
|
|
273
|
+
|
|
274
|
+
def write_gold_faostat_data_to_iceberg_and_csv(self, df: DataFrame) -> DataFrame:
|
|
275
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
276
|
+
df.writeTo(self.iceberg_tables.GOLD_FAOSTAT.iceberg_id).createOrReplace()
|
|
277
|
+
|
|
278
|
+
logging.info(
|
|
279
|
+
f"Gold FAOSTAT table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
self.spark.sql(
|
|
283
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
logging.info(f"gold FAOSTAT tag '{self.tag_name}' created")
|
|
287
|
+
|
|
288
|
+
df_1 = df.coalesce(1)
|
|
289
|
+
|
|
290
|
+
save_cache_csv(
|
|
291
|
+
df=df_1,
|
|
292
|
+
bucket=self.bucket,
|
|
293
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT.csv_prefix,
|
|
294
|
+
tag_name=self.tag_name,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
return df
|
|
298
|
+
|
|
299
|
+
def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
|
|
300
|
+
self, df: DataFrame
|
|
301
|
+
) -> DataFrame:
|
|
302
|
+
"""The expected input to this function is the output of the sws disseminated function"""
|
|
303
|
+
df.writeTo(
|
|
304
|
+
self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
|
|
305
|
+
).createOrReplace()
|
|
306
|
+
|
|
307
|
+
logging.info(
|
|
308
|
+
f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
self.spark.sql(
|
|
312
|
+
f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
|
|
316
|
+
|
|
317
|
+
df_1 = df.coalesce(1)
|
|
318
|
+
|
|
319
|
+
save_cache_csv(
|
|
320
|
+
df=df_1,
|
|
321
|
+
bucket=self.bucket,
|
|
322
|
+
prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
|
|
323
|
+
tag_name=self.tag_name,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return df
|
|
545
327
|
|
|
546
328
|
def write_gold_sws_validated_sws_dissemination_tag(
|
|
547
329
|
self, df: DataFrame, tags: Tags
|
|
@@ -553,7 +335,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
553
335
|
new_iceberg_table = BaseDisseminatedTagTable(
|
|
554
336
|
id=f"{self.domain_code.lower()}_gold_sws_validated_iceberg",
|
|
555
337
|
name=f"{self.domain_code} gold SWS validated Iceberg",
|
|
556
|
-
description="Gold table containing all the data
|
|
338
|
+
description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format",
|
|
557
339
|
layer=TableLayer.GOLD,
|
|
558
340
|
private=True,
|
|
559
341
|
type=TableType.ICEBERG,
|
|
@@ -571,10 +353,10 @@ class SWSGoldIcebergSparkHelper:
|
|
|
571
353
|
)
|
|
572
354
|
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
573
355
|
|
|
574
|
-
|
|
356
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
575
357
|
id=f"{self.domain_code.lower()}_gold_sws_validated_csv",
|
|
576
358
|
name=f"{self.domain_code} gold SWS validated csv",
|
|
577
|
-
description="Gold table containing all the data
|
|
359
|
+
description="Gold table containing all the unfiltered tag data, with code correction appplied, in SWS compatible format, cached in csv",
|
|
578
360
|
layer=TableLayer.GOLD,
|
|
579
361
|
private=True,
|
|
580
362
|
type=TableType.CSV,
|
|
@@ -586,7 +368,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
586
368
|
tag=tag,
|
|
587
369
|
dataset_id=self.dataset_id,
|
|
588
370
|
tag_name=self.tag_name,
|
|
589
|
-
table=
|
|
371
|
+
table=new_diss_table,
|
|
590
372
|
)
|
|
591
373
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
592
374
|
|
|
@@ -602,7 +384,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
602
384
|
new_iceberg_table = BaseDisseminatedTagTable(
|
|
603
385
|
id=f"{self.domain_code.lower()}_gold_sws_disseminated_iceberg",
|
|
604
386
|
name=f"{self.domain_code} gold SWS disseminated Iceberg",
|
|
605
|
-
description="Gold table containing
|
|
387
|
+
description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format",
|
|
606
388
|
layer=TableLayer.GOLD,
|
|
607
389
|
private=True,
|
|
608
390
|
type=TableType.ICEBERG,
|
|
@@ -620,10 +402,10 @@ class SWSGoldIcebergSparkHelper:
|
|
|
620
402
|
)
|
|
621
403
|
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
622
404
|
|
|
623
|
-
|
|
405
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
624
406
|
id=f"{self.domain_code.lower()}_gold_sws_disseminated_csv",
|
|
625
407
|
name=f"{self.domain_code} gold SWS disseminated csv",
|
|
626
|
-
description="Gold table containing
|
|
408
|
+
description="Gold table containing only the filtered tag data, with code correction appplied, in SWS compatible format, cached in csv",
|
|
627
409
|
layer=TableLayer.GOLD,
|
|
628
410
|
private=True,
|
|
629
411
|
type=TableType.CSV,
|
|
@@ -635,7 +417,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
635
417
|
tag=tag,
|
|
636
418
|
dataset_id=self.dataset_id,
|
|
637
419
|
tag_name=self.tag_name,
|
|
638
|
-
table=
|
|
420
|
+
table=new_diss_table,
|
|
639
421
|
)
|
|
640
422
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
641
423
|
|
|
@@ -669,7 +451,7 @@ class SWSGoldIcebergSparkHelper:
|
|
|
669
451
|
)
|
|
670
452
|
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
671
453
|
|
|
672
|
-
|
|
454
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
673
455
|
id=f"{self.domain_code.lower()}_gold_sdmx_csv",
|
|
674
456
|
name=f"{self.domain_code} gold SDMX csv",
|
|
675
457
|
description="Gold table containing all the cleaned data in SDMX compatible format cached in csv",
|
|
@@ -684,7 +466,203 @@ class SWSGoldIcebergSparkHelper:
|
|
|
684
466
|
tag=tag,
|
|
685
467
|
dataset_id=self.dataset_id,
|
|
686
468
|
tag_name=self.tag_name,
|
|
687
|
-
table=
|
|
469
|
+
table=new_diss_table,
|
|
470
|
+
)
|
|
471
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
472
|
+
|
|
473
|
+
return df
|
|
474
|
+
|
|
475
|
+
def write_gold_pre_sdmx_sws_dissemination_tag(
|
|
476
|
+
self, df: DataFrame, tags: Tags
|
|
477
|
+
) -> DataFrame:
|
|
478
|
+
# Get or create a new tag
|
|
479
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
480
|
+
logging.debug(f"Tag: {tag}")
|
|
481
|
+
|
|
482
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
483
|
+
id=f"{self.domain_code.lower()}_gold_pre_sdmx_iceberg",
|
|
484
|
+
name=f"{self.domain_code} gold pre-SDMX Iceberg",
|
|
485
|
+
description="Gold table containing all the cleaned data in SDMX compatible format, ready to be mapped using FMR",
|
|
486
|
+
layer=TableLayer.GOLD,
|
|
487
|
+
private=True,
|
|
488
|
+
debug=True,
|
|
489
|
+
type=TableType.ICEBERG,
|
|
490
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
491
|
+
table=self.iceberg_tables.GOLD_PRE_SDMX.table,
|
|
492
|
+
path=self.iceberg_tables.GOLD_PRE_SDMX.path,
|
|
493
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
494
|
+
)
|
|
495
|
+
tag = upsert_disseminated_table(
|
|
496
|
+
sws_tags=tags,
|
|
497
|
+
tag=tag,
|
|
498
|
+
dataset_id=self.dataset_id,
|
|
499
|
+
tag_name=self.tag_name,
|
|
500
|
+
table=new_iceberg_table,
|
|
501
|
+
)
|
|
502
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
503
|
+
|
|
504
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
505
|
+
id=f"{self.domain_code.lower()}_gold_pre_sdmx_csv",
|
|
506
|
+
name=f"{self.domain_code} gold pre-SDMX csv",
|
|
507
|
+
description="Gold table containing all the cleaned data in SDMX compatible format, ready to be mapped using FMR and cached in csv",
|
|
508
|
+
layer=TableLayer.GOLD,
|
|
509
|
+
private=True,
|
|
510
|
+
debug=True,
|
|
511
|
+
type=TableType.CSV,
|
|
512
|
+
path=self.iceberg_tables.GOLD_PRE_SDMX.csv_path,
|
|
513
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
514
|
+
)
|
|
515
|
+
tag = upsert_disseminated_table(
|
|
516
|
+
sws_tags=tags,
|
|
517
|
+
tag=tag,
|
|
518
|
+
dataset_id=self.dataset_id,
|
|
519
|
+
tag_name=self.tag_name,
|
|
520
|
+
table=new_diss_table,
|
|
521
|
+
)
|
|
522
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
523
|
+
|
|
524
|
+
return df
|
|
525
|
+
|
|
526
|
+
def write_gold_sws_dissemination_tag(self, df: DataFrame, tags: Tags) -> DataFrame:
|
|
527
|
+
# Get or create a new tag
|
|
528
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
529
|
+
logging.debug(f"Tag: {tag}")
|
|
530
|
+
|
|
531
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
532
|
+
id=f"{self.domain_code.lower()}_gold_sws_iceberg",
|
|
533
|
+
name=f"{self.domain_code} gold SWS Iceberg",
|
|
534
|
+
description="Gold table containing the tag data without any processing",
|
|
535
|
+
layer=TableLayer.GOLD,
|
|
536
|
+
private=True,
|
|
537
|
+
type=TableType.ICEBERG,
|
|
538
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
539
|
+
table=self.iceberg_tables.GOLD_SWS.table,
|
|
540
|
+
path=self.iceberg_tables.GOLD_SWS.path,
|
|
541
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
542
|
+
)
|
|
543
|
+
tag = upsert_disseminated_table(
|
|
544
|
+
sws_tags=tags,
|
|
545
|
+
tag=tag,
|
|
546
|
+
dataset_id=self.dataset_id,
|
|
547
|
+
tag_name=self.tag_name,
|
|
548
|
+
table=new_iceberg_table,
|
|
549
|
+
)
|
|
550
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
551
|
+
|
|
552
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
553
|
+
id=f"{self.domain_code.lower()}_gold_sws_csv",
|
|
554
|
+
name=f"{self.domain_code} gold SWS csv",
|
|
555
|
+
description="Gold table containing the tag data without any processing cached in csv",
|
|
556
|
+
layer=TableLayer.GOLD,
|
|
557
|
+
private=True,
|
|
558
|
+
type=TableType.CSV,
|
|
559
|
+
path=self.iceberg_tables.GOLD_SWS.csv_path,
|
|
560
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
561
|
+
)
|
|
562
|
+
tag = upsert_disseminated_table(
|
|
563
|
+
sws_tags=tags,
|
|
564
|
+
tag=tag,
|
|
565
|
+
dataset_id=self.dataset_id,
|
|
566
|
+
tag_name=self.tag_name,
|
|
567
|
+
table=new_diss_table,
|
|
568
|
+
)
|
|
569
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
570
|
+
|
|
571
|
+
return df
|
|
572
|
+
|
|
573
|
+
def write_gold_faostat_dissemination_tag(
|
|
574
|
+
self, df: DataFrame, tags: Tags
|
|
575
|
+
) -> DataFrame:
|
|
576
|
+
# Get or create a new tag
|
|
577
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
578
|
+
logging.debug(f"Tag: {tag}")
|
|
579
|
+
|
|
580
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
581
|
+
id=f"{self.domain_code.lower()}_gold_faostat_iceberg",
|
|
582
|
+
name=f"{self.domain_code} gold FAOSTAT Iceberg",
|
|
583
|
+
description="Gold table containing the tag data in FAOSTAT format",
|
|
584
|
+
layer=TableLayer.GOLD,
|
|
585
|
+
private=True,
|
|
586
|
+
type=TableType.ICEBERG,
|
|
587
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
588
|
+
table=self.iceberg_tables.GOLD_FAOSTAT.table,
|
|
589
|
+
path=self.iceberg_tables.GOLD_FAOSTAT.path,
|
|
590
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
591
|
+
)
|
|
592
|
+
tag = upsert_disseminated_table(
|
|
593
|
+
sws_tags=tags,
|
|
594
|
+
tag=tag,
|
|
595
|
+
dataset_id=self.dataset_id,
|
|
596
|
+
tag_name=self.tag_name,
|
|
597
|
+
table=new_iceberg_table,
|
|
598
|
+
)
|
|
599
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
600
|
+
|
|
601
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
602
|
+
id=f"{self.domain_code.lower()}_gold_faostat_csv",
|
|
603
|
+
name=f"{self.domain_code} gold FAOSTAT csv",
|
|
604
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
605
|
+
layer=TableLayer.GOLD,
|
|
606
|
+
private=True,
|
|
607
|
+
type=TableType.CSV,
|
|
608
|
+
path=self.iceberg_tables.GOLD_FAOSTAT.csv_path,
|
|
609
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
610
|
+
)
|
|
611
|
+
tag = upsert_disseminated_table(
|
|
612
|
+
sws_tags=tags,
|
|
613
|
+
tag=tag,
|
|
614
|
+
dataset_id=self.dataset_id,
|
|
615
|
+
tag_name=self.tag_name,
|
|
616
|
+
table=new_diss_table,
|
|
617
|
+
)
|
|
618
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
619
|
+
|
|
620
|
+
return df
|
|
621
|
+
|
|
622
|
+
def write_gold_faostat_unfiltered_dissemination_tag(
|
|
623
|
+
self, df: DataFrame, tags: Tags
|
|
624
|
+
) -> DataFrame:
|
|
625
|
+
# Get or create a new tag
|
|
626
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
627
|
+
logging.debug(f"Tag: {tag}")
|
|
628
|
+
|
|
629
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
630
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
|
|
631
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
|
|
632
|
+
description="Gold table containing all the tag data in FAOSTAT format",
|
|
633
|
+
layer=TableLayer.GOLD,
|
|
634
|
+
private=True,
|
|
635
|
+
type=TableType.ICEBERG,
|
|
636
|
+
database=IcebergDatabases.GOLD_DATABASE,
|
|
637
|
+
table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
|
|
638
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
|
|
639
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
640
|
+
)
|
|
641
|
+
tag = upsert_disseminated_table(
|
|
642
|
+
sws_tags=tags,
|
|
643
|
+
tag=tag,
|
|
644
|
+
dataset_id=self.dataset_id,
|
|
645
|
+
tag_name=self.tag_name,
|
|
646
|
+
table=new_iceberg_table,
|
|
647
|
+
)
|
|
648
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
649
|
+
|
|
650
|
+
new_diss_table = BaseDisseminatedTagTable(
|
|
651
|
+
id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
|
|
652
|
+
name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
|
|
653
|
+
description="Gold table containing the tag data in FAOSTAT format in csv",
|
|
654
|
+
layer=TableLayer.GOLD,
|
|
655
|
+
private=True,
|
|
656
|
+
type=TableType.CSV,
|
|
657
|
+
path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
|
|
658
|
+
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
659
|
+
)
|
|
660
|
+
tag = upsert_disseminated_table(
|
|
661
|
+
sws_tags=tags,
|
|
662
|
+
tag=tag,
|
|
663
|
+
dataset_id=self.dataset_id,
|
|
664
|
+
tag_name=self.tag_name,
|
|
665
|
+
table=new_diss_table,
|
|
688
666
|
)
|
|
689
667
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
690
668
|
|