sws-spark-dissemination-helper 0.0.93__py3-none-any.whl → 0.0.183__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +321 -57
- sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +723 -0
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +324 -16
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py +96 -31
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +117 -16
- sws_spark_dissemination_helper/__init__.py +1 -0
- sws_spark_dissemination_helper/constants.py +93 -25
- sws_spark_dissemination_helper/utils.py +24 -6
- {sws_spark_dissemination_helper-0.0.93.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/METADATA +11 -11
- sws_spark_dissemination_helper-0.0.183.dist-info/RECORD +13 -0
- sws_spark_dissemination_helper-0.0.93.dist-info/RECORD +0 -11
- {sws_spark_dissemination_helper-0.0.93.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/WHEEL +0 -0
- {sws_spark_dissemination_helper-0.0.93.dist-info → sws_spark_dissemination_helper-0.0.183.dist-info}/licenses/LICENSE +0 -0
|
@@ -10,7 +10,7 @@ from pyspark.sql.window import Window
|
|
|
10
10
|
from sws_api_client import Tags
|
|
11
11
|
from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
|
|
12
12
|
|
|
13
|
-
from .constants import IcebergDatabases, IcebergTables
|
|
13
|
+
from .constants import IcebergDatabases, IcebergTables, DatasetDatatables
|
|
14
14
|
from .SWSPostgresSparkReader import SWSPostgresSparkReader
|
|
15
15
|
from .utils import (
|
|
16
16
|
get_or_create_tag,
|
|
@@ -103,16 +103,17 @@ class SWSSilverIcebergSparkHelper:
|
|
|
103
103
|
# The diss_flag column is needed to initialize the condition expression
|
|
104
104
|
# The note column will contain the eventual reasons why diss_flag has been set to false
|
|
105
105
|
return df.withColumn("diss_flag", lit(True)).withColumn(
|
|
106
|
-
"
|
|
106
|
+
"diss_note", lit([]).cast(ArrayType(StringType()))
|
|
107
107
|
)
|
|
108
108
|
|
|
109
109
|
def read_bronze_data(self) -> DataFrame:
|
|
110
110
|
return self.spark.read.option("tag", self.tag_name).table(
|
|
111
111
|
self.iceberg_tables.BRONZE.iceberg_id
|
|
112
112
|
)
|
|
113
|
+
|
|
113
114
|
def read_bronze_diss_tag_data(self) -> DataFrame:
|
|
114
|
-
return self.spark.read.option("
|
|
115
|
-
self.iceberg_tables.
|
|
115
|
+
return self.spark.read.option("tag", self.tag_name).table(
|
|
116
|
+
self.iceberg_tables.BRONZE_DISS_TAG.iceberg_id
|
|
116
117
|
)
|
|
117
118
|
|
|
118
119
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
@@ -162,6 +163,99 @@ class SWSSilverIcebergSparkHelper:
|
|
|
162
163
|
|
|
163
164
|
logging.info(f"Checking time validity for {col_name} of type {col_type}")
|
|
164
165
|
|
|
166
|
+
if col_type == "area":
|
|
167
|
+
logging.info(
|
|
168
|
+
f'Changing start and end year according to "{DatasetDatatables.MAPPING_CODE_CORRECTION.name}"'
|
|
169
|
+
)
|
|
170
|
+
df_start_year_correction = self.df_mapping_code_correction.filter(
|
|
171
|
+
col("var_type") == lit("start_year")
|
|
172
|
+
)
|
|
173
|
+
df_end_year_correction = self.df_mapping_code_correction.filter(
|
|
174
|
+
col("var_type") == lit("end_year")
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
original_col_order = df.columns
|
|
178
|
+
cols_to_select = df.columns
|
|
179
|
+
col_name_lower = col_name.lower()
|
|
180
|
+
cols_to_select = [
|
|
181
|
+
column
|
|
182
|
+
for column in cols_to_select
|
|
183
|
+
if column.lower()
|
|
184
|
+
not in (
|
|
185
|
+
"diss_note",
|
|
186
|
+
f"{col_name_lower}_start_date",
|
|
187
|
+
f"{col_name_lower}_end_date",
|
|
188
|
+
)
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
df = (
|
|
192
|
+
df.alias("d")
|
|
193
|
+
.join(
|
|
194
|
+
F.broadcast(df_start_year_correction).alias("sy"),
|
|
195
|
+
on=col(f"d.{col_name}") == col("sy.mapping_type"),
|
|
196
|
+
how="left",
|
|
197
|
+
)
|
|
198
|
+
.join(
|
|
199
|
+
F.broadcast(df_end_year_correction).alias("ey"),
|
|
200
|
+
on=col(f"d.{col_name}") == col("ey.mapping_type"),
|
|
201
|
+
how="left",
|
|
202
|
+
)
|
|
203
|
+
.withColumn("valid_new_start_year", col("sy.new_code").isNotNull())
|
|
204
|
+
.withColumn("valid_new_end_year", col("ey.new_code").isNotNull())
|
|
205
|
+
.withColumn(
|
|
206
|
+
"new_diss_note",
|
|
207
|
+
F.when(
|
|
208
|
+
col("valid_new_start_year"),
|
|
209
|
+
F.array_append(
|
|
210
|
+
col("d.diss_note"),
|
|
211
|
+
F.concat(
|
|
212
|
+
col("sy.note"),
|
|
213
|
+
lit(" from "),
|
|
214
|
+
col("sy.old_code"),
|
|
215
|
+
lit(" to "),
|
|
216
|
+
col("sy.new_code"),
|
|
217
|
+
),
|
|
218
|
+
),
|
|
219
|
+
).otherwise(col("d.diss_note")),
|
|
220
|
+
)
|
|
221
|
+
.withColumn(
|
|
222
|
+
"new_diss_note",
|
|
223
|
+
F.when(
|
|
224
|
+
col("valid_new_end_year"),
|
|
225
|
+
F.array_append(
|
|
226
|
+
col("new_diss_note"),
|
|
227
|
+
F.concat(
|
|
228
|
+
col("ey.note"),
|
|
229
|
+
lit(" from "),
|
|
230
|
+
col("ey.old_code"),
|
|
231
|
+
lit(" to "),
|
|
232
|
+
col("ey.new_code"),
|
|
233
|
+
),
|
|
234
|
+
),
|
|
235
|
+
).otherwise(col("new_diss_note")),
|
|
236
|
+
)
|
|
237
|
+
.withColumn(
|
|
238
|
+
f"new_{col_name}_start_date",
|
|
239
|
+
F.when(
|
|
240
|
+
col("valid_new_start_year"), F.to_date(col("sy.new_code"))
|
|
241
|
+
).otherwise(col(f"d.{col_name}_start_date")),
|
|
242
|
+
)
|
|
243
|
+
.withColumn(
|
|
244
|
+
f"new_{col_name}_end_date",
|
|
245
|
+
F.when(
|
|
246
|
+
col("valid_new_end_year"),
|
|
247
|
+
F.to_date(F.concat(col("ey.new_code"), lit("-12-31"))),
|
|
248
|
+
).otherwise(col(f"d.{col_name}_end_date")),
|
|
249
|
+
)
|
|
250
|
+
.select(
|
|
251
|
+
*cols_to_select,
|
|
252
|
+
col("new_diss_note").alias("diss_note"),
|
|
253
|
+
col(f"new_{col_name}_start_date").alias(f"{col_name}_start_date"),
|
|
254
|
+
col(f"new_{col_name}_end_date").alias(f"{col_name}_end_date"),
|
|
255
|
+
)
|
|
256
|
+
.select(*original_col_order)
|
|
257
|
+
)
|
|
258
|
+
|
|
165
259
|
# Iterate through columns and build conditions dynamically
|
|
166
260
|
start_date_condition = col(f"{col_name}_start_date").isNull() | (
|
|
167
261
|
col(f"{col_name}_start_date") <= col(f"{self.time_column}_start_date")
|
|
@@ -176,15 +270,15 @@ class SWSSilverIcebergSparkHelper:
|
|
|
176
270
|
start_date_condition & end_date_condition,
|
|
177
271
|
)
|
|
178
272
|
.withColumn("diss_flag", col("diss_flag") & col("condition_result"))
|
|
179
|
-
# In case the condition is satisfied update diss_flag accordingly and append a
|
|
273
|
+
# In case the condition is satisfied update diss_flag accordingly and append a diss_note indicating the reason for the observation exclusion from the dissemination
|
|
180
274
|
.withColumn(
|
|
181
|
-
"
|
|
275
|
+
"diss_note",
|
|
182
276
|
F.when(
|
|
183
277
|
~col("condition_result"),
|
|
184
278
|
F.array_append(
|
|
185
|
-
col("
|
|
279
|
+
col("diss_note"), lit(f"{col_type} out of time validity range")
|
|
186
280
|
),
|
|
187
|
-
).otherwise(col("
|
|
281
|
+
).otherwise(col("diss_note")),
|
|
188
282
|
)
|
|
189
283
|
.drop("condition_result")
|
|
190
284
|
)
|
|
@@ -296,7 +390,7 @@ class SWSSilverIcebergSparkHelper:
|
|
|
296
390
|
col_name (str): The DataFrame column name on which to apply the filter
|
|
297
391
|
|
|
298
392
|
Returns:
|
|
299
|
-
DataFrame: The DataFrame with updated `diss_flag` and `
|
|
393
|
+
DataFrame: The DataFrame with updated `diss_flag` and `diss_note` columns based on the check outcome
|
|
300
394
|
"""
|
|
301
395
|
|
|
302
396
|
# Remove the duplicates that may be in the tables
|
|
@@ -334,14 +428,14 @@ class SWSSilverIcebergSparkHelper:
|
|
|
334
428
|
col("diss_flag") & col("condition_result"),
|
|
335
429
|
)
|
|
336
430
|
.withColumn(
|
|
337
|
-
"
|
|
431
|
+
"diss_note",
|
|
338
432
|
F.when(
|
|
339
433
|
~col("condition_result"),
|
|
340
434
|
F.array_append(
|
|
341
|
-
col("
|
|
435
|
+
col("diss_note"),
|
|
342
436
|
lit(f"{col_type} not disseminated for this domain"),
|
|
343
437
|
),
|
|
344
|
-
).otherwise(col("
|
|
438
|
+
).otherwise(col("diss_note")),
|
|
345
439
|
)
|
|
346
440
|
.drop("condition_result")
|
|
347
441
|
)
|
|
@@ -428,16 +522,16 @@ class SWSSilverIcebergSparkHelper:
|
|
|
428
522
|
col("diss_flag") & col("condition_result"),
|
|
429
523
|
)
|
|
430
524
|
.withColumn(
|
|
431
|
-
"
|
|
525
|
+
"diss_note",
|
|
432
526
|
F.when(
|
|
433
527
|
~col("condition_result"),
|
|
434
528
|
F.array_append(
|
|
435
|
-
col("
|
|
529
|
+
col("diss_note"),
|
|
436
530
|
lit(
|
|
437
531
|
f"not disseminated according to exception with note: {row_exception['note']}"
|
|
438
532
|
),
|
|
439
533
|
),
|
|
440
|
-
).otherwise(col("
|
|
534
|
+
).otherwise(col("diss_note")),
|
|
441
535
|
)
|
|
442
536
|
.drop("condition_result")
|
|
443
537
|
)
|
|
@@ -522,7 +616,7 @@ class SWSSilverIcebergSparkHelper:
|
|
|
522
616
|
|
|
523
617
|
df = (
|
|
524
618
|
df.withColumn("metadata", F.to_json(col("metadata")))
|
|
525
|
-
.withColumn("
|
|
619
|
+
.withColumn("diss_note", F.to_json(col("diss_note")))
|
|
526
620
|
.coalesce(1)
|
|
527
621
|
)
|
|
528
622
|
|
|
@@ -551,6 +645,13 @@ class SWSSilverIcebergSparkHelper:
|
|
|
551
645
|
table=self.iceberg_tables.SILVER.table,
|
|
552
646
|
path=self.iceberg_tables.SILVER.path,
|
|
553
647
|
structure={"columns": df.schema.jsonValue()["fields"]},
|
|
648
|
+
pinned_columns=[
|
|
649
|
+
*self.dim_columns_w_time,
|
|
650
|
+
"value",
|
|
651
|
+
*self.flag_columns,
|
|
652
|
+
"diss_flag",
|
|
653
|
+
"diss_note",
|
|
654
|
+
],
|
|
554
655
|
)
|
|
555
656
|
tag = upsert_disseminated_table(
|
|
556
657
|
sws_tags=tags,
|
|
@@ -2,3 +2,4 @@ from .SWSPostgresSparkReader import SWSPostgresSparkReader
|
|
|
2
2
|
from .SWSBronzeIcebergSparkHelper import SWSBronzeIcebergSparkHelper
|
|
3
3
|
from .SWSSilverIcebergSparkHelper import SWSSilverIcebergSparkHelper
|
|
4
4
|
from .SWSGoldIcebergSparkHelper import SWSGoldIcebergSparkHelper
|
|
5
|
+
from .SWSEasyIcebergSparkHelper import SWSEasyIcebergSparkHelper
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
1
3
|
from pyspark.sql.functions import col, lit
|
|
2
4
|
|
|
3
5
|
SPARK_POSTGRES_DRIVER = "org.postgresql.Driver"
|
|
@@ -34,26 +36,70 @@ class DomainFilters:
|
|
|
34
36
|
class DatasetDatatables:
|
|
35
37
|
|
|
36
38
|
class __SWSDatatable:
|
|
37
|
-
def __init__(
|
|
39
|
+
def __init__(
|
|
40
|
+
self, id: str, name: str, schema: str, join_columns: List[str] = []
|
|
41
|
+
):
|
|
38
42
|
self.id = id
|
|
43
|
+
self.iceberg_id = f"{IcebergDatabases.BRONZE_DATABASE}.{id.split('.')[1]}"
|
|
39
44
|
self.name = name
|
|
40
45
|
self.schema = schema
|
|
46
|
+
self.join_columns = join_columns
|
|
47
|
+
|
|
48
|
+
# Aggregation Tables
|
|
49
|
+
AGGREGATES_COMPOSITION = __SWSDatatable(
|
|
50
|
+
id="datatables.aggregates_composition",
|
|
51
|
+
name="Aggregation - Composition",
|
|
52
|
+
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, aggregation_type STRING, group_code STRING, child_code STRING, group_name STRING, child_name STRING, link_code STRING, factor STRING",
|
|
53
|
+
)
|
|
54
|
+
AGGREGATES_ELEMENTS = __SWSDatatable(
|
|
55
|
+
id="datatables.aggregates_elements",
|
|
56
|
+
name="Aggregation - Aggregates per elements",
|
|
57
|
+
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, element STRING, aggregation_type STRING, code STRING",
|
|
58
|
+
)
|
|
41
59
|
|
|
42
60
|
# Dissemination Tables
|
|
43
61
|
DISSEMINATION_TYPE_LIST = __SWSDatatable(
|
|
44
62
|
id="datatables.dissemination_{type}_list",
|
|
45
63
|
name="Dissemination - {type} list",
|
|
46
64
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
|
|
65
|
+
join_columns=["domain", "code"],
|
|
47
66
|
)
|
|
48
67
|
DISSEMINATION_EXCEPTIONS = __SWSDatatable(
|
|
49
68
|
id="datatables.dissemination_exception",
|
|
50
69
|
name="Dissemination - Exceptions",
|
|
51
70
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, dim1_code STRING, dim2_code STRING, dim3_code STRING, dim4_code STRING, dim5_code STRING, dim6_code STRING, dim7_code STRING, status_flag STRING, method_flag STRING, dissemination BOOLEAN, aggregation BOOLEAN, note STRING",
|
|
71
|
+
join_columns=[
|
|
72
|
+
"domain",
|
|
73
|
+
" dim1_code",
|
|
74
|
+
" dim2_code",
|
|
75
|
+
" dim3_code",
|
|
76
|
+
" dim4_code",
|
|
77
|
+
" dim5_code",
|
|
78
|
+
" dim6_code",
|
|
79
|
+
" dim7_code",
|
|
80
|
+
" status_flag",
|
|
81
|
+
" method_flag",
|
|
82
|
+
],
|
|
83
|
+
)
|
|
84
|
+
DISPLAY_DECIMALS = __SWSDatatable(
|
|
85
|
+
id="datatables.display_decimals",
|
|
86
|
+
name="Dissemination - Display Decimals",
|
|
87
|
+
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, column_1_name STRING, column_1_value STRING, column_2_name STRING, column_2_value STRING, display_decimals STRING",
|
|
88
|
+
join_columns=[
|
|
89
|
+
"domain",
|
|
90
|
+
"column_1_name",
|
|
91
|
+
"column_1_value",
|
|
92
|
+
"column_2_name",
|
|
93
|
+
"column_2_value",
|
|
94
|
+
"display_decimals",
|
|
95
|
+
],
|
|
52
96
|
)
|
|
97
|
+
# TODO Deprecate
|
|
53
98
|
DISSEMINATION_ITEM_LIST_FAOSTAT = __SWSDatatable(
|
|
54
99
|
id="datatables.dissemination_item_list_faostat",
|
|
55
100
|
name="Dissemination - Item list - FAOSTAT",
|
|
56
101
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
|
|
102
|
+
join_columns=["domain", "code"],
|
|
57
103
|
)
|
|
58
104
|
|
|
59
105
|
# Mapping Tables
|
|
@@ -61,34 +107,23 @@ class DatasetDatatables:
|
|
|
61
107
|
id="datatables.aggregates_mapping_domains_id",
|
|
62
108
|
name="Mapping - Domains ID",
|
|
63
109
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, domain_name STRING, sws_source_id STRING, sws_destination_id STRING",
|
|
110
|
+
join_columns=["domain", "sws_source_id"],
|
|
64
111
|
)
|
|
65
112
|
MAPPING_CODELIST_TYPE = __SWSDatatable(
|
|
66
113
|
id="datatables.mapping_codelist_type",
|
|
67
114
|
name="Mapping Codelist type",
|
|
68
115
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, col_name STRING, col_type STRING",
|
|
116
|
+
join_columns=["domain", "col_name"],
|
|
69
117
|
)
|
|
70
118
|
MAPPING_CODE_CORRECTION = __SWSDatatable(
|
|
71
119
|
id="datatables.aggregates_mapping_code_correction",
|
|
72
120
|
name="Mapping - Code correction",
|
|
73
121
|
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, old_code STRING, new_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
|
|
74
|
-
|
|
75
|
-
MAPPING_SDMX_COLUMN_NAMES = __SWSDatatable(
|
|
76
|
-
id="datatables.mapping_sdmx_col_names",
|
|
77
|
-
name="Mapping - SDMX column names",
|
|
78
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_name STRING, external_name STRING, delete BOOLEAN, add BOOLEAN, default_value STRING",
|
|
79
|
-
)
|
|
80
|
-
MAPPING_SDMX_CODES = __SWSDatatable(
|
|
81
|
-
id="datatables.mapping_pre_dissemination",
|
|
82
|
-
name="Mapping - Pre dissemination",
|
|
83
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_code STRING, external_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
|
|
84
|
-
)
|
|
85
|
-
MAPPING_UNITS_OF_MEASURE = __SWSDatatable(
|
|
86
|
-
id="datatables.mapping_units_of_measure",
|
|
87
|
-
name="Mapping - Units of measure",
|
|
88
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, sws_code STRING, sws_multiplier INT, sdmx_code STRING, sdmx_multiplier INT, value_multiplier INT, delete BOOLEAN, mapping_type STRING",
|
|
122
|
+
join_columns=["domain", "old_code", "var_type", "mapping_type"],
|
|
89
123
|
)
|
|
90
124
|
|
|
91
125
|
# Non-SWS Sources Tables
|
|
126
|
+
# TODO To deprecate
|
|
92
127
|
FAOSTAT_CODE_MAPPING = __SWSDatatable(
|
|
93
128
|
id="datatables.faostat_code_mapping",
|
|
94
129
|
name="FAOSTAT Code Mapping",
|
|
@@ -150,6 +185,11 @@ class DatasetTables:
|
|
|
150
185
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata_element",
|
|
151
186
|
schema="id BIGINT, metadata INT, metadata_element_type INT, value STRING",
|
|
152
187
|
)
|
|
188
|
+
self.TAG_OBSERVATION = self.__SWSTable(
|
|
189
|
+
postgres_id=f"{self.__dataset_id}.tag_observation",
|
|
190
|
+
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_tag_observation",
|
|
191
|
+
schema="tag BIGINT, observation INT",
|
|
192
|
+
)
|
|
153
193
|
|
|
154
194
|
# Reference data
|
|
155
195
|
self.CODELISTS = [
|
|
@@ -181,18 +221,21 @@ class DatasetTables:
|
|
|
181
221
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_element_type",
|
|
182
222
|
schema="id INT, metadata_type INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN, private BOOLEAN",
|
|
183
223
|
)
|
|
184
|
-
|
|
185
224
|
LANGUAGE = __SWSTable(
|
|
186
225
|
postgres_id="reference_data.language",
|
|
187
226
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.language",
|
|
188
227
|
schema="id INT, country_code STRING, description STRING",
|
|
189
228
|
)
|
|
190
|
-
|
|
191
229
|
UNIT_OF_MEASURE = __SWSTable(
|
|
192
230
|
postgres_id="reference_data.unit_of_measure",
|
|
193
231
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.unit_of_measure",
|
|
194
232
|
schema="id INT, code STRING, sdmx_code STRING, metric BOOLEAN, description STRING, symbol STRING, base_unit STRING, multiplier DECIMAL",
|
|
195
233
|
)
|
|
234
|
+
DATASET = __SWSTable(
|
|
235
|
+
postgres_id="reference_data.dataset",
|
|
236
|
+
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.dataset",
|
|
237
|
+
schema="id INT, xml_name STRING",
|
|
238
|
+
)
|
|
196
239
|
|
|
197
240
|
# Operational data
|
|
198
241
|
USER = __SWSTable(
|
|
@@ -200,6 +243,11 @@ class DatasetTables:
|
|
|
200
243
|
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.user",
|
|
201
244
|
schema="id INT, username STRING, preferences INT, email STRING, active BOOLEAN, settings STRING",
|
|
202
245
|
)
|
|
246
|
+
TAG = __SWSTable(
|
|
247
|
+
postgres_id="operational_data.tag",
|
|
248
|
+
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.tag",
|
|
249
|
+
schema="id INT, name STRING, reference_date DATE, dataset INT, type STRING, released_ON DATE, released_by INT, properties STRING",
|
|
250
|
+
)
|
|
203
251
|
|
|
204
252
|
|
|
205
253
|
class IcebergTable:
|
|
@@ -218,24 +266,44 @@ class IcebergTables:
|
|
|
218
266
|
self.__dataset_id = dataset_id
|
|
219
267
|
self.__tag_name = tag_name
|
|
220
268
|
|
|
221
|
-
|
|
222
|
-
self.
|
|
269
|
+
# TODO Fix later with a more appropriate DATABASE
|
|
270
|
+
self.DENORMALIZED_OBSERVATION = self.create_iceberg_table(
|
|
271
|
+
"BRONZE", suffix="denormalized_observation"
|
|
272
|
+
)
|
|
273
|
+
self.DENORMALIZED_METADATA = self.create_iceberg_table(
|
|
274
|
+
"BRONZE", suffix="denormalized_metadata"
|
|
275
|
+
)
|
|
276
|
+
self.GROUPED_METADATA = self.create_iceberg_table(
|
|
277
|
+
"BRONZE", suffix="grouped_metadata"
|
|
278
|
+
)
|
|
279
|
+
self.TABLE = self.create_iceberg_table("BRONZE")
|
|
280
|
+
self.TABLE_FILTERED = self.create_iceberg_table("BRONZE", suffix="filtered")
|
|
281
|
+
self.BRONZE = self.create_iceberg_table("BRONZE")
|
|
282
|
+
self.BRONZE_DISS_TAG = self.create_iceberg_table("BRONZE", suffix="diss_tag")
|
|
283
|
+
self.SILVER = self.create_iceberg_table("SILVER", prefix=domain)
|
|
223
284
|
|
|
224
285
|
# GOLD tables with specific suffixes
|
|
225
|
-
self.
|
|
286
|
+
self.GOLD_SWS = self.create_iceberg_table("GOLD", prefix=domain, suffix="sws")
|
|
287
|
+
self.GOLD_SDMX = self.create_iceberg_table(
|
|
226
288
|
"GOLD", prefix=domain, suffix="sdmx_disseminated"
|
|
227
289
|
)
|
|
228
|
-
self.GOLD_SWS_VALIDATED = self.
|
|
290
|
+
self.GOLD_SWS_VALIDATED = self.create_iceberg_table(
|
|
229
291
|
"GOLD", prefix=domain, suffix="sws_validated"
|
|
230
292
|
)
|
|
231
|
-
self.GOLD_SWS_DISSEMINATED = self.
|
|
293
|
+
self.GOLD_SWS_DISSEMINATED = self.create_iceberg_table(
|
|
232
294
|
"GOLD", prefix=domain, suffix="sws_disseminated"
|
|
233
295
|
)
|
|
234
|
-
self.GOLD_PRE_SDMX = self.
|
|
296
|
+
self.GOLD_PRE_SDMX = self.create_iceberg_table(
|
|
235
297
|
"GOLD", prefix=domain, suffix="pre_sdmx"
|
|
236
298
|
)
|
|
299
|
+
self.GOLD_FAOSTAT = self.create_iceberg_table(
|
|
300
|
+
"GOLD", prefix=domain, suffix="faostat"
|
|
301
|
+
)
|
|
302
|
+
self.GOLD_FAOSTAT_UNFILTERED = self.create_iceberg_table(
|
|
303
|
+
"GOLD", prefix=domain, suffix="faostat_unfiltered"
|
|
304
|
+
)
|
|
237
305
|
|
|
238
|
-
def
|
|
306
|
+
def create_iceberg_table(
|
|
239
307
|
self, level: str, prefix: str = "", suffix: str = ""
|
|
240
308
|
) -> IcebergTable:
|
|
241
309
|
database = getattr(IcebergDatabases, f"{level}_DATABASE")
|
|
@@ -363,16 +363,34 @@ def map_codes_and_remove_null_duplicates(
|
|
|
363
363
|
"diss_flag", F.when(col("delete"), lit(False)).otherwise(col("diss_flag"))
|
|
364
364
|
)
|
|
365
365
|
.withColumn(
|
|
366
|
-
"
|
|
366
|
+
"diss_note",
|
|
367
367
|
F.when(
|
|
368
368
|
col("delete"),
|
|
369
369
|
F.array_append(
|
|
370
|
-
col("
|
|
370
|
+
col("diss_note"),
|
|
371
371
|
lit(
|
|
372
372
|
f"The observation is not disseminated according to the Mapping - Code correction table"
|
|
373
373
|
),
|
|
374
374
|
),
|
|
375
|
-
).otherwise(col("
|
|
375
|
+
).otherwise(col("diss_note")),
|
|
376
|
+
)
|
|
377
|
+
# Add mapping message to notes
|
|
378
|
+
.withColumn(
|
|
379
|
+
"diss_note",
|
|
380
|
+
F.when(
|
|
381
|
+
~col("is_duplicate")
|
|
382
|
+
& col("new_dim_code").isNotNull()
|
|
383
|
+
& (col("new_dim_code") != lit("")),
|
|
384
|
+
F.array_append(
|
|
385
|
+
col("diss_note"),
|
|
386
|
+
F.concat(
|
|
387
|
+
lit(f"Dimension {col_name} code was changed from "),
|
|
388
|
+
col(col_name),
|
|
389
|
+
lit(" to "),
|
|
390
|
+
col("new_dim_code"),
|
|
391
|
+
),
|
|
392
|
+
),
|
|
393
|
+
).otherwise(col("diss_note")),
|
|
376
394
|
)
|
|
377
395
|
.withColumn(
|
|
378
396
|
col_name,
|
|
@@ -391,18 +409,18 @@ def map_codes_and_remove_null_duplicates(
|
|
|
391
409
|
).otherwise(col("diss_flag")),
|
|
392
410
|
)
|
|
393
411
|
.withColumn(
|
|
394
|
-
"
|
|
412
|
+
"diss_note",
|
|
395
413
|
F.when(
|
|
396
414
|
col("is_duplicate")
|
|
397
415
|
& col("new_dim_code").isNotNull()
|
|
398
416
|
& (col("new_dim_code") != lit("")),
|
|
399
417
|
F.array_append(
|
|
400
|
-
col("
|
|
418
|
+
col("diss_note"),
|
|
401
419
|
lit(
|
|
402
420
|
f"The code correction was not applied to avoid observation duplications"
|
|
403
421
|
),
|
|
404
422
|
),
|
|
405
|
-
).otherwise(col("
|
|
423
|
+
).otherwise(col("diss_note")),
|
|
406
424
|
)
|
|
407
425
|
# Check the domain specific multiplier first and then the standard multiplier
|
|
408
426
|
.withColumn("value", col("value") * F.coalesce(col("multiplier"), lit(1)))
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.183
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
|
-
Project-URL: Repository, https://
|
|
5
|
+
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
7
7
|
License: MIT License
|
|
8
8
|
|
|
@@ -31,27 +31,27 @@ Classifier: Operating System :: OS Independent
|
|
|
31
31
|
Classifier: Programming Language :: Python :: 3
|
|
32
32
|
Requires-Python: >=3.9
|
|
33
33
|
Requires-Dist: annotated-types==0.7.0
|
|
34
|
-
Requires-Dist: boto3
|
|
35
|
-
Requires-Dist: botocore
|
|
34
|
+
Requires-Dist: boto3>=1.40.0
|
|
35
|
+
Requires-Dist: botocore>=1.40.0
|
|
36
36
|
Requires-Dist: certifi==2025.1.31
|
|
37
37
|
Requires-Dist: charset-normalizer==3.4.1
|
|
38
|
-
Requires-Dist: idna
|
|
38
|
+
Requires-Dist: idna>=3.10
|
|
39
39
|
Requires-Dist: jmespath==1.0.1
|
|
40
40
|
Requires-Dist: numpy==2.0.2
|
|
41
|
-
Requires-Dist: pandas==2.
|
|
41
|
+
Requires-Dist: pandas==2.3.3
|
|
42
42
|
Requires-Dist: py4j==0.10.9.7
|
|
43
43
|
Requires-Dist: pydantic-core==2.27.2
|
|
44
44
|
Requires-Dist: pydantic==2.10.6
|
|
45
45
|
Requires-Dist: pyspark==3.5.4
|
|
46
46
|
Requires-Dist: python-dateutil==2.9.0.post0
|
|
47
47
|
Requires-Dist: python-dotenv==0.19.2
|
|
48
|
-
Requires-Dist: pytz==2025.
|
|
48
|
+
Requires-Dist: pytz==2025.2
|
|
49
49
|
Requires-Dist: requests==2.32.3
|
|
50
|
-
Requires-Dist: s3transfer
|
|
50
|
+
Requires-Dist: s3transfer>=0.11.2
|
|
51
51
|
Requires-Dist: six==1.17.0
|
|
52
|
-
Requires-Dist: sws-api-client==
|
|
53
|
-
Requires-Dist: typing-extensions
|
|
54
|
-
Requires-Dist: tzdata==2025.
|
|
52
|
+
Requires-Dist: sws-api-client==2.3.0
|
|
53
|
+
Requires-Dist: typing-extensions>=4.12.2
|
|
54
|
+
Requires-Dist: tzdata==2025.2
|
|
55
55
|
Requires-Dist: urllib3==1.26.20
|
|
56
56
|
Description-Content-Type: text/markdown
|
|
57
57
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
|
|
2
|
+
sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
|
|
4
|
+
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=atQFiY5Mmo-rzHY7WVWg-Guvg8i1ZcaaoKE4ymTaKdE,27750
|
|
5
|
+
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=V_rH4UYoFZfMUc82U-KxeL_o8F44HnMHfLLXoyNxHxs,20016
|
|
6
|
+
sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=3l5zkEWksnEC-R4mJi8JEHL3ylCMbkMD9a0qbdZQU5E,26345
|
|
7
|
+
sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
|
|
8
|
+
sws_spark_dissemination_helper/constants.py,sha256=cVjTS3xbJNKz-1i7c1dJk2PcOZzQhvuHUp9i4PNIPh4,14055
|
|
9
|
+
sws_spark_dissemination_helper/utils.py,sha256=Ge8zXsUIcvFihALDNLF5kCu_tAdRQUE04xE6Yn9xQF4,22008
|
|
10
|
+
sws_spark_dissemination_helper-0.0.183.dist-info/METADATA,sha256=LDVmzDL6ZDhGrRBd3flpX0TPEIJONpdZJodUGrAvemw,2822
|
|
11
|
+
sws_spark_dissemination_helper-0.0.183.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
12
|
+
sws_spark_dissemination_helper-0.0.183.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
|
|
13
|
+
sws_spark_dissemination_helper-0.0.183.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=ZPCpHgPVCsf7-7tWl6DDWgnXLkS02RoCvsomO3TmQ24,20418
|
|
2
|
-
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=ZC7hxkppo6qmfCc2z5vm2Y2iH1901F-rx9Er9cxuzP4,16037
|
|
3
|
-
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=ja7AbOfbmC_EXHCJk7UMDzzbA-LRxzPkaaUmuvcihJ8,17449
|
|
4
|
-
sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=zEppNq5shiHZH2yt5faWGsb5QEmpAQS0ToIrG6fmv6o,22231
|
|
5
|
-
sws_spark_dissemination_helper/__init__.py,sha256=Efjoe9V4vGXWVp-DY5P6NbRwIUr_zkZJkDmMi-lf5Bc,262
|
|
6
|
-
sws_spark_dissemination_helper/constants.py,sha256=hpHHlbojShMWRfyIelXz6c5BqFzO48Oap1zmztlMMrs,11349
|
|
7
|
-
sws_spark_dissemination_helper/utils.py,sha256=6SzrXX0xhvynRyv-vRFDbc6V4UNe_RzKKETZAtefnhg,21341
|
|
8
|
-
sws_spark_dissemination_helper-0.0.93.dist-info/METADATA,sha256=y1PL3ZygwfoyBxglsrNeP6IZvaUGTYCM03RuIjrqDMc,2823
|
|
9
|
-
sws_spark_dissemination_helper-0.0.93.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
-
sws_spark_dissemination_helper-0.0.93.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
|
|
11
|
-
sws_spark_dissemination_helper-0.0.93.dist-info/RECORD,,
|
|
File without changes
|