sws-spark-dissemination-helper 0.0.60__py3-none-any.whl → 0.0.171__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +380 -28
- sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
- sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +723 -0
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +331 -353
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py +110 -31
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +138 -23
- sws_spark_dissemination_helper/__init__.py +1 -0
- sws_spark_dissemination_helper/constants.py +76 -24
- sws_spark_dissemination_helper/utils.py +133 -68
- {sws_spark_dissemination_helper-0.0.60.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/METADATA +21 -17
- sws_spark_dissemination_helper-0.0.171.dist-info/RECORD +13 -0
- sws_spark_dissemination_helper-0.0.60.dist-info/RECORD +0 -11
- {sws_spark_dissemination_helper-0.0.60.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/WHEEL +0 -0
- {sws_spark_dissemination_helper-0.0.60.dist-info → sws_spark_dissemination_helper-0.0.171.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,12 +4,13 @@ from typing import List
|
|
|
4
4
|
|
|
5
5
|
import boto3
|
|
6
6
|
import pyspark.sql.functions as F
|
|
7
|
-
from pyspark.sql import DataFrame, SparkSession
|
|
7
|
+
from pyspark.sql import Column, DataFrame, SparkSession
|
|
8
8
|
from pyspark.sql.functions import col, lit
|
|
9
|
+
from pyspark.sql.window import Window
|
|
9
10
|
from sws_api_client import Tags
|
|
10
|
-
from sws_api_client.tags import
|
|
11
|
+
from sws_api_client.tags import BaseDisseminatedTagTable, DisseminatedTag
|
|
11
12
|
|
|
12
|
-
from .constants import
|
|
13
|
+
from .constants import DatasetDatatables, DomainFilters
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
def get_spark() -> SparkSession:
|
|
@@ -272,7 +273,7 @@ def check_sdmx_col_names_mappings(
|
|
|
272
273
|
return df_mapping_sdmx_column_names_unique
|
|
273
274
|
|
|
274
275
|
|
|
275
|
-
def
|
|
276
|
+
def map_codes_and_remove_null_duplicates(
|
|
276
277
|
df: DataFrame,
|
|
277
278
|
df_mapping: DataFrame,
|
|
278
279
|
domain_code: str,
|
|
@@ -280,98 +281,162 @@ def map_codes(
|
|
|
280
281
|
col_type: str,
|
|
281
282
|
src_column: str,
|
|
282
283
|
dest_column: str,
|
|
284
|
+
dimension_columns: List[str],
|
|
285
|
+
flag_columns: List[str],
|
|
283
286
|
) -> DataFrame:
|
|
284
|
-
|
|
287
|
+
|
|
288
|
+
lower_col_name = col_name.lower()
|
|
289
|
+
lower_flag_columns = [column.lower() for column in flag_columns]
|
|
290
|
+
lower_dimension_columns = [column.lower() for column in dimension_columns]
|
|
291
|
+
|
|
292
|
+
# Define partitioning columns
|
|
293
|
+
if lower_col_name in lower_flag_columns:
|
|
294
|
+
partition_columns = dimension_columns
|
|
295
|
+
else:
|
|
296
|
+
partition_columns = [
|
|
297
|
+
column for column in lower_dimension_columns if column != lower_col_name
|
|
298
|
+
] + ["partition_column"]
|
|
299
|
+
|
|
300
|
+
partitioning_window = Window.partitionBy(*partition_columns)
|
|
301
|
+
|
|
302
|
+
standard_mapping_df = df_mapping.filter(
|
|
303
|
+
(col("domain").isNull() | (col("domain") == lit("")))
|
|
304
|
+
& (col("var_type") == lit(col_type))
|
|
305
|
+
& (col("mapping_type").isNull() | (col("mapping_type") == lit("")))
|
|
306
|
+
)
|
|
307
|
+
domain_mapping_df = df_mapping.filter(
|
|
308
|
+
(col("domain") == lit(domain_code))
|
|
309
|
+
& (col("var_type") == lit(col_type))
|
|
310
|
+
& (col("mapping_type").isNull() | (col("mapping_type") == lit("")))
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
count_all = df.count()
|
|
314
|
+
|
|
315
|
+
df_no_nulls = (
|
|
285
316
|
df.alias("d")
|
|
286
317
|
# Join the data with the standard mapping for the specific dimension
|
|
287
318
|
.join(
|
|
288
|
-
F.broadcast(
|
|
289
|
-
df_mapping.filter(
|
|
290
|
-
(col("domain").isNull() | (col("domain") == lit("")))
|
|
291
|
-
& (col("var_type") == lit(col_type))
|
|
292
|
-
& (
|
|
293
|
-
col("mapping_type").isNull()
|
|
294
|
-
| (col("mapping_type").isNull() == lit(""))
|
|
295
|
-
)
|
|
296
|
-
)
|
|
297
|
-
).alias("m_standard"),
|
|
319
|
+
F.broadcast(standard_mapping_df).alias("m_standard"),
|
|
298
320
|
col(f"d.{col_name}") == col(f"m_standard.{src_column}"),
|
|
299
321
|
"left",
|
|
300
322
|
)
|
|
301
323
|
# Join the data with the domain specific mapping for the specific dimension
|
|
302
324
|
.join(
|
|
303
|
-
F.broadcast(
|
|
304
|
-
df_mapping.filter(
|
|
305
|
-
(col("domain") == lit(domain_code))
|
|
306
|
-
& (col("var_type") == lit(col_type))
|
|
307
|
-
& (
|
|
308
|
-
col("mapping_type").isNull()
|
|
309
|
-
| (col("mapping_type").isNull() == lit(""))
|
|
310
|
-
)
|
|
311
|
-
)
|
|
312
|
-
).alias("m_domain"),
|
|
325
|
+
F.broadcast(domain_mapping_df).alias("m_domain"),
|
|
313
326
|
col(f"d.{col_name}") == col(f"m_domain.{src_column}"),
|
|
314
327
|
"left",
|
|
315
328
|
)
|
|
316
|
-
# Select only the columns we are interested in (this step is optional but recommended for debugging)
|
|
317
329
|
.select(
|
|
318
330
|
"d.*",
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
331
|
+
# Evaluate the domain specific rule first and then the general rule
|
|
332
|
+
F.coalesce(
|
|
333
|
+
col(f"m_domain.{dest_column}"), col(f"m_standard.{dest_column}")
|
|
334
|
+
).alias("new_dim_code"),
|
|
335
|
+
F.coalesce(
|
|
336
|
+
col("m_domain.delete"),
|
|
337
|
+
col("m_standard.delete"),
|
|
338
|
+
lit(False),
|
|
339
|
+
).alias("delete"),
|
|
340
|
+
F.coalesce(col("m_standard.multiplier"), col("m_domain.multiplier")).alias(
|
|
341
|
+
"multiplier"
|
|
342
|
+
),
|
|
343
|
+
)
|
|
344
|
+
.withColumn("partition_column", F.coalesce(col("new_dim_code"), col(col_name)))
|
|
345
|
+
.withColumn("count_obs_per_point", F.count(lit(1)).over(partitioning_window))
|
|
346
|
+
.withColumn("is_duplicate", col("count_obs_per_point") > lit(1))
|
|
347
|
+
# Filter out all the rows that are duplicates with null value
|
|
348
|
+
.filter(~(col("is_duplicate") & col("value").isNull()))
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
count_no_null_dupes = df_no_nulls.count()
|
|
352
|
+
null_dupes_removed = count_all - count_no_null_dupes
|
|
353
|
+
|
|
354
|
+
logging.info(f"{null_dupes_removed} duplicates with null value removed")
|
|
355
|
+
|
|
356
|
+
df_mapped = (
|
|
357
|
+
df_no_nulls
|
|
358
|
+
# Count again the observations per coordinate after removing the null duplicates
|
|
359
|
+
.withColumn("count_obs_per_point", F.count(lit(1)).over(partitioning_window))
|
|
360
|
+
.withColumn("is_duplicate", col("count_obs_per_point") > lit(1))
|
|
361
|
+
# Update the diss_flag to false for records to delete
|
|
362
|
+
.withColumn(
|
|
363
|
+
"diss_flag", F.when(col("delete"), lit(False)).otherwise(col("diss_flag"))
|
|
364
|
+
)
|
|
365
|
+
.withColumn(
|
|
366
|
+
"note",
|
|
367
|
+
F.when(
|
|
368
|
+
col("delete"),
|
|
369
|
+
F.array_append(
|
|
370
|
+
col("note"),
|
|
371
|
+
lit(
|
|
372
|
+
f"The observation is not disseminated according to the Mapping - Code correction table"
|
|
373
|
+
),
|
|
374
|
+
),
|
|
375
|
+
).otherwise(col("note")),
|
|
376
|
+
)
|
|
377
|
+
# Add mapping message to notes
|
|
378
|
+
.withColumn(
|
|
379
|
+
"note",
|
|
329
380
|
F.when(
|
|
330
|
-
col("
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
381
|
+
~col("is_duplicate")
|
|
382
|
+
& col("new_dim_code").isNotNull()
|
|
383
|
+
& (col("new_dim_code") != lit("")),
|
|
384
|
+
F.array_append(
|
|
385
|
+
col("note"),
|
|
386
|
+
F.concat(
|
|
387
|
+
lit(f"Dimension {col_name} code was changed from "),
|
|
388
|
+
col(col_name),
|
|
389
|
+
lit(" to "),
|
|
390
|
+
col("new_dim_code"),
|
|
391
|
+
),
|
|
392
|
+
),
|
|
393
|
+
).otherwise(col("note")),
|
|
337
394
|
)
|
|
338
395
|
.withColumn(
|
|
339
396
|
col_name,
|
|
340
|
-
# Evaluate first the domain specific mapping
|
|
341
397
|
F.when(
|
|
342
|
-
col(
|
|
343
|
-
col(
|
|
344
|
-
)
|
|
345
|
-
# Then evaluate the general mapping
|
|
346
|
-
.when(
|
|
347
|
-
col(f"standard_{dest_column}").isNotNull(),
|
|
348
|
-
col(f"standard_{dest_column}"),
|
|
398
|
+
~col("is_duplicate"),
|
|
399
|
+
F.coalesce(col("new_dim_code"), col(col_name)),
|
|
349
400
|
).otherwise(col(col_name)),
|
|
350
401
|
)
|
|
351
402
|
.withColumn(
|
|
352
|
-
"
|
|
353
|
-
|
|
403
|
+
"diss_flag",
|
|
404
|
+
F.when(
|
|
405
|
+
col("is_duplicate")
|
|
406
|
+
& col("new_dim_code").isNotNull()
|
|
407
|
+
& (col("new_dim_code") != lit("")),
|
|
408
|
+
lit(False),
|
|
409
|
+
).otherwise(col("diss_flag")),
|
|
410
|
+
)
|
|
411
|
+
.withColumn(
|
|
412
|
+
"note",
|
|
354
413
|
F.when(
|
|
355
|
-
col("
|
|
356
|
-
col("
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
414
|
+
col("is_duplicate")
|
|
415
|
+
& col("new_dim_code").isNotNull()
|
|
416
|
+
& (col("new_dim_code") != lit("")),
|
|
417
|
+
F.array_append(
|
|
418
|
+
col("note"),
|
|
419
|
+
lit(
|
|
420
|
+
f"The code correction was not applied to avoid observation duplications"
|
|
421
|
+
),
|
|
422
|
+
),
|
|
423
|
+
).otherwise(col("note")),
|
|
363
424
|
)
|
|
425
|
+
# Check the domain specific multiplier first and then the standard multiplier
|
|
426
|
+
.withColumn("value", col("value") * F.coalesce(col("multiplier"), lit(1)))
|
|
364
427
|
# Remove the columns that were not in the original dataset
|
|
365
428
|
.drop(
|
|
366
|
-
|
|
367
|
-
"
|
|
368
|
-
"
|
|
369
|
-
|
|
370
|
-
"
|
|
371
|
-
"
|
|
429
|
+
"new_dim_code",
|
|
430
|
+
"delete",
|
|
431
|
+
"multiplier",
|
|
432
|
+
"partition_column",
|
|
433
|
+
"count_obs_per_point",
|
|
434
|
+
"is_duplicate",
|
|
372
435
|
)
|
|
373
436
|
)
|
|
374
437
|
|
|
438
|
+
return df_mapped
|
|
439
|
+
|
|
375
440
|
|
|
376
441
|
def apply_code_correction(
|
|
377
442
|
df: DataFrame,
|
|
@@ -381,7 +446,7 @@ def apply_code_correction(
|
|
|
381
446
|
col_type: str,
|
|
382
447
|
) -> DataFrame:
|
|
383
448
|
logging.info(f"correcting codes for column {col_name} of type {col_type}")
|
|
384
|
-
return
|
|
449
|
+
return map_codes_and_remove_null_duplicates(
|
|
385
450
|
df,
|
|
386
451
|
df_mapping_code_correction,
|
|
387
452
|
domain_code,
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.171
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
|
-
Project-URL: Repository, https://
|
|
5
|
+
Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
7
7
|
License: MIT License
|
|
8
8
|
|
|
@@ -29,26 +29,30 @@ License-File: LICENSE
|
|
|
29
29
|
Classifier: License :: OSI Approved :: MIT License
|
|
30
30
|
Classifier: Operating System :: OS Independent
|
|
31
31
|
Classifier: Programming Language :: Python :: 3
|
|
32
|
-
Requires-Python: >=3.
|
|
32
|
+
Requires-Python: >=3.9
|
|
33
33
|
Requires-Dist: annotated-types==0.7.0
|
|
34
|
-
Requires-Dist: boto3
|
|
35
|
-
Requires-Dist: botocore
|
|
36
|
-
Requires-Dist: certifi==
|
|
37
|
-
Requires-Dist: charset-normalizer==3.
|
|
38
|
-
Requires-Dist: idna
|
|
34
|
+
Requires-Dist: boto3>=1.40.75
|
|
35
|
+
Requires-Dist: botocore>=1.40.75
|
|
36
|
+
Requires-Dist: certifi==2025.1.31
|
|
37
|
+
Requires-Dist: charset-normalizer==3.4.1
|
|
38
|
+
Requires-Dist: idna>=3.10
|
|
39
39
|
Requires-Dist: jmespath==1.0.1
|
|
40
|
+
Requires-Dist: numpy==2.0.2
|
|
41
|
+
Requires-Dist: pandas==2.3.3
|
|
40
42
|
Requires-Dist: py4j==0.10.9.7
|
|
41
|
-
Requires-Dist: pydantic-core==2.
|
|
42
|
-
Requires-Dist: pydantic==2.
|
|
43
|
-
Requires-Dist: pyspark==3.5.
|
|
43
|
+
Requires-Dist: pydantic-core==2.27.2
|
|
44
|
+
Requires-Dist: pydantic==2.10.6
|
|
45
|
+
Requires-Dist: pyspark==3.5.4
|
|
44
46
|
Requires-Dist: python-dateutil==2.9.0.post0
|
|
45
|
-
Requires-Dist: python-dotenv==
|
|
47
|
+
Requires-Dist: python-dotenv==0.19.2
|
|
48
|
+
Requires-Dist: pytz==2025.2
|
|
46
49
|
Requires-Dist: requests==2.32.3
|
|
47
|
-
Requires-Dist: s3transfer
|
|
48
|
-
Requires-Dist: six==1.
|
|
49
|
-
Requires-Dist: sws-api-client==
|
|
50
|
-
Requires-Dist: typing-extensions
|
|
51
|
-
Requires-Dist:
|
|
50
|
+
Requires-Dist: s3transfer>=0.11.2
|
|
51
|
+
Requires-Dist: six==1.17.0
|
|
52
|
+
Requires-Dist: sws-api-client==2.3.0
|
|
53
|
+
Requires-Dist: typing-extensions>=4.12.2
|
|
54
|
+
Requires-Dist: tzdata==2025.2
|
|
55
|
+
Requires-Dist: urllib3==1.26.20
|
|
52
56
|
Description-Content-Type: text/markdown
|
|
53
57
|
|
|
54
58
|
# Upload a new version
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
|
|
2
|
+
sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
|
|
4
|
+
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=0dxbVkrhdaASapEffF5PFcgKwAMyJoWBxzgymjZ4JyY,25049
|
|
5
|
+
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=KpG8gp8Ai9pHDiKhUOTcXWxxmFGeKEE3XKlI_Y-SveU,18453
|
|
6
|
+
sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=qioLv3SlJEfk0LzTiwfXRtZXVImPOJUeh9k1XwHC-pA,26225
|
|
7
|
+
sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
|
|
8
|
+
sws_spark_dissemination_helper/constants.py,sha256=vQmalAqInwPAybgJOfYx99jn47KsKp8jeD8eqmjw-Rs,13471
|
|
9
|
+
sws_spark_dissemination_helper/utils.py,sha256=G7lQqNRrvqZpgm9WmddD7fWsI8IVn09x1p3cV3458EA,21963
|
|
10
|
+
sws_spark_dissemination_helper-0.0.171.dist-info/METADATA,sha256=W4qkQISSzekzXhpmNhlNMfJEmaQlscu3hQTs4Vavawg,2824
|
|
11
|
+
sws_spark_dissemination_helper-0.0.171.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
12
|
+
sws_spark_dissemination_helper-0.0.171.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
|
|
13
|
+
sws_spark_dissemination_helper-0.0.171.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=tyC3e2LNBes9J2UFR-j7bDlvEffeI0YsiYlMvk0wPxA,16382
|
|
2
|
-
sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=43ZOaDtFdnuYHL9oMSh64F3YUiDDWbVHS5iuhrsE1C4,26074
|
|
3
|
-
sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=wXSz4-SbIcfVfDsN5gsbg6ul5GvVoX59VkfjAmTCToo,14935
|
|
4
|
-
sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=ZTpjkejKJpl6kue8DI1FVEJB-M7TlyUgrjXqF7GUEws,21978
|
|
5
|
-
sws_spark_dissemination_helper/__init__.py,sha256=Efjoe9V4vGXWVp-DY5P6NbRwIUr_zkZJkDmMi-lf5Bc,262
|
|
6
|
-
sws_spark_dissemination_helper/constants.py,sha256=KGfuudVovMxgzCIowe7L9gqDbHjbngqzhd3Zgowo5yk,11229
|
|
7
|
-
sws_spark_dissemination_helper/utils.py,sha256=MLiQV1I-HJtc9gHHWn1mPiYCsfI_7bCMPv9GUDY6kO0,19768
|
|
8
|
-
sws_spark_dissemination_helper-0.0.60.dist-info/METADATA,sha256=eNuM3tOPZQ1b0akwWtVqT43UrSlE7YFcTG8lSOHloQQ,2708
|
|
9
|
-
sws_spark_dissemination_helper-0.0.60.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
-
sws_spark_dissemination_helper-0.0.60.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
|
|
11
|
-
sws_spark_dissemination_helper-0.0.60.dist-info/RECORD,,
|
|
File without changes
|