sws-spark-dissemination-helper 0.0.60__py3-none-any.whl → 0.0.171__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,12 +4,13 @@ from typing import List
4
4
 
5
5
  import boto3
6
6
  import pyspark.sql.functions as F
7
- from pyspark.sql import DataFrame, SparkSession, Column
7
+ from pyspark.sql import Column, DataFrame, SparkSession
8
8
  from pyspark.sql.functions import col, lit
9
+ from pyspark.sql.window import Window
9
10
  from sws_api_client import Tags
10
- from sws_api_client.tags import DisseminatedTag, BaseDisseminatedTagTable
11
+ from sws_api_client.tags import BaseDisseminatedTagTable, DisseminatedTag
11
12
 
12
- from .constants import DomainFilters, DatasetDatatables
13
+ from .constants import DatasetDatatables, DomainFilters
13
14
 
14
15
 
15
16
  def get_spark() -> SparkSession:
@@ -272,7 +273,7 @@ def check_sdmx_col_names_mappings(
272
273
  return df_mapping_sdmx_column_names_unique
273
274
 
274
275
 
275
- def map_codes(
276
+ def map_codes_and_remove_null_duplicates(
276
277
  df: DataFrame,
277
278
  df_mapping: DataFrame,
278
279
  domain_code: str,
@@ -280,98 +281,162 @@ def map_codes(
280
281
  col_type: str,
281
282
  src_column: str,
282
283
  dest_column: str,
284
+ dimension_columns: List[str],
285
+ flag_columns: List[str],
283
286
  ) -> DataFrame:
284
- return (
287
+
288
+ lower_col_name = col_name.lower()
289
+ lower_flag_columns = [column.lower() for column in flag_columns]
290
+ lower_dimension_columns = [column.lower() for column in dimension_columns]
291
+
292
+ # Define partitioning columns
293
+ if lower_col_name in lower_flag_columns:
294
+ partition_columns = dimension_columns
295
+ else:
296
+ partition_columns = [
297
+ column for column in lower_dimension_columns if column != lower_col_name
298
+ ] + ["partition_column"]
299
+
300
+ partitioning_window = Window.partitionBy(*partition_columns)
301
+
302
+ standard_mapping_df = df_mapping.filter(
303
+ (col("domain").isNull() | (col("domain") == lit("")))
304
+ & (col("var_type") == lit(col_type))
305
+ & (col("mapping_type").isNull() | (col("mapping_type") == lit("")))
306
+ )
307
+ domain_mapping_df = df_mapping.filter(
308
+ (col("domain") == lit(domain_code))
309
+ & (col("var_type") == lit(col_type))
310
+ & (col("mapping_type").isNull() | (col("mapping_type") == lit("")))
311
+ )
312
+
313
+ count_all = df.count()
314
+
315
+ df_no_nulls = (
285
316
  df.alias("d")
286
317
  # Join the data with the standard mapping for the specific dimension
287
318
  .join(
288
- F.broadcast(
289
- df_mapping.filter(
290
- (col("domain").isNull() | (col("domain") == lit("")))
291
- & (col("var_type") == lit(col_type))
292
- & (
293
- col("mapping_type").isNull()
294
- | (col("mapping_type").isNull() == lit(""))
295
- )
296
- )
297
- ).alias("m_standard"),
319
+ F.broadcast(standard_mapping_df).alias("m_standard"),
298
320
  col(f"d.{col_name}") == col(f"m_standard.{src_column}"),
299
321
  "left",
300
322
  )
301
323
  # Join the data with the domain specific mapping for the specific dimension
302
324
  .join(
303
- F.broadcast(
304
- df_mapping.filter(
305
- (col("domain") == lit(domain_code))
306
- & (col("var_type") == lit(col_type))
307
- & (
308
- col("mapping_type").isNull()
309
- | (col("mapping_type").isNull() == lit(""))
310
- )
311
- )
312
- ).alias("m_domain"),
325
+ F.broadcast(domain_mapping_df).alias("m_domain"),
313
326
  col(f"d.{col_name}") == col(f"m_domain.{src_column}"),
314
327
  "left",
315
328
  )
316
- # Select only the columns we are interested in (this step is optional but recommended for debugging)
317
329
  .select(
318
330
  "d.*",
319
- col(f"m_standard.{dest_column}").alias(f"standard_{dest_column}"),
320
- col("m_standard.delete").alias("standard_delete"),
321
- col("m_standard.multiplier").alias("standard_multiplier"),
322
- col(f"m_domain.{dest_column}").alias(f"domain_specific_{dest_column}"),
323
- col("m_domain.delete").alias("domain_specific_delete"),
324
- col("m_domain.multiplier").alias("domain_specific_multiplier"),
325
- )
326
- # Filter out records to delete
327
- .filter(
328
- # Evaluate first the domain specific flag
331
+ # Evaluate the domain specific rule first and then the general rule
332
+ F.coalesce(
333
+ col(f"m_domain.{dest_column}"), col(f"m_standard.{dest_column}")
334
+ ).alias("new_dim_code"),
335
+ F.coalesce(
336
+ col("m_domain.delete"),
337
+ col("m_standard.delete"),
338
+ lit(False),
339
+ ).alias("delete"),
340
+ F.coalesce(col("m_standard.multiplier"), col("m_domain.multiplier")).alias(
341
+ "multiplier"
342
+ ),
343
+ )
344
+ .withColumn("partition_column", F.coalesce(col("new_dim_code"), col(col_name)))
345
+ .withColumn("count_obs_per_point", F.count(lit(1)).over(partitioning_window))
346
+ .withColumn("is_duplicate", col("count_obs_per_point") > lit(1))
347
+ # Filter out all the rows that are duplicates with null value
348
+ .filter(~(col("is_duplicate") & col("value").isNull()))
349
+ )
350
+
351
+ count_no_null_dupes = df_no_nulls.count()
352
+ null_dupes_removed = count_all - count_no_null_dupes
353
+
354
+ logging.info(f"{null_dupes_removed} duplicates with null value removed")
355
+
356
+ df_mapped = (
357
+ df_no_nulls
358
+ # Count again the observations per coordinate after removing the null duplicates
359
+ .withColumn("count_obs_per_point", F.count(lit(1)).over(partitioning_window))
360
+ .withColumn("is_duplicate", col("count_obs_per_point") > lit(1))
361
+ # Update the diss_flag to false for records to delete
362
+ .withColumn(
363
+ "diss_flag", F.when(col("delete"), lit(False)).otherwise(col("diss_flag"))
364
+ )
365
+ .withColumn(
366
+ "note",
367
+ F.when(
368
+ col("delete"),
369
+ F.array_append(
370
+ col("note"),
371
+ lit(
372
+ f"The observation is not disseminated according to the Mapping - Code correction table"
373
+ ),
374
+ ),
375
+ ).otherwise(col("note")),
376
+ )
377
+ # Add mapping message to notes
378
+ .withColumn(
379
+ "note",
329
380
  F.when(
330
- col("domain_specific_delete").isNotNull(),
331
- ~col("domain_specific_delete"),
332
- )
333
- # Then evaluate the general flag
334
- .when(
335
- col("standard_delete").isNotNull(), ~col("standard_delete")
336
- ).otherwise(lit(True))
381
+ ~col("is_duplicate")
382
+ & col("new_dim_code").isNotNull()
383
+ & (col("new_dim_code") != lit("")),
384
+ F.array_append(
385
+ col("note"),
386
+ F.concat(
387
+ lit(f"Dimension {col_name} code was changed from "),
388
+ col(col_name),
389
+ lit(" to "),
390
+ col("new_dim_code"),
391
+ ),
392
+ ),
393
+ ).otherwise(col("note")),
337
394
  )
338
395
  .withColumn(
339
396
  col_name,
340
- # Evaluate first the domain specific mapping
341
397
  F.when(
342
- col(f"domain_specific_{dest_column}").isNotNull(),
343
- col(f"domain_specific_{dest_column}"),
344
- )
345
- # Then evaluate the general mapping
346
- .when(
347
- col(f"standard_{dest_column}").isNotNull(),
348
- col(f"standard_{dest_column}"),
398
+ ~col("is_duplicate"),
399
+ F.coalesce(col("new_dim_code"), col(col_name)),
349
400
  ).otherwise(col(col_name)),
350
401
  )
351
402
  .withColumn(
352
- "value",
353
- # Multiply first by the domain specific multiplier
403
+ "diss_flag",
404
+ F.when(
405
+ col("is_duplicate")
406
+ & col("new_dim_code").isNotNull()
407
+ & (col("new_dim_code") != lit("")),
408
+ lit(False),
409
+ ).otherwise(col("diss_flag")),
410
+ )
411
+ .withColumn(
412
+ "note",
354
413
  F.when(
355
- col("domain_specific_multiplier").isNotNull(),
356
- col("value") * col("domain_specific_multiplier"),
357
- )
358
- # Then multiply by the general multiplier
359
- .when(
360
- col(f"standard_{dest_column}").isNotNull(),
361
- col("value") * col("standard_multiplier"),
362
- ).otherwise(col("value")),
414
+ col("is_duplicate")
415
+ & col("new_dim_code").isNotNull()
416
+ & (col("new_dim_code") != lit("")),
417
+ F.array_append(
418
+ col("note"),
419
+ lit(
420
+ f"The code correction was not applied to avoid observation duplications"
421
+ ),
422
+ ),
423
+ ).otherwise(col("note")),
363
424
  )
425
+ # Check the domain specific multiplier first and then the standard multiplier
426
+ .withColumn("value", col("value") * F.coalesce(col("multiplier"), lit(1)))
364
427
  # Remove the columns that were not in the original dataset
365
428
  .drop(
366
- f"standard_{dest_column}",
367
- "standard_delete",
368
- "standard_multiplier",
369
- f"domain_specific_{dest_column}",
370
- "domain_specific_delete",
371
- "domain_specific_multiplier",
429
+ "new_dim_code",
430
+ "delete",
431
+ "multiplier",
432
+ "partition_column",
433
+ "count_obs_per_point",
434
+ "is_duplicate",
372
435
  )
373
436
  )
374
437
 
438
+ return df_mapped
439
+
375
440
 
376
441
  def apply_code_correction(
377
442
  df: DataFrame,
@@ -381,7 +446,7 @@ def apply_code_correction(
381
446
  col_type: str,
382
447
  ) -> DataFrame:
383
448
  logging.info(f"correcting codes for column {col_name} of type {col_type}")
384
- return map_codes(
449
+ return map_codes_and_remove_null_duplicates(
385
450
  df,
386
451
  df_mapping_code_correction,
387
452
  domain_code,
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.60
3
+ Version: 0.0.171
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
- Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
5
+ Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
7
7
  License: MIT License
8
8
 
@@ -29,26 +29,30 @@ License-File: LICENSE
29
29
  Classifier: License :: OSI Approved :: MIT License
30
30
  Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python :: 3
32
- Requires-Python: >=3.8
32
+ Requires-Python: >=3.9
33
33
  Requires-Dist: annotated-types==0.7.0
34
- Requires-Dist: boto3==1.34.147
35
- Requires-Dist: botocore==1.34.147
36
- Requires-Dist: certifi==2024.7.4
37
- Requires-Dist: charset-normalizer==3.3.2
38
- Requires-Dist: idna==3.7
34
+ Requires-Dist: boto3>=1.40.75
35
+ Requires-Dist: botocore>=1.40.75
36
+ Requires-Dist: certifi==2025.1.31
37
+ Requires-Dist: charset-normalizer==3.4.1
38
+ Requires-Dist: idna>=3.10
39
39
  Requires-Dist: jmespath==1.0.1
40
+ Requires-Dist: numpy==2.0.2
41
+ Requires-Dist: pandas==2.3.3
40
42
  Requires-Dist: py4j==0.10.9.7
41
- Requires-Dist: pydantic-core==2.20.1
42
- Requires-Dist: pydantic==2.8.2
43
- Requires-Dist: pyspark==3.5.1
43
+ Requires-Dist: pydantic-core==2.27.2
44
+ Requires-Dist: pydantic==2.10.6
45
+ Requires-Dist: pyspark==3.5.4
44
46
  Requires-Dist: python-dateutil==2.9.0.post0
45
- Requires-Dist: python-dotenv==1.0.1
47
+ Requires-Dist: python-dotenv==0.19.2
48
+ Requires-Dist: pytz==2025.2
46
49
  Requires-Dist: requests==2.32.3
47
- Requires-Dist: s3transfer==0.10.2
48
- Requires-Dist: six==1.16.0
49
- Requires-Dist: sws-api-client==1.0.7b0
50
- Requires-Dist: typing-extensions==4.12.2
51
- Requires-Dist: urllib3==1.26.19
50
+ Requires-Dist: s3transfer>=0.11.2
51
+ Requires-Dist: six==1.17.0
52
+ Requires-Dist: sws-api-client==2.3.0
53
+ Requires-Dist: typing-extensions>=4.12.2
54
+ Requires-Dist: tzdata==2025.2
55
+ Requires-Dist: urllib3==1.26.20
52
56
  Description-Content-Type: text/markdown
53
57
 
54
58
  # Upload a new version
@@ -0,0 +1,13 @@
1
+ sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
2
+ sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
4
+ sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=0dxbVkrhdaASapEffF5PFcgKwAMyJoWBxzgymjZ4JyY,25049
5
+ sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=KpG8gp8Ai9pHDiKhUOTcXWxxmFGeKEE3XKlI_Y-SveU,18453
6
+ sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=qioLv3SlJEfk0LzTiwfXRtZXVImPOJUeh9k1XwHC-pA,26225
7
+ sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
8
+ sws_spark_dissemination_helper/constants.py,sha256=vQmalAqInwPAybgJOfYx99jn47KsKp8jeD8eqmjw-Rs,13471
9
+ sws_spark_dissemination_helper/utils.py,sha256=G7lQqNRrvqZpgm9WmddD7fWsI8IVn09x1p3cV3458EA,21963
10
+ sws_spark_dissemination_helper-0.0.171.dist-info/METADATA,sha256=W4qkQISSzekzXhpmNhlNMfJEmaQlscu3hQTs4Vavawg,2824
11
+ sws_spark_dissemination_helper-0.0.171.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
+ sws_spark_dissemination_helper-0.0.171.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
+ sws_spark_dissemination_helper-0.0.171.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=tyC3e2LNBes9J2UFR-j7bDlvEffeI0YsiYlMvk0wPxA,16382
2
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=43ZOaDtFdnuYHL9oMSh64F3YUiDDWbVHS5iuhrsE1C4,26074
3
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=wXSz4-SbIcfVfDsN5gsbg6ul5GvVoX59VkfjAmTCToo,14935
4
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=ZTpjkejKJpl6kue8DI1FVEJB-M7TlyUgrjXqF7GUEws,21978
5
- sws_spark_dissemination_helper/__init__.py,sha256=Efjoe9V4vGXWVp-DY5P6NbRwIUr_zkZJkDmMi-lf5Bc,262
6
- sws_spark_dissemination_helper/constants.py,sha256=KGfuudVovMxgzCIowe7L9gqDbHjbngqzhd3Zgowo5yk,11229
7
- sws_spark_dissemination_helper/utils.py,sha256=MLiQV1I-HJtc9gHHWn1mPiYCsfI_7bCMPv9GUDY6kO0,19768
8
- sws_spark_dissemination_helper-0.0.60.dist-info/METADATA,sha256=eNuM3tOPZQ1b0akwWtVqT43UrSlE7YFcTG8lSOHloQQ,2708
9
- sws_spark_dissemination_helper-0.0.60.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
- sws_spark_dissemination_helper-0.0.60.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
11
- sws_spark_dissemination_helper-0.0.60.dist-info/RECORD,,