sws-spark-dissemination-helper 0.0.164__tar.gz → 0.0.173__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/PKG-INFO +10 -10
  2. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/pyproject.toml +10 -10
  3. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +62 -33
  4. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +1 -1
  5. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +79 -1
  6. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/constants.py +16 -16
  7. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/utils.py +9 -9
  8. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/.gitignore +0 -0
  9. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/LICENSE +0 -0
  10. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/README.md +0 -0
  11. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/tests/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.164 → sws_spark_dissemination_helper-0.0.173}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.164
3
+ Version: 0.0.173
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -31,27 +31,27 @@ Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python :: 3
32
32
  Requires-Python: >=3.9
33
33
  Requires-Dist: annotated-types==0.7.0
34
- Requires-Dist: boto3>=1.36.18
35
- Requires-Dist: botocore>=1.36.18
34
+ Requires-Dist: boto3>=1.40.0
35
+ Requires-Dist: botocore>=1.40.0
36
36
  Requires-Dist: certifi==2025.1.31
37
37
  Requires-Dist: charset-normalizer==3.4.1
38
- Requires-Dist: idna==3.10
38
+ Requires-Dist: idna>=3.10
39
39
  Requires-Dist: jmespath==1.0.1
40
40
  Requires-Dist: numpy==2.0.2
41
- Requires-Dist: pandas==2.2.3
41
+ Requires-Dist: pandas==2.3.3
42
42
  Requires-Dist: py4j==0.10.9.7
43
43
  Requires-Dist: pydantic-core==2.27.2
44
44
  Requires-Dist: pydantic==2.10.6
45
45
  Requires-Dist: pyspark==3.5.4
46
46
  Requires-Dist: python-dateutil==2.9.0.post0
47
47
  Requires-Dist: python-dotenv==0.19.2
48
- Requires-Dist: pytz==2025.1
48
+ Requires-Dist: pytz==2025.2
49
49
  Requires-Dist: requests==2.32.3
50
- Requires-Dist: s3transfer==0.11.2
50
+ Requires-Dist: s3transfer>=0.11.2
51
51
  Requires-Dist: six==1.17.0
52
- Requires-Dist: sws-api-client==1.5.3
53
- Requires-Dist: typing-extensions==4.12.2
54
- Requires-Dist: tzdata==2025.1
52
+ Requires-Dist: sws-api-client==2.3.0
53
+ Requires-Dist: typing-extensions>=4.12.2
54
+ Requires-Dist: tzdata==2025.2
55
55
  Requires-Dist: urllib3==1.26.20
56
56
  Description-Content-Type: text/markdown
57
57
 
@@ -4,30 +4,30 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.164"
7
+ version = "0.0.173"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
- "boto3>=1.36.18",
11
- "botocore>=1.36.18",
10
+ "boto3>=1.40.0",
11
+ "botocore>=1.40.0",
12
12
  "certifi==2025.1.31",
13
13
  "charset-normalizer==3.4.1",
14
- "idna==3.10",
14
+ "idna>=3.10",
15
15
  "jmespath==1.0.1",
16
16
  "numpy==2.0.2",
17
- "pandas==2.2.3",
17
+ "pandas==2.3.3",
18
18
  "py4j==0.10.9.7",
19
19
  "pydantic==2.10.6",
20
20
  "pydantic_core==2.27.2",
21
21
  "pyspark==3.5.4",
22
22
  "python-dateutil==2.9.0.post0",
23
23
  "python-dotenv==0.19.2",
24
- "pytz==2025.1",
24
+ "pytz==2025.2",
25
25
  "requests==2.32.3",
26
- "s3transfer==0.11.2",
26
+ "s3transfer>=0.11.2",
27
27
  "six==1.17.0",
28
- "sws_api_client==1.5.3",
29
- "typing_extensions==4.12.2",
30
- "tzdata==2025.1",
28
+ "sws_api_client==2.3.0",
29
+ "typing_extensions>=4.12.2",
30
+ "tzdata==2025.2",
31
31
  "urllib3==1.26.20"
32
32
  ]
33
33
  requires-python = ">=3.9"
@@ -158,7 +158,7 @@ class SWSBronzeIcebergSparkHelper:
158
158
 
159
159
  return dfs_dimension
160
160
 
161
- def _prepare_element_uom(self) -> DataFrame:
161
+ def _prepare_element_uom(self) -> Union[DataFrame, None]:
162
162
  """Prepare the element and unit of measure join."""
163
163
 
164
164
  # Get the element DataFrame
@@ -170,23 +170,24 @@ class SWSBronzeIcebergSparkHelper:
170
170
  if dimension_column == self.element_column
171
171
  )
172
172
 
173
- # Join the element and the unit_of_measure
174
- df_element_uom = (
175
- df_element.alias("e")
176
- .join(
177
- self.df_unit_of_measure.alias("u"),
178
- col("e.unit_of_measure") == col("u.id"),
179
- )
180
- .select(
181
- col("e.code").alias("element_code"),
182
- col("u.code").alias("unit_of_measure"),
183
- col("u.symbol").alias("unit_of_measure_symbol"),
184
- col("u.base_unit").alias("unit_of_measure_base_unit"),
185
- col("u.multiplier").alias("unit_of_measure_multiplier"),
173
+ if any("unit_of_measure" == column.lower() for column in df_element.columns):
174
+ # Join the element and the unit_of_measure
175
+ df_element_uom = (
176
+ df_element.alias("e")
177
+ .join(
178
+ self.df_unit_of_measure.alias("u"),
179
+ col("e.unit_of_measure") == col("u.id"),
180
+ )
181
+ .select(
182
+ col("e.code").alias("element_code"),
183
+ col("u.code").alias("unit_of_measure"),
184
+ col("u.symbol").alias("unit_of_measure_symbol"),
185
+ col("u.base_unit").alias("unit_of_measure_base_unit"),
186
+ col("u.multiplier").alias("unit_of_measure_multiplier"),
187
+ )
186
188
  )
187
- )
188
189
 
189
- return df_element_uom
190
+ return df_element_uom
190
191
 
191
192
  def _gen_denormalized_observation(self) -> DataFrame:
192
193
  """Original query upon which the below computation is based
@@ -278,15 +279,16 @@ class SWSBronzeIcebergSparkHelper:
278
279
  .withColumnRenamed("code", dimension_column)
279
280
  )
280
281
 
281
- df_intermediate = (
282
- df_intermediate.alias("d")
283
- .join(
284
- F.broadcast(df_element_uom).alias("e"),
285
- col(f"d.{self.element_column}") == col("e.element_code"),
286
- "left",
282
+ if df_element_uom is not None:
283
+ df_intermediate = (
284
+ df_intermediate.alias("d")
285
+ .join(
286
+ F.broadcast(df_element_uom).alias("e"),
287
+ col(f"d.{self.element_column}") == col("e.element_code"),
288
+ "left",
289
+ )
290
+ .drop("element_code")
287
291
  )
288
- .drop("element_code")
289
- )
290
292
 
291
293
  df_obs_denorm = df_intermediate
292
294
 
@@ -364,16 +366,17 @@ class SWSBronzeIcebergSparkHelper:
364
366
  )
365
367
  logging.debug(f"After join count: {df_obs_denorm.count()}")
366
368
 
367
- df_obs_denorm = (
368
- df_obs_denorm.alias("d")
369
- .join(
370
- F.broadcast(df_element_uom).alias("e"),
371
- col(f"d.{self.element_column}") == col("e.element_code"),
372
- "left",
369
+ if df_element_uom is not None:
370
+ df_obs_denorm = (
371
+ df_obs_denorm.alias("d")
372
+ .join(
373
+ F.broadcast(df_element_uom).alias("e"),
374
+ col(f"d.{self.element_column}") == col("e.element_code"),
375
+ "left",
376
+ )
377
+ .drop("element_code")
373
378
  )
374
- .drop("element_code")
375
- )
376
- logging.debug(f"After uom count: {df_obs_denorm.count()}")
379
+ logging.debug(f"After uom count: {df_obs_denorm.count()}")
377
380
 
378
381
  return df_obs_denorm
379
382
 
@@ -766,3 +769,29 @@ class SWSBronzeIcebergSparkHelper:
766
769
  logging.debug(f"Tag with Added csv Table: {tag}")
767
770
 
768
771
  logging.info("Bronze Disseminated tag with selection successfully written")
772
+
773
+
774
+ 1
775
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
776
+ 1
777
+ 1
778
+ 2
779
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
780
+ 2
781
+ 1
782
+ 1
783
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
784
+ 1
785
+ 1
786
+ 2
787
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
788
+ 2
789
+ 1
790
+ 1
791
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
792
+ 1
793
+ 1
794
+ 1
795
+ frozenset({"8", "4", "2", "5", "9", "1", "7", "6", "0", "3"})
796
+ 1
797
+ 1
@@ -519,7 +519,7 @@ class SWSEasyIcebergSparkHelper:
519
519
  SELECT
520
520
  o.*,
521
521
  m.metadata
522
- FROM {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS o
522
+ FROM {self.iceberg_tables.DENORMALIZED_OBSERVATION.iceberg_id} AS o
523
523
  LEFT JOIN {self.iceberg_tables.GROUPED_METADATA.iceberg_id} AS m
524
524
  ON o.id = m.observation_id
525
525
  """
@@ -296,6 +296,35 @@ class SWSGoldIcebergSparkHelper:
296
296
 
297
297
  return df
298
298
 
299
+ def write_gold_faostat_unfiltered_data_to_iceberg_and_csv(
300
+ self, df: DataFrame
301
+ ) -> DataFrame:
302
+ """The expected input to this function is the output of the sws disseminated function"""
303
+ df.writeTo(
304
+ self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id
305
+ ).createOrReplace()
306
+
307
+ logging.info(
308
+ f"Gold FAOSTAT unfiltered table written to {self.iceberg_tables.GOLD_FAOSTAT.iceberg_id}"
309
+ )
310
+
311
+ self.spark.sql(
312
+ f"ALTER TABLE {self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.iceberg_id} CREATE OR REPLACE TAG `{self.tag_name}`"
313
+ )
314
+
315
+ logging.info(f"gold FAOSTAT unfiltered tag '{self.tag_name}' created")
316
+
317
+ df_1 = df.coalesce(1)
318
+
319
+ save_cache_csv(
320
+ df=df_1,
321
+ bucket=self.bucket,
322
+ prefix=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_prefix,
323
+ tag_name=self.tag_name,
324
+ )
325
+
326
+ return df
327
+
299
328
  def write_gold_sws_validated_sws_dissemination_tag(
300
329
  self, df: DataFrame, tags: Tags
301
330
  ) -> DataFrame:
@@ -570,7 +599,7 @@ class SWSGoldIcebergSparkHelper:
570
599
  logging.debug(f"Tag with Added Iceberg Table: {tag}")
571
600
 
572
601
  new_diss_table = BaseDisseminatedTagTable(
573
- id=f"{self.domain_code.lower()}_gold_faosta_csv",
602
+ id=f"{self.domain_code.lower()}_gold_faostat_csv",
574
603
  name=f"{self.domain_code} gold FAOSTAT csv",
575
604
  description="Gold table containing the tag data in FAOSTAT format in csv",
576
605
  layer=TableLayer.GOLD,
@@ -589,3 +618,52 @@ class SWSGoldIcebergSparkHelper:
589
618
  logging.debug(f"Tag with Added csv Table: {tag}")
590
619
 
591
620
  return df
621
+
622
+ def write_gold_faostat_unfiltered_dissemination_tag(
623
+ self, df: DataFrame, tags: Tags
624
+ ) -> DataFrame:
625
+ # Get or create a new tag
626
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
627
+ logging.debug(f"Tag: {tag}")
628
+
629
+ new_iceberg_table = BaseDisseminatedTagTable(
630
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_iceberg",
631
+ name=f"{self.domain_code} gold FAOSTAT unfiltered Iceberg",
632
+ description="Gold table containing all the tag data in FAOSTAT format",
633
+ layer=TableLayer.GOLD,
634
+ private=True,
635
+ type=TableType.ICEBERG,
636
+ database=IcebergDatabases.GOLD_DATABASE,
637
+ table=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.table,
638
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.path,
639
+ structure={"columns": df.schema.jsonValue()["fields"]},
640
+ )
641
+ tag = upsert_disseminated_table(
642
+ sws_tags=tags,
643
+ tag=tag,
644
+ dataset_id=self.dataset_id,
645
+ tag_name=self.tag_name,
646
+ table=new_iceberg_table,
647
+ )
648
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
649
+
650
+ new_diss_table = BaseDisseminatedTagTable(
651
+ id=f"{self.domain_code.lower()}_gold_faostat_unfiltered_csv",
652
+ name=f"{self.domain_code} gold FAOSTAT unfiltered csv",
653
+ description="Gold table containing the tag data in FAOSTAT format in csv",
654
+ layer=TableLayer.GOLD,
655
+ private=True,
656
+ type=TableType.CSV,
657
+ path=self.iceberg_tables.GOLD_FAOSTAT_UNFILTERED.csv_path,
658
+ structure={"columns": df.schema.jsonValue()["fields"]},
659
+ )
660
+ tag = upsert_disseminated_table(
661
+ sws_tags=tags,
662
+ tag=tag,
663
+ dataset_id=self.dataset_id,
664
+ tag_name=self.tag_name,
665
+ table=new_diss_table,
666
+ )
667
+ logging.debug(f"Tag with Added csv Table: {tag}")
668
+
669
+ return df
@@ -254,37 +254,37 @@ class IcebergTables:
254
254
  self.__tag_name = tag_name
255
255
 
256
256
  # TODO Fix later with a more appropriate DATABASE
257
- self.DENORMALIZED_OBSERVATION = self._create_iceberg_table("BRONZE", suffix="denormalized_observation")
258
- self.DENORMALIZED_METADATA = self._create_iceberg_table("BRONZE", suffix="denormalized_metadata")
259
- self.GROUPED_METADATA = self._create_iceberg_table("BRONZE", suffix="grouped_metadata")
260
- self.TABLE = self._create_iceberg_table("BRONZE")
261
- self.TABLE_FILTERED = self._create_iceberg_table("BRONZE", suffix="filtered")
262
- self.BRONZE = self._create_iceberg_table("BRONZE")
263
- self.BRONZE_DISS_TAG = self._create_iceberg_table("BRONZE", suffix="diss_tag")
264
- self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)
257
+ self.DENORMALIZED_OBSERVATION = self.create_iceberg_table("BRONZE", suffix="denormalized_observation")
258
+ self.DENORMALIZED_METADATA = self.create_iceberg_table("BRONZE", suffix="denormalized_metadata")
259
+ self.GROUPED_METADATA = self.create_iceberg_table("BRONZE", suffix="grouped_metadata")
260
+ self.TABLE = self.create_iceberg_table("BRONZE")
261
+ self.TABLE_FILTERED = self.create_iceberg_table("BRONZE", suffix="filtered")
262
+ self.BRONZE = self.create_iceberg_table("BRONZE")
263
+ self.BRONZE_DISS_TAG = self.create_iceberg_table("BRONZE", suffix="diss_tag")
264
+ self.SILVER = self.create_iceberg_table("SILVER", prefix=domain)
265
265
 
266
266
  # GOLD tables with specific suffixes
267
- self.GOLD_SWS = self._create_iceberg_table("GOLD", prefix=domain, suffix="sws")
268
- self.GOLD_SDMX = self._create_iceberg_table(
267
+ self.GOLD_SWS = self.create_iceberg_table("GOLD", prefix=domain, suffix="sws")
268
+ self.GOLD_SDMX = self.create_iceberg_table(
269
269
  "GOLD", prefix=domain, suffix="sdmx_disseminated"
270
270
  )
271
- self.GOLD_SWS_VALIDATED = self._create_iceberg_table(
271
+ self.GOLD_SWS_VALIDATED = self.create_iceberg_table(
272
272
  "GOLD", prefix=domain, suffix="sws_validated"
273
273
  )
274
- self.GOLD_SWS_DISSEMINATED = self._create_iceberg_table(
274
+ self.GOLD_SWS_DISSEMINATED = self.create_iceberg_table(
275
275
  "GOLD", prefix=domain, suffix="sws_disseminated"
276
276
  )
277
- self.GOLD_PRE_SDMX = self._create_iceberg_table(
277
+ self.GOLD_PRE_SDMX = self.create_iceberg_table(
278
278
  "GOLD", prefix=domain, suffix="pre_sdmx"
279
279
  )
280
- self.GOLD_FAOSTAT = self._create_iceberg_table(
280
+ self.GOLD_FAOSTAT = self.create_iceberg_table(
281
281
  "GOLD", prefix=domain, suffix="faostat"
282
282
  )
283
- self.GOLD_FAOSTAT_UNFILTERED = self._create_iceberg_table(
283
+ self.GOLD_FAOSTAT_UNFILTERED = self.create_iceberg_table(
284
284
  "GOLD", prefix=domain, suffix="faostat_unfiltered"
285
285
  )
286
286
 
287
- def _create_iceberg_table(
287
+ def create_iceberg_table(
288
288
  self, level: str, prefix: str = "", suffix: str = ""
289
289
  ) -> IcebergTable:
290
290
  database = getattr(IcebergDatabases, f"{level}_DATABASE")
@@ -363,26 +363,26 @@ def map_codes_and_remove_null_duplicates(
363
363
  "diss_flag", F.when(col("delete"), lit(False)).otherwise(col("diss_flag"))
364
364
  )
365
365
  .withColumn(
366
- "note",
366
+ "diss_note",
367
367
  F.when(
368
368
  col("delete"),
369
369
  F.array_append(
370
- col("note"),
370
+ col("diss_note"),
371
371
  lit(
372
372
  f"The observation is not disseminated according to the Mapping - Code correction table"
373
373
  ),
374
374
  ),
375
- ).otherwise(col("note")),
375
+ ).otherwise(col("diss_note")),
376
376
  )
377
377
  # Add mapping message to notes
378
378
  .withColumn(
379
- "note",
379
+ "diss_note",
380
380
  F.when(
381
381
  ~col("is_duplicate")
382
382
  & col("new_dim_code").isNotNull()
383
383
  & (col("new_dim_code") != lit("")),
384
384
  F.array_append(
385
- col("note"),
385
+ col("diss_note"),
386
386
  F.concat(
387
387
  lit(f"Dimension {col_name} code was changed from "),
388
388
  col(col_name),
@@ -390,7 +390,7 @@ def map_codes_and_remove_null_duplicates(
390
390
  col("new_dim_code"),
391
391
  ),
392
392
  ),
393
- ).otherwise(col("note")),
393
+ ).otherwise(col("diss_note")),
394
394
  )
395
395
  .withColumn(
396
396
  col_name,
@@ -409,18 +409,18 @@ def map_codes_and_remove_null_duplicates(
409
409
  ).otherwise(col("diss_flag")),
410
410
  )
411
411
  .withColumn(
412
- "note",
412
+ "diss_note",
413
413
  F.when(
414
414
  col("is_duplicate")
415
415
  & col("new_dim_code").isNotNull()
416
416
  & (col("new_dim_code") != lit("")),
417
417
  F.array_append(
418
- col("note"),
418
+ col("diss_note"),
419
419
  lit(
420
420
  f"The code correction was not applied to avoid observation duplications"
421
421
  ),
422
422
  ),
423
- ).otherwise(col("note")),
423
+ ).otherwise(col("diss_note")),
424
424
  )
425
425
  # Check the domain specific multiplier first and then the standard multiplier
426
426
  .withColumn("value", col("value") * F.coalesce(col("multiplier"), lit(1)))