sws-spark-dissemination-helper 0.0.79__py3-none-any.whl → 0.0.183__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ from typing import List
2
+
1
3
  from pyspark.sql.functions import col, lit
2
4
 
3
5
  SPARK_POSTGRES_DRIVER = "org.postgresql.Driver"
@@ -34,26 +36,70 @@ class DomainFilters:
34
36
  class DatasetDatatables:
35
37
 
36
38
  class __SWSDatatable:
37
- def __init__(self, id: str, name: str, schema: str):
39
+ def __init__(
40
+ self, id: str, name: str, schema: str, join_columns: List[str] = []
41
+ ):
38
42
  self.id = id
43
+ self.iceberg_id = f"{IcebergDatabases.BRONZE_DATABASE}.{id.split('.')[1]}"
39
44
  self.name = name
40
45
  self.schema = schema
46
+ self.join_columns = join_columns
47
+
48
+ # Aggregation Tables
49
+ AGGREGATES_COMPOSITION = __SWSDatatable(
50
+ id="datatables.aggregates_composition",
51
+ name="Aggregation - Composition",
52
+ schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, aggregation_type STRING, group_code STRING, child_code STRING, group_name STRING, child_name STRING, link_code STRING, factor STRING",
53
+ )
54
+ AGGREGATES_ELEMENTS = __SWSDatatable(
55
+ id="datatables.aggregates_elements",
56
+ name="Aggregation - Aggregates per elements",
57
+ schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, element STRING, aggregation_type STRING, code STRING",
58
+ )
41
59
 
42
60
  # Dissemination Tables
43
61
  DISSEMINATION_TYPE_LIST = __SWSDatatable(
44
62
  id="datatables.dissemination_{type}_list",
45
63
  name="Dissemination - {type} list",
46
64
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
65
+ join_columns=["domain", "code"],
47
66
  )
48
67
  DISSEMINATION_EXCEPTIONS = __SWSDatatable(
49
68
  id="datatables.dissemination_exception",
50
69
  name="Dissemination - Exceptions",
51
70
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, dim1_code STRING, dim2_code STRING, dim3_code STRING, dim4_code STRING, dim5_code STRING, dim6_code STRING, dim7_code STRING, status_flag STRING, method_flag STRING, dissemination BOOLEAN, aggregation BOOLEAN, note STRING",
71
+ join_columns=[
72
+ "domain",
73
+ " dim1_code",
74
+ " dim2_code",
75
+ " dim3_code",
76
+ " dim4_code",
77
+ " dim5_code",
78
+ " dim6_code",
79
+ " dim7_code",
80
+ " status_flag",
81
+ " method_flag",
82
+ ],
83
+ )
84
+ DISPLAY_DECIMALS = __SWSDatatable(
85
+ id="datatables.display_decimals",
86
+ name="Dissemination - Display Decimals",
87
+ schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, column_1_name STRING, column_1_value STRING, column_2_name STRING, column_2_value STRING, display_decimals STRING",
88
+ join_columns=[
89
+ "domain",
90
+ "column_1_name",
91
+ "column_1_value",
92
+ "column_2_name",
93
+ "column_2_value",
94
+ "display_decimals",
95
+ ],
52
96
  )
97
+ # TODO Deprecate
53
98
  DISSEMINATION_ITEM_LIST_FAOSTAT = __SWSDatatable(
54
99
  id="datatables.dissemination_item_list_faostat",
55
100
  name="Dissemination - Item list - FAOSTAT",
56
101
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
102
+ join_columns=["domain", "code"],
57
103
  )
58
104
 
59
105
  # Mapping Tables
@@ -61,34 +107,23 @@ class DatasetDatatables:
61
107
  id="datatables.aggregates_mapping_domains_id",
62
108
  name="Mapping - Domains ID",
63
109
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, domain_name STRING, sws_source_id STRING, sws_destination_id STRING",
110
+ join_columns=["domain", "sws_source_id"],
64
111
  )
65
112
  MAPPING_CODELIST_TYPE = __SWSDatatable(
66
113
  id="datatables.mapping_codelist_type",
67
114
  name="Mapping Codelist type",
68
115
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, col_name STRING, col_type STRING",
116
+ join_columns=["domain", "col_name"],
69
117
  )
70
118
  MAPPING_CODE_CORRECTION = __SWSDatatable(
71
119
  id="datatables.aggregates_mapping_code_correction",
72
120
  name="Mapping - Code correction",
73
121
  schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, old_code STRING, new_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
74
- )
75
- MAPPING_SDMX_COLUMN_NAMES = __SWSDatatable(
76
- id="datatables.mapping_sdmx_col_names",
77
- name="Mapping - SDMX column names",
78
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_name STRING, external_name STRING, delete BOOLEAN, add BOOLEAN, default_value STRING",
79
- )
80
- MAPPING_SDMX_CODES = __SWSDatatable(
81
- id="datatables.mapping_pre_dissemination",
82
- name="Mapping - Pre dissemination",
83
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_code STRING, external_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
84
- )
85
- MAPPING_UNITS_OF_MEASURE = __SWSDatatable(
86
- id="datatables.mapping_units_of_measure",
87
- name="Mapping - Units of measure",
88
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, sws_code STRING, sws_multiplier INT, sdmx_code STRING, sdmx_multiplier INT, value_multiplier INT, delete BOOLEAN, mapping_type STRING",
122
+ join_columns=["domain", "old_code", "var_type", "mapping_type"],
89
123
  )
90
124
 
91
125
  # Non-SWS Sources Tables
126
+ # TODO To deprecate
92
127
  FAOSTAT_CODE_MAPPING = __SWSDatatable(
93
128
  id="datatables.faostat_code_mapping",
94
129
  name="FAOSTAT Code Mapping",
@@ -150,6 +185,11 @@ class DatasetTables:
150
185
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata_element",
151
186
  schema="id BIGINT, metadata INT, metadata_element_type INT, value STRING",
152
187
  )
188
+ self.TAG_OBSERVATION = self.__SWSTable(
189
+ postgres_id=f"{self.__dataset_id}.tag_observation",
190
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_tag_observation",
191
+ schema="tag BIGINT, observation INT",
192
+ )
153
193
 
154
194
  # Reference data
155
195
  self.CODELISTS = [
@@ -181,18 +221,21 @@ class DatasetTables:
181
221
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_element_type",
182
222
  schema="id INT, metadata_type INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN, private BOOLEAN",
183
223
  )
184
-
185
224
  LANGUAGE = __SWSTable(
186
225
  postgres_id="reference_data.language",
187
226
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.language",
188
227
  schema="id INT, country_code STRING, description STRING",
189
228
  )
190
-
191
229
  UNIT_OF_MEASURE = __SWSTable(
192
230
  postgres_id="reference_data.unit_of_measure",
193
231
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.unit_of_measure",
194
232
  schema="id INT, code STRING, sdmx_code STRING, metric BOOLEAN, description STRING, symbol STRING, base_unit STRING, multiplier DECIMAL",
195
233
  )
234
+ DATASET = __SWSTable(
235
+ postgres_id="reference_data.dataset",
236
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.dataset",
237
+ schema="id INT, xml_name STRING",
238
+ )
196
239
 
197
240
  # Operational data
198
241
  USER = __SWSTable(
@@ -200,6 +243,11 @@ class DatasetTables:
200
243
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.user",
201
244
  schema="id INT, username STRING, preferences INT, email STRING, active BOOLEAN, settings STRING",
202
245
  )
246
+ TAG = __SWSTable(
247
+ postgres_id="operational_data.tag",
248
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.tag",
249
+ schema="id INT, name STRING, reference_date DATE, dataset INT, type STRING, released_ON DATE, released_by INT, properties STRING",
250
+ )
203
251
 
204
252
 
205
253
  class IcebergTable:
@@ -218,24 +266,44 @@ class IcebergTables:
218
266
  self.__dataset_id = dataset_id
219
267
  self.__tag_name = tag_name
220
268
 
221
- self.BRONZE = self._create_iceberg_table("BRONZE")
222
- self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)
269
+ # TODO Fix later with a more appropriate DATABASE
270
+ self.DENORMALIZED_OBSERVATION = self.create_iceberg_table(
271
+ "BRONZE", suffix="denormalized_observation"
272
+ )
273
+ self.DENORMALIZED_METADATA = self.create_iceberg_table(
274
+ "BRONZE", suffix="denormalized_metadata"
275
+ )
276
+ self.GROUPED_METADATA = self.create_iceberg_table(
277
+ "BRONZE", suffix="grouped_metadata"
278
+ )
279
+ self.TABLE = self.create_iceberg_table("BRONZE")
280
+ self.TABLE_FILTERED = self.create_iceberg_table("BRONZE", suffix="filtered")
281
+ self.BRONZE = self.create_iceberg_table("BRONZE")
282
+ self.BRONZE_DISS_TAG = self.create_iceberg_table("BRONZE", suffix="diss_tag")
283
+ self.SILVER = self.create_iceberg_table("SILVER", prefix=domain)
223
284
 
224
285
  # GOLD tables with specific suffixes
225
- self.GOLD_SDMX = self._create_iceberg_table(
286
+ self.GOLD_SWS = self.create_iceberg_table("GOLD", prefix=domain, suffix="sws")
287
+ self.GOLD_SDMX = self.create_iceberg_table(
226
288
  "GOLD", prefix=domain, suffix="sdmx_disseminated"
227
289
  )
228
- self.GOLD_SWS_VALIDATED = self._create_iceberg_table(
290
+ self.GOLD_SWS_VALIDATED = self.create_iceberg_table(
229
291
  "GOLD", prefix=domain, suffix="sws_validated"
230
292
  )
231
- self.GOLD_SWS_DISSEMINATED = self._create_iceberg_table(
293
+ self.GOLD_SWS_DISSEMINATED = self.create_iceberg_table(
232
294
  "GOLD", prefix=domain, suffix="sws_disseminated"
233
295
  )
234
- self.GOLD_PRE_SDMX = self._create_iceberg_table(
296
+ self.GOLD_PRE_SDMX = self.create_iceberg_table(
235
297
  "GOLD", prefix=domain, suffix="pre_sdmx"
236
298
  )
299
+ self.GOLD_FAOSTAT = self.create_iceberg_table(
300
+ "GOLD", prefix=domain, suffix="faostat"
301
+ )
302
+ self.GOLD_FAOSTAT_UNFILTERED = self.create_iceberg_table(
303
+ "GOLD", prefix=domain, suffix="faostat_unfiltered"
304
+ )
237
305
 
238
- def _create_iceberg_table(
306
+ def create_iceberg_table(
239
307
  self, level: str, prefix: str = "", suffix: str = ""
240
308
  ) -> IcebergTable:
241
309
  database = getattr(IcebergDatabases, f"{level}_DATABASE")
@@ -363,16 +363,34 @@ def map_codes_and_remove_null_duplicates(
363
363
  "diss_flag", F.when(col("delete"), lit(False)).otherwise(col("diss_flag"))
364
364
  )
365
365
  .withColumn(
366
- "note",
366
+ "diss_note",
367
367
  F.when(
368
368
  col("delete"),
369
369
  F.array_append(
370
- col("note"),
370
+ col("diss_note"),
371
371
  lit(
372
372
  f"The observation is not disseminated according to the Mapping - Code correction table"
373
373
  ),
374
374
  ),
375
- ).otherwise(col("note")),
375
+ ).otherwise(col("diss_note")),
376
+ )
377
+ # Add mapping message to notes
378
+ .withColumn(
379
+ "diss_note",
380
+ F.when(
381
+ ~col("is_duplicate")
382
+ & col("new_dim_code").isNotNull()
383
+ & (col("new_dim_code") != lit("")),
384
+ F.array_append(
385
+ col("diss_note"),
386
+ F.concat(
387
+ lit(f"Dimension {col_name} code was changed from "),
388
+ col(col_name),
389
+ lit(" to "),
390
+ col("new_dim_code"),
391
+ ),
392
+ ),
393
+ ).otherwise(col("diss_note")),
376
394
  )
377
395
  .withColumn(
378
396
  col_name,
@@ -391,18 +409,18 @@ def map_codes_and_remove_null_duplicates(
391
409
  ).otherwise(col("diss_flag")),
392
410
  )
393
411
  .withColumn(
394
- "note",
412
+ "diss_note",
395
413
  F.when(
396
414
  col("is_duplicate")
397
415
  & col("new_dim_code").isNotNull()
398
416
  & (col("new_dim_code") != lit("")),
399
417
  F.array_append(
400
- col("note"),
418
+ col("diss_note"),
401
419
  lit(
402
420
  f"The code correction was not applied to avoid observation duplications"
403
421
  ),
404
422
  ),
405
- ).otherwise(col("note")),
423
+ ).otherwise(col("diss_note")),
406
424
  )
407
425
  # Check the domain specific multiplier first and then the standard multiplier
408
426
  .withColumn("value", col("value") * F.coalesce(col("multiplier"), lit(1)))
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.79
3
+ Version: 0.0.183
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
- Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
5
+ Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
7
7
  License: MIT License
8
8
 
@@ -29,26 +29,30 @@ License-File: LICENSE
29
29
  Classifier: License :: OSI Approved :: MIT License
30
30
  Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python :: 3
32
- Requires-Python: >=3.8
32
+ Requires-Python: >=3.9
33
33
  Requires-Dist: annotated-types==0.7.0
34
- Requires-Dist: boto3==1.34.147
35
- Requires-Dist: botocore==1.34.147
36
- Requires-Dist: certifi==2024.7.4
37
- Requires-Dist: charset-normalizer==3.3.2
38
- Requires-Dist: idna==3.7
34
+ Requires-Dist: boto3>=1.40.0
35
+ Requires-Dist: botocore>=1.40.0
36
+ Requires-Dist: certifi==2025.1.31
37
+ Requires-Dist: charset-normalizer==3.4.1
38
+ Requires-Dist: idna>=3.10
39
39
  Requires-Dist: jmespath==1.0.1
40
+ Requires-Dist: numpy==2.0.2
41
+ Requires-Dist: pandas==2.3.3
40
42
  Requires-Dist: py4j==0.10.9.7
41
- Requires-Dist: pydantic-core==2.20.1
42
- Requires-Dist: pydantic==2.8.2
43
- Requires-Dist: pyspark==3.5.1
43
+ Requires-Dist: pydantic-core==2.27.2
44
+ Requires-Dist: pydantic==2.10.6
45
+ Requires-Dist: pyspark==3.5.4
44
46
  Requires-Dist: python-dateutil==2.9.0.post0
45
- Requires-Dist: python-dotenv==1.0.1
47
+ Requires-Dist: python-dotenv==0.19.2
48
+ Requires-Dist: pytz==2025.2
46
49
  Requires-Dist: requests==2.32.3
47
- Requires-Dist: s3transfer==0.10.2
48
- Requires-Dist: six==1.16.0
49
- Requires-Dist: sws-api-client==1.0.7b0
50
- Requires-Dist: typing-extensions==4.12.2
51
- Requires-Dist: urllib3==1.26.19
50
+ Requires-Dist: s3transfer>=0.11.2
51
+ Requires-Dist: six==1.17.0
52
+ Requires-Dist: sws-api-client==2.3.0
53
+ Requires-Dist: typing-extensions>=4.12.2
54
+ Requires-Dist: tzdata==2025.2
55
+ Requires-Dist: urllib3==1.26.20
52
56
  Description-Content-Type: text/markdown
53
57
 
54
58
  # Upload a new version
@@ -0,0 +1,13 @@
1
+ sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=N0eQ2LXtpPeZQCWYi85sMLmpXRzLA2erECiba8tqOAY,29595
2
+ sws_spark_dissemination_helper/SWSDatatablesExportHelper.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py,sha256=csqKyYglBkJSBvEkEa1_keHarZZAIJHaV0d64gGJy98,26379
4
+ sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=atQFiY5Mmo-rzHY7WVWg-Guvg8i1ZcaaoKE4ymTaKdE,27750
5
+ sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=V_rH4UYoFZfMUc82U-KxeL_o8F44HnMHfLLXoyNxHxs,20016
6
+ sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=3l5zkEWksnEC-R4mJi8JEHL3ylCMbkMD9a0qbdZQU5E,26345
7
+ sws_spark_dissemination_helper/__init__.py,sha256=42TPbk7KxAud_qY3Sr_F4F7VjyofUlxEJkUXAFQsjRo,327
8
+ sws_spark_dissemination_helper/constants.py,sha256=cVjTS3xbJNKz-1i7c1dJk2PcOZzQhvuHUp9i4PNIPh4,14055
9
+ sws_spark_dissemination_helper/utils.py,sha256=Ge8zXsUIcvFihALDNLF5kCu_tAdRQUE04xE6Yn9xQF4,22008
10
+ sws_spark_dissemination_helper-0.0.183.dist-info/METADATA,sha256=LDVmzDL6ZDhGrRBd3flpX0TPEIJONpdZJodUGrAvemw,2822
11
+ sws_spark_dissemination_helper-0.0.183.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
+ sws_spark_dissemination_helper-0.0.183.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
13
+ sws_spark_dissemination_helper-0.0.183.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py,sha256=tyC3e2LNBes9J2UFR-j7bDlvEffeI0YsiYlMvk0wPxA,16382
2
- sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py,sha256=5BI9uOmYo9E1Q4JdDa4rlQ1LBaNTpzfbwUZjTY1VsYQ,29251
3
- sws_spark_dissemination_helper/SWSPostgresSparkReader.py,sha256=wXSz4-SbIcfVfDsN5gsbg6ul5GvVoX59VkfjAmTCToo,14935
4
- sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py,sha256=F0g4N95QIApVNvPFWuQfHphGE320LKoimBRisln7Luk,22033
5
- sws_spark_dissemination_helper/__init__.py,sha256=Efjoe9V4vGXWVp-DY5P6NbRwIUr_zkZJkDmMi-lf5Bc,262
6
- sws_spark_dissemination_helper/constants.py,sha256=hpHHlbojShMWRfyIelXz6c5BqFzO48Oap1zmztlMMrs,11349
7
- sws_spark_dissemination_helper/utils.py,sha256=6SzrXX0xhvynRyv-vRFDbc6V4UNe_RzKKETZAtefnhg,21341
8
- sws_spark_dissemination_helper-0.0.79.dist-info/METADATA,sha256=EXaooj8Ss9G9EfvKets1e7bUc0fTVbc-a9X4zjE26FI,2708
9
- sws_spark_dissemination_helper-0.0.79.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
- sws_spark_dissemination_helper-0.0.79.dist-info/licenses/LICENSE,sha256=zFzeb_j_6pXEHwH8Z0OpIkKFJk7vmhZjdem-K0d4zU4,1073
11
- sws_spark_dissemination_helper-0.0.79.dist-info/RECORD,,