sws-spark-dissemination-helper 0.0.144__tar.gz → 0.0.146__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/.gitignore +1 -1
  2. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/PKG-INFO +5 -5
  3. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/pyproject.toml +5 -5
  4. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +83 -2
  5. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +10 -6
  6. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/constants.py +15 -2
  7. sws_spark_dissemination_helper-0.0.144/old_requirements.txt +0 -23
  8. sws_spark_dissemination_helper-0.0.144/requirements.txt +0 -23
  9. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/LICENSE +0 -0
  10. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/README.md +0 -0
  11. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/src/sws_spark_dissemination_helper/utils.py +0 -0
  17. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/tests/__init__.py +0 -0
  18. {sws_spark_dissemination_helper-0.0.144 → sws_spark_dissemination_helper-0.0.146}/tests/test.py +0 -0
@@ -2,7 +2,6 @@
2
2
  # You should customize this list as applicable to your project.
3
3
  # Learn more about .gitignore:
4
4
  # https://www.atlassian.com/git/tutorials/saving-changes/gitignore
5
- .*
6
5
 
7
6
  # Node artifact files
8
7
  node_modules/
@@ -49,3 +48,4 @@ Thumbs.db
49
48
  *.mov
50
49
  *.wmv
51
50
 
51
+ .venv/
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.144
3
+ Version: 0.0.146
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
- Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
5
+ Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
7
7
  License: MIT License
8
8
 
@@ -31,8 +31,8 @@ Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python :: 3
32
32
  Requires-Python: >=3.9
33
33
  Requires-Dist: annotated-types==0.7.0
34
- Requires-Dist: boto3==1.36.18
35
- Requires-Dist: botocore==1.36.18
34
+ Requires-Dist: boto3>=1.36.18
35
+ Requires-Dist: botocore>=1.36.18
36
36
  Requires-Dist: certifi==2025.1.31
37
37
  Requires-Dist: charset-normalizer==3.4.1
38
38
  Requires-Dist: idna==3.10
@@ -49,7 +49,7 @@ Requires-Dist: pytz==2025.1
49
49
  Requires-Dist: requests==2.32.3
50
50
  Requires-Dist: s3transfer==0.11.2
51
51
  Requires-Dist: six==1.17.0
52
- Requires-Dist: sws-api-client>=1.5.1
52
+ Requires-Dist: sws-api-client==1.5.3
53
53
  Requires-Dist: typing-extensions==4.12.2
54
54
  Requires-Dist: tzdata==2025.1
55
55
  Requires-Dist: urllib3==1.26.20
@@ -4,11 +4,11 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.144"
7
+ version = "0.0.146"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
- "boto3==1.36.18",
11
- "botocore==1.36.18",
10
+ "boto3>=1.36.18",
11
+ "botocore>=1.36.18",
12
12
  "certifi==2025.1.31",
13
13
  "charset-normalizer==3.4.1",
14
14
  "idna==3.10",
@@ -25,7 +25,7 @@ dependencies = [
25
25
  "requests==2.32.3",
26
26
  "s3transfer==0.11.2",
27
27
  "six==1.17.0",
28
- "sws_api_client>=1.5.1",
28
+ "sws_api_client==1.5.3",
29
29
  "typing_extensions==4.12.2",
30
30
  "tzdata==2025.1",
31
31
  "urllib3==1.26.20"
@@ -42,4 +42,4 @@ classifiers = [
42
42
  ]
43
43
 
44
44
  [project.urls]
45
- Repository = "https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper"
45
+ Repository = "https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper"
@@ -26,6 +26,7 @@ class SWSEasyIcebergSparkHelper:
26
26
  dataset_tables: DatasetTables = None,
27
27
  keep_history: bool = False,
28
28
  write_csv: bool = True,
29
+ source_tag: str | None = None,
29
30
  ) -> None:
30
31
  self.spark: SparkSession = spark
31
32
  self.dataset_details: dict = dataset_details
@@ -37,6 +38,7 @@ class SWSEasyIcebergSparkHelper:
37
38
  self.iceberg_tables: IcebergTables = iceberg_tables
38
39
  self.keep_history: bool = keep_history
39
40
  self.write_csv: bool = write_csv
41
+ self.source_tag: str | None = source_tag
40
42
 
41
43
  if dataset_details is not None:
42
44
  (
@@ -69,6 +71,7 @@ class SWSEasyIcebergSparkHelper:
69
71
  self.df_obs_coord,
70
72
  self.df_metadata,
71
73
  self.df_meta_elem,
74
+ self.df_tag_observation,
72
75
  ) = self.raw_data
73
76
 
74
77
  (
@@ -78,10 +81,11 @@ class SWSEasyIcebergSparkHelper:
78
81
  self.df_meta_elem_type,
79
82
  self.df_language,
80
83
  self.df_unit_of_measure,
84
+ self.df_dataset,
81
85
  self.dfs_dimension,
82
86
  ) = self.raw_reference_data
83
87
 
84
- self.df_user = self.raw_operational_data
88
+ (self.df_user, self.df_tag) = self.raw_operational_data
85
89
 
86
90
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
87
91
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -278,6 +282,69 @@ class SWSEasyIcebergSparkHelper:
278
282
 
279
283
  return df_obs_denorm
280
284
 
285
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
286
+ # ----------------
287
+ # Prepare dataframes for the joins
288
+ # ----------------
289
+
290
+ select_statement = """
291
+ o.id,
292
+ o.value,
293
+ u.email,
294
+ o.created_on,
295
+ o.replaced_on,
296
+ o.version"""
297
+
298
+ from_statement = f"""
299
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
300
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
301
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
302
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
303
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
304
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
305
+
306
+ hint_statement = ""
307
+
308
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
309
+ for flag_col in self.flag_columns:
310
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
311
+
312
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
313
+ for i, (dim_col, cl) in enumerate(
314
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
315
+ ):
316
+ select_statement += f",\nd{i}.code AS {dim_col}"
317
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
318
+ hint_statement = (
319
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
320
+ if hint_statement
321
+ else f"BROADCAST({cl.iceberg_id})"
322
+ )
323
+
324
+ hint_statement = "/*+ " + hint_statement + " */"
325
+
326
+ # TODO Add tag name as a parameter
327
+ where_statement = (
328
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
329
+ )
330
+
331
+ final_query = (
332
+ "SELECT "
333
+ + hint_statement
334
+ + select_statement
335
+ + from_statement
336
+ + where_statement
337
+ )
338
+ if not self.keep_history:
339
+ final_query += "\n AND o.replaced_on IS NULL"
340
+
341
+ logging.info("Final query for merging observation and observation_coordinares")
342
+ logging.info(final_query)
343
+
344
+ df_obs_denorm = self.spark.sql(final_query)
345
+
346
+ return df_obs_denorm
347
+
281
348
  def _gen_denormalized_metadata(self) -> DataFrame:
282
349
  """Original query upon which the below computation is based
283
350
 
@@ -431,9 +498,23 @@ class SWSEasyIcebergSparkHelper:
431
498
  .drop("m.observation_id")
432
499
  )
433
500
 
434
- def write_data_to_iceberg_and_csv(self, sql=False) -> DataFrame:
501
+ def _gen_denormalied_data_sql_from_tag(self) -> DataFrame:
502
+ return (
503
+ self._gen_denormalized_observation_sql_from_tag()
504
+ .alias("o")
505
+ .join(
506
+ self._gen_grouped_metadata_sql().alias("m"),
507
+ col("o.id") == col("m.observation_id"),
508
+ "left",
509
+ )
510
+ .drop("m.observation_id")
511
+ )
512
+
513
+ def write_data_to_iceberg_and_csv(self, sql=False, from_tag=False) -> DataFrame:
435
514
  if sql:
436
515
  self.df_denorm = self._gen_denormalied_data_sql()
516
+ elif from_tag:
517
+ self.df_denorm = self._gen_denormalied_data_sql_from_tag()
437
518
  else:
438
519
  self.df_denorm = self._gen_denormalied_data()
439
520
 
@@ -195,6 +195,7 @@ class SWSPostgresSparkReader:
195
195
  (dataset_tables.OBSERVATION_COORDINATE, "id", 10),
196
196
  (dataset_tables.METADATA, "id", 10),
197
197
  (dataset_tables.METADATA_ELEMENT, "metadata", 10),
198
+ (dataset_tables.TAG_OBSERVATION, "tag", 10),
198
199
  ]
199
200
  return self._import_tables(data_tables)
200
201
 
@@ -209,6 +210,7 @@ class SWSPostgresSparkReader:
209
210
  dataset_tables.METADATA_ELEMENT_TYPE,
210
211
  dataset_tables.LANGUAGE,
211
212
  dataset_tables.UNIT_OF_MEASURE,
213
+ dataset_tables.DATSET,
212
214
  *dataset_tables.CODELISTS,
213
215
  ]
214
216
  return self._import_tables(
@@ -217,17 +219,18 @@ class SWSPostgresSparkReader:
217
219
 
218
220
  def import_operational_data_tables(
219
221
  self, dataset_tables: DatasetTables
220
- ) -> DataFrame:
222
+ ) -> List[DataFrame]:
221
223
  # Define and import operational data table without partitioning
222
224
  operational_data_tables = [
223
225
  (dataset_tables.USER, None, 1),
226
+ (dataset_tables.TAG, None, 1),
224
227
  ]
225
- return self._import_tables(operational_data_tables)[0]
228
+ return self._import_tables(operational_data_tables)
226
229
 
227
230
  def import_data_reference_data_operational_data(
228
231
  self, dataset_tables: DatasetTables
229
232
  ) -> Tuple[
230
- Tuple[DataFrame, DataFrame, DataFrame, DataFrame],
233
+ Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame],
231
234
  Tuple[
232
235
  DataFrame,
233
236
  DataFrame,
@@ -235,14 +238,15 @@ class SWSPostgresSparkReader:
235
238
  DataFrame,
236
239
  DataFrame,
237
240
  DataFrame,
241
+ DataFrame,
238
242
  List[DataFrame],
239
243
  ],
240
- DataFrame,
244
+ Tuple[DataFrame, DataFrame],
241
245
  ]:
242
246
  # Import and organize DataFrames into the desired output structure
243
247
  data_dfs = self.import_data_tables(dataset_tables)
244
248
  reference_data_dfs = self.import_reference_data_tables(dataset_tables)
245
- operational_data_df = self.import_operational_data_tables(dataset_tables)
249
+ operational_data_dfs = self.import_operational_data_tables(dataset_tables)
246
250
 
247
251
  return (
248
252
  tuple(data_dfs),
@@ -250,7 +254,7 @@ class SWSPostgresSparkReader:
250
254
  *reference_data_dfs[:6],
251
255
  reference_data_dfs[6:],
252
256
  ),
253
- operational_data_df,
257
+ tuple(operational_data_dfs),
254
258
  )
255
259
 
256
260
  def get_codelist_type_mapping(
@@ -172,6 +172,11 @@ class DatasetTables:
172
172
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata_element",
173
173
  schema="id BIGINT, metadata INT, metadata_element_type INT, value STRING",
174
174
  )
175
+ self.TAG_OBSERVATION = self.__SWSTable(
176
+ postgres_id=f"{self.__dataset_id}.tag_observation",
177
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_tag_observation",
178
+ schema="tag BIGINT, observation INT",
179
+ )
175
180
 
176
181
  # Reference data
177
182
  self.CODELISTS = [
@@ -203,18 +208,21 @@ class DatasetTables:
203
208
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_element_type",
204
209
  schema="id INT, metadata_type INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN, private BOOLEAN",
205
210
  )
206
-
207
211
  LANGUAGE = __SWSTable(
208
212
  postgres_id="reference_data.language",
209
213
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.language",
210
214
  schema="id INT, country_code STRING, description STRING",
211
215
  )
212
-
213
216
  UNIT_OF_MEASURE = __SWSTable(
214
217
  postgres_id="reference_data.unit_of_measure",
215
218
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.unit_of_measure",
216
219
  schema="id INT, code STRING, sdmx_code STRING, metric BOOLEAN, description STRING, symbol STRING, base_unit STRING, multiplier DECIMAL",
217
220
  )
221
+ DATASET = __SWSTable(
222
+ postgres_id="reference_data.dataset",
223
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.dataset",
224
+ schema="id INT, xml_name STRING",
225
+ )
218
226
 
219
227
  # Operational data
220
228
  USER = __SWSTable(
@@ -222,6 +230,11 @@ class DatasetTables:
222
230
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.user",
223
231
  schema="id INT, username STRING, preferences INT, email STRING, active BOOLEAN, settings STRING",
224
232
  )
233
+ TAG = __SWSTable(
234
+ postgres_id="operational_data.tag",
235
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.tag",
236
+ schema="id INT, name STRING, reference_date DATE, dataset INT, type STRING, released_ON DATE, released_by INT, properties STRING",
237
+ )
225
238
 
226
239
 
227
240
  class IcebergTable:
@@ -1,23 +0,0 @@
1
- annotated-types==0.7.0
2
- boto3==1.36.18
3
- botocore==1.36.18
4
- certifi==2025.1.31
5
- charset-normalizer==3.4.1
6
- idna==3.10
7
- jmespath==1.0.1
8
- numpy==2.0.2
9
- pandas==2.2.3
10
- py4j==0.10.9.7
11
- pydantic==2.10.6
12
- pydantic_core==2.27.2
13
- pyspark==3.5.4
14
- python-dateutil==2.9.0.post0
15
- python-dotenv==0.19.2
16
- pytz==2025.1
17
- requests==2.32.3
18
- s3transfer==0.11.2
19
- six==1.17.0
20
- sws_api_client==1.4.4
21
- typing_extensions==4.12.2
22
- tzdata==2025.1
23
- urllib3==1.26.20
@@ -1,23 +0,0 @@
1
- annotated-types==0.7.0
2
- boto3==1.36.18
3
- botocore==1.36.18
4
- certifi==2025.1.31
5
- charset-normalizer==3.4.1
6
- idna==3.10
7
- jmespath==1.0.1
8
- numpy==2.0.2
9
- pandas==2.2.3
10
- py4j==0.10.9.7
11
- pydantic==2.10.6
12
- pydantic_core==2.27.2
13
- pyspark==3.5.4
14
- python-dateutil==2.9.0.post0
15
- python-dotenv==0.19.2
16
- pytz==2025.1
17
- requests==2.32.3
18
- s3transfer==0.11.2
19
- six==1.17.0
20
- sws_api_client==1.5.0
21
- typing_extensions==4.12.2
22
- tzdata==2025.1
23
- urllib3==1.26.20