sws-spark-dissemination-helper 0.0.143__tar.gz → 0.0.145__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/.gitignore +1 -1
  2. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/PKG-INFO +5 -5
  3. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/pyproject.toml +5 -5
  4. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/SWSEasyIcebergSparkHelper.py +66 -1
  5. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +10 -6
  6. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/constants.py +16 -3
  7. sws_spark_dissemination_helper-0.0.143/old_requirements.txt +0 -23
  8. sws_spark_dissemination_helper-0.0.143/requirements.txt +0 -23
  9. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/LICENSE +0 -0
  10. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/README.md +0 -0
  11. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +0 -0
  12. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/SWSDatatablesExportHelper.py +0 -0
  13. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +0 -0
  14. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +0 -0
  15. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  16. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/src/sws_spark_dissemination_helper/utils.py +0 -0
  17. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/tests/__init__.py +0 -0
  18. {sws_spark_dissemination_helper-0.0.143 → sws_spark_dissemination_helper-0.0.145}/tests/test.py +0 -0
@@ -2,7 +2,6 @@
2
2
  # You should customize this list as applicable to your project.
3
3
  # Learn more about .gitignore:
4
4
  # https://www.atlassian.com/git/tutorials/saving-changes/gitignore
5
- .*
6
5
 
7
6
  # Node artifact files
8
7
  node_modules/
@@ -49,3 +48,4 @@ Thumbs.db
49
48
  *.mov
50
49
  *.wmv
51
50
 
51
+ .venv/
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.143
3
+ Version: 0.0.145
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
- Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
5
+ Project-URL: Repository, https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
7
7
  License: MIT License
8
8
 
@@ -31,8 +31,8 @@ Classifier: Operating System :: OS Independent
31
31
  Classifier: Programming Language :: Python :: 3
32
32
  Requires-Python: >=3.9
33
33
  Requires-Dist: annotated-types==0.7.0
34
- Requires-Dist: boto3==1.36.18
35
- Requires-Dist: botocore==1.36.18
34
+ Requires-Dist: boto3>=1.36.18
35
+ Requires-Dist: botocore>=1.36.18
36
36
  Requires-Dist: certifi==2025.1.31
37
37
  Requires-Dist: charset-normalizer==3.4.1
38
38
  Requires-Dist: idna==3.10
@@ -49,7 +49,7 @@ Requires-Dist: pytz==2025.1
49
49
  Requires-Dist: requests==2.32.3
50
50
  Requires-Dist: s3transfer==0.11.2
51
51
  Requires-Dist: six==1.17.0
52
- Requires-Dist: sws-api-client>=1.5.1
52
+ Requires-Dist: sws-api-client==1.5.3
53
53
  Requires-Dist: typing-extensions==4.12.2
54
54
  Requires-Dist: tzdata==2025.1
55
55
  Requires-Dist: urllib3==1.26.20
@@ -4,11 +4,11 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.143"
7
+ version = "0.0.145"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
- "boto3==1.36.18",
11
- "botocore==1.36.18",
10
+ "boto3>=1.36.18",
11
+ "botocore>=1.36.18",
12
12
  "certifi==2025.1.31",
13
13
  "charset-normalizer==3.4.1",
14
14
  "idna==3.10",
@@ -25,7 +25,7 @@ dependencies = [
25
25
  "requests==2.32.3",
26
26
  "s3transfer==0.11.2",
27
27
  "six==1.17.0",
28
- "sws_api_client>=1.5.1",
28
+ "sws_api_client==1.5.3",
29
29
  "typing_extensions==4.12.2",
30
30
  "tzdata==2025.1",
31
31
  "urllib3==1.26.20"
@@ -42,4 +42,4 @@ classifiers = [
42
42
  ]
43
43
 
44
44
  [project.urls]
45
- Repository = "https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper"
45
+ Repository = "https://github.com/un-fao/fao-sws-it-python-spark-dissemination-helper"
@@ -69,6 +69,7 @@ class SWSEasyIcebergSparkHelper:
69
69
  self.df_obs_coord,
70
70
  self.df_metadata,
71
71
  self.df_meta_elem,
72
+ self.df_tag_observation,
72
73
  ) = self.raw_data
73
74
 
74
75
  (
@@ -78,10 +79,11 @@ class SWSEasyIcebergSparkHelper:
78
79
  self.df_meta_elem_type,
79
80
  self.df_language,
80
81
  self.df_unit_of_measure,
82
+ self.df_dataset,
81
83
  self.dfs_dimension,
82
84
  ) = self.raw_reference_data
83
85
 
84
- self.df_user = self.raw_operational_data
86
+ (self.df_user, self.df_tag) = self.raw_operational_data
85
87
 
86
88
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
87
89
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -278,6 +280,69 @@ class SWSEasyIcebergSparkHelper:
278
280
 
279
281
  return df_obs_denorm
280
282
 
283
+ def _gen_denormalized_observation_sql_from_tag(self) -> DataFrame:
284
+ # ----------------
285
+ # Prepare dataframes for the joins
286
+ # ----------------
287
+
288
+ select_statement = """
289
+ o.id,
290
+ o.value,
291
+ u.email,
292
+ o.created_on,
293
+ o.replaced_on,
294
+ o.version"""
295
+
296
+ from_statement = f"""
297
+ FROM {self.dataset_tables.OBSERVATION.iceberg_id} o
298
+ INNER JOIN {self.dataset_tables.TAG_OBSERVATION.iceberg_id} to ON o.id = to.observation
299
+ INNER JOIN {self.dataset_tables.TAG.iceberg_id} t ON to.tag = t.id
300
+ INNER JOIN {self.dataset_tables.DATASET.iceberg_id} d ON t.dataset = d.id
301
+ LEFT JOIN {self.dataset_tables.USER.iceberg_id} u ON u.id = o.created_by
302
+ LEFT JOIN {self.dataset_tables.OBSERVATION_COORDINATE.iceberg_id} AS oc ON oc.id = o.observation_coordinates"""
303
+
304
+ hint_statement = ""
305
+
306
+ id_to_flag_col_mapping = {v: k for k, v in self.flag_col_to_id_mapping.items()}
307
+ for flag_col in self.flag_columns:
308
+ select_statement += f",\no.{id_to_flag_col_mapping[flag_col]} AS {flag_col}"
309
+
310
+ id_to_dim_col_mapping = {v: k for k, v in self.dim_col_to_id_mapping.items()}
311
+ for i, (dim_col, cl) in enumerate(
312
+ zip(self.dim_columns_w_time, self.dataset_tables.CODELISTS)
313
+ ):
314
+ select_statement += f",\nd{i}.code AS {dim_col}"
315
+ from_statement += f"\nLEFT JOIN {cl.iceberg_id} d{i} ON d{i}.id = oc.{id_to_dim_col_mapping[dim_col]}"
316
+ hint_statement = (
317
+ hint_statement + f", BROADCAST({cl.iceberg_id})"
318
+ if hint_statement
319
+ else f"BROADCAST({cl.iceberg_id})"
320
+ )
321
+
322
+ hint_statement = "/*+ " + hint_statement + " */"
323
+
324
+ # TODO Add tag name as a parameter
325
+ where_statement = (
326
+ f"\nWHERE t.name = '{self.source_tag}' AND d.xml_name = '{self.dataset_id}'"
327
+ )
328
+
329
+ final_query = (
330
+ "SELECT "
331
+ + hint_statement
332
+ + select_statement
333
+ + from_statement
334
+ + where_statement
335
+ )
336
+ if not self.keep_history:
337
+ final_query += "\n AND o.replaced_on IS NULL"
338
+
339
+ logging.info("Final query for merging observation and observation_coordinares")
340
+ logging.info(final_query)
341
+
342
+ df_obs_denorm = self.spark.sql(final_query)
343
+
344
+ return df_obs_denorm
345
+
281
346
  def _gen_denormalized_metadata(self) -> DataFrame:
282
347
  """Original query upon which the below computation is based
283
348
 
@@ -195,6 +195,7 @@ class SWSPostgresSparkReader:
195
195
  (dataset_tables.OBSERVATION_COORDINATE, "id", 10),
196
196
  (dataset_tables.METADATA, "id", 10),
197
197
  (dataset_tables.METADATA_ELEMENT, "metadata", 10),
198
+ (dataset_tables.TAG_OBSERVATION, "tag", 10),
198
199
  ]
199
200
  return self._import_tables(data_tables)
200
201
 
@@ -209,6 +210,7 @@ class SWSPostgresSparkReader:
209
210
  dataset_tables.METADATA_ELEMENT_TYPE,
210
211
  dataset_tables.LANGUAGE,
211
212
  dataset_tables.UNIT_OF_MEASURE,
213
+ dataset_tables.DATSET,
212
214
  *dataset_tables.CODELISTS,
213
215
  ]
214
216
  return self._import_tables(
@@ -217,17 +219,18 @@ class SWSPostgresSparkReader:
217
219
 
218
220
  def import_operational_data_tables(
219
221
  self, dataset_tables: DatasetTables
220
- ) -> DataFrame:
222
+ ) -> List[DataFrame]:
221
223
  # Define and import operational data table without partitioning
222
224
  operational_data_tables = [
223
225
  (dataset_tables.USER, None, 1),
226
+ (dataset_tables.TAG, None, 1),
224
227
  ]
225
- return self._import_tables(operational_data_tables)[0]
228
+ return self._import_tables(operational_data_tables)
226
229
 
227
230
  def import_data_reference_data_operational_data(
228
231
  self, dataset_tables: DatasetTables
229
232
  ) -> Tuple[
230
- Tuple[DataFrame, DataFrame, DataFrame, DataFrame],
233
+ Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame],
231
234
  Tuple[
232
235
  DataFrame,
233
236
  DataFrame,
@@ -235,14 +238,15 @@ class SWSPostgresSparkReader:
235
238
  DataFrame,
236
239
  DataFrame,
237
240
  DataFrame,
241
+ DataFrame,
238
242
  List[DataFrame],
239
243
  ],
240
- DataFrame,
244
+ Tuple[DataFrame, DataFrame],
241
245
  ]:
242
246
  # Import and organize DataFrames into the desired output structure
243
247
  data_dfs = self.import_data_tables(dataset_tables)
244
248
  reference_data_dfs = self.import_reference_data_tables(dataset_tables)
245
- operational_data_df = self.import_operational_data_tables(dataset_tables)
249
+ operational_data_dfs = self.import_operational_data_tables(dataset_tables)
246
250
 
247
251
  return (
248
252
  tuple(data_dfs),
@@ -250,7 +254,7 @@ class SWSPostgresSparkReader:
250
254
  *reference_data_dfs[:6],
251
255
  reference_data_dfs[6:],
252
256
  ),
253
- operational_data_df,
257
+ tuple(operational_data_dfs),
254
258
  )
255
259
 
256
260
  def get_codelist_type_mapping(
@@ -40,7 +40,7 @@ class DatasetDatatables:
40
40
  self, id: str, name: str, schema: str, join_columns: List[str] = []
41
41
  ):
42
42
  self.id = id
43
- self.iceberg_id = f"{IcebergDatabases.BRONZE_DATABASE}{id.split('.')[1]}"
43
+ self.iceberg_id = f"{IcebergDatabases.BRONZE_DATABASE}.{id.split('.')[1]}"
44
44
  self.name = name
45
45
  self.schema = schema
46
46
  self.join_columns = join_columns
@@ -172,6 +172,11 @@ class DatasetTables:
172
172
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata_element",
173
173
  schema="id BIGINT, metadata INT, metadata_element_type INT, value STRING",
174
174
  )
175
+ self.TAG_OBSERVATION = self.__SWSTable(
176
+ postgres_id=f"{self.__dataset_id}.tag_observation",
177
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_tag_observation",
178
+ schema="tag BIGINT, observation INT",
179
+ )
175
180
 
176
181
  # Reference data
177
182
  self.CODELISTS = [
@@ -203,18 +208,21 @@ class DatasetTables:
203
208
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_element_type",
204
209
  schema="id INT, metadata_type INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN, private BOOLEAN",
205
210
  )
206
-
207
211
  LANGUAGE = __SWSTable(
208
212
  postgres_id="reference_data.language",
209
213
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.language",
210
214
  schema="id INT, country_code STRING, description STRING",
211
215
  )
212
-
213
216
  UNIT_OF_MEASURE = __SWSTable(
214
217
  postgres_id="reference_data.unit_of_measure",
215
218
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.unit_of_measure",
216
219
  schema="id INT, code STRING, sdmx_code STRING, metric BOOLEAN, description STRING, symbol STRING, base_unit STRING, multiplier DECIMAL",
217
220
  )
221
+ DATASET = __SWSTable(
222
+ postgres_id="reference_data.dataset",
223
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.dataset",
224
+ schema="id INT, xml_name STRING",
225
+ )
218
226
 
219
227
  # Operational data
220
228
  USER = __SWSTable(
@@ -222,6 +230,11 @@ class DatasetTables:
222
230
  iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.user",
223
231
  schema="id INT, username STRING, preferences INT, email STRING, active BOOLEAN, settings STRING",
224
232
  )
233
+ TAG = __SWSTable(
234
+ postgres_id="operational_data.tag",
235
+ iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.tag",
236
+ schema="id INT, name STRING, reference_date DATE, dataset INT, type STRING, released_ON DATE, released_by INT, properties STRING",
237
+ )
225
238
 
226
239
 
227
240
  class IcebergTable:
@@ -1,23 +0,0 @@
1
- annotated-types==0.7.0
2
- boto3==1.36.18
3
- botocore==1.36.18
4
- certifi==2025.1.31
5
- charset-normalizer==3.4.1
6
- idna==3.10
7
- jmespath==1.0.1
8
- numpy==2.0.2
9
- pandas==2.2.3
10
- py4j==0.10.9.7
11
- pydantic==2.10.6
12
- pydantic_core==2.27.2
13
- pyspark==3.5.4
14
- python-dateutil==2.9.0.post0
15
- python-dotenv==0.19.2
16
- pytz==2025.1
17
- requests==2.32.3
18
- s3transfer==0.11.2
19
- six==1.17.0
20
- sws_api_client==1.4.4
21
- typing_extensions==4.12.2
22
- tzdata==2025.1
23
- urllib3==1.26.20
@@ -1,23 +0,0 @@
1
- annotated-types==0.7.0
2
- boto3==1.36.18
3
- botocore==1.36.18
4
- certifi==2025.1.31
5
- charset-normalizer==3.4.1
6
- idna==3.10
7
- jmespath==1.0.1
8
- numpy==2.0.2
9
- pandas==2.2.3
10
- py4j==0.10.9.7
11
- pydantic==2.10.6
12
- pydantic_core==2.27.2
13
- pyspark==3.5.4
14
- python-dateutil==2.9.0.post0
15
- python-dotenv==0.19.2
16
- pytz==2025.1
17
- requests==2.32.3
18
- s3transfer==0.11.2
19
- six==1.17.0
20
- sws_api_client==1.5.0
21
- typing_extensions==4.12.2
22
- tzdata==2025.1
23
- urllib3==1.26.20