sws-spark-dissemination-helper 0.0.86__tar.gz → 0.0.88__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/.gitignore +1 -0
  2. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/PKG-INFO +1 -1
  3. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/pyproject.toml +1 -1
  4. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +75 -0
  5. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +7 -7
  6. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +4 -0
  7. sws_spark_dissemination_helper-0.0.86/.pipeline/bitbucket-pipelines.yml +0 -69
  8. sws_spark_dissemination_helper-0.0.86/.pipeline/pyproject.toml +0 -34
  9. sws_spark_dissemination_helper-0.0.86/.python-version +0 -1
  10. sws_spark_dissemination_helper-0.0.86/.wip/SWSBaseIcebergSparkHelper.py +0 -129
  11. sws_spark_dissemination_helper-0.0.86/.wip/model.py +0 -263
  12. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/LICENSE +0 -0
  13. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/README.md +0 -0
  14. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/old_requirements.txt +0 -0
  15. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/requirements.txt +0 -0
  16. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
  17. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/__init__.py +0 -0
  18. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/constants.py +0 -0
  19. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/utils.py +0 -0
  20. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/tests/__init__.py +0 -0
  21. {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/tests/test.py +0 -0
@@ -2,6 +2,7 @@
2
2
  # You should customize this list as applicable to your project.
3
3
  # Learn more about .gitignore:
4
4
  # https://www.atlassian.com/git/tutorials/saving-changes/gitignore
5
+ .*
5
6
 
6
7
  # Node artifact files
7
8
  node_modules/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sws-spark-dissemination-helper
3
- Version: 0.0.86
3
+ Version: 0.0.88
4
4
  Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
5
5
  Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
6
6
  Author-email: Daniele Mansillo <danielemansillo@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sws-spark-dissemination-helper"
7
- version = "0.0.86"
7
+ version = "0.0.88"
8
8
  dependencies = [
9
9
  "annotated-types==0.7.0",
10
10
  "boto3==1.36.18",
@@ -443,3 +443,78 @@ class SWSBronzeIcebergSparkHelper:
443
443
  logging.debug(f"Tag with Added csv Table: {tag}")
444
444
 
445
445
  logging.info("Bronze Dissemination tags successfully written")
446
+
447
+ def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
448
+ self, dimensions: Dict[str, List[str]]
449
+ ) -> DataFrame:
450
+ self.spark.sql(
451
+ f"ALTER TABLE {self.iceberg_tables.BRONZE.iceberg_id}.`tag_{self.tag_name}` CREATE OR REPLACE BRANCH `diss_tag_{self.tag_name}`" # AS OF VERSION `{tag_name}`
452
+ )
453
+
454
+ for dimension_name, codes in dimensions.items():
455
+ if len(codes) != 0:
456
+ not_in_codes = ",".join([f"'{code}'" for code in codes])
457
+ self.spark.sql(
458
+ f"DELETE FROM {self.iceberg_tables.BRONZE.iceberg_id}.`branch_diss_tag_{self.tag_name}` WHERE {dimension_name} NOT IN ({not_in_codes})"
459
+ )
460
+ disseminated_tag_df = self.spark.read.option("branch", self.tag_name).table(
461
+ self.iceberg_tables.BRONZE.iceberg_id
462
+ )
463
+
464
+ disseminated_tag_df = disseminated_tag_df.withColumn(
465
+ "metadata", F.to_json(col("metadata"))
466
+ ).coalesce(1)
467
+
468
+ save_cache_csv(
469
+ df=disseminated_tag_df,
470
+ bucket=self.bucket,
471
+ prefix=f"{self.iceberg_tables.BRONZE.csv_prefix}_disseminated_tag",
472
+ tag_name=self.tag_name,
473
+ )
474
+
475
+ return disseminated_tag_df
476
+
477
+ def write_bronze_sws_filtered_disseminated_tag(self, tags: Tags):
478
+ # Get or create a new tag
479
+ tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
480
+ logging.debug(f"Tag: {tag}")
481
+
482
+ new_iceberg_table = BaseDisseminatedTagTable(
483
+ id=f"{self.domain_code.lower()}_bronze_disseminated_tag_iceberg",
484
+ name=f"{self.domain_code} bronze disseminated tag Iceberg",
485
+ description="Bronze table containing the raw data imported from the SWS, denormalized and filtered per dimension",
486
+ layer=TableLayer.BRONZE,
487
+ private=True,
488
+ type=TableType.ICEBERG,
489
+ database=IcebergDatabases.BRONZE_DATABASE,
490
+ table=self.iceberg_tables.BRONZE.table,
491
+ path=self.iceberg_tables.BRONZE.path,
492
+ structure={
493
+ "columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
494
+ },
495
+ )
496
+ tag = tags.add_dissemination_table(
497
+ self.dataset_id, self.tag_name, new_iceberg_table
498
+ )
499
+ logging.debug(f"Tag with Added Iceberg Table: {tag}")
500
+
501
+ new_csv_table = BaseDisseminatedTagTable(
502
+ id=f"{self.domain_code.lower()}_bronze_disseminated_tag_csv",
503
+ name=f"{self.domain_code} bronze disseminated tag csv",
504
+ description="Bronze table containing the raw data imported from the SWS, denormalized and filtered per dimension cached in csv",
505
+ layer=TableLayer.BRONZE,
506
+ private=True,
507
+ type=TableType.CSV,
508
+ # TODO Correct the path in the origin library
509
+ path=self.iceberg_tables.BRONZE.csv_path,
510
+ structure={
511
+ "columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
512
+ },
513
+ )
514
+ tag = tags.add_dissemination_table(
515
+ self.dataset_id, self.tag_name, new_csv_table
516
+ )
517
+
518
+ logging.debug(f"Tag with Added csv Table: {tag}")
519
+
520
+ logging.info("Bronze Disseminated tag with selection successfully written")
@@ -89,20 +89,20 @@ class SWSGoldIcebergSparkHelper:
89
89
  def keep_dim_val_attr_columns(self, df: DataFrame):
90
90
  return df.select(*self.cols_to_keep_sws)
91
91
 
92
+ def read_silver_data(self) -> DataFrame:
93
+ return self.spark.read.option("tag", self.tag_name).table(
94
+ self.iceberg_tables.SILVER.iceberg_id
95
+ )
96
+
92
97
  def gen_gold_sws_disseminated_data(self) -> DataFrame:
93
98
  return (
94
- self.spark.read.option("tag", self.tag_name)
95
- .table(self.iceberg_tables.SILVER.iceberg_id)
99
+ self.read_silver_data()
96
100
  .transform(self.apply_diss_flag_filter)
97
101
  .transform(self.keep_dim_val_attr_columns)
98
102
  )
99
103
 
100
104
  def gen_gold_sws_validated_data(self) -> DataFrame:
101
- return (
102
- self.spark.read.option("tag", self.tag_name)
103
- .table(self.iceberg_tables.BRONZE.iceberg_id)
104
- .transform(self.keep_dim_val_attr_columns)
105
- )
105
+ return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
106
106
 
107
107
  def write_gold_sws_validated_data_to_iceberg_and_csv(
108
108
  self, df: DataFrame
@@ -110,6 +110,10 @@ class SWSSilverIcebergSparkHelper:
110
110
  return self.spark.read.option("tag", self.tag_name).table(
111
111
  self.iceberg_tables.BRONZE.iceberg_id
112
112
  )
113
+ def read_bronze_diss_tag_data(self) -> DataFrame:
114
+ return self.spark.read.option("branch", f"diss_tag_{self.tag_name}").table(
115
+ self.iceberg_tables.BRONZE.iceberg_id
116
+ )
113
117
 
114
118
  def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
115
119
  """Extract the dimension columns with time, without time, the time column and the flag columns names."""
@@ -1,69 +0,0 @@
1
- image: python:3.12 # Choose the base image with Python
2
-
3
- pipelines:
4
- default:
5
- - step:
6
- name: Run Tests
7
- caches:
8
- - pip
9
- script:
10
- - curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
11
- - export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
12
- - poetry install # Install your dependencies
13
- - poetry run pytest --cov=sws_api_client --cov-report=xml # Run your tests with coverage
14
-
15
- branches:
16
- main:
17
- - step:
18
- name: Run Tests and Version
19
- caches:
20
- - pip
21
- script:
22
- - curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
23
- - export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
24
- - poetry install # Install your dependencies
25
- - poetry run pytest # Run your tests
26
- - export BB_TOKEN=$BITBUCKET_APP_PASSWORD # Set Bitbucket token
27
- - export BB_USERNAME=$BITBUCKET_USERNAME # Set Bitbucket username
28
- - poetry run semantic-release version --no-vcs-release # Run semantic release
29
-
30
- feature/*:
31
- - step:
32
- name: Run Tests and Version
33
- caches:
34
- - pip
35
- script:
36
- - curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
37
- - export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
38
- - poetry install # Install your dependencies
39
- - poetry run pytest # Run your tests
40
- - export BB_TOKEN=$BITBUCKET_APP_PASSWORD # Set Bitbucket token
41
- - export BB_USERNAME=$BITBUCKET_USERNAME # Set Bitbucket username
42
- - poetry run semantic-release version --no-vcs-release --no-tag # Run semantic release
43
-
44
- dev:
45
- - step:
46
- name: Run Tests and Version
47
- caches:
48
- - pip
49
- script:
50
- - curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
51
- - export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
52
- - poetry install # Install your dependencies
53
- - poetry run pytest # Run your tests
54
- - export BB_TOKEN=$BITBUCKET_APP_PASSWORD # Set Bitbucket token
55
- - export BB_USERNAME=$BITBUCKET_USERNAME # Set Bitbucket username
56
- - poetry run semantic-release version --no-vcs-release --no-tag # Run semantic release
57
-
58
- tags:
59
- v*.*.*: # Trigger on version tags like v1.0.0, v2.1.0, etc.
60
- - step:
61
- name: Publish to PyPI
62
- caches:
63
- - pip
64
- script:
65
- - curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
66
- - export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
67
- - poetry install --no-dev # Install dependencies without dev dependencies
68
- - poetry config pypi-token.pypi $PYPI_TOKEN # Set the PyPI token
69
- - poetry publish --build # Build and publish the package to PyPI. force is needed because I already uploaded pypi version till 0.0.7
@@ -1,34 +0,0 @@
1
- [tool.poetry]
2
- name = "sws-spark-dissemination-helper"
3
- version = "0.0.49"
4
- description = "A Python helper package providing streamlined Spark functions for efficient data dissemination processes"
5
- authors = ["Daniele Mansillo <danielemansillo@gmail.com>"]
6
- license = "MIT"
7
- readme = "README.md"
8
-
9
- [tool.poetry.dependencies]
10
- python = "^3.8"
11
- annotated-types = "0.7.0"
12
- boto3 = "1.34.147"
13
- botocore = "1.34.147"
14
- certifi = "2024.7.4"
15
- charset-normalizer = "3.3.2"
16
- idna = "3.7"
17
- jmespath = "1.0.1"
18
- py4j = "0.10.9.7"
19
- pydantic = "2.8.2"
20
- pydantic-core = "2.20.1"
21
- pyspark = "3.5.1"
22
- python-dateutil = "2.9.0.post0"
23
- python-dotenv = "1.0.1"
24
- requests = "2.32.3"
25
- s3transfer = "0.10.2"
26
- six = "1.16.0"
27
- sws-api-client = "1.0.7b0"
28
- typing-extensions = "4.12.2"
29
- urllib3 = "1.26.19"
30
-
31
-
32
- [build-system]
33
- requires = ["poetry-core"]
34
- build-backend = "poetry.core.masonry.api"
@@ -1 +0,0 @@
1
- 3.9.20
@@ -1,129 +0,0 @@
1
- import logging
2
- from copy import copy
3
- from typing import List, Tuple
4
-
5
- import pyspark.sql.functions as F
6
- from pyspark.sql import DataFrame, SparkSession
7
- from pyspark.sql.functions import col, lit
8
- from sws_api_client import Tags
9
- from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
10
-
11
- from .constants import IcebergDatabases, IcebergTables, IcebergTable
12
- from .SWSPostgresSparkReader import SWSPostgresSparkReader
13
- from .utils import get_or_create_tag, save_cache_csv
14
- import boto3
15
-
16
-
17
- class SWSBaseIcebergSparkHelper:
18
-
19
- def _write_to_iceberg(
20
- self,
21
- df: DataFrame,
22
- iceberg_table: IcebergTable,
23
- tag_name: str,
24
- ) -> DataFrame:
25
- # Write to Iceberg
26
- df.writeTo(iceberg_table.iceberg_id).createOrReplace()
27
- logging.info(f"Table written to {iceberg_table.iceberg_id}")
28
-
29
- # Create a tag in Iceberg if required
30
- self.spark.sql(
31
- f"ALTER TABLE {iceberg_table.iceberg_id} CREATE TAG `{tag_name}`"
32
- )
33
- logging.info(f"Tag '{tag_name}' created for {iceberg_table.iceberg_id}")
34
-
35
- return df
36
-
37
- def _write_to_csv(
38
- self, df: DataFrame, iceberg_table: IcebergTable, bucket: str, tag_name: str
39
- ) -> DataFrame:
40
- s3 = boto3.client("s3")
41
-
42
- latest_path = f"s3://{bucket}/{iceberg_table.csv_prefix}/latest"
43
- tag_path = f"s3://{bucket}/{iceberg_table.csv_prefix}/{tag_name}"
44
-
45
- latest_prefix = f"{iceberg_table.csv_prefix}/latest"
46
- tag_prefix = f"{iceberg_table.csv_prefix}/{tag_name}"
47
-
48
- s3.delete_object(Bucket=bucket, Key=f"{latest_prefix}.csv")
49
- df.coalesce(1).write.option("header", True).mode("overwrite").csv(latest_path)
50
-
51
- response = s3.list_objects_v2(Bucket=bucket, Prefix=latest_prefix)
52
-
53
- s3_path_objects_keys = [
54
- content["Key"] for content in response.get("Contents", {})
55
- ]
56
- s3_path_csv = [
57
- s3_object
58
- for s3_object in s3_path_objects_keys
59
- if s3_object.endswith(".csv")
60
- ][0]
61
-
62
- # Extract the csv from the folder and delete the folder
63
- result_latest = s3.copy_object(
64
- Bucket=bucket,
65
- CopySource={"Bucket": bucket, "Key": s3_path_csv},
66
- Key=f"{latest_prefix}.csv",
67
- )
68
- logging.info(f"Updated latest version of cached csv at {latest_path}.csv")
69
-
70
- result_tag = s3.copy_object(
71
- Bucket=bucket,
72
- CopySource={"Bucket": bucket, "Key": s3_path_csv},
73
- Key=f"{tag_prefix}.csv",
74
- )
75
- logging.info(f"Wrote the tag version of cached csv at {tag_path}.csv")
76
-
77
- for object in s3_path_objects_keys:
78
- s3.delete_object(Bucket=bucket, Key=object)
79
- logging.debug("Cleaning the temporary folder of the csv files")
80
-
81
- return df
82
-
83
- def _create_dissemination_tag(
84
- self,
85
- df: DataFrame,
86
- tags: Tags,
87
- iceberg_table: IcebergTable,
88
- level: str,
89
- domain_code: str,
90
- iceberg_description: str,
91
- csv_description: str,
92
- ) -> Tags:
93
- tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
94
- logging.debug(f"Initial Tag: {tag}")
95
-
96
- # Common table structure
97
- base_table_structure = {
98
- "layer": level,
99
- "private": True,
100
- "structure": {"columns": df.schema.jsonValue()["fields"]},
101
- }
102
-
103
- # Add Iceberg table to tag
104
- iceberg_table = BaseDisseminatedTagTable(
105
- id=f"{domain_code}_{iceberg_table.level}_iceberg",
106
- name=f"{domain_code} {iceberg_table.level} Iceberg",
107
- description=iceberg_description,
108
- type=TableType.ICEBERG,
109
- database=iceberg_table.database,
110
- table=iceberg_table.table,
111
- path=iceberg_table.path,
112
- **base_table_structure,
113
- )
114
- tags.add_dissemination_table(self.dataset_id, self.tag_name, iceberg_table)
115
- logging.debug(f"Tag with added Iceberg Table: {tag}")
116
-
117
- # Add CSV table to tag
118
- csv_table = BaseDisseminatedTagTable(
119
- id=f"{domain_code}_{iceberg_table.level}_csv",
120
- name=f"{domain_code} {iceberg_table.level} CSV",
121
- description=csv_description,
122
- type=TableType.CSV,
123
- path=iceberg_table.csv_path,
124
- **base_table_structure,
125
- )
126
- tags.add_dissemination_table(self.dataset_id, self.tag_name, csv_table)
127
- logging.debug(f"Tag with added CSV Table: {tag}")
128
-
129
- return tag
@@ -1,263 +0,0 @@
1
- class IcebergCatalog:
2
- def __init__(self, id: str, bucket: str) -> None:
3
- self.id = "AwsDataCatalog"
4
- self.bucket = bucket
5
-
6
-
7
- class IcebergDatabase:
8
- def __init__(self, catalog: IcebergCatalog, id: str, level: str) -> None:
9
- self.catalog = catalog
10
- self.level = level
11
- self.id = id
12
- self.database = f"{catalog.id}.{id}"
13
-
14
-
15
- class IcebergTable:
16
- def __init__(self, database: IcebergDatabase, id) -> None:
17
- self.database = database
18
- self.id = id
19
- self.iceberg_id = f"{database.catalog.id}.{database.id}.{id}"
20
- self.path = f"{database.id}"
21
- self.csv_prefix = csv_path.rsplit("/", 1)[0]
22
- self.csv_path = csv_path
23
-
24
-
25
- class IcebergDatabases:
26
-
27
- STAGING_SCHEME = "sws_dissemination_tags_bronze"
28
- BRONZE_SCHEME = "sws_dissemination_tags_bronze"
29
- SILVER_SCHEME = "sws_dissemination_tags_silver"
30
- GOLD_SCHEME = "sws_dissemination_tags_gold"
31
- STAGING_DATABASE = f"{CATALOG}.{STAGING_SCHEME}"
32
- BRONZE_DATABASE = f"{CATALOG}.{BRONZE_SCHEME}"
33
- SILVER_DATABASE = f"{CATALOG}.{SILVER_SCHEME}"
34
- GOLD_DATABASE = f"{CATALOG}.{GOLD_SCHEME}"
35
-
36
-
37
- class DomainFilters:
38
- GENERIC = lambda domain_code: (
39
- col("domain").isNull()
40
- | (col("domain") == lit(""))
41
- | (col("domain") == lit(domain_code))
42
- )
43
- MATCH = lambda domain_code: (col("domain") == lit(domain_code))
44
- EMPTY = lambda: (col("domain").isNull() | (col("domain") == lit("")))
45
-
46
-
47
- class DatasetDatatables:
48
-
49
- class __SWSDatatable:
50
- def __init__(self, id: str, name: str, schema: str):
51
- self.id = id
52
- self.name = name
53
- self.schema = schema
54
-
55
- # Dissemination Tables
56
- DISSEMINATION_TYPE_LIST = __SWSDatatable(
57
- id="datatables.dissemination_{type}_list",
58
- name="Dissemination - {type} list",
59
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
60
- )
61
- DISSEMINATION_EXCEPTIONS = __SWSDatatable(
62
- id="datatables.dissemination_exception",
63
- name="Dissemination - Exceptions",
64
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, dim1_code STRING, dim2_code STRING, dim3_code STRING, dim4_code STRING, dim5_code STRING, dim6_code STRING, dim7_code STRING, status_flag STRING, method_flag STRING, dissemination BOOLEAN, aggregation BOOLEAN, note STRING",
65
- )
66
- DISSEMINATION_ITEM_LIST_FAOSTAT = __SWSDatatable(
67
- id="datatables.dissemination_item_list_faostat",
68
- name="Dissemination - Item list - FAOSTAT",
69
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
70
- )
71
-
72
- # Mapping Tables
73
- MAPPING_DOMAINS_ID = __SWSDatatable(
74
- id="datatables.aggregates_mapping_domains_id",
75
- name="Mapping - Domains ID",
76
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, domain_name STRING, sws_source_id STRING, sws_destination_id STRING",
77
- )
78
- MAPPING_CODELIST_TYPE = __SWSDatatable(
79
- id="datatables.mapping_codelist_type",
80
- name="Mapping Codelist type",
81
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, col_name STRING, col_type STRING",
82
- )
83
- MAPPING_CODE_CORRECTION = __SWSDatatable(
84
- id="datatables.aggregates_mapping_code_correction",
85
- name="Mapping - Code correction",
86
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, old_code STRING, new_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
87
- )
88
- MAPPING_SDMX_COLUMN_NAMES = __SWSDatatable(
89
- id="datatables.mapping_sdmx_col_names",
90
- name="Mapping - SDMX column names",
91
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_name STRING, external_name STRING, delete BOOLEAN, add BOOLEAN, default_value STRING",
92
- )
93
- MAPPING_SDMX_CODES = __SWSDatatable(
94
- id="datatables.mapping_pre_dissemination",
95
- name="Mapping - Pre dissemination",
96
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_code STRING, external_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
97
- )
98
- MAPPING_UNITS_OF_MEASURE = __SWSDatatable(
99
- id="datatables.mapping_units_of_measure",
100
- name="Mapping - Units of measure",
101
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_code STRING, external_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
102
- )
103
-
104
- # Non-SWS Sources Tables
105
- FAOSTAT_CODE_MAPPING = __SWSDatatable(
106
- id="datatables.faostat_code_mapping",
107
- name="FAOSTAT Code Mapping",
108
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, faostat_code_type STRING, faostat_code STRING, mapping_type STRING, mapped_code STRING",
109
- )
110
- FS_INPUT_MAPPING = __SWSDatatable(
111
- id="datatables.fs_input_mapping",
112
- name="FS Input Mapping",
113
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, DomainCode STRING, Code STRING, Var2Code STRING, Var3Code STRING, Mult INT, 3YrAvgFlag INT, Flag STRING",
114
- )
115
- HCES_INPUT_MAPPING = __SWSDatatable(
116
- id="datatables.hces_input_mapping",
117
- name="HCES Input Mapping",
118
- schema=f"{DATATABLE_COLUMNS_SCHEMA}, variable STRING, indicator STRING, element STRING, decimals STRING",
119
- )
120
-
121
-
122
- class DatasetTables:
123
- class __SWSTable:
124
- def __init__(self, postgres_id: str, iceberg_id: str, schema: str):
125
- self.postgres_id = postgres_id
126
- self.iceberg_id = iceberg_id
127
- self.schema = schema
128
-
129
- def __get_obs_coord_schema(self) -> str:
130
-
131
- obs_coord_schema_prefix = (
132
- "id BIGINT, approved_observation BIGINT, num_version BIGINT, "
133
- )
134
-
135
- dimensions_schema = (
136
- " INT, ".join(self.__dataset_details["dimensionColumns"]) + " INT"
137
- )
138
-
139
- return obs_coord_schema_prefix + dimensions_schema
140
-
141
- def __init__(self, dataset_id: str, dataset_details: dict) -> None:
142
- self.__dataset_id = dataset_id
143
- self.__dataset_details = dataset_details
144
-
145
- # Data
146
- self.OBSERVATION = self.__SWSTable(
147
- postgres_id=f"{self.__dataset_id}.observation",
148
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_observation",
149
- schema="id BIGINT, observation_coordinates BIGINT, version INT, value FLOAT, flag_obs_status STRING, flag_method STRING, created_on TIMESTAMP, created_by INT, replaced_on TIMESTAMP",
150
- )
151
- self.OBSERVATION_COORDINATE = self.__SWSTable(
152
- postgres_id=f"{self.__dataset_id}.observation_coordinate",
153
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_observation_coordinate",
154
- schema=self.__get_obs_coord_schema(),
155
- )
156
- self.METADATA = self.__SWSTable(
157
- postgres_id=f"{self.__dataset_id}.metadata",
158
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata",
159
- schema="id BIGINT, observation BIGINT, metadata_type INT, language INT, copy_metadata BIGINT",
160
- )
161
- self.METADATA_ELEMENT = self.__SWSTable(
162
- postgres_id=f"{self.__dataset_id}.metadata_element",
163
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata_element",
164
- schema="id BIGINT, metadata INT, metadata_element_type INT, value STRING",
165
- )
166
-
167
- # Reference data
168
- self.CODELISTS = [
169
- self.__SWSTable(
170
- postgres_id=dimension["codelist"]["table"],
171
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{dimension['codelist']['table'].split('.')[1]}",
172
- schema=CODELIST_SCHEMA,
173
- )
174
- for dimension in dataset_details["dimensions"]
175
- ]
176
-
177
- FLAG_METHOD = __SWSTable(
178
- postgres_id="reference_data.flag_method",
179
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.flag_method",
180
- schema=FLAGLIST_SCHEMA,
181
- )
182
- FLAG_OBS_STATUS = __SWSTable(
183
- postgres_id="reference_data.flag_obs_status",
184
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.flag_obs_status",
185
- schema=FLAGLIST_SCHEMA,
186
- )
187
- METADATA_TYPE = __SWSTable(
188
- postgres_id="reference_data.metadata_type",
189
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_type",
190
- schema="id INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN",
191
- )
192
- METADATA_ELEMENT_TYPE = __SWSTable(
193
- postgres_id="reference_data.metadata_element_type",
194
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_element_type",
195
- schema="id INT, metadata_type INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN, private BOOLEAN",
196
- )
197
-
198
- LANGUAGE = __SWSTable(
199
- postgres_id="reference_data.language",
200
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.language",
201
- schema="id INT, country_code STRING, description STRING",
202
- )
203
-
204
- UNIT_OF_MEASURE = __SWSTable(
205
- postgres_id="reference_data.unit_of_measure",
206
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.unit_of_measure",
207
- schema="id INT, code STRING, sdmx_code STRING, metric BOOLEAN, description STRING, symbol STRING, base_unit STRING, multiplier DECIMAL",
208
- )
209
-
210
- # Operational data
211
- USER = __SWSTable(
212
- postgres_id="operational_data.user",
213
- iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.user",
214
- schema="id INT, username STRING, preferences INT, email STRING, active BOOLEAN, settings STRING",
215
- )
216
-
217
-
218
- class IcebergTable:
219
- def __init__(self, level: str, iceberg_id: str, path: str, csv_path: str):
220
- self.level = level.lower()
221
- self.iceberg_id = iceberg_id
222
- self.table = iceberg_id.split(".")[-1]
223
- self.path = path
224
- self.csv_prefix = csv_path.rsplit("/", 1)[0]
225
- self.csv_path = csv_path
226
-
227
-
228
- class IcebergTables:
229
-
230
- def __init__(self, dataset_id: str, tag_name: str, domain: str = "") -> None:
231
- self.__dataset_id = dataset_id
232
- self.__tag_name = tag_name
233
-
234
- self.BRONZE = self._create_iceberg_table("BRONZE")
235
- self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)
236
-
237
- # GOLD tables with specific suffixes
238
- self.GOLD_SDMX = self._create_iceberg_table(
239
- "GOLD", prefix=domain, suffix="sdmx_disseminated"
240
- )
241
- self.GOLD_SWS_VALIDATED = self._create_iceberg_table(
242
- "GOLD", prefix=domain, suffix="sws_validated"
243
- )
244
- self.GOLD_SWS_DISSEMINATED = self._create_iceberg_table(
245
- "GOLD", prefix=domain, suffix="sws_disseminated"
246
- )
247
-
248
- def _create_iceberg_table(
249
- self, level: str, prefix: str = "", suffix: str = ""
250
- ) -> IcebergTable:
251
- database = getattr(IcebergDatabases, f"{level}_DATABASE")
252
- scheme = getattr(IcebergDatabases, f"{level}_SCHEME")
253
-
254
- if prefix != "":
255
- prefix = f"{prefix}_".lower()
256
- if suffix != "":
257
- suffix = f"_{suffix}".lower()
258
-
259
- iceberg_id = f"{database}.{prefix}{self.__dataset_id}{suffix}"
260
- path = f"{scheme}/{prefix}{self.__dataset_id}{suffix}"
261
- csv_path = f"{CACHED_CSV_FOLDER}/{scheme}/{prefix}{self.__dataset_id}{suffix}/{self.__tag_name}.csv"
262
-
263
- return IcebergTable(level, iceberg_id, path, csv_path)