sws-spark-dissemination-helper 0.0.86__tar.gz → 0.0.88__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/.gitignore +1 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/PKG-INFO +1 -1
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/pyproject.toml +1 -1
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/SWSBronzeIcebergSparkHelper.py +75 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/SWSGoldIcebergSparkHelper.py +7 -7
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/SWSSilverIcebergSparkHelper.py +4 -0
- sws_spark_dissemination_helper-0.0.86/.pipeline/bitbucket-pipelines.yml +0 -69
- sws_spark_dissemination_helper-0.0.86/.pipeline/pyproject.toml +0 -34
- sws_spark_dissemination_helper-0.0.86/.python-version +0 -1
- sws_spark_dissemination_helper-0.0.86/.wip/SWSBaseIcebergSparkHelper.py +0 -129
- sws_spark_dissemination_helper-0.0.86/.wip/model.py +0 -263
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/LICENSE +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/README.md +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/old_requirements.txt +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/requirements.txt +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/SWSPostgresSparkReader.py +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/constants.py +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/src/sws_spark_dissemination_helper/utils.py +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/tests/__init__.py +0 -0
- {sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/tests/test.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sws-spark-dissemination-helper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.88
|
|
4
4
|
Summary: A Python helper package providing streamlined Spark functions for efficient data dissemination processes
|
|
5
5
|
Project-URL: Repository, https://bitbucket.org/cioapps/sws-it-python-spark-dissemination-helper
|
|
6
6
|
Author-email: Daniele Mansillo <danielemansillo@gmail.com>
|
|
@@ -443,3 +443,78 @@ class SWSBronzeIcebergSparkHelper:
|
|
|
443
443
|
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
444
444
|
|
|
445
445
|
logging.info("Bronze Dissemination tags successfully written")
|
|
446
|
+
|
|
447
|
+
def write_bronze_disseminated_tag_data_to_iceberg_and_csv(
|
|
448
|
+
self, dimensions: Dict[str, List[str]]
|
|
449
|
+
) -> DataFrame:
|
|
450
|
+
self.spark.sql(
|
|
451
|
+
f"ALTER TABLE {self.iceberg_tables.BRONZE.iceberg_id}.`tag_{self.tag_name}` CREATE OR REPLACE BRANCH `diss_tag_{self.tag_name}`" # AS OF VERSION `{tag_name}`
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
for dimension_name, codes in dimensions.items():
|
|
455
|
+
if len(codes) != 0:
|
|
456
|
+
not_in_codes = ",".join([f"'{code}'" for code in codes])
|
|
457
|
+
self.spark.sql(
|
|
458
|
+
f"DELETE FROM {self.iceberg_tables.BRONZE.iceberg_id}.`branch_diss_tag_{self.tag_name}` WHERE {dimension_name} NOT IN ({not_in_codes})"
|
|
459
|
+
)
|
|
460
|
+
disseminated_tag_df = self.spark.read.option("branch", self.tag_name).table(
|
|
461
|
+
self.iceberg_tables.BRONZE.iceberg_id
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
disseminated_tag_df = disseminated_tag_df.withColumn(
|
|
465
|
+
"metadata", F.to_json(col("metadata"))
|
|
466
|
+
).coalesce(1)
|
|
467
|
+
|
|
468
|
+
save_cache_csv(
|
|
469
|
+
df=disseminated_tag_df,
|
|
470
|
+
bucket=self.bucket,
|
|
471
|
+
prefix=f"{self.iceberg_tables.BRONZE.csv_prefix}_disseminated_tag",
|
|
472
|
+
tag_name=self.tag_name,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
return disseminated_tag_df
|
|
476
|
+
|
|
477
|
+
def write_bronze_sws_filtered_disseminated_tag(self, tags: Tags):
|
|
478
|
+
# Get or create a new tag
|
|
479
|
+
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
480
|
+
logging.debug(f"Tag: {tag}")
|
|
481
|
+
|
|
482
|
+
new_iceberg_table = BaseDisseminatedTagTable(
|
|
483
|
+
id=f"{self.domain_code.lower()}_bronze_disseminated_tag_iceberg",
|
|
484
|
+
name=f"{self.domain_code} bronze disseminated tag Iceberg",
|
|
485
|
+
description="Bronze table containing the raw data imported from the SWS, denormalized and filtered per dimension",
|
|
486
|
+
layer=TableLayer.BRONZE,
|
|
487
|
+
private=True,
|
|
488
|
+
type=TableType.ICEBERG,
|
|
489
|
+
database=IcebergDatabases.BRONZE_DATABASE,
|
|
490
|
+
table=self.iceberg_tables.BRONZE.table,
|
|
491
|
+
path=self.iceberg_tables.BRONZE.path,
|
|
492
|
+
structure={
|
|
493
|
+
"columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
|
|
494
|
+
},
|
|
495
|
+
)
|
|
496
|
+
tag = tags.add_dissemination_table(
|
|
497
|
+
self.dataset_id, self.tag_name, new_iceberg_table
|
|
498
|
+
)
|
|
499
|
+
logging.debug(f"Tag with Added Iceberg Table: {tag}")
|
|
500
|
+
|
|
501
|
+
new_csv_table = BaseDisseminatedTagTable(
|
|
502
|
+
id=f"{self.domain_code.lower()}_bronze_disseminated_tag_csv",
|
|
503
|
+
name=f"{self.domain_code} bronze disseminated tag csv",
|
|
504
|
+
description="Bronze table containing the raw data imported from the SWS, denormalized and filtered per dimension cached in csv",
|
|
505
|
+
layer=TableLayer.BRONZE,
|
|
506
|
+
private=True,
|
|
507
|
+
type=TableType.CSV,
|
|
508
|
+
# TODO Correct the path in the origin library
|
|
509
|
+
path=self.iceberg_tables.BRONZE.csv_path,
|
|
510
|
+
structure={
|
|
511
|
+
"columns": self.disseminated_tag_df.schema.jsonValue()["fields"]
|
|
512
|
+
},
|
|
513
|
+
)
|
|
514
|
+
tag = tags.add_dissemination_table(
|
|
515
|
+
self.dataset_id, self.tag_name, new_csv_table
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
logging.debug(f"Tag with Added csv Table: {tag}")
|
|
519
|
+
|
|
520
|
+
logging.info("Bronze Disseminated tag with selection successfully written")
|
|
@@ -89,20 +89,20 @@ class SWSGoldIcebergSparkHelper:
|
|
|
89
89
|
def keep_dim_val_attr_columns(self, df: DataFrame):
|
|
90
90
|
return df.select(*self.cols_to_keep_sws)
|
|
91
91
|
|
|
92
|
+
def read_silver_data(self) -> DataFrame:
|
|
93
|
+
return self.spark.read.option("tag", self.tag_name).table(
|
|
94
|
+
self.iceberg_tables.SILVER.iceberg_id
|
|
95
|
+
)
|
|
96
|
+
|
|
92
97
|
def gen_gold_sws_disseminated_data(self) -> DataFrame:
|
|
93
98
|
return (
|
|
94
|
-
self.
|
|
95
|
-
.table(self.iceberg_tables.SILVER.iceberg_id)
|
|
99
|
+
self.read_silver_data()
|
|
96
100
|
.transform(self.apply_diss_flag_filter)
|
|
97
101
|
.transform(self.keep_dim_val_attr_columns)
|
|
98
102
|
)
|
|
99
103
|
|
|
100
104
|
def gen_gold_sws_validated_data(self) -> DataFrame:
|
|
101
|
-
return (
|
|
102
|
-
self.spark.read.option("tag", self.tag_name)
|
|
103
|
-
.table(self.iceberg_tables.BRONZE.iceberg_id)
|
|
104
|
-
.transform(self.keep_dim_val_attr_columns)
|
|
105
|
-
)
|
|
105
|
+
return self.read_silver_data().transform(self.keep_dim_val_attr_columns)
|
|
106
106
|
|
|
107
107
|
def write_gold_sws_validated_data_to_iceberg_and_csv(
|
|
108
108
|
self, df: DataFrame
|
|
@@ -110,6 +110,10 @@ class SWSSilverIcebergSparkHelper:
|
|
|
110
110
|
return self.spark.read.option("tag", self.tag_name).table(
|
|
111
111
|
self.iceberg_tables.BRONZE.iceberg_id
|
|
112
112
|
)
|
|
113
|
+
def read_bronze_diss_tag_data(self) -> DataFrame:
|
|
114
|
+
return self.spark.read.option("branch", f"diss_tag_{self.tag_name}").table(
|
|
115
|
+
self.iceberg_tables.BRONZE.iceberg_id
|
|
116
|
+
)
|
|
113
117
|
|
|
114
118
|
def _get_dim_time_flag_columns(self) -> Tuple[List[str], List[str], str, List[str]]:
|
|
115
119
|
"""Extract the dimension columns with time, without time, the time column and the flag columns names."""
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
image: python:3.12 # Choose the base image with Python
|
|
2
|
-
|
|
3
|
-
pipelines:
|
|
4
|
-
default:
|
|
5
|
-
- step:
|
|
6
|
-
name: Run Tests
|
|
7
|
-
caches:
|
|
8
|
-
- pip
|
|
9
|
-
script:
|
|
10
|
-
- curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
|
|
11
|
-
- export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
|
|
12
|
-
- poetry install # Install your dependencies
|
|
13
|
-
- poetry run pytest --cov=sws_api_client --cov-report=xml # Run your tests with coverage
|
|
14
|
-
|
|
15
|
-
branches:
|
|
16
|
-
main:
|
|
17
|
-
- step:
|
|
18
|
-
name: Run Tests and Version
|
|
19
|
-
caches:
|
|
20
|
-
- pip
|
|
21
|
-
script:
|
|
22
|
-
- curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
|
|
23
|
-
- export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
|
|
24
|
-
- poetry install # Install your dependencies
|
|
25
|
-
- poetry run pytest # Run your tests
|
|
26
|
-
- export BB_TOKEN=$BITBUCKET_APP_PASSWORD # Set Bitbucket token
|
|
27
|
-
- export BB_USERNAME=$BITBUCKET_USERNAME # Set Bitbucket username
|
|
28
|
-
- poetry run semantic-release version --no-vcs-release # Run semantic release
|
|
29
|
-
|
|
30
|
-
feature/*:
|
|
31
|
-
- step:
|
|
32
|
-
name: Run Tests and Version
|
|
33
|
-
caches:
|
|
34
|
-
- pip
|
|
35
|
-
script:
|
|
36
|
-
- curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
|
|
37
|
-
- export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
|
|
38
|
-
- poetry install # Install your dependencies
|
|
39
|
-
- poetry run pytest # Run your tests
|
|
40
|
-
- export BB_TOKEN=$BITBUCKET_APP_PASSWORD # Set Bitbucket token
|
|
41
|
-
- export BB_USERNAME=$BITBUCKET_USERNAME # Set Bitbucket username
|
|
42
|
-
- poetry run semantic-release version --no-vcs-release --no-tag # Run semantic release
|
|
43
|
-
|
|
44
|
-
dev:
|
|
45
|
-
- step:
|
|
46
|
-
name: Run Tests and Version
|
|
47
|
-
caches:
|
|
48
|
-
- pip
|
|
49
|
-
script:
|
|
50
|
-
- curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
|
|
51
|
-
- export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
|
|
52
|
-
- poetry install # Install your dependencies
|
|
53
|
-
- poetry run pytest # Run your tests
|
|
54
|
-
- export BB_TOKEN=$BITBUCKET_APP_PASSWORD # Set Bitbucket token
|
|
55
|
-
- export BB_USERNAME=$BITBUCKET_USERNAME # Set Bitbucket username
|
|
56
|
-
- poetry run semantic-release version --no-vcs-release --no-tag # Run semantic release
|
|
57
|
-
|
|
58
|
-
tags:
|
|
59
|
-
v*.*.*: # Trigger on version tags like v1.0.0, v2.1.0, etc.
|
|
60
|
-
- step:
|
|
61
|
-
name: Publish to PyPI
|
|
62
|
-
caches:
|
|
63
|
-
- pip
|
|
64
|
-
script:
|
|
65
|
-
- curl -sSL https://install.python-poetry.org | python3 - # Install Poetry
|
|
66
|
-
- export PATH="$HOME/.local/bin:$PATH" # Add Poetry to PATH
|
|
67
|
-
- poetry install --no-dev # Install dependencies without dev dependencies
|
|
68
|
-
- poetry config pypi-token.pypi $PYPI_TOKEN # Set the PyPI token
|
|
69
|
-
- poetry publish --build # Build and publish the package to PyPI. force is needed because I already uploaded pypi version till 0.0.7
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
[tool.poetry]
|
|
2
|
-
name = "sws-spark-dissemination-helper"
|
|
3
|
-
version = "0.0.49"
|
|
4
|
-
description = "A Python helper package providing streamlined Spark functions for efficient data dissemination processes"
|
|
5
|
-
authors = ["Daniele Mansillo <danielemansillo@gmail.com>"]
|
|
6
|
-
license = "MIT"
|
|
7
|
-
readme = "README.md"
|
|
8
|
-
|
|
9
|
-
[tool.poetry.dependencies]
|
|
10
|
-
python = "^3.8"
|
|
11
|
-
annotated-types = "0.7.0"
|
|
12
|
-
boto3 = "1.34.147"
|
|
13
|
-
botocore = "1.34.147"
|
|
14
|
-
certifi = "2024.7.4"
|
|
15
|
-
charset-normalizer = "3.3.2"
|
|
16
|
-
idna = "3.7"
|
|
17
|
-
jmespath = "1.0.1"
|
|
18
|
-
py4j = "0.10.9.7"
|
|
19
|
-
pydantic = "2.8.2"
|
|
20
|
-
pydantic-core = "2.20.1"
|
|
21
|
-
pyspark = "3.5.1"
|
|
22
|
-
python-dateutil = "2.9.0.post0"
|
|
23
|
-
python-dotenv = "1.0.1"
|
|
24
|
-
requests = "2.32.3"
|
|
25
|
-
s3transfer = "0.10.2"
|
|
26
|
-
six = "1.16.0"
|
|
27
|
-
sws-api-client = "1.0.7b0"
|
|
28
|
-
typing-extensions = "4.12.2"
|
|
29
|
-
urllib3 = "1.26.19"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
[build-system]
|
|
33
|
-
requires = ["poetry-core"]
|
|
34
|
-
build-backend = "poetry.core.masonry.api"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
3.9.20
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from copy import copy
|
|
3
|
-
from typing import List, Tuple
|
|
4
|
-
|
|
5
|
-
import pyspark.sql.functions as F
|
|
6
|
-
from pyspark.sql import DataFrame, SparkSession
|
|
7
|
-
from pyspark.sql.functions import col, lit
|
|
8
|
-
from sws_api_client import Tags
|
|
9
|
-
from sws_api_client.tags import BaseDisseminatedTagTable, TableLayer, TableType
|
|
10
|
-
|
|
11
|
-
from .constants import IcebergDatabases, IcebergTables, IcebergTable
|
|
12
|
-
from .SWSPostgresSparkReader import SWSPostgresSparkReader
|
|
13
|
-
from .utils import get_or_create_tag, save_cache_csv
|
|
14
|
-
import boto3
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class SWSBaseIcebergSparkHelper:
|
|
18
|
-
|
|
19
|
-
def _write_to_iceberg(
|
|
20
|
-
self,
|
|
21
|
-
df: DataFrame,
|
|
22
|
-
iceberg_table: IcebergTable,
|
|
23
|
-
tag_name: str,
|
|
24
|
-
) -> DataFrame:
|
|
25
|
-
# Write to Iceberg
|
|
26
|
-
df.writeTo(iceberg_table.iceberg_id).createOrReplace()
|
|
27
|
-
logging.info(f"Table written to {iceberg_table.iceberg_id}")
|
|
28
|
-
|
|
29
|
-
# Create a tag in Iceberg if required
|
|
30
|
-
self.spark.sql(
|
|
31
|
-
f"ALTER TABLE {iceberg_table.iceberg_id} CREATE TAG `{tag_name}`"
|
|
32
|
-
)
|
|
33
|
-
logging.info(f"Tag '{tag_name}' created for {iceberg_table.iceberg_id}")
|
|
34
|
-
|
|
35
|
-
return df
|
|
36
|
-
|
|
37
|
-
def _write_to_csv(
|
|
38
|
-
self, df: DataFrame, iceberg_table: IcebergTable, bucket: str, tag_name: str
|
|
39
|
-
) -> DataFrame:
|
|
40
|
-
s3 = boto3.client("s3")
|
|
41
|
-
|
|
42
|
-
latest_path = f"s3://{bucket}/{iceberg_table.csv_prefix}/latest"
|
|
43
|
-
tag_path = f"s3://{bucket}/{iceberg_table.csv_prefix}/{tag_name}"
|
|
44
|
-
|
|
45
|
-
latest_prefix = f"{iceberg_table.csv_prefix}/latest"
|
|
46
|
-
tag_prefix = f"{iceberg_table.csv_prefix}/{tag_name}"
|
|
47
|
-
|
|
48
|
-
s3.delete_object(Bucket=bucket, Key=f"{latest_prefix}.csv")
|
|
49
|
-
df.coalesce(1).write.option("header", True).mode("overwrite").csv(latest_path)
|
|
50
|
-
|
|
51
|
-
response = s3.list_objects_v2(Bucket=bucket, Prefix=latest_prefix)
|
|
52
|
-
|
|
53
|
-
s3_path_objects_keys = [
|
|
54
|
-
content["Key"] for content in response.get("Contents", {})
|
|
55
|
-
]
|
|
56
|
-
s3_path_csv = [
|
|
57
|
-
s3_object
|
|
58
|
-
for s3_object in s3_path_objects_keys
|
|
59
|
-
if s3_object.endswith(".csv")
|
|
60
|
-
][0]
|
|
61
|
-
|
|
62
|
-
# Extract the csv from the folder and delete the folder
|
|
63
|
-
result_latest = s3.copy_object(
|
|
64
|
-
Bucket=bucket,
|
|
65
|
-
CopySource={"Bucket": bucket, "Key": s3_path_csv},
|
|
66
|
-
Key=f"{latest_prefix}.csv",
|
|
67
|
-
)
|
|
68
|
-
logging.info(f"Updated latest version of cached csv at {latest_path}.csv")
|
|
69
|
-
|
|
70
|
-
result_tag = s3.copy_object(
|
|
71
|
-
Bucket=bucket,
|
|
72
|
-
CopySource={"Bucket": bucket, "Key": s3_path_csv},
|
|
73
|
-
Key=f"{tag_prefix}.csv",
|
|
74
|
-
)
|
|
75
|
-
logging.info(f"Wrote the tag version of cached csv at {tag_path}.csv")
|
|
76
|
-
|
|
77
|
-
for object in s3_path_objects_keys:
|
|
78
|
-
s3.delete_object(Bucket=bucket, Key=object)
|
|
79
|
-
logging.debug("Cleaning the temporary folder of the csv files")
|
|
80
|
-
|
|
81
|
-
return df
|
|
82
|
-
|
|
83
|
-
def _create_dissemination_tag(
|
|
84
|
-
self,
|
|
85
|
-
df: DataFrame,
|
|
86
|
-
tags: Tags,
|
|
87
|
-
iceberg_table: IcebergTable,
|
|
88
|
-
level: str,
|
|
89
|
-
domain_code: str,
|
|
90
|
-
iceberg_description: str,
|
|
91
|
-
csv_description: str,
|
|
92
|
-
) -> Tags:
|
|
93
|
-
tag = get_or_create_tag(tags, self.dataset_id, self.tag_name, self.tag_name)
|
|
94
|
-
logging.debug(f"Initial Tag: {tag}")
|
|
95
|
-
|
|
96
|
-
# Common table structure
|
|
97
|
-
base_table_structure = {
|
|
98
|
-
"layer": level,
|
|
99
|
-
"private": True,
|
|
100
|
-
"structure": {"columns": df.schema.jsonValue()["fields"]},
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
# Add Iceberg table to tag
|
|
104
|
-
iceberg_table = BaseDisseminatedTagTable(
|
|
105
|
-
id=f"{domain_code}_{iceberg_table.level}_iceberg",
|
|
106
|
-
name=f"{domain_code} {iceberg_table.level} Iceberg",
|
|
107
|
-
description=iceberg_description,
|
|
108
|
-
type=TableType.ICEBERG,
|
|
109
|
-
database=iceberg_table.database,
|
|
110
|
-
table=iceberg_table.table,
|
|
111
|
-
path=iceberg_table.path,
|
|
112
|
-
**base_table_structure,
|
|
113
|
-
)
|
|
114
|
-
tags.add_dissemination_table(self.dataset_id, self.tag_name, iceberg_table)
|
|
115
|
-
logging.debug(f"Tag with added Iceberg Table: {tag}")
|
|
116
|
-
|
|
117
|
-
# Add CSV table to tag
|
|
118
|
-
csv_table = BaseDisseminatedTagTable(
|
|
119
|
-
id=f"{domain_code}_{iceberg_table.level}_csv",
|
|
120
|
-
name=f"{domain_code} {iceberg_table.level} CSV",
|
|
121
|
-
description=csv_description,
|
|
122
|
-
type=TableType.CSV,
|
|
123
|
-
path=iceberg_table.csv_path,
|
|
124
|
-
**base_table_structure,
|
|
125
|
-
)
|
|
126
|
-
tags.add_dissemination_table(self.dataset_id, self.tag_name, csv_table)
|
|
127
|
-
logging.debug(f"Tag with added CSV Table: {tag}")
|
|
128
|
-
|
|
129
|
-
return tag
|
|
@@ -1,263 +0,0 @@
|
|
|
1
|
-
class IcebergCatalog:
|
|
2
|
-
def __init__(self, id: str, bucket: str) -> None:
|
|
3
|
-
self.id = "AwsDataCatalog"
|
|
4
|
-
self.bucket = bucket
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class IcebergDatabase:
|
|
8
|
-
def __init__(self, catalog: IcebergCatalog, id: str, level: str) -> None:
|
|
9
|
-
self.catalog = catalog
|
|
10
|
-
self.level = level
|
|
11
|
-
self.id = id
|
|
12
|
-
self.database = f"{catalog.id}.{id}"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class IcebergTable:
|
|
16
|
-
def __init__(self, database: IcebergDatabase, id) -> None:
|
|
17
|
-
self.database = database
|
|
18
|
-
self.id = id
|
|
19
|
-
self.iceberg_id = f"{database.catalog.id}.{database.id}.{id}"
|
|
20
|
-
self.path = f"{database.id}"
|
|
21
|
-
self.csv_prefix = csv_path.rsplit("/", 1)[0]
|
|
22
|
-
self.csv_path = csv_path
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class IcebergDatabases:
|
|
26
|
-
|
|
27
|
-
STAGING_SCHEME = "sws_dissemination_tags_bronze"
|
|
28
|
-
BRONZE_SCHEME = "sws_dissemination_tags_bronze"
|
|
29
|
-
SILVER_SCHEME = "sws_dissemination_tags_silver"
|
|
30
|
-
GOLD_SCHEME = "sws_dissemination_tags_gold"
|
|
31
|
-
STAGING_DATABASE = f"{CATALOG}.{STAGING_SCHEME}"
|
|
32
|
-
BRONZE_DATABASE = f"{CATALOG}.{BRONZE_SCHEME}"
|
|
33
|
-
SILVER_DATABASE = f"{CATALOG}.{SILVER_SCHEME}"
|
|
34
|
-
GOLD_DATABASE = f"{CATALOG}.{GOLD_SCHEME}"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class DomainFilters:
|
|
38
|
-
GENERIC = lambda domain_code: (
|
|
39
|
-
col("domain").isNull()
|
|
40
|
-
| (col("domain") == lit(""))
|
|
41
|
-
| (col("domain") == lit(domain_code))
|
|
42
|
-
)
|
|
43
|
-
MATCH = lambda domain_code: (col("domain") == lit(domain_code))
|
|
44
|
-
EMPTY = lambda: (col("domain").isNull() | (col("domain") == lit("")))
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class DatasetDatatables:
|
|
48
|
-
|
|
49
|
-
class __SWSDatatable:
|
|
50
|
-
def __init__(self, id: str, name: str, schema: str):
|
|
51
|
-
self.id = id
|
|
52
|
-
self.name = name
|
|
53
|
-
self.schema = schema
|
|
54
|
-
|
|
55
|
-
# Dissemination Tables
|
|
56
|
-
DISSEMINATION_TYPE_LIST = __SWSDatatable(
|
|
57
|
-
id="datatables.dissemination_{type}_list",
|
|
58
|
-
name="Dissemination - {type} list",
|
|
59
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
|
|
60
|
-
)
|
|
61
|
-
DISSEMINATION_EXCEPTIONS = __SWSDatatable(
|
|
62
|
-
id="datatables.dissemination_exception",
|
|
63
|
-
name="Dissemination - Exceptions",
|
|
64
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, dim1_code STRING, dim2_code STRING, dim3_code STRING, dim4_code STRING, dim5_code STRING, dim6_code STRING, dim7_code STRING, status_flag STRING, method_flag STRING, dissemination BOOLEAN, aggregation BOOLEAN, note STRING",
|
|
65
|
-
)
|
|
66
|
-
DISSEMINATION_ITEM_LIST_FAOSTAT = __SWSDatatable(
|
|
67
|
-
id="datatables.dissemination_item_list_faostat",
|
|
68
|
-
name="Dissemination - Item list - FAOSTAT",
|
|
69
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, code STRING, name STRING, aggregation_type STRING, dissemination BOOLEAN, aggregation BOOLEAN",
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
# Mapping Tables
|
|
73
|
-
MAPPING_DOMAINS_ID = __SWSDatatable(
|
|
74
|
-
id="datatables.aggregates_mapping_domains_id",
|
|
75
|
-
name="Mapping - Domains ID",
|
|
76
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, domain_name STRING, sws_source_id STRING, sws_destination_id STRING",
|
|
77
|
-
)
|
|
78
|
-
MAPPING_CODELIST_TYPE = __SWSDatatable(
|
|
79
|
-
id="datatables.mapping_codelist_type",
|
|
80
|
-
name="Mapping Codelist type",
|
|
81
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, col_name STRING, col_type STRING",
|
|
82
|
-
)
|
|
83
|
-
MAPPING_CODE_CORRECTION = __SWSDatatable(
|
|
84
|
-
id="datatables.aggregates_mapping_code_correction",
|
|
85
|
-
name="Mapping - Code correction",
|
|
86
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, old_code STRING, new_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
|
|
87
|
-
)
|
|
88
|
-
MAPPING_SDMX_COLUMN_NAMES = __SWSDatatable(
|
|
89
|
-
id="datatables.mapping_sdmx_col_names",
|
|
90
|
-
name="Mapping - SDMX column names",
|
|
91
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_name STRING, external_name STRING, delete BOOLEAN, add BOOLEAN, default_value STRING",
|
|
92
|
-
)
|
|
93
|
-
MAPPING_SDMX_CODES = __SWSDatatable(
|
|
94
|
-
id="datatables.mapping_pre_dissemination",
|
|
95
|
-
name="Mapping - Pre dissemination",
|
|
96
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_code STRING, external_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
|
|
97
|
-
)
|
|
98
|
-
MAPPING_UNITS_OF_MEASURE = __SWSDatatable(
|
|
99
|
-
id="datatables.mapping_units_of_measure",
|
|
100
|
-
name="Mapping - Units of measure",
|
|
101
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, domain STRING, internal_code STRING, external_code STRING, var_type STRING, delete BOOLEAN, multiplier FLOAT, mapping_type STRING",
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# Non-SWS Sources Tables
|
|
105
|
-
FAOSTAT_CODE_MAPPING = __SWSDatatable(
|
|
106
|
-
id="datatables.faostat_code_mapping",
|
|
107
|
-
name="FAOSTAT Code Mapping",
|
|
108
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, faostat_code_type STRING, faostat_code STRING, mapping_type STRING, mapped_code STRING",
|
|
109
|
-
)
|
|
110
|
-
FS_INPUT_MAPPING = __SWSDatatable(
|
|
111
|
-
id="datatables.fs_input_mapping",
|
|
112
|
-
name="FS Input Mapping",
|
|
113
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, DomainCode STRING, Code STRING, Var2Code STRING, Var3Code STRING, Mult INT, 3YrAvgFlag INT, Flag STRING",
|
|
114
|
-
)
|
|
115
|
-
HCES_INPUT_MAPPING = __SWSDatatable(
|
|
116
|
-
id="datatables.hces_input_mapping",
|
|
117
|
-
name="HCES Input Mapping",
|
|
118
|
-
schema=f"{DATATABLE_COLUMNS_SCHEMA}, variable STRING, indicator STRING, element STRING, decimals STRING",
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
class DatasetTables:
|
|
123
|
-
class __SWSTable:
|
|
124
|
-
def __init__(self, postgres_id: str, iceberg_id: str, schema: str):
|
|
125
|
-
self.postgres_id = postgres_id
|
|
126
|
-
self.iceberg_id = iceberg_id
|
|
127
|
-
self.schema = schema
|
|
128
|
-
|
|
129
|
-
def __get_obs_coord_schema(self) -> str:
|
|
130
|
-
|
|
131
|
-
obs_coord_schema_prefix = (
|
|
132
|
-
"id BIGINT, approved_observation BIGINT, num_version BIGINT, "
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
dimensions_schema = (
|
|
136
|
-
" INT, ".join(self.__dataset_details["dimensionColumns"]) + " INT"
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
return obs_coord_schema_prefix + dimensions_schema
|
|
140
|
-
|
|
141
|
-
def __init__(self, dataset_id: str, dataset_details: dict) -> None:
|
|
142
|
-
self.__dataset_id = dataset_id
|
|
143
|
-
self.__dataset_details = dataset_details
|
|
144
|
-
|
|
145
|
-
# Data
|
|
146
|
-
self.OBSERVATION = self.__SWSTable(
|
|
147
|
-
postgres_id=f"{self.__dataset_id}.observation",
|
|
148
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_observation",
|
|
149
|
-
schema="id BIGINT, observation_coordinates BIGINT, version INT, value FLOAT, flag_obs_status STRING, flag_method STRING, created_on TIMESTAMP, created_by INT, replaced_on TIMESTAMP",
|
|
150
|
-
)
|
|
151
|
-
self.OBSERVATION_COORDINATE = self.__SWSTable(
|
|
152
|
-
postgres_id=f"{self.__dataset_id}.observation_coordinate",
|
|
153
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_observation_coordinate",
|
|
154
|
-
schema=self.__get_obs_coord_schema(),
|
|
155
|
-
)
|
|
156
|
-
self.METADATA = self.__SWSTable(
|
|
157
|
-
postgres_id=f"{self.__dataset_id}.metadata",
|
|
158
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata",
|
|
159
|
-
schema="id BIGINT, observation BIGINT, metadata_type INT, language INT, copy_metadata BIGINT",
|
|
160
|
-
)
|
|
161
|
-
self.METADATA_ELEMENT = self.__SWSTable(
|
|
162
|
-
postgres_id=f"{self.__dataset_id}.metadata_element",
|
|
163
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{self.__dataset_id}_metadata_element",
|
|
164
|
-
schema="id BIGINT, metadata INT, metadata_element_type INT, value STRING",
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
# Reference data
|
|
168
|
-
self.CODELISTS = [
|
|
169
|
-
self.__SWSTable(
|
|
170
|
-
postgres_id=dimension["codelist"]["table"],
|
|
171
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.{dimension['codelist']['table'].split('.')[1]}",
|
|
172
|
-
schema=CODELIST_SCHEMA,
|
|
173
|
-
)
|
|
174
|
-
for dimension in dataset_details["dimensions"]
|
|
175
|
-
]
|
|
176
|
-
|
|
177
|
-
FLAG_METHOD = __SWSTable(
|
|
178
|
-
postgres_id="reference_data.flag_method",
|
|
179
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.flag_method",
|
|
180
|
-
schema=FLAGLIST_SCHEMA,
|
|
181
|
-
)
|
|
182
|
-
FLAG_OBS_STATUS = __SWSTable(
|
|
183
|
-
postgres_id="reference_data.flag_obs_status",
|
|
184
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.flag_obs_status",
|
|
185
|
-
schema=FLAGLIST_SCHEMA,
|
|
186
|
-
)
|
|
187
|
-
METADATA_TYPE = __SWSTable(
|
|
188
|
-
postgres_id="reference_data.metadata_type",
|
|
189
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_type",
|
|
190
|
-
schema="id INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN",
|
|
191
|
-
)
|
|
192
|
-
METADATA_ELEMENT_TYPE = __SWSTable(
|
|
193
|
-
postgres_id="reference_data.metadata_element_type",
|
|
194
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.metadata_element_type",
|
|
195
|
-
schema="id INT, metadata_type INT, code STRING, description STRING, mandatory BOOLEAN, repeatable BOOLEAN, private BOOLEAN",
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
LANGUAGE = __SWSTable(
|
|
199
|
-
postgres_id="reference_data.language",
|
|
200
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.language",
|
|
201
|
-
schema="id INT, country_code STRING, description STRING",
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
UNIT_OF_MEASURE = __SWSTable(
|
|
205
|
-
postgres_id="reference_data.unit_of_measure",
|
|
206
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.unit_of_measure",
|
|
207
|
-
schema="id INT, code STRING, sdmx_code STRING, metric BOOLEAN, description STRING, symbol STRING, base_unit STRING, multiplier DECIMAL",
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
# Operational data
|
|
211
|
-
USER = __SWSTable(
|
|
212
|
-
postgres_id="operational_data.user",
|
|
213
|
-
iceberg_id=f"{IcebergDatabases.STAGING_DATABASE}.user",
|
|
214
|
-
schema="id INT, username STRING, preferences INT, email STRING, active BOOLEAN, settings STRING",
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
class IcebergTable:
|
|
219
|
-
def __init__(self, level: str, iceberg_id: str, path: str, csv_path: str):
|
|
220
|
-
self.level = level.lower()
|
|
221
|
-
self.iceberg_id = iceberg_id
|
|
222
|
-
self.table = iceberg_id.split(".")[-1]
|
|
223
|
-
self.path = path
|
|
224
|
-
self.csv_prefix = csv_path.rsplit("/", 1)[0]
|
|
225
|
-
self.csv_path = csv_path
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
class IcebergTables:
|
|
229
|
-
|
|
230
|
-
def __init__(self, dataset_id: str, tag_name: str, domain: str = "") -> None:
|
|
231
|
-
self.__dataset_id = dataset_id
|
|
232
|
-
self.__tag_name = tag_name
|
|
233
|
-
|
|
234
|
-
self.BRONZE = self._create_iceberg_table("BRONZE")
|
|
235
|
-
self.SILVER = self._create_iceberg_table("SILVER", prefix=domain)
|
|
236
|
-
|
|
237
|
-
# GOLD tables with specific suffixes
|
|
238
|
-
self.GOLD_SDMX = self._create_iceberg_table(
|
|
239
|
-
"GOLD", prefix=domain, suffix="sdmx_disseminated"
|
|
240
|
-
)
|
|
241
|
-
self.GOLD_SWS_VALIDATED = self._create_iceberg_table(
|
|
242
|
-
"GOLD", prefix=domain, suffix="sws_validated"
|
|
243
|
-
)
|
|
244
|
-
self.GOLD_SWS_DISSEMINATED = self._create_iceberg_table(
|
|
245
|
-
"GOLD", prefix=domain, suffix="sws_disseminated"
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
def _create_iceberg_table(
|
|
249
|
-
self, level: str, prefix: str = "", suffix: str = ""
|
|
250
|
-
) -> IcebergTable:
|
|
251
|
-
database = getattr(IcebergDatabases, f"{level}_DATABASE")
|
|
252
|
-
scheme = getattr(IcebergDatabases, f"{level}_SCHEME")
|
|
253
|
-
|
|
254
|
-
if prefix != "":
|
|
255
|
-
prefix = f"{prefix}_".lower()
|
|
256
|
-
if suffix != "":
|
|
257
|
-
suffix = f"_{suffix}".lower()
|
|
258
|
-
|
|
259
|
-
iceberg_id = f"{database}.{prefix}{self.__dataset_id}{suffix}"
|
|
260
|
-
path = f"{scheme}/{prefix}{self.__dataset_id}{suffix}"
|
|
261
|
-
csv_path = f"{CACHED_CSV_FOLDER}/{scheme}/{prefix}{self.__dataset_id}{suffix}/{self.__tag_name}.csv"
|
|
262
|
-
|
|
263
|
-
return IcebergTable(level, iceberg_id, path, csv_path)
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/old_requirements.txt
RENAMED
|
File without changes
|
{sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/tests/__init__.py
RENAMED
|
File without changes
|
{sws_spark_dissemination_helper-0.0.86 → sws_spark_dissemination_helper-0.0.88}/tests/test.py
RENAMED
|
File without changes
|