dataorc-utils 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataorc_utils-0.1.0/PKG-INFO +15 -0
- dataorc_utils-0.1.0/pyproject.toml +40 -0
- dataorc_utils-0.1.0/setup.cfg +4 -0
- dataorc_utils-0.1.0/src/dataorc_utils/__init__.py +22 -0
- dataorc_utils-0.1.0/src/dataorc_utils/config/__init__.py +21 -0
- dataorc_utils-0.1.0/src/dataorc_utils/config/enums.py +51 -0
- dataorc_utils-0.1.0/src/dataorc_utils/config/manager.py +174 -0
- dataorc_utils-0.1.0/src/dataorc_utils/config/models.py +155 -0
- dataorc_utils-0.1.0/src/dataorc_utils/config/rules.py +88 -0
- dataorc_utils-0.1.0/src/dataorc_utils/config/validation.py +49 -0
- dataorc_utils-0.1.0/src/dataorc_utils.egg-info/PKG-INFO +15 -0
- dataorc_utils-0.1.0/src/dataorc_utils.egg-info/SOURCES.txt +14 -0
- dataorc_utils-0.1.0/src/dataorc_utils.egg-info/dependency_links.txt +1 -0
- dataorc_utils-0.1.0/src/dataorc_utils.egg-info/requires.txt +5 -0
- dataorc_utils-0.1.0/src/dataorc_utils.egg-info/top_level.txt +1 -0
- dataorc_utils-0.1.0/tests/test_config_basic.py +114 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataorc-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Utility functions for ETL operations
|
|
5
|
+
Classifier: Development Status :: 3 - Alpha
|
|
6
|
+
Classifier: Intended Audience :: Developers
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
14
|
+
Requires-Dist: pytest-cov>=7.0; extra == "dev"
|
|
15
|
+
Requires-Dist: ruff>=0.8.0; extra == "dev"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dataorc-utils"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Utility functions for ETL operations"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 3 - Alpha",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
]
|
|
18
|
+
dependencies = []
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
dev = [
|
|
22
|
+
"pytest>=8.0",
|
|
23
|
+
"pytest-cov>=7.0",
|
|
24
|
+
"ruff>=0.8.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["src"]
|
|
29
|
+
|
|
30
|
+
[tool.ruff]
|
|
31
|
+
# Set the maximum line length to 88.
|
|
32
|
+
line-length = 88
|
|
33
|
+
target-version = "py312"
|
|
34
|
+
|
|
35
|
+
[tool.ruff.lint]
|
|
36
|
+
select = ["E4", "E7", "E9", "F", "B", "I"]
|
|
37
|
+
|
|
38
|
+
[tool.isort]
|
|
39
|
+
profile = "black"
|
|
40
|
+
line_length = 88
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Dataorc Utils
|
|
2
|
+
|
|
3
|
+
A collection of utility functions for ETL operations.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__author__ = "Equinor"
|
|
7
|
+
__email__ = "toarst@equinor.com"
|
|
8
|
+
|
|
9
|
+
# Import main functions/classes here to make them available at package level
|
|
10
|
+
# Example:
|
|
11
|
+
# from .core import some_function
|
|
12
|
+
# from .utils import another_function
|
|
13
|
+
|
|
14
|
+
# Re-export subpackages / common symbols for convenience.
|
|
15
|
+
# This allows: `from dataorc_utils import config` or
|
|
16
|
+
# `from dataorc_utils.config import CorePipelineConfig`.
|
|
17
|
+
from . import config # convenient access to the config subpackage
|
|
18
|
+
|
|
19
|
+
# __all__ defines what gets imported with "from dataorc_utils import *"
|
|
20
|
+
__all__ = [
|
|
21
|
+
"config",
|
|
22
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Config package public API.
|
|
3
|
+
|
|
4
|
+
This module re-exports the most commonly used symbols from the
|
|
5
|
+
submodules so callers can `from dataorc_utils.config import ...`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .enums import CoreParam, Defaults, Environment
|
|
9
|
+
from .manager import PipelineParameterManager
|
|
10
|
+
from .models import CorePipelineConfig, InfraContext
|
|
11
|
+
from .validation import print_config
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"Environment",
|
|
15
|
+
"CoreParam",
|
|
16
|
+
"Defaults",
|
|
17
|
+
"InfraContext",
|
|
18
|
+
"CorePipelineConfig",
|
|
19
|
+
"print_config",
|
|
20
|
+
"PipelineParameterManager",
|
|
21
|
+
]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core parameter definitions and enums.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Environment(str, Enum):
|
|
9
|
+
"""Pipeline execution environments."""
|
|
10
|
+
|
|
11
|
+
DEV = "dev"
|
|
12
|
+
TEST = "test"
|
|
13
|
+
PROD = "prod"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CoreParam(str, Enum):
|
|
17
|
+
"""Core parameters used across all pipelines."""
|
|
18
|
+
|
|
19
|
+
# Environment and data lake
|
|
20
|
+
DATALAKE_NAME = "datalake_name"
|
|
21
|
+
DATALAKE_CONTAINER_NAME = "datalake_container_name"
|
|
22
|
+
ENV = "env"
|
|
23
|
+
|
|
24
|
+
# Data Lake Structure Parameters
|
|
25
|
+
# Following pattern: containername/{layer}/{domain}/{product}/{version}/output/{processing_method}
|
|
26
|
+
DOMAIN = "domain" # Business domain -> Catalog name
|
|
27
|
+
PRODUCT = "product" # Product/project -> Database name
|
|
28
|
+
TABLE_NAME = "table_name" # Table name within the product/database
|
|
29
|
+
|
|
30
|
+
# layers
|
|
31
|
+
BRONZE_VERSION = "bronze_version" # Version for bronze layer
|
|
32
|
+
SILVER_VERSION = "silver_version" # Version for silver layer
|
|
33
|
+
GOLD_VERSION = "gold_version" # Version for gold layer
|
|
34
|
+
|
|
35
|
+
# Processing Methods (layer-specific, user-configurable)
|
|
36
|
+
BRONZE_PROCESSING_METHOD = "bronze_processing_method" # incremental, full, delta
|
|
37
|
+
SILVER_PROCESSING_METHOD = "silver_processing_method" # incremental, full, delta
|
|
38
|
+
GOLD_PROCESSING_METHOD = "gold_processing_method" # incremental, full, delta
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Default values - co-located with their semantic meaning
|
|
42
|
+
class Defaults:
|
|
43
|
+
"""Default values for pipeline configuration."""
|
|
44
|
+
|
|
45
|
+
# Version defaults
|
|
46
|
+
VERSION = "v1"
|
|
47
|
+
|
|
48
|
+
# Processing method defaults
|
|
49
|
+
BRONZE_PROCESSING_METHOD = "incremental"
|
|
50
|
+
SILVER_PROCESSING_METHOD = "incremental"
|
|
51
|
+
GOLD_PROCESSING_METHOD = "delta"
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parameter management for pipeline configuration.
|
|
3
|
+
|
|
4
|
+
This manager reads configuration from environment variables.
|
|
5
|
+
Cluster environment variables / wheel packaging.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
from .enums import CoreParam, Defaults, Environment
|
|
11
|
+
from .models import CorePipelineConfig, InfraContext
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PipelineParameterManager:
|
|
15
|
+
"""
|
|
16
|
+
General parameter manager for data pipelines.
|
|
17
|
+
|
|
18
|
+
This manager is designed to work with repository-specific configurations.
|
|
19
|
+
Most repositories should create their own wrapper that provides the
|
|
20
|
+
repository-specific configuration dictionaries.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
environments_config: dict = None,
|
|
26
|
+
domain_configs: dict = None,
|
|
27
|
+
product_configs: dict = None,
|
|
28
|
+
case_fallback: bool = False,
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Initialize parameter manager.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
environments_config: Dictionary of environment configurations
|
|
35
|
+
domain_configs: Dictionary of domain configurations
|
|
36
|
+
product_configs: Dictionary of product configurations
|
|
37
|
+
"""
|
|
38
|
+
# Wheel-based packaging for configuration delivery.
|
|
39
|
+
self.environments_config = environments_config or {}
|
|
40
|
+
self.domain_configs = domain_configs or {}
|
|
41
|
+
self.product_configs = product_configs or {}
|
|
42
|
+
self.case_fallback = case_fallback
|
|
43
|
+
self._local_environment = Environment.DEV # Default for local development
|
|
44
|
+
|
|
45
|
+
def _get_default_value(self, param: CoreParam) -> str:
|
|
46
|
+
"""Get default value for a core parameter."""
|
|
47
|
+
defaults_map = {
|
|
48
|
+
CoreParam.BRONZE_VERSION: Defaults.VERSION,
|
|
49
|
+
CoreParam.SILVER_VERSION: Defaults.VERSION,
|
|
50
|
+
CoreParam.GOLD_VERSION: Defaults.VERSION,
|
|
51
|
+
CoreParam.BRONZE_PROCESSING_METHOD: Defaults.BRONZE_PROCESSING_METHOD,
|
|
52
|
+
CoreParam.SILVER_PROCESSING_METHOD: Defaults.SILVER_PROCESSING_METHOD,
|
|
53
|
+
CoreParam.GOLD_PROCESSING_METHOD: Defaults.GOLD_PROCESSING_METHOD,
|
|
54
|
+
}
|
|
55
|
+
return defaults_map.get(param, "")
|
|
56
|
+
|
|
57
|
+
def get_env_variables(
|
|
58
|
+
self, var_names: list[str], required: bool = False
|
|
59
|
+
) -> dict[str, str]:
|
|
60
|
+
"""
|
|
61
|
+
Retrieve environment variables by name.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
var_names: List of environment variable names to retrieve
|
|
65
|
+
required: If True, raises ValueError when a variable is missing
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Dictionary mapping variable names to their values (empty string for missing vars)
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
ValueError: If required=True and any variable is not set
|
|
72
|
+
"""
|
|
73
|
+
result = {}
|
|
74
|
+
missing = []
|
|
75
|
+
|
|
76
|
+
for var_name in var_names:
|
|
77
|
+
# Lookup strategy: exact first. If case_fallback enabled, try UPPER then lower.
|
|
78
|
+
env_value = os.getenv(var_name)
|
|
79
|
+
if env_value is None and self.case_fallback:
|
|
80
|
+
if var_name.upper() != var_name:
|
|
81
|
+
env_value = os.getenv(var_name.upper())
|
|
82
|
+
if env_value is None and var_name.lower() != var_name:
|
|
83
|
+
env_value = os.getenv(var_name.lower())
|
|
84
|
+
|
|
85
|
+
if env_value is not None:
|
|
86
|
+
result[var_name] = env_value
|
|
87
|
+
else:
|
|
88
|
+
if required:
|
|
89
|
+
missing.append(var_name)
|
|
90
|
+
else:
|
|
91
|
+
result[var_name] = ""
|
|
92
|
+
|
|
93
|
+
if missing:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"Required environment variables not set: {', '.join(missing)}"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
def prepare_infrastructure(self, env_vars: list[str]) -> InfraContext:
|
|
101
|
+
"""Read and return infrastructure context (no dataset identifiers).
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
env_vars: List of infrastructure environment variable names to capture
|
|
105
|
+
(e.g., ["datalake_name", "datalake_container_name", "az_tenant_id"]).
|
|
106
|
+
These will be stored in InfraContext.variables.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
InfraContext with env and requested infrastructure variables
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
ValueError: If the ENV environment variable is not set in environment
|
|
113
|
+
"""
|
|
114
|
+
# Get the environment (always required)
|
|
115
|
+
env_value = os.getenv(CoreParam.ENV.value)
|
|
116
|
+
if not env_value:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"Required environment variable '{CoreParam.ENV.value}' is not set"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
env = Environment(env_value)
|
|
122
|
+
|
|
123
|
+
# Capture infrastructure variables
|
|
124
|
+
infra_vars = self.get_env_variables(env_vars, required=False)
|
|
125
|
+
|
|
126
|
+
return InfraContext(
|
|
127
|
+
env=env,
|
|
128
|
+
variables=infra_vars,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def build_core_config(
|
|
132
|
+
self,
|
|
133
|
+
infra: InfraContext,
|
|
134
|
+
domain: str = "",
|
|
135
|
+
product: str = "",
|
|
136
|
+
table_name: str = "",
|
|
137
|
+
bronze_version: str | None = None,
|
|
138
|
+
silver_version: str | None = None,
|
|
139
|
+
gold_version: str | None = None,
|
|
140
|
+
bronze_processing_method: str | None = None,
|
|
141
|
+
silver_processing_method: str | None = None,
|
|
142
|
+
gold_processing_method: str | None = None,
|
|
143
|
+
) -> CorePipelineConfig:
|
|
144
|
+
"""Compose a CorePipelineConfig from infra plus pipeline-specific overrides."""
|
|
145
|
+
# Resolve defaults if None supplied
|
|
146
|
+
bv = bronze_version or self._get_default_value(CoreParam.BRONZE_VERSION)
|
|
147
|
+
sv = silver_version or self._get_default_value(CoreParam.SILVER_VERSION)
|
|
148
|
+
gv = gold_version or self._get_default_value(CoreParam.GOLD_VERSION)
|
|
149
|
+
|
|
150
|
+
bpm = bronze_processing_method or self._get_default_value(
|
|
151
|
+
CoreParam.BRONZE_PROCESSING_METHOD
|
|
152
|
+
)
|
|
153
|
+
spm = silver_processing_method or self._get_default_value(
|
|
154
|
+
CoreParam.SILVER_PROCESSING_METHOD
|
|
155
|
+
)
|
|
156
|
+
gpm = gold_processing_method or self._get_default_value(
|
|
157
|
+
CoreParam.GOLD_PROCESSING_METHOD
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
config = CorePipelineConfig(
|
|
161
|
+
env=infra.env,
|
|
162
|
+
domain=domain,
|
|
163
|
+
product=product,
|
|
164
|
+
table_name=table_name,
|
|
165
|
+
bronze_version=bv,
|
|
166
|
+
silver_version=sv,
|
|
167
|
+
gold_version=gv,
|
|
168
|
+
bronze_processing_method=bpm,
|
|
169
|
+
silver_processing_method=spm,
|
|
170
|
+
gold_processing_method=gpm,
|
|
171
|
+
env_vars=infra.variables,
|
|
172
|
+
)
|
|
173
|
+
config.validate_rules()
|
|
174
|
+
return config
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Core configuration data classes."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
from .enums import Defaults, Environment
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class InfraContext:
|
|
10
|
+
"""Infrastructure-level context captured prior to pipeline specifics.
|
|
11
|
+
|
|
12
|
+
Stable across multiple pipeline jobs; excludes dataset identifiers and
|
|
13
|
+
per-layer version/processing configuration.
|
|
14
|
+
|
|
15
|
+
The `variables` dict holds all infrastructure environment variables
|
|
16
|
+
(e.g., datalake_name, datalake_container_name, Azure tenant/client IDs, etc.)
|
|
17
|
+
that were requested when calling prepare_infrastructure().
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
env: Environment
|
|
21
|
+
variables: dict[str, str] = field(default_factory=dict)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True, slots=True)
|
|
25
|
+
class CorePipelineConfig:
|
|
26
|
+
"""Immutable pipeline configuration snapshot.
|
|
27
|
+
|
|
28
|
+
Path pattern: container/{layer}/{domain}/{product}/{table_name}/{version}/output/{processing_method}
|
|
29
|
+
Construct via PipelineParameterManager.build_core_config() in production code.
|
|
30
|
+
|
|
31
|
+
The `env_vars` dict holds infrastructure environment variables
|
|
32
|
+
(e.g., datalake_name, datalake_container_name, Azure IDs, etc.) captured during
|
|
33
|
+
prepare_infrastructure().
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Required
|
|
37
|
+
env: Environment
|
|
38
|
+
|
|
39
|
+
# Structure identifiers
|
|
40
|
+
domain: str = ""
|
|
41
|
+
product: str = ""
|
|
42
|
+
table_name: str = ""
|
|
43
|
+
|
|
44
|
+
# Layer versions
|
|
45
|
+
bronze_version: str = Defaults.VERSION
|
|
46
|
+
silver_version: str = Defaults.VERSION
|
|
47
|
+
gold_version: str = Defaults.VERSION
|
|
48
|
+
|
|
49
|
+
# Processing methods
|
|
50
|
+
bronze_processing_method: str = Defaults.BRONZE_PROCESSING_METHOD
|
|
51
|
+
silver_processing_method: str = Defaults.SILVER_PROCESSING_METHOD
|
|
52
|
+
gold_processing_method: str = Defaults.GOLD_PROCESSING_METHOD
|
|
53
|
+
|
|
54
|
+
# Flexible infrastructure variables (datalake_name, container, Azure IDs, etc.)
|
|
55
|
+
env_vars: dict[str, str] = field(default_factory=dict)
|
|
56
|
+
# Convenience properties that return the canonical lake path for each layer.
|
|
57
|
+
|
|
58
|
+
def get_lake_path(
|
|
59
|
+
self,
|
|
60
|
+
layer: str,
|
|
61
|
+
processing_method_override: str = None,
|
|
62
|
+
version_override: str = None,
|
|
63
|
+
domain_override: str = None,
|
|
64
|
+
product_override: str = None,
|
|
65
|
+
table_name_override: str = None,
|
|
66
|
+
) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Generate Data Lake path following the standard structure.
|
|
69
|
+
|
|
70
|
+
Structure: containername/{layer}/{domain}/{product}/{table_name}/{version}/output/{processing_method}
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
layer: bronze, silver, or gold
|
|
74
|
+
processing_method_override: override processing method for specific layer
|
|
75
|
+
version_override: override version for specific layer
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Full data lake path
|
|
79
|
+
"""
|
|
80
|
+
# Allow callers to override identifiers; fall back to the instance values.
|
|
81
|
+
domain = domain_override or self.domain
|
|
82
|
+
product = product_override or self.product
|
|
83
|
+
table_name = table_name_override or self.table_name
|
|
84
|
+
container = self.env_vars.get("datalake_container_name", "")
|
|
85
|
+
|
|
86
|
+
if not all([container, domain, product, table_name]):
|
|
87
|
+
raise ValueError(
|
|
88
|
+
"datalake_container_name, domain, product and table_name must be set to generate lake path"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Resolve attribute names directly (e.g. bronze_version, bronze_processing_method)
|
|
92
|
+
v_attr = f"{layer}_version"
|
|
93
|
+
p_attr = f"{layer}_processing_method"
|
|
94
|
+
|
|
95
|
+
version = version_override or getattr(self, v_attr, Defaults.VERSION)
|
|
96
|
+
processing_method = processing_method_override or getattr(
|
|
97
|
+
self, p_attr, Defaults.BRONZE_PROCESSING_METHOD
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return f"{container}/{layer}/{domain}/{product}/{table_name}/{version}/output/{processing_method}"
|
|
101
|
+
|
|
102
|
+
def get_work_path(
|
|
103
|
+
self,
|
|
104
|
+
layer: str,
|
|
105
|
+
version_override: str = None,
|
|
106
|
+
domain_override: str = None,
|
|
107
|
+
product_override: str = None,
|
|
108
|
+
table_name_override: str = None,
|
|
109
|
+
) -> str:
|
|
110
|
+
"""Return the working path for a layer.
|
|
111
|
+
|
|
112
|
+
This reuses `get_lake_path(...)` and replaces the trailing
|
|
113
|
+
`/output/{processing_method}` segment with `/work`. If the
|
|
114
|
+
expected `/output/` segment isn't found, `/work` is appended.
|
|
115
|
+
"""
|
|
116
|
+
# Reuse get_lake_path to compose the canonical path and then
|
|
117
|
+
# convert it to a work path by replacing the output segment.
|
|
118
|
+
lake_path = self.get_lake_path(
|
|
119
|
+
layer,
|
|
120
|
+
processing_method_override=None,
|
|
121
|
+
version_override=version_override,
|
|
122
|
+
domain_override=domain_override,
|
|
123
|
+
product_override=product_override,
|
|
124
|
+
table_name_override=table_name_override,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
marker = "/output/"
|
|
128
|
+
idx = lake_path.find(marker)
|
|
129
|
+
if idx >= 0:
|
|
130
|
+
return lake_path[:idx] + "/work"
|
|
131
|
+
# Fallback: append /work if format differs
|
|
132
|
+
return lake_path.rstrip("/") + "/work"
|
|
133
|
+
|
|
134
|
+
def validate_rules(self, layers: list | None = None) -> bool:
|
|
135
|
+
"""Run repository-config rules against this CorePipelineConfig.
|
|
136
|
+
|
|
137
|
+
Delegates to `run_rules_checks` and returns True if checks pass or
|
|
138
|
+
raises ValueError if any rule fails.
|
|
139
|
+
"""
|
|
140
|
+
# Import locally to avoid circular imports at module import time
|
|
141
|
+
from .rules import run_rules_checks
|
|
142
|
+
|
|
143
|
+
return run_rules_checks(self, layers)
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def bronze_lake_path(self) -> str:
|
|
147
|
+
return self.get_lake_path("bronze")
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def silver_lake_path(self) -> str:
|
|
151
|
+
return self.get_lake_path("silver")
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def gold_lake_path(self) -> str:
|
|
155
|
+
return self.get_lake_path("gold")
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Rule framework for configuration validation.
|
|
2
|
+
|
|
3
|
+
Extensible mechanism for validating `CorePipelineConfig` objects. Each rule is a
|
|
4
|
+
callable taking (config, layer) and returning True or raising ValueError.
|
|
5
|
+
|
|
6
|
+
Built‑in rules:
|
|
7
|
+
- `lowercase_lake_path_rule`: lake paths must not contain uppercase letters.
|
|
8
|
+
|
|
9
|
+
Add new rules by appending to `RULES` or passing a custom list to
|
|
10
|
+
`run_rules_checks`.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from typing import TYPE_CHECKING, Callable, Iterable, List
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
19
|
+
from .models import CorePipelineConfig
|
|
20
|
+
|
|
21
|
+
RuleFunc = Callable[["CorePipelineConfig", str], bool]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def lowercase_lake_path_rule(config: "CorePipelineConfig", layer: str) -> bool:
|
|
25
|
+
path = config.get_lake_path(layer)
|
|
26
|
+
if any(ch.isalpha() and ch.isupper() for ch in path):
|
|
27
|
+
raise ValueError(f"Lake path contains uppercase letters: '{path}'")
|
|
28
|
+
return True
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
_VERSION_PATTERN = re.compile(r"^v[0-9]+$")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def version_format_rule(config: "CorePipelineConfig", layer: str) -> bool:
|
|
35
|
+
"""Ensure the version segment for the layer matches pattern v<integer>.
|
|
36
|
+
|
|
37
|
+
It derives the layer-specific version attribute (e.g. bronze_version) and validates
|
|
38
|
+
it matches the required pattern. Also verifies that the lake path actually embeds
|
|
39
|
+
that exact version token (defensive consistency check).
|
|
40
|
+
"""
|
|
41
|
+
attr = f"{layer}_version"
|
|
42
|
+
value = getattr(config, attr)
|
|
43
|
+
if not isinstance(value, str) or not _VERSION_PATTERN.match(value):
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"Version for layer '{layer}' must match pattern 'v<integer>' (e.g. v1); got: {value!r}"
|
|
46
|
+
)
|
|
47
|
+
path = config.get_lake_path(layer)
|
|
48
|
+
# Ensure token boundary match in path
|
|
49
|
+
if f"/{value}/" not in path:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Lake path for layer '{layer}' does not include expected version token '{value}': {path}"
|
|
52
|
+
)
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
RULES: List[RuleFunc] = [lowercase_lake_path_rule, version_format_rule]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def run_rules_checks(
|
|
60
|
+
config: "CorePipelineConfig",
|
|
61
|
+
layers: Iterable[str] | None = None,
|
|
62
|
+
rules: Iterable[RuleFunc] | None = None,
|
|
63
|
+
) -> bool:
|
|
64
|
+
if layers is None:
|
|
65
|
+
layers = ("bronze", "silver", "gold")
|
|
66
|
+
if rules is None:
|
|
67
|
+
rules = RULES
|
|
68
|
+
|
|
69
|
+
errors: list[str] = []
|
|
70
|
+
for layer in layers:
|
|
71
|
+
for rule in rules:
|
|
72
|
+
try:
|
|
73
|
+
rule(config, layer)
|
|
74
|
+
except Exception as exc: # noqa: BLE001
|
|
75
|
+
errors.append(f"[{layer}] {exc}")
|
|
76
|
+
|
|
77
|
+
if errors:
|
|
78
|
+
raise ValueError("Rule checks failed:\n" + "\n".join(errors))
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
__all__ = [
|
|
83
|
+
"RuleFunc",
|
|
84
|
+
"RULES",
|
|
85
|
+
"lowercase_lake_path_rule",
|
|
86
|
+
"version_format_rule",
|
|
87
|
+
"run_rules_checks",
|
|
88
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parameter validation logic.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .models import CorePipelineConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def print_config(
|
|
9
|
+
config: CorePipelineConfig, title: str = "Pipeline Configuration"
|
|
10
|
+
) -> None:
|
|
11
|
+
"""Print configuration for debugging."""
|
|
12
|
+
print(f"📦 {title}:")
|
|
13
|
+
# Handle both string and enum types
|
|
14
|
+
env_value = config.env.value if hasattr(config.env, "value") else str(config.env)
|
|
15
|
+
|
|
16
|
+
print(f" Environment: {env_value}")
|
|
17
|
+
|
|
18
|
+
print(" 🏗️ Data Lake Structure:")
|
|
19
|
+
print(f" Domain: {config.domain}")
|
|
20
|
+
print(f" Product: {config.product}")
|
|
21
|
+
print(f" Table: {config.table_name}")
|
|
22
|
+
print(f" Bronze Version: {config.bronze_version}")
|
|
23
|
+
print(f" Silver Version: {config.silver_version}")
|
|
24
|
+
print(f" Gold Version: {config.gold_version}")
|
|
25
|
+
|
|
26
|
+
print(" ⚙️ Processing Methods:")
|
|
27
|
+
print(f" Bronze: {config.bronze_processing_method}")
|
|
28
|
+
print(f" Silver: {config.silver_processing_method}")
|
|
29
|
+
print(f" Gold: {config.gold_processing_method}")
|
|
30
|
+
|
|
31
|
+
# Only show generated lake paths if structure is complete
|
|
32
|
+
if all([config.domain, config.product, config.table_name]):
|
|
33
|
+
print(" 📁 Generated Paths:")
|
|
34
|
+
print(f" Bronze Lake Path: {config.get_lake_path('bronze')}")
|
|
35
|
+
print(f" Silver Lake Path: {config.get_lake_path('silver')}")
|
|
36
|
+
print(f" Gold Lake Path: {config.get_lake_path('gold')}")
|
|
37
|
+
print(" 📁 Work paths:")
|
|
38
|
+
print(f" Bronze: {config.get_work_path('bronze')}")
|
|
39
|
+
print(f" Silver: {config.get_work_path('silver')}")
|
|
40
|
+
print(f" Gold: {config.get_work_path('gold')}")
|
|
41
|
+
else:
|
|
42
|
+
print(" ⚠️ Data Lake Structure incomplete - paths not generated")
|
|
43
|
+
|
|
44
|
+
# Infrastructure variables
|
|
45
|
+
if config.env_vars:
|
|
46
|
+
print(" 🔧 Infrastructure Variables:")
|
|
47
|
+
for key, value in sorted(config.env_vars.items()):
|
|
48
|
+
display_value = value if value else "(empty)"
|
|
49
|
+
print(f" {key}: {display_value}")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataorc-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Utility functions for ETL operations
|
|
5
|
+
Classifier: Development Status :: 3 - Alpha
|
|
6
|
+
Classifier: Intended Audience :: Developers
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
14
|
+
Requires-Dist: pytest-cov>=7.0; extra == "dev"
|
|
15
|
+
Requires-Dist: ruff>=0.8.0; extra == "dev"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
src/dataorc_utils/__init__.py
|
|
3
|
+
src/dataorc_utils.egg-info/PKG-INFO
|
|
4
|
+
src/dataorc_utils.egg-info/SOURCES.txt
|
|
5
|
+
src/dataorc_utils.egg-info/dependency_links.txt
|
|
6
|
+
src/dataorc_utils.egg-info/requires.txt
|
|
7
|
+
src/dataorc_utils.egg-info/top_level.txt
|
|
8
|
+
src/dataorc_utils/config/__init__.py
|
|
9
|
+
src/dataorc_utils/config/enums.py
|
|
10
|
+
src/dataorc_utils/config/manager.py
|
|
11
|
+
src/dataorc_utils/config/models.py
|
|
12
|
+
src/dataorc_utils/config/rules.py
|
|
13
|
+
src/dataorc_utils/config/validation.py
|
|
14
|
+
tests/test_config_basic.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dataorc_utils
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
# Ensure the package src directory is on path before importing package
|
|
7
|
+
CURRENT_DIR = os.path.dirname(__file__)
|
|
8
|
+
SRC_ROOT = os.path.abspath(os.path.join(CURRENT_DIR, ".."))
|
|
9
|
+
if SRC_ROOT not in sys.path:
|
|
10
|
+
sys.path.insert(0, SRC_ROOT)
|
|
11
|
+
|
|
12
|
+
from dataorc_utils.config import ( # noqa: E402
|
|
13
|
+
CoreParam,
|
|
14
|
+
CorePipelineConfig,
|
|
15
|
+
Environment,
|
|
16
|
+
PipelineParameterManager,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def make_config(**overrides):
|
|
21
|
+
base = dict(
|
|
22
|
+
env=Environment.DEV,
|
|
23
|
+
domain="finance",
|
|
24
|
+
product="forecast",
|
|
25
|
+
table_name="positions",
|
|
26
|
+
bronze_version="v1",
|
|
27
|
+
silver_version="v2",
|
|
28
|
+
gold_version="v3",
|
|
29
|
+
bronze_processing_method="incremental",
|
|
30
|
+
silver_processing_method="full",
|
|
31
|
+
gold_processing_method="delta",
|
|
32
|
+
env_vars={
|
|
33
|
+
"datalake_name": "dlakeacct",
|
|
34
|
+
"datalake_container_name": "raw",
|
|
35
|
+
},
|
|
36
|
+
)
|
|
37
|
+
base.update(overrides)
|
|
38
|
+
return CorePipelineConfig(**base)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_lake_path_bronze():
|
|
42
|
+
cfg = make_config()
|
|
43
|
+
assert (
|
|
44
|
+
cfg.get_lake_path("bronze")
|
|
45
|
+
== "raw/bronze/finance/forecast/positions/v1/output/incremental"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_lake_path_silver():
|
|
50
|
+
cfg = make_config()
|
|
51
|
+
assert (
|
|
52
|
+
cfg.get_lake_path("silver")
|
|
53
|
+
== "raw/silver/finance/forecast/positions/v2/output/full"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_lake_path_gold():
|
|
58
|
+
cfg = make_config()
|
|
59
|
+
assert (
|
|
60
|
+
cfg.get_lake_path("gold")
|
|
61
|
+
== "raw/gold/finance/forecast/positions/v3/output/delta"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_lake_path_overrides():
|
|
66
|
+
cfg = make_config()
|
|
67
|
+
custom = cfg.get_lake_path(
|
|
68
|
+
"gold", processing_method_override="full", version_override="v9"
|
|
69
|
+
)
|
|
70
|
+
assert custom == "raw/gold/finance/forecast/positions/v9/output/full"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_validate_rules_pass():
|
|
74
|
+
cfg = make_config()
|
|
75
|
+
assert cfg.validate_rules() is True
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_validate_rules_fail_uppercase_domain():
|
|
79
|
+
cfg = make_config(domain="Finance")
|
|
80
|
+
with pytest.raises(ValueError) as exc:
|
|
81
|
+
cfg.validate_rules()
|
|
82
|
+
assert "uppercase" in str(exc.value).lower()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_validate_rules_fail_bad_version_pattern():
|
|
86
|
+
cfg = make_config(bronze_version="version1") # missing leading 'v'
|
|
87
|
+
with pytest.raises(ValueError) as exc:
|
|
88
|
+
cfg.validate_rules()
|
|
89
|
+
assert "pattern" in str(exc.value).lower() or "v<integer>" in str(exc.value)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def test_validate_rules_pass_custom_version_override():
|
|
93
|
+
# Overrides should not break rule when format is correct
|
|
94
|
+
cfg = make_config()
|
|
95
|
+
path = cfg.get_lake_path("bronze", version_override="v99")
|
|
96
|
+
assert path.endswith("/v99/output/incremental")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_case_fallback_env_uppercase_resolution(monkeypatch):
|
|
100
|
+
"""Primary fallback behavior: uppercase variant is resolved when case_fallback=True."""
|
|
101
|
+
key_upper = "DATALAKE_NAME"
|
|
102
|
+
key_lower = CoreParam.DATALAKE_NAME.value
|
|
103
|
+
for k in (key_upper, key_lower):
|
|
104
|
+
monkeypatch.delenv(k, raising=False)
|
|
105
|
+
|
|
106
|
+
# On Windows env vars are case-insensitive, but logic path still runs.
|
|
107
|
+
monkeypatch.setenv(key_upper, "LakeAcctFallback")
|
|
108
|
+
monkeypatch.setenv("DATALAKE_CONTAINER_NAME", "container-fb")
|
|
109
|
+
monkeypatch.setenv("env", Environment.DEV.value)
|
|
110
|
+
|
|
111
|
+
mgr = PipelineParameterManager(case_fallback=True)
|
|
112
|
+
infra = mgr.prepare_infrastructure(["datalake_name", "datalake_container_name"])
|
|
113
|
+
assert infra.variables.get("datalake_name") == "LakeAcctFallback"
|
|
114
|
+
assert infra.variables.get("datalake_container_name") == "container-fb"
|