dataorc-utils 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataorc-utils
3
+ Version: 0.1.0
4
+ Summary: Utility functions for ETL operations
5
+ Classifier: Development Status :: 3 - Alpha
6
+ Classifier: Intended Audience :: Developers
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest>=8.0; extra == "dev"
14
+ Requires-Dist: pytest-cov>=7.0; extra == "dev"
15
+ Requires-Dist: ruff>=0.8.0; extra == "dev"
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dataorc-utils"
7
+ version = "0.1.0"
8
+ description = "Utility functions for ETL operations"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Developers",
14
+ "License :: OSI Approved :: MIT License",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.12",
17
+ ]
18
+ dependencies = []
19
+
20
+ [project.optional-dependencies]
21
+ dev = [
22
+ "pytest>=8.0",
23
+ "pytest-cov>=7.0",
24
+ "ruff>=0.8.0",
25
+ ]
26
+
27
+ [tool.setuptools.packages.find]
28
+ where = ["src"]
29
+
30
+ [tool.ruff]
31
+ # Set the maximum line length to 88.
32
+ line-length = 88
33
+ target-version = "py312"
34
+
35
+ [tool.ruff.lint]
36
+ select = ["E4", "E7", "E9", "F", "B", "I"]
37
+
38
+ [tool.isort]
39
+ profile = "black"
40
+ line_length = 88
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,22 @@
1
+ """Dataorc Utils
2
+
3
+ A collection of utility functions for ETL operations.
4
+ """
5
+
6
+ __author__ = "Equinor"
7
+ __email__ = "toarst@equinor.com"
8
+
9
+ # Import main functions/classes here to make them available at package level
10
+ # Example:
11
+ # from .core import some_function
12
+ # from .utils import another_function
13
+
14
+ # Re-export subpackages / common symbols for convenience.
15
+ # This allows: `from dataorc_utils import config` or
16
+ # `from dataorc_utils.config import CorePipelineConfig`.
17
+ from . import config # convenient access to the config subpackage
18
+
19
+ # __all__ defines what gets imported with "from dataorc_utils import *"
20
+ __all__ = [
21
+ "config",
22
+ ]
@@ -0,0 +1,21 @@
1
+ """
2
+ Config package public API.
3
+
4
+ This module re-exports the most commonly used symbols from the
5
+ submodules so callers can `from dataorc_utils.config import ...`.
6
+ """
7
+
8
+ from .enums import CoreParam, Defaults, Environment
9
+ from .manager import PipelineParameterManager
10
+ from .models import CorePipelineConfig, InfraContext
11
+ from .validation import print_config
12
+
13
+ __all__ = [
14
+ "Environment",
15
+ "CoreParam",
16
+ "Defaults",
17
+ "InfraContext",
18
+ "CorePipelineConfig",
19
+ "print_config",
20
+ "PipelineParameterManager",
21
+ ]
@@ -0,0 +1,51 @@
1
+ """
2
+ Core parameter definitions and enums.
3
+ """
4
+
5
+ from enum import Enum
6
+
7
+
8
+ class Environment(str, Enum):
9
+ """Pipeline execution environments."""
10
+
11
+ DEV = "dev"
12
+ TEST = "test"
13
+ PROD = "prod"
14
+
15
+
16
+ class CoreParam(str, Enum):
17
+ """Core parameters used across all pipelines."""
18
+
19
+ # Environment and data lake
20
+ DATALAKE_NAME = "datalake_name"
21
+ DATALAKE_CONTAINER_NAME = "datalake_container_name"
22
+ ENV = "env"
23
+
24
+ # Data Lake Structure Parameters
25
+ # Following pattern: containername/{layer}/{domain}/{product}/{version}/output/{processing_method}
26
+ DOMAIN = "domain" # Business domain -> Catalog name
27
+ PRODUCT = "product" # Product/project -> Database name
28
+ TABLE_NAME = "table_name" # Table name within the product/database
29
+
30
+ # layers
31
+ BRONZE_VERSION = "bronze_version" # Version for bronze layer
32
+ SILVER_VERSION = "silver_version" # Version for silver layer
33
+ GOLD_VERSION = "gold_version" # Version for gold layer
34
+
35
+ # Processing Methods (layer-specific, user-configurable)
36
+ BRONZE_PROCESSING_METHOD = "bronze_processing_method" # incremental, full, delta
37
+ SILVER_PROCESSING_METHOD = "silver_processing_method" # incremental, full, delta
38
+ GOLD_PROCESSING_METHOD = "gold_processing_method" # incremental, full, delta
39
+
40
+
41
+ # Default values - co-located with their semantic meaning
42
+ class Defaults:
43
+ """Default values for pipeline configuration."""
44
+
45
+ # Version defaults
46
+ VERSION = "v1"
47
+
48
+ # Processing method defaults
49
+ BRONZE_PROCESSING_METHOD = "incremental"
50
+ SILVER_PROCESSING_METHOD = "incremental"
51
+ GOLD_PROCESSING_METHOD = "delta"
@@ -0,0 +1,174 @@
1
+ """
2
+ Parameter management for pipeline configuration.
3
+
4
+ This manager reads configuration from environment variables.
5
+ Cluster environment variables / wheel packaging.
6
+ """
7
+
8
+ import os
9
+
10
+ from .enums import CoreParam, Defaults, Environment
11
+ from .models import CorePipelineConfig, InfraContext
12
+
13
+
14
+ class PipelineParameterManager:
15
+ """
16
+ General parameter manager for data pipelines.
17
+
18
+ This manager is designed to work with repository-specific configurations.
19
+ Most repositories should create their own wrapper that provides the
20
+ repository-specific configuration dictionaries.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ environments_config: dict = None,
26
+ domain_configs: dict = None,
27
+ product_configs: dict = None,
28
+ case_fallback: bool = False,
29
+ ):
30
+ """
31
+ Initialize parameter manager.
32
+
33
+ Args:
34
+ environments_config: Dictionary of environment configurations
35
+ domain_configs: Dictionary of domain configurations
36
+ product_configs: Dictionary of product configurations
37
+ """
38
+ # Wheel-based packaging for configuration delivery.
39
+ self.environments_config = environments_config or {}
40
+ self.domain_configs = domain_configs or {}
41
+ self.product_configs = product_configs or {}
42
+ self.case_fallback = case_fallback
43
+ self._local_environment = Environment.DEV # Default for local development
44
+
45
+ def _get_default_value(self, param: CoreParam) -> str:
46
+ """Get default value for a core parameter."""
47
+ defaults_map = {
48
+ CoreParam.BRONZE_VERSION: Defaults.VERSION,
49
+ CoreParam.SILVER_VERSION: Defaults.VERSION,
50
+ CoreParam.GOLD_VERSION: Defaults.VERSION,
51
+ CoreParam.BRONZE_PROCESSING_METHOD: Defaults.BRONZE_PROCESSING_METHOD,
52
+ CoreParam.SILVER_PROCESSING_METHOD: Defaults.SILVER_PROCESSING_METHOD,
53
+ CoreParam.GOLD_PROCESSING_METHOD: Defaults.GOLD_PROCESSING_METHOD,
54
+ }
55
+ return defaults_map.get(param, "")
56
+
57
+ def get_env_variables(
58
+ self, var_names: list[str], required: bool = False
59
+ ) -> dict[str, str]:
60
+ """
61
+ Retrieve environment variables by name.
62
+
63
+ Args:
64
+ var_names: List of environment variable names to retrieve
65
+ required: If True, raises ValueError when a variable is missing
66
+
67
+ Returns:
68
+ Dictionary mapping variable names to their values (empty string for missing vars)
69
+
70
+ Raises:
71
+ ValueError: If required=True and any variable is not set
72
+ """
73
+ result = {}
74
+ missing = []
75
+
76
+ for var_name in var_names:
77
+ # Lookup strategy: exact first. If case_fallback enabled, try UPPER then lower.
78
+ env_value = os.getenv(var_name)
79
+ if env_value is None and self.case_fallback:
80
+ if var_name.upper() != var_name:
81
+ env_value = os.getenv(var_name.upper())
82
+ if env_value is None and var_name.lower() != var_name:
83
+ env_value = os.getenv(var_name.lower())
84
+
85
+ if env_value is not None:
86
+ result[var_name] = env_value
87
+ else:
88
+ if required:
89
+ missing.append(var_name)
90
+ else:
91
+ result[var_name] = ""
92
+
93
+ if missing:
94
+ raise ValueError(
95
+ f"Required environment variables not set: {', '.join(missing)}"
96
+ )
97
+
98
+ return result
99
+
100
+ def prepare_infrastructure(self, env_vars: list[str]) -> InfraContext:
101
+ """Read and return infrastructure context (no dataset identifiers).
102
+
103
+ Args:
104
+ env_vars: List of infrastructure environment variable names to capture
105
+ (e.g., ["datalake_name", "datalake_container_name", "az_tenant_id"]).
106
+ These will be stored in InfraContext.variables.
107
+
108
+ Returns:
109
+ InfraContext with env and requested infrastructure variables
110
+
111
+ Raises:
112
+ ValueError: If the ENV environment variable is not set in environment
113
+ """
114
+ # Get the environment (always required)
115
+ env_value = os.getenv(CoreParam.ENV.value)
116
+ if not env_value:
117
+ raise ValueError(
118
+ f"Required environment variable '{CoreParam.ENV.value}' is not set"
119
+ )
120
+
121
+ env = Environment(env_value)
122
+
123
+ # Capture infrastructure variables
124
+ infra_vars = self.get_env_variables(env_vars, required=False)
125
+
126
+ return InfraContext(
127
+ env=env,
128
+ variables=infra_vars,
129
+ )
130
+
131
+ def build_core_config(
132
+ self,
133
+ infra: InfraContext,
134
+ domain: str = "",
135
+ product: str = "",
136
+ table_name: str = "",
137
+ bronze_version: str | None = None,
138
+ silver_version: str | None = None,
139
+ gold_version: str | None = None,
140
+ bronze_processing_method: str | None = None,
141
+ silver_processing_method: str | None = None,
142
+ gold_processing_method: str | None = None,
143
+ ) -> CorePipelineConfig:
144
+ """Compose a CorePipelineConfig from infra plus pipeline-specific overrides."""
145
+ # Resolve defaults if None supplied
146
+ bv = bronze_version or self._get_default_value(CoreParam.BRONZE_VERSION)
147
+ sv = silver_version or self._get_default_value(CoreParam.SILVER_VERSION)
148
+ gv = gold_version or self._get_default_value(CoreParam.GOLD_VERSION)
149
+
150
+ bpm = bronze_processing_method or self._get_default_value(
151
+ CoreParam.BRONZE_PROCESSING_METHOD
152
+ )
153
+ spm = silver_processing_method or self._get_default_value(
154
+ CoreParam.SILVER_PROCESSING_METHOD
155
+ )
156
+ gpm = gold_processing_method or self._get_default_value(
157
+ CoreParam.GOLD_PROCESSING_METHOD
158
+ )
159
+
160
+ config = CorePipelineConfig(
161
+ env=infra.env,
162
+ domain=domain,
163
+ product=product,
164
+ table_name=table_name,
165
+ bronze_version=bv,
166
+ silver_version=sv,
167
+ gold_version=gv,
168
+ bronze_processing_method=bpm,
169
+ silver_processing_method=spm,
170
+ gold_processing_method=gpm,
171
+ env_vars=infra.variables,
172
+ )
173
+ config.validate_rules()
174
+ return config
@@ -0,0 +1,155 @@
1
+ """Core configuration data classes."""
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+ from .enums import Defaults, Environment
6
+
7
+
8
+ @dataclass
9
+ class InfraContext:
10
+ """Infrastructure-level context captured prior to pipeline specifics.
11
+
12
+ Stable across multiple pipeline jobs; excludes dataset identifiers and
13
+ per-layer version/processing configuration.
14
+
15
+ The `variables` dict holds all infrastructure environment variables
16
+ (e.g., datalake_name, datalake_container_name, Azure tenant/client IDs, etc.)
17
+ that were requested when calling prepare_infrastructure().
18
+ """
19
+
20
+ env: Environment
21
+ variables: dict[str, str] = field(default_factory=dict)
22
+
23
+
24
+ @dataclass(frozen=True, slots=True)
25
+ class CorePipelineConfig:
26
+ """Immutable pipeline configuration snapshot.
27
+
28
+ Path pattern: container/{layer}/{domain}/{product}/{table_name}/{version}/output/{processing_method}
29
+ Construct via PipelineParameterManager.build_core_config() in production code.
30
+
31
+ The `env_vars` dict holds infrastructure environment variables
32
+ (e.g., datalake_name, datalake_container_name, Azure IDs, etc.) captured during
33
+ prepare_infrastructure().
34
+ """
35
+
36
+ # Required
37
+ env: Environment
38
+
39
+ # Structure identifiers
40
+ domain: str = ""
41
+ product: str = ""
42
+ table_name: str = ""
43
+
44
+ # Layer versions
45
+ bronze_version: str = Defaults.VERSION
46
+ silver_version: str = Defaults.VERSION
47
+ gold_version: str = Defaults.VERSION
48
+
49
+ # Processing methods
50
+ bronze_processing_method: str = Defaults.BRONZE_PROCESSING_METHOD
51
+ silver_processing_method: str = Defaults.SILVER_PROCESSING_METHOD
52
+ gold_processing_method: str = Defaults.GOLD_PROCESSING_METHOD
53
+
54
+ # Flexible infrastructure variables (datalake_name, container, Azure IDs, etc.)
55
+ env_vars: dict[str, str] = field(default_factory=dict)
56
+ # Convenience properties that return the canonical lake path for each layer.
57
+
58
+ def get_lake_path(
59
+ self,
60
+ layer: str,
61
+ processing_method_override: str = None,
62
+ version_override: str = None,
63
+ domain_override: str = None,
64
+ product_override: str = None,
65
+ table_name_override: str = None,
66
+ ) -> str:
67
+ """
68
+ Generate Data Lake path following the standard structure.
69
+
70
+ Structure: containername/{layer}/{domain}/{product}/{table_name}/{version}/output/{processing_method}
71
+
72
+ Args:
73
+ layer: bronze, silver, or gold
74
+ processing_method_override: override processing method for specific layer
75
+ version_override: override version for specific layer
76
+
77
+ Returns:
78
+ Full data lake path
79
+ """
80
+ # Allow callers to override identifiers; fall back to the instance values.
81
+ domain = domain_override or self.domain
82
+ product = product_override or self.product
83
+ table_name = table_name_override or self.table_name
84
+ container = self.env_vars.get("datalake_container_name", "")
85
+
86
+ if not all([container, domain, product, table_name]):
87
+ raise ValueError(
88
+ "datalake_container_name, domain, product and table_name must be set to generate lake path"
89
+ )
90
+
91
+ # Resolve attribute names directly (e.g. bronze_version, bronze_processing_method)
92
+ v_attr = f"{layer}_version"
93
+ p_attr = f"{layer}_processing_method"
94
+
95
+ version = version_override or getattr(self, v_attr, Defaults.VERSION)
96
+ processing_method = processing_method_override or getattr(
97
+ self, p_attr, Defaults.BRONZE_PROCESSING_METHOD
98
+ )
99
+
100
+ return f"{container}/{layer}/{domain}/{product}/{table_name}/{version}/output/{processing_method}"
101
+
102
+ def get_work_path(
103
+ self,
104
+ layer: str,
105
+ version_override: str = None,
106
+ domain_override: str = None,
107
+ product_override: str = None,
108
+ table_name_override: str = None,
109
+ ) -> str:
110
+ """Return the working path for a layer.
111
+
112
+ This reuses `get_lake_path(...)` and replaces the trailing
113
+ `/output/{processing_method}` segment with `/work`. If the
114
+ expected `/output/` segment isn't found, `/work` is appended.
115
+ """
116
+ # Reuse get_lake_path to compose the canonical path and then
117
+ # convert it to a work path by replacing the output segment.
118
+ lake_path = self.get_lake_path(
119
+ layer,
120
+ processing_method_override=None,
121
+ version_override=version_override,
122
+ domain_override=domain_override,
123
+ product_override=product_override,
124
+ table_name_override=table_name_override,
125
+ )
126
+
127
+ marker = "/output/"
128
+ idx = lake_path.find(marker)
129
+ if idx >= 0:
130
+ return lake_path[:idx] + "/work"
131
+ # Fallback: append /work if format differs
132
+ return lake_path.rstrip("/") + "/work"
133
+
134
+ def validate_rules(self, layers: list | None = None) -> bool:
135
+ """Run repository-config rules against this CorePipelineConfig.
136
+
137
+ Delegates to `run_rules_checks` and returns True if checks pass or
138
+ raises ValueError if any rule fails.
139
+ """
140
+ # Import locally to avoid circular imports at module import time
141
+ from .rules import run_rules_checks
142
+
143
+ return run_rules_checks(self, layers)
144
+
145
+ @property
146
+ def bronze_lake_path(self) -> str:
147
+ return self.get_lake_path("bronze")
148
+
149
+ @property
150
+ def silver_lake_path(self) -> str:
151
+ return self.get_lake_path("silver")
152
+
153
+ @property
154
+ def gold_lake_path(self) -> str:
155
+ return self.get_lake_path("gold")
@@ -0,0 +1,88 @@
1
+ """Rule framework for configuration validation.
2
+
3
+ Extensible mechanism for validating `CorePipelineConfig` objects. Each rule is a
4
+ callable taking (config, layer) and returning True or raising ValueError.
5
+
6
+ Built‑in rules:
7
+ - `lowercase_lake_path_rule`: lake paths must not contain uppercase letters.
8
+
9
+ Add new rules by appending to `RULES` or passing a custom list to
10
+ `run_rules_checks`.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from typing import TYPE_CHECKING, Callable, Iterable, List
17
+
18
+ if TYPE_CHECKING: # pragma: no cover
19
+ from .models import CorePipelineConfig
20
+
21
+ RuleFunc = Callable[["CorePipelineConfig", str], bool]
22
+
23
+
24
+ def lowercase_lake_path_rule(config: "CorePipelineConfig", layer: str) -> bool:
25
+ path = config.get_lake_path(layer)
26
+ if any(ch.isalpha() and ch.isupper() for ch in path):
27
+ raise ValueError(f"Lake path contains uppercase letters: '{path}'")
28
+ return True
29
+
30
+
31
+ _VERSION_PATTERN = re.compile(r"^v[0-9]+$")
32
+
33
+
34
+ def version_format_rule(config: "CorePipelineConfig", layer: str) -> bool:
35
+ """Ensure the version segment for the layer matches pattern v<integer>.
36
+
37
+ It derives the layer-specific version attribute (e.g. bronze_version) and validates
38
+ it matches the required pattern. Also verifies that the lake path actually embeds
39
+ that exact version token (defensive consistency check).
40
+ """
41
+ attr = f"{layer}_version"
42
+ value = getattr(config, attr)
43
+ if not isinstance(value, str) or not _VERSION_PATTERN.match(value):
44
+ raise ValueError(
45
+ f"Version for layer '{layer}' must match pattern 'v<integer>' (e.g. v1); got: {value!r}"
46
+ )
47
+ path = config.get_lake_path(layer)
48
+ # Ensure token boundary match in path
49
+ if f"/{value}/" not in path:
50
+ raise ValueError(
51
+ f"Lake path for layer '{layer}' does not include expected version token '{value}': {path}"
52
+ )
53
+ return True
54
+
55
+
56
+ RULES: List[RuleFunc] = [lowercase_lake_path_rule, version_format_rule]
57
+
58
+
59
+ def run_rules_checks(
60
+ config: "CorePipelineConfig",
61
+ layers: Iterable[str] | None = None,
62
+ rules: Iterable[RuleFunc] | None = None,
63
+ ) -> bool:
64
+ if layers is None:
65
+ layers = ("bronze", "silver", "gold")
66
+ if rules is None:
67
+ rules = RULES
68
+
69
+ errors: list[str] = []
70
+ for layer in layers:
71
+ for rule in rules:
72
+ try:
73
+ rule(config, layer)
74
+ except Exception as exc: # noqa: BLE001
75
+ errors.append(f"[{layer}] {exc}")
76
+
77
+ if errors:
78
+ raise ValueError("Rule checks failed:\n" + "\n".join(errors))
79
+ return True
80
+
81
+
82
+ __all__ = [
83
+ "RuleFunc",
84
+ "RULES",
85
+ "lowercase_lake_path_rule",
86
+ "version_format_rule",
87
+ "run_rules_checks",
88
+ ]
@@ -0,0 +1,49 @@
1
+ """
2
+ Parameter validation logic.
3
+ """
4
+
5
+ from .models import CorePipelineConfig
6
+
7
+
8
+ def print_config(
9
+ config: CorePipelineConfig, title: str = "Pipeline Configuration"
10
+ ) -> None:
11
+ """Print configuration for debugging."""
12
+ print(f"📦 {title}:")
13
+ # Handle both string and enum types
14
+ env_value = config.env.value if hasattr(config.env, "value") else str(config.env)
15
+
16
+ print(f" Environment: {env_value}")
17
+
18
+ print(" 🏗️ Data Lake Structure:")
19
+ print(f" Domain: {config.domain}")
20
+ print(f" Product: {config.product}")
21
+ print(f" Table: {config.table_name}")
22
+ print(f" Bronze Version: {config.bronze_version}")
23
+ print(f" Silver Version: {config.silver_version}")
24
+ print(f" Gold Version: {config.gold_version}")
25
+
26
+ print(" ⚙️ Processing Methods:")
27
+ print(f" Bronze: {config.bronze_processing_method}")
28
+ print(f" Silver: {config.silver_processing_method}")
29
+ print(f" Gold: {config.gold_processing_method}")
30
+
31
+ # Only show generated lake paths if structure is complete
32
+ if all([config.domain, config.product, config.table_name]):
33
+ print(" 📁 Generated Paths:")
34
+ print(f" Bronze Lake Path: {config.get_lake_path('bronze')}")
35
+ print(f" Silver Lake Path: {config.get_lake_path('silver')}")
36
+ print(f" Gold Lake Path: {config.get_lake_path('gold')}")
37
+ print(" 📁 Work paths:")
38
+ print(f" Bronze: {config.get_work_path('bronze')}")
39
+ print(f" Silver: {config.get_work_path('silver')}")
40
+ print(f" Gold: {config.get_work_path('gold')}")
41
+ else:
42
+ print(" ⚠️ Data Lake Structure incomplete - paths not generated")
43
+
44
+ # Infrastructure variables
45
+ if config.env_vars:
46
+ print(" 🔧 Infrastructure Variables:")
47
+ for key, value in sorted(config.env_vars.items()):
48
+ display_value = value if value else "(empty)"
49
+ print(f" {key}: {display_value}")
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataorc-utils
3
+ Version: 0.1.0
4
+ Summary: Utility functions for ETL operations
5
+ Classifier: Development Status :: 3 - Alpha
6
+ Classifier: Intended Audience :: Developers
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest>=8.0; extra == "dev"
14
+ Requires-Dist: pytest-cov>=7.0; extra == "dev"
15
+ Requires-Dist: ruff>=0.8.0; extra == "dev"
@@ -0,0 +1,14 @@
1
+ pyproject.toml
2
+ src/dataorc_utils/__init__.py
3
+ src/dataorc_utils.egg-info/PKG-INFO
4
+ src/dataorc_utils.egg-info/SOURCES.txt
5
+ src/dataorc_utils.egg-info/dependency_links.txt
6
+ src/dataorc_utils.egg-info/requires.txt
7
+ src/dataorc_utils.egg-info/top_level.txt
8
+ src/dataorc_utils/config/__init__.py
9
+ src/dataorc_utils/config/enums.py
10
+ src/dataorc_utils/config/manager.py
11
+ src/dataorc_utils/config/models.py
12
+ src/dataorc_utils/config/rules.py
13
+ src/dataorc_utils/config/validation.py
14
+ tests/test_config_basic.py
@@ -0,0 +1,5 @@
1
+
2
+ [dev]
3
+ pytest>=8.0
4
+ pytest-cov>=7.0
5
+ ruff>=0.8.0
@@ -0,0 +1 @@
1
+ dataorc_utils
@@ -0,0 +1,114 @@
1
+ import os
2
+ import sys
3
+
4
+ import pytest
5
+
6
+ # Ensure the package src directory is on path before importing package
7
+ CURRENT_DIR = os.path.dirname(__file__)
8
+ SRC_ROOT = os.path.abspath(os.path.join(CURRENT_DIR, ".."))
9
+ if SRC_ROOT not in sys.path:
10
+ sys.path.insert(0, SRC_ROOT)
11
+
12
+ from dataorc_utils.config import ( # noqa: E402
13
+ CoreParam,
14
+ CorePipelineConfig,
15
+ Environment,
16
+ PipelineParameterManager,
17
+ )
18
+
19
+
20
+ def make_config(**overrides):
21
+ base = dict(
22
+ env=Environment.DEV,
23
+ domain="finance",
24
+ product="forecast",
25
+ table_name="positions",
26
+ bronze_version="v1",
27
+ silver_version="v2",
28
+ gold_version="v3",
29
+ bronze_processing_method="incremental",
30
+ silver_processing_method="full",
31
+ gold_processing_method="delta",
32
+ env_vars={
33
+ "datalake_name": "dlakeacct",
34
+ "datalake_container_name": "raw",
35
+ },
36
+ )
37
+ base.update(overrides)
38
+ return CorePipelineConfig(**base)
39
+
40
+
41
+ def test_lake_path_bronze():
42
+ cfg = make_config()
43
+ assert (
44
+ cfg.get_lake_path("bronze")
45
+ == "raw/bronze/finance/forecast/positions/v1/output/incremental"
46
+ )
47
+
48
+
49
+ def test_lake_path_silver():
50
+ cfg = make_config()
51
+ assert (
52
+ cfg.get_lake_path("silver")
53
+ == "raw/silver/finance/forecast/positions/v2/output/full"
54
+ )
55
+
56
+
57
+ def test_lake_path_gold():
58
+ cfg = make_config()
59
+ assert (
60
+ cfg.get_lake_path("gold")
61
+ == "raw/gold/finance/forecast/positions/v3/output/delta"
62
+ )
63
+
64
+
65
+ def test_lake_path_overrides():
66
+ cfg = make_config()
67
+ custom = cfg.get_lake_path(
68
+ "gold", processing_method_override="full", version_override="v9"
69
+ )
70
+ assert custom == "raw/gold/finance/forecast/positions/v9/output/full"
71
+
72
+
73
+ def test_validate_rules_pass():
74
+ cfg = make_config()
75
+ assert cfg.validate_rules() is True
76
+
77
+
78
+ def test_validate_rules_fail_uppercase_domain():
79
+ cfg = make_config(domain="Finance")
80
+ with pytest.raises(ValueError) as exc:
81
+ cfg.validate_rules()
82
+ assert "uppercase" in str(exc.value).lower()
83
+
84
+
85
+ def test_validate_rules_fail_bad_version_pattern():
86
+ cfg = make_config(bronze_version="version1") # missing leading 'v'
87
+ with pytest.raises(ValueError) as exc:
88
+ cfg.validate_rules()
89
+ assert "pattern" in str(exc.value).lower() or "v<integer>" in str(exc.value)
90
+
91
+
92
+ def test_validate_rules_pass_custom_version_override():
93
+ # Overrides should not break rule when format is correct
94
+ cfg = make_config()
95
+ path = cfg.get_lake_path("bronze", version_override="v99")
96
+ assert path.endswith("/v99/output/incremental")
97
+
98
+
99
+ def test_case_fallback_env_uppercase_resolution(monkeypatch):
100
+ """Primary fallback behavior: uppercase variant is resolved when case_fallback=True."""
101
+ key_upper = "DATALAKE_NAME"
102
+ key_lower = CoreParam.DATALAKE_NAME.value
103
+ for k in (key_upper, key_lower):
104
+ monkeypatch.delenv(k, raising=False)
105
+
106
+ # On Windows env vars are case-insensitive, but logic path still runs.
107
+ monkeypatch.setenv(key_upper, "LakeAcctFallback")
108
+ monkeypatch.setenv("DATALAKE_CONTAINER_NAME", "container-fb")
109
+ monkeypatch.setenv("env", Environment.DEV.value)
110
+
111
+ mgr = PipelineParameterManager(case_fallback=True)
112
+ infra = mgr.prepare_infrastructure(["datalake_name", "datalake_container_name"])
113
+ assert infra.variables.get("datalake_name") == "LakeAcctFallback"
114
+ assert infra.variables.get("datalake_container_name") == "container-fb"