PyPI - etlplus - Versions diffs - 0.15.5__py3-none-any.whl → 0.16.2__py3-none-any.whl - Mend

etlplus 0.15.5py3-none-any.whl → 0.16.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

etlplus/api/types.py +32 -11
etlplus/cli/constants.py +1 -1
etlplus/connector/__init__.py +43 -0
etlplus/connector/api.py +161 -0
etlplus/connector/connector.py +26 -0
etlplus/connector/core.py +132 -0
etlplus/connector/database.py +122 -0
etlplus/connector/enums.py +52 -0
etlplus/connector/file.py +120 -0
etlplus/connector/types.py +40 -0
etlplus/connector/utils.py +122 -0
etlplus/enums.py +0 -32
etlplus/ops/extract.py +210 -23
etlplus/ops/load.py +141 -35
etlplus/ops/run.py +86 -101
etlplus/ops/transform.py +30 -11
etlplus/types.py +3 -2
etlplus/workflow/__init__.py +2 -11
etlplus/workflow/dag.py +23 -1
etlplus/workflow/jobs.py +15 -26
etlplus/workflow/pipeline.py +39 -56
etlplus/workflow/profile.py +4 -2
{etlplus-0.15.5.dist-info → etlplus-0.16.2.dist-info}/METADATA +1 -1
{etlplus-0.15.5.dist-info → etlplus-0.16.2.dist-info}/RECORD +28 -21
etlplus/workflow/connector.py +0 -386
etlplus/workflow/types.py +0 -115
{etlplus-0.15.5.dist-info → etlplus-0.16.2.dist-info}/WHEEL +0 -0
{etlplus-0.15.5.dist-info → etlplus-0.16.2.dist-info}/entry_points.txt +0 -0
{etlplus-0.15.5.dist-info → etlplus-0.16.2.dist-info}/licenses/LICENSE +0 -0
{etlplus-0.15.5.dist-info → etlplus-0.16.2.dist-info}/top_level.txt +0 -0

etlplus/ops/load.py CHANGED Viewed

@@ -8,13 +8,15 @@ from __future__ import annotations
 import json
 import sys
+from collections.abc import Mapping
 from pathlib import Path
 from typing import Any
 from typing import cast
 from ..api import HttpMethod
+from ..api import compose_api_target_env
 from ..api.utils import resolve_request
-from ..enums import DataConnectorType
+from ..connector import DataConnectorType
 from ..file import File
 from ..file import FileFormat
 from ..types import JSONData
@@ -39,6 +41,108 @@ __all__ = [
 # SECTION: INTERNAL FUNCTIONS ============================================== #
+def _load_data_from_str(
+    source: str,
+) -> JSONData:
+    """
+    Load JSON data from a string or file path.
+    Parameters
+    ----------
+    source : str
+        Input string representing a file path or JSON payload.
+    Returns
+    -------
+    JSONData
+        Parsed JSON payload.
+    """
+    # Special case: '-' means read JSON from STDIN (Unix convention).
+    if source == '-':
+        raw = sys.stdin.read()
+        return _parse_json_string(raw)
+    candidate = Path(source)
+    if candidate.exists():
+        try:
+            return File(candidate, FileFormat.JSON).read()
+        except (OSError, json.JSONDecodeError, ValueError):
+            # Fall back to treating the string as raw JSON content.
+            pass
+    return _parse_json_string(source)
+def _load_to_api_env(
+    data: JSONData,
+    env: Mapping[str, Any],
+) -> JSONDict:
+    """
+    Load data to an API target using a normalized environment.
+    Parameters
+    ----------
+    data : JSONData
+        Payload to load.
+    env : Mapping[str, Any]
+        Normalized request environment.
+    Returns
+    -------
+    JSONDict
+        Load result payload.
+    Raises
+    ------
+    ValueError
+        If required parameters are missing.
+    """
+    url = env.get('url')
+    if not url:
+        raise ValueError('API target missing "url"')
+    method = env.get('method') or 'post'
+    kwargs: dict[str, Any] = {}
+    headers = env.get('headers')
+    if headers:
+        kwargs['headers'] = cast(dict[str, str], headers)
+    if env.get('timeout') is not None:
+        kwargs['timeout'] = env.get('timeout')
+    session = env.get('session')
+    if session is not None:
+        kwargs['session'] = session
+    extra_kwargs = env.get('request_kwargs')
+    if isinstance(extra_kwargs, Mapping):
+        kwargs.update(extra_kwargs)
+    timeout = kwargs.pop('timeout', 10.0)
+    session = kwargs.pop('session', None)
+    request_callable, timeout, http_method = resolve_request(
+        method,
+        session=session,
+        timeout=timeout,
+    )
+    response = request_callable(
+        cast(str, url),
+        json=data,
+        timeout=timeout,
+        **kwargs,
+    )
+    response.raise_for_status()
+    # Try JSON first, fall back to text.
+    try:
+        payload: Any = response.json()
+    except ValueError:
+        payload = response.text
+    return {
+        'status': 'success',
+        'status_code': response.status_code,
+        'message': f'Data loaded to {url}',
+        'response': payload,
+        'records': count_records(data),
+        'method': http_method.value.upper(),
+    }
 def _parse_json_string(
     raw: str,
 ) -> JSONData:
@@ -113,18 +217,7 @@ def load_data(
         return File(source, FileFormat.JSON).read()
     if isinstance(source, str):
-        # Special case: '-' means read JSON from STDIN (Unix convention).
-        if source == '-':
-            raw = sys.stdin.read()
-            return _parse_json_string(raw)
-        candidate = Path(source)
-        if candidate.exists():
-            try:
-                return File(candidate, FileFormat.JSON).read()
-            except (OSError, json.JSONDecodeError, ValueError):
-                # Fall back to treating the string as raw JSON content.
-                pass
-        return _parse_json_string(source)
+        return _load_data_from_str(source)
     raise TypeError(
         'source must be a mapping, sequence of mappings, path, or JSON string',
@@ -158,30 +251,43 @@ def load_to_api(
         Result dictionary including response payload or text.
     """
     # Apply a conservative timeout to guard against hanging requests.
-    timeout = kwargs.pop('timeout', 10.0)
-    session = kwargs.pop('session', None)
-    request_callable, timeout, http_method = resolve_request(
-        method,
-        session=session,
-        timeout=timeout,
-    )
-    response = request_callable(url, json=data, timeout=timeout, **kwargs)
-    response.raise_for_status()
+    env = {
+        'url': url,
+        'method': method,
+        'timeout': kwargs.pop('timeout', 10.0),
+        'session': kwargs.pop('session', None),
+        'request_kwargs': kwargs,
+    }
+    return _load_to_api_env(data, env)
-    # Try JSON first, fall back to text.
-    try:
-        payload: Any = response.json()
-    except ValueError:
-        payload = response.text
-    return {
-        'status': 'success',
-        'status_code': response.status_code,
-        'message': f'Data loaded to {url}',
-        'response': payload,
-        'records': count_records(data),
-        'method': http_method.value.upper(),
-    }
+def load_to_api_target(
+    cfg: Any,
+    target_obj: Any,
+    overrides: dict[str, Any],
+    data: JSONData,
+) -> JSONDict:
+    """
+    Load data to an API target connector.
+    Parameters
+    ----------
+    cfg : Any
+        Pipeline configuration.
+    target_obj : Any
+        Connector configuration.
+    overrides : dict[str, Any]
+        Load-time overrides.
+    data : JSONData
+        Payload to load.
+    Returns
+    -------
+    JSONDict
+        Load result.
+    """
+    env = compose_api_target_env(cfg, target_obj, overrides)
+    return _load_to_api_env(data, env)
 def load_to_database(

etlplus/ops/run.py CHANGED Viewed

@@ -6,31 +6,23 @@ A module for running ETL jobs defined in YAML configurations.
 from __future__ import annotations
-from collections.abc import Mapping
 from typing import Any
 from typing import Final
 from typing import cast
-from urllib.parse import urlsplit
-from urllib.parse import urlunsplit
-from ..api import EndpointClient  # noqa: F401 (re-exported for tests)
 from ..api import HttpMethod
-from ..api import PaginationConfigMap
-from ..api import RequestOptions
-from ..api import compose_api_request_env
-from ..api import compose_api_target_env
-from ..api import paginate_with_client
-from ..enums import DataConnectorType
+from ..connector import DataConnectorType
 from ..file import FileFormat
 from ..types import JSONData
 from ..types import JSONDict
 from ..types import PipelineConfig
 from ..types import StrPath
-from ..types import Timeout
 from ..utils import print_json
 from ..workflow import load_pipeline_config
 from .extract import extract
+from .extract import extract_from_api_source
 from .load import load
+from .load import load_to_api_target
 from .transform import transform
 from .utils import maybe_validate
 from .validate import validate
@@ -54,6 +46,75 @@ DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
 # SECTION: INTERNAL FUNCTIONS =============================================== #
+def _index_connectors(
+    connectors: list[Any],
+    *,
+    label: str,
+) -> dict[str, Any]:
+    """
+    Index connectors by name with a helpful error on duplicates.
+    Parameters
+    ----------
+    connectors : list[Any]
+        Connector objects to index.
+    label : str
+        Label used in error messages (e.g., ``"source"``).
+    Returns
+    -------
+    dict[str, Any]
+        Mapping of connector names to connector objects.
+    Raises
+    ------
+    ValueError
+        If duplicate connector names are found.
+    """
+    indexed: dict[str, Any] = {}
+    for connector in connectors:
+        name = getattr(connector, 'name', None)
+        if not isinstance(name, str) or not name:
+            continue
+        if name in indexed:
+            raise ValueError(f'Duplicate {label} connector name: {name}')
+        indexed[name] = connector
+    return indexed
+def _require_named_connector(
+    connectors: dict[str, Any],
+    name: str,
+    *,
+    label: str,
+) -> Any:
+    """
+    Return a connector by name or raise a helpful error.
+    Parameters
+    ----------
+    connectors : dict[str, Any]
+        Mapping of connector names to connector objects.
+    name : str
+        Connector name to retrieve.
+    label : str
+        Label used in error messages (e.g., ``"source"``).
+    Returns
+    -------
+    Any
+        Connector object.
+    Raises
+    ------
+    ValueError
+        If the connector name is not found.
+    """
+    if name not in connectors:
+        raise ValueError(f'Unknown {label}: {name}')
+    return connectors[name]
 def _resolve_validation_config(
     job_obj: Any,
     cfg: Any,
@@ -122,16 +183,18 @@ def run(
         raise ValueError(f'Job not found: {job}')
     # Index sources/targets by name
-    sources_by_name = {getattr(s, 'name', None): s for s in cfg.sources}
-    targets_by_name = {getattr(t, 'name', None): t for t in cfg.targets}
+    sources_by_name = _index_connectors(cfg.sources, label='source')
+    targets_by_name = _index_connectors(cfg.targets, label='target')
     # Extract.
     if not job_obj.extract:
         raise ValueError('Job missing "extract" section')
     source_name = job_obj.extract.source
-    if source_name not in sources_by_name:
-        raise ValueError(f'Unknown source: {source_name}')
-    source_obj = sources_by_name[source_name]
+    source_obj = _require_named_connector(
+        sources_by_name,
+        source_name,
+        label='source',
+    )
     ex_opts: dict[str, Any] = job_obj.extract.options or {}
     data: Any
@@ -151,68 +214,7 @@ def run(
             conn = getattr(source_obj, 'connection_string', '')
             data = extract('database', conn)
         case DataConnectorType.API:
-            env = compose_api_request_env(cfg, source_obj, ex_opts)
-            if (
-                env.get('use_endpoints')
-                and env.get('base_url')
-                and env.get('endpoints_map')
-                and env.get('endpoint_key')
-            ):
-                # Construct client using module-level EndpointClient so tests
-                # can monkeypatch this class on etlplus.ops.run.
-                ClientClass = EndpointClient  # noqa: N806
-                client = ClientClass(
-                    base_url=cast(str, env.get('base_url')),
-                    base_path=cast(str | None, env.get('base_path')),
-                    endpoints=cast(
-                        dict[str, str],
-                        env.get('endpoints_map', {}),
-                    ),
-                    retry=env.get('retry'),
-                    retry_network_errors=bool(
-                        env.get('retry_network_errors', False),
-                    ),
-                    session=env.get('session'),
-                )
-                data = paginate_with_client(
-                    client,
-                    cast(str, env.get('endpoint_key')),
-                    env.get('params'),
-                    env.get('headers'),
-                    env.get('timeout'),
-                    env.get('pagination'),
-                    cast(float | None, env.get('sleep_seconds')),
-                )
-            else:
-                url = env.get('url')
-                if not url:
-                    raise ValueError('API source missing URL')
-                parts = urlsplit(cast(str, url))
-                base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
-                ClientClass = EndpointClient  # noqa: N806
-                client = ClientClass(
-                    base_url=base,
-                    base_path=None,
-                    endpoints={},
-                    retry=env.get('retry'),
-                    retry_network_errors=bool(
-                        env.get('retry_network_errors', False),
-                    ),
-                    session=env.get('session'),
-                )
-                request_options = RequestOptions(
-                    params=cast(Mapping[str, Any] | None, env.get('params')),
-                    headers=cast(Mapping[str, str] | None, env.get('headers')),
-                    timeout=cast(Timeout | None, env.get('timeout')),
-                )
-                data = client.paginate_url(
-                    cast(str, url),
-                    cast(PaginationConfigMap | None, env.get('pagination')),
-                    request=request_options,
-                    sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
-                )
+            data = extract_from_api_source(cfg, source_obj, ex_opts)
         case _:
             # :meth:`coerce` already raises for invalid connector types, but
             # keep explicit guard for defensive programming.
@@ -256,9 +258,11 @@ def run(
     if not job_obj.load:
         raise ValueError('Job missing "load" section')
     target_name = job_obj.load.target
-    if target_name not in targets_by_name:
-        raise ValueError(f'Unknown target: {target_name}')
-    target_obj = targets_by_name[target_name]
+    target_obj = _require_named_connector(
+        targets_by_name,
+        target_name,
+        label='target',
+    )
     overrides = job_obj.load.overrides or {}
     ttype_raw = getattr(target_obj, 'type', None)
@@ -274,26 +278,7 @@ def run(
                 raise ValueError('File target missing "path"')
             result = load(data, 'file', path, file_format=fmt)
         case DataConnectorType.API:
-            env_t = compose_api_target_env(cfg, target_obj, overrides)
-            url_t = env_t.get('url')
-            if not url_t:
-                raise ValueError('API target missing "url"')
-            kwargs_t: dict[str, Any] = {}
-            headers = env_t.get('headers')
-            if headers:
-                kwargs_t['headers'] = cast(dict[str, str], headers)
-            if env_t.get('timeout') is not None:
-                kwargs_t['timeout'] = env_t.get('timeout')
-            session = env_t.get('session')
-            if session is not None:
-                kwargs_t['session'] = session
-            result = load(
-                data,
-                'api',
-                cast(str, url_t),
-                method=cast(str | Any, env_t.get('method') or 'post'),
-                **kwargs_t,
-            )
+            result = load_to_api_target(cfg, target_obj, overrides, data)
         case DataConnectorType.DATABASE:
             conn = overrides.get('connection_string') or getattr(
                 target_obj,

etlplus/ops/transform.py CHANGED Viewed

@@ -206,15 +206,12 @@ def _normalize_specs(
     """
     if config is None:
         return []
-    if isinstance(config, Sequence) and not isinstance(
-        config,
-        (str, bytes, bytearray),
-    ):
+    if _is_sequence_not_text(config):
         # Already a sequence of step specs; normalize to a list.
-        return list(config)  # type: ignore[list-item]
+        return list(cast(Sequence[StepSpec], config))
     # Single spec
-    return [config]
+    return [cast(StepSpec, config)]
 def _normalize_operation_keys(ops: Mapping[Any, Any]) -> dict[str, Any]:
@@ -702,7 +699,31 @@ def _apply_sort_step(
 # -- Helpers -- #
-def _is_plain_fields_list(obj: Any) -> bool:
+def _is_sequence_not_text(
+    obj: Any,
+) -> bool:
+    """
+    Return ``True`` for non-text sequences.
+    Parameters
+    ----------
+    obj : Any
+        The object to check.
+    Returns
+    -------
+    bool
+        ``True`` when *obj* is a non-text sequence.
+    """
+    return isinstance(obj, Sequence) and not isinstance(
+        obj,
+        (str, bytes, bytearray),
+    )
+def _is_plain_fields_list(
+    obj: Any,
+) -> bool:
     """
     Return True if obj is a non-text sequence of non-mapping items.
@@ -719,10 +740,8 @@ def _is_plain_fields_list(obj: Any) -> bool:
         True if obj is a non-text sequence of non-mapping items, False
         otherwise.
     """
-    return (
-        isinstance(obj, Sequence)
-        and not isinstance(obj, (str, bytes, bytearray))
-        and not any(isinstance(x, Mapping) for x in obj)
+    return _is_sequence_not_text(obj) and not any(
+        isinstance(x, Mapping) for x in obj
     )

etlplus/types.py CHANGED Viewed

@@ -12,8 +12,7 @@ Notes
 See Also
 --------
 - :mod:`etlplus.api.types` for HTTP-specific aliases and data classes
-- :mod:`etlplus.workflow.types` for workflow-specific aliases and TypedDict
-    surfaces
+- :mod:`etlplus.connector.types` for connector-specific aliases
 Examples
 --------
@@ -81,6 +80,8 @@ __all__ = [
     # Type Aliases (Networking / Runtime)
     'Sleeper',
     'Timeout',
+    # Type Aliases (Templates)
+    'TemplateKey',
 ]

etlplus/workflow/__init__.py CHANGED Viewed

@@ -6,11 +6,6 @@ Job workflow helpers.
 from __future__ import annotations
-from .connector import Connector
-from .connector import ConnectorApi
-from .connector import ConnectorDb
-from .connector import ConnectorFile
-from .connector import parse_connector
 from .dag import topological_sort_jobs
 from .jobs import ExtractRef
 from .jobs import JobConfig
@@ -19,25 +14,21 @@ from .jobs import TransformRef
 from .jobs import ValidationRef
 from .pipeline import PipelineConfig
 from .pipeline import load_pipeline_config
+from .profile import ProfileConfig
 # SECTION: EXPORTS ========================================================== #
 __all__ = [
     # Data Classes
-    'ConnectorApi',
-    'ConnectorDb',
-    'ConnectorFile',
     'ExtractRef',
     'JobConfig',
     'LoadRef',
     'PipelineConfig',
+    'ProfileConfig',
     'TransformRef',
     'ValidationRef',
     # Functions
     'load_pipeline_config',
-    'parse_connector',
     'topological_sort_jobs',
-    # Type Aliases
-    'Connector',
 ]

etlplus/workflow/dag.py CHANGED Viewed

@@ -47,6 +47,28 @@ class DagError(ValueError):
         return self.message
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _ready(
+    indegree: dict[str, int],
+) -> list[str]:
+    """
+    Return a sorted list of nodes with zero indegree.
+    Parameters
+    ----------
+    indegree : dict[str, int]
+        Mapping of node name to indegree.
+    Returns
+    -------
+    list[str]
+        Sorted list of node names ready to process.
+    """
+    return sorted(name for name, deg in indegree.items() if deg == 0)
 # SECTION: FUNCTIONS ======================================================== #
@@ -88,7 +110,7 @@ def topological_sort_jobs(
                 edges[dep].add(job.name)
                 indegree[job.name] += 1
-    queue = deque(sorted(name for name, deg in indegree.items() if deg == 0))
+    queue = deque(_ready(indegree))
     ordered: list[str] = []
     while queue:

etlplus 0.15.5__py3-none-any.whl → 0.16.2__py3-none-any.whl

etlplus 0.15.5py3-none-any.whl → 0.16.2py3-none-any.whl