PyPI - etlplus - Versions diffs - 0.10.4__py3-none-any.whl → 0.12.2__py3-none-any.whl - Mend

etlplus 0.10.4py3-none-any.whl → 0.12.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

etlplus/README.md +37 -0
etlplus/api/README.md +20 -3
etlplus/cli/README.md +40 -0
etlplus/cli/commands.py +1 -1
etlplus/cli/constants.py +1 -1
etlplus/cli/handlers.py +1 -1
etlplus/cli/io.py +2 -2
etlplus/config/README.md +52 -0
etlplus/config/pipeline.py +2 -2
etlplus/database/README.md +48 -0
etlplus/database/ddl.py +1 -1
etlplus/database/engine.py +1 -1
etlplus/database/schema.py +1 -1
etlplus/enums.py +2 -270
etlplus/extract.py +5 -7
etlplus/file/README.md +105 -0
etlplus/file/__init__.py +25 -0
etlplus/file/avro.py +198 -0
etlplus/file/core.py +287 -0
etlplus/file/csv.py +91 -0
etlplus/file/enums.py +238 -0
etlplus/file/feather.py +144 -0
etlplus/file/gz.py +123 -0
etlplus/file/json.py +98 -0
etlplus/file/ndjson.py +109 -0
etlplus/file/orc.py +142 -0
etlplus/file/parquet.py +146 -0
etlplus/file/tsv.py +91 -0
etlplus/file/txt.py +99 -0
etlplus/file/xls.py +132 -0
etlplus/file/xlsx.py +142 -0
etlplus/file/xml.py +174 -0
etlplus/file/yaml.py +136 -0
etlplus/file/zip.py +175 -0
etlplus/load.py +9 -12
etlplus/run.py +6 -9
etlplus/templates/README.md +46 -0
etlplus/validation/README.md +50 -0
{etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/METADATA +58 -14
{etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/RECORD +44 -20
etlplus/file.py +0 -652
{etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/WHEEL +0 -0
{etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/entry_points.txt +0 -0
{etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/licenses/LICENSE +0 -0
{etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/top_level.txt +0 -0

etlplus/file/orc.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""
+:mod:`etlplus.file.orc` module.
+Helpers for reading/writing ORC files.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from typing import cast
+from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
+# SECTION: EXPORTS ========================================================== #
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: INTERNAL CONSTANTS =============================================== #
+_PANDAS_CACHE: dict[str, Any] = {}
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _get_pandas() -> Any:
+    """
+    Return the pandas module, importing it on first use.
+    Raises an informative ImportError if the optional dependency is missing.
+    """
+    mod = _PANDAS_CACHE.get('mod')
+    if mod is not None:  # pragma: no cover - tiny branch
+        return mod
+    try:
+        _pd = __import__('pandas')  # type: ignore[assignment]
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'ORC support requires optional dependency "pandas".\n'
+            'Install with: pip install pandas',
+        ) from e
+    _PANDAS_CACHE['mod'] = _pd
+    return _pd
+def _normalize_records(data: JSONData) -> JSONList:
+    """
+    Normalize JSON payloads into a list of dictionaries.
+    Raises TypeError when payloads contain non-dict items.
+    """
+    if isinstance(data, list):
+        if not all(isinstance(item, dict) for item in data):
+            raise TypeError('ORC payloads must contain only objects (dicts)')
+        return cast(JSONList, data)
+    return [cast(JSONDict, data)]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
+    """
+    Read ORC content from ``path``.
+    Parameters
+    ----------
+    path : Path
+        Path to the ORC file on disk.
+    Returns
+    -------
+    JSONList
+        The list of dictionaries read from the ORC file.
+    Raises
+    ------
+    ImportError
+        When optional dependency "pyarrow" is missing.
+    """
+    pandas = _get_pandas()
+    try:
+        frame = pandas.read_orc(path)
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'ORC support requires optional dependency "pyarrow".\n'
+            'Install with: pip install pyarrow',
+        ) from e
+    return cast(JSONList, frame.to_dict(orient='records'))
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
+    """
+    Write ``data`` to ORC at ``path`` and return record count.
+    Parameters
+    ----------
+    path : Path
+        Path to the ORC file on disk.
+    data : JSONData
+        Data to write.
+    Returns
+    -------
+    int
+        Number of records written.
+    Raises
+    ------
+    ImportError
+        When optional dependency "pyarrow" is missing.
+    """
+    records = _normalize_records(data)
+    if not records:
+        return 0
+    pandas = _get_pandas()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    frame = pandas.DataFrame.from_records(records)
+    try:
+        frame.to_orc(path, index=False)
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'ORC support requires optional dependency "pyarrow".\n'
+            'Install with: pip install pyarrow',
+        ) from e
+    return len(records)

etlplus/file/parquet.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""
+:mod:`etlplus.file.parquet` module.
+Helpers for reading/writing Parquet files.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from typing import cast
+from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
+# SECTION: EXPORTS ========================================================== #
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: INTERNAL CONSTANTS =============================================== #
+_PANDAS_CACHE: dict[str, Any] = {}
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _get_pandas() -> Any:
+    """
+    Return the pandas module, importing it on first use.
+    Raises an informative ImportError if the optional dependency is missing.
+    """
+    mod = _PANDAS_CACHE.get('mod')
+    if mod is not None:  # pragma: no cover - tiny branch
+        return mod
+    try:
+        _pd = __import__('pandas')  # type: ignore[assignment]
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'Parquet support requires optional dependency "pandas".\n'
+            'Install with: pip install pandas',
+        ) from e
+    _PANDAS_CACHE['mod'] = _pd
+    return _pd
+def _normalize_records(data: JSONData) -> JSONList:
+    """
+    Normalize JSON payloads into a list of dictionaries.
+    Raises TypeError when payloads contain non-dict items.
+    """
+    if isinstance(data, list):
+        if not all(isinstance(item, dict) for item in data):
+            raise TypeError(
+                'Parquet payloads must contain only objects (dicts)',
+            )
+        return cast(JSONList, data)
+    return [cast(JSONDict, data)]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
+    """
+    Read Parquet content from ``path``.
+    Parameters
+    ----------
+    path : Path
+        Path to the PARQUET file on disk.
+    Returns
+    -------
+    JSONList
+        The list of dictionaries read from the Parquet file.
+    Raises
+    ------
+    ImportError
+        If optional dependencies for Parquet support are missing.
+    """
+    pandas = _get_pandas()
+    try:
+        frame = pandas.read_parquet(path)
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'Parquet support requires optional dependency '
+            '"pyarrow" or "fastparquet".\n'
+            'Install with: pip install pyarrow',
+        ) from e
+    return cast(JSONList, frame.to_dict(orient='records'))
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
+    """
+    Write ``data`` to Parquet at ``path`` and return record count.
+    Parameters
+    ----------
+    path : Path
+        Path to the PARQUET file on disk.
+    data : JSONData
+        Data to write.
+    Returns
+    -------
+    int
+        Number of records written.
+    Raises
+    ------
+    ImportError
+        If optional dependencies for Parquet support are missing.
+    """
+    records = _normalize_records(data)
+    if not records:
+        return 0
+    pandas = _get_pandas()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    frame = pandas.DataFrame.from_records(records)
+    try:
+        frame.to_parquet(path, index=False)
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'Parquet support requires optional dependency '
+            '"pyarrow" or "fastparquet".\n'
+            'Install with: pip install pyarrow',
+        ) from e
+    return len(records)

etlplus/file/tsv.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""
+:mod:`etlplus.file.tsv` module.
+Helpers for reading/writing TSV files.
+"""
+from __future__ import annotations
+import csv
+from pathlib import Path
+from typing import cast
+from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
+# SECTION: EXPORTS ========================================================== #
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
+    """
+    Read TSV content from ``path``.
+    Parameters
+    ----------
+    path : Path
+        Path to the TSV file on disk.
+    Returns
+    -------
+    JSONList
+        The list of dictionaries read from the TSV file.
+    """
+    with path.open('r', encoding='utf-8', newline='') as handle:
+        reader: csv.DictReader[str] = csv.DictReader(handle, delimiter='\t')
+        rows: JSONList = []
+        for row in reader:
+            if not any(row.values()):
+                continue
+            rows.append(cast(JSONDict, dict(row)))
+    return rows
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
+    """
+    Write ``data`` to TSV at ``path`` and return record count.
+    Parameters
+    ----------
+    path : Path
+        Path to the TSV file on disk.
+    data : JSONData
+        Data to write as TSV. Should be a list of dictionaries or a
+        single dictionary.
+    Returns
+    -------
+    int
+        The number of rows written to the TSV file.
+    """
+    rows: list[JSONDict]
+    if isinstance(data, list):
+        rows = [row for row in data if isinstance(row, dict)]
+    else:
+        rows = [data]
+    if not rows:
+        return 0
+    fieldnames = sorted({key for row in rows for key in row})
+    with path.open('w', encoding='utf-8', newline='') as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter='\t')
+        writer.writeheader()
+        for row in rows:
+            writer.writerow({field: row.get(field) for field in fieldnames})
+    return len(rows)

etlplus/file/txt.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""
+:mod:`etlplus.file.txt` module.
+Helpers for reading/writing text files.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import cast
+from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
+from ..utils import count_records
+# SECTION: EXPORTS ========================================================== #
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
+    """
+    Read TXT content from ``path``.
+    Parameters
+    ----------
+    path : Path
+        Path to the TXT file on disk.
+    Returns
+    -------
+    JSONList
+        The list of dictionaries read from the TXT file.
+    """
+    rows: JSONList = []
+    with path.open('r', encoding='utf-8') as handle:
+        for line in handle:
+            text = line.rstrip('\n')
+            if text == '':
+                continue
+            rows.append({'text': text})
+    return rows
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
+    """
+    Write ``data`` to TXT at ``path`` and return record count.
+    Parameters
+    ----------
+    path : Path
+        Path to the TXT file on disk.
+    data : JSONData
+        Data to write. Expects ``{'text': '...'} `` or a list of those.
+    Returns
+    -------
+    int
+        Number of records written.
+    Raises
+    ------
+    TypeError
+        If any item in ``data`` is not a dictionary or if any dictionary
+        does not contain a ``'text'`` key.
+    """
+    rows: JSONList
+    if isinstance(data, list):
+        if not all(isinstance(item, dict) for item in data):
+            raise TypeError('TXT payloads must contain only objects (dicts)')
+        rows = cast(JSONList, data)
+    else:
+        rows = [cast(JSONDict, data)]
+    if not rows:
+        return 0
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open('w', encoding='utf-8') as handle:
+        for row in rows:
+            if 'text' not in row:
+                raise TypeError('TXT payloads must include a "text" key')
+            handle.write(str(row['text']))
+            handle.write('\n')
+    return count_records(rows)

etlplus/file/xls.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""
+:mod:`etlplus.file.xls` module.
+Helpers for reading/writing Excel XLS files.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from typing import cast
+from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
+# SECTION: EXPORTS ========================================================== #
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: INTERNAL CONSTANTS =============================================== #
+_PANDAS_CACHE: dict[str, Any] = {}
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _get_pandas() -> Any:
+    """
+    Return the pandas module, importing it on first use.
+    Raises an informative ImportError if the optional dependency is missing.
+    """
+    mod = _PANDAS_CACHE.get('mod')
+    if mod is not None:  # pragma: no cover - tiny branch
+        return mod
+    try:
+        _pd = __import__('pandas')  # type: ignore[assignment]
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'XLS support requires optional dependency "pandas".\n'
+            'Install with: pip install pandas',
+        ) from e
+    _PANDAS_CACHE['mod'] = _pd
+    return _pd
+def _normalize_records(data: JSONData) -> JSONList:
+    """
+    Normalize JSON payloads into a list of dictionaries.
+    Raises TypeError when payloads contain non-dict items.
+    """
+    if isinstance(data, list):
+        if not all(isinstance(item, dict) for item in data):
+            raise TypeError('XLS payloads must contain only objects (dicts)')
+        return cast(JSONList, data)
+    return [cast(JSONDict, data)]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
+    """
+    Read XLS content from ``path``.
+    Parameters
+    ----------
+    path : Path
+        Path to the XLS file on disk.
+    Returns
+    -------
+    JSONList
+        The list of dictionaries read from the XLS file.
+    Raises
+    ------
+    ImportError
+        If the optional dependency "xlrd" is not installed.
+    """
+    pandas = _get_pandas()
+    try:
+        frame = pandas.read_excel(path, engine='xlrd')
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'XLS support requires optional dependency "xlrd".\n'
+            'Install with: pip install xlrd',
+        ) from e
+    return cast(JSONList, frame.to_dict(orient='records'))
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
+    """
+    Write ``data`` to XLS at ``path`` and return record count.
+    Notes
+    -----
+    XLS writing is not supported by pandas 2.x. Use XLSX for writes.
+    Parameters
+    ----------
+    path : Path
+        Path to the XLS file on disk.
+    data : JSONData
+        Data to write.
+    Returns
+    -------
+    int
+        Number of records written.
+    Raises
+    ------
+    ImportError
+        If the optional dependency "xlwt" is not installed.
+    """
+    raise RuntimeError('XLS write is not supported; use XLSX instead')

etlplus 0.10.4__py3-none-any.whl → 0.12.2__py3-none-any.whl

etlplus 0.10.4py3-none-any.whl → 0.12.2py3-none-any.whl