PyPI - etlplus - Versions diffs - 0.11.11__py3-none-any.whl → 0.12.2__py3-none-any.whl - Mend

etlplus 0.11.11py3-none-any.whl → 0.12.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

etlplus/README.md +37 -0
etlplus/api/README.md +20 -3
etlplus/cli/README.md +40 -0
etlplus/config/README.md +52 -0
etlplus/database/README.md +48 -0
etlplus/file/README.md +105 -0
etlplus/file/avro.py +157 -18
etlplus/file/core.py +3 -3
etlplus/file/csv.py +12 -3
etlplus/file/feather.py +100 -15
etlplus/file/gz.py +80 -16
etlplus/file/json.py +13 -2
etlplus/file/ndjson.py +61 -11
etlplus/file/orc.py +95 -12
etlplus/file/parquet.py +100 -13
etlplus/file/tsv.py +52 -20
etlplus/file/txt.py +56 -16
etlplus/file/xls.py +85 -12
etlplus/file/xlsx.py +95 -12
etlplus/file/xml.py +12 -3
etlplus/file/yaml.py +13 -2
etlplus/file/zip.py +133 -7
etlplus/templates/README.md +46 -0
etlplus/validation/README.md +50 -0
{etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/METADATA +58 -14
{etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/RECORD +30 -23
{etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/WHEEL +0 -0
{etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/entry_points.txt +0 -0
{etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/licenses/LICENSE +0 -0
{etlplus-0.11.11.dist-info → etlplus-0.12.2.dist-info}/top_level.txt +0 -0

etlplus/file/parquet.py CHANGED Viewed

@@ -1,21 +1,81 @@
 """
 :mod:`etlplus.file.parquet` module.
-Stub helpers for PARQUET read/write.
+Helpers for reading/writing Parquet files.
 """
 from __future__ import annotations
 from pathlib import Path
+from typing import Any
+from typing import cast
 from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
 # SECTION: EXPORTS ========================================================== #
-def read(path: Path) -> JSONData:
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: INTERNAL CONSTANTS =============================================== #
+_PANDAS_CACHE: dict[str, Any] = {}
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _get_pandas() -> Any:
+    """
+    Return the pandas module, importing it on first use.
+    Raises an informative ImportError if the optional dependency is missing.
+    """
+    mod = _PANDAS_CACHE.get('mod')
+    if mod is not None:  # pragma: no cover - tiny branch
+        return mod
+    try:
+        _pd = __import__('pandas')  # type: ignore[assignment]
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'Parquet support requires optional dependency "pandas".\n'
+            'Install with: pip install pandas',
+        ) from e
+    _PANDAS_CACHE['mod'] = _pd
+    return _pd
+def _normalize_records(data: JSONData) -> JSONList:
     """
-    Read PARQUET content from ``path``.
+    Normalize JSON payloads into a list of dictionaries.
+    Raises TypeError when payloads contain non-dict items.
+    """
+    if isinstance(data, list):
+        if not all(isinstance(item, dict) for item in data):
+            raise TypeError(
+                'Parquet payloads must contain only objects (dicts)',
+            )
+        return cast(JSONList, data)
+    return [cast(JSONDict, data)]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
+    """
+    Read Parquet content from ``path``.
     Parameters
     ----------
@@ -24,20 +84,32 @@ def read(path: Path) -> JSONData:
     Returns
     -------
-    JSONData
-        Parsed payload.
+    JSONList
+        The list of dictionaries read from the Parquet file.
     Raises
     ------
-    NotImplementedError
-        PARQUET :func:`read` is not implemented yet.
+    ImportError
+        If optional dependencies for Parquet support are missing.
     """
-    raise NotImplementedError('PARQUET read is not implemented yet')
+    pandas = _get_pandas()
+    try:
+        frame = pandas.read_parquet(path)
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'Parquet support requires optional dependency '
+            '"pyarrow" or "fastparquet".\n'
+            'Install with: pip install pyarrow',
+        ) from e
+    return cast(JSONList, frame.to_dict(orient='records'))
-def write(path: Path, data: JSONData) -> int:
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
     """
-    Write ``data`` to PARQUET at ``path``.
+    Write ``data`` to Parquet at ``path`` and return record count.
     Parameters
     ----------
@@ -53,7 +125,22 @@ def write(path: Path, data: JSONData) -> int:
     Raises
     ------
-    NotImplementedError
-        PARQUET :func:`write` is not implemented yet.
+    ImportError
+        If optional dependencies for Parquet support are missing.
     """
-    raise NotImplementedError('PARQUET write is not implemented yet')
+    records = _normalize_records(data)
+    if not records:
+        return 0
+    pandas = _get_pandas()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    frame = pandas.DataFrame.from_records(records)
+    try:
+        frame.to_parquet(path, index=False)
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'Parquet support requires optional dependency '
+            '"pyarrow" or "fastparquet".\n'
+            'Install with: pip install pyarrow',
+        ) from e
+    return len(records)

etlplus/file/tsv.py CHANGED Viewed

@@ -1,19 +1,34 @@
 """
 :mod:`etlplus.file.tsv` module.
-Stub helpers for TSV read/write.
+Helpers for reading/writing TSV files.
 """
 from __future__ import annotations
+import csv
 from pathlib import Path
+from typing import cast
 from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
 # SECTION: EXPORTS ========================================================== #
-def read(path: Path) -> JSONData:
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
     """
     Read TSV content from ``path``.
@@ -24,36 +39,53 @@ def read(path: Path) -> JSONData:
     Returns
     -------
-    JSONData
-        Parsed payload.
-    Raises
-    ------
-    NotImplementedError
-        TSV :func:`read` is not implemented yet.
+    JSONList
+        The list of dictionaries read from the TSV file.
     """
-    raise NotImplementedError('TSV read is not implemented yet')
+    with path.open('r', encoding='utf-8', newline='') as handle:
+        reader: csv.DictReader[str] = csv.DictReader(handle, delimiter='\t')
+        rows: JSONList = []
+        for row in reader:
+            if not any(row.values()):
+                continue
+            rows.append(cast(JSONDict, dict(row)))
+    return rows
-def write(path: Path, data: JSONData) -> int:
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
     """
-    Write ``data`` to TSV at ``path``.
+    Write ``data`` to TSV at ``path`` and return record count.
     Parameters
     ----------
     path : Path
         Path to the TSV file on disk.
     data : JSONData
-        Data to write.
+        Data to write as TSV. Should be a list of dictionaries or a
+        single dictionary.
     Returns
     -------
     int
-        Number of records written.
-    Raises
-    ------
-    NotImplementedError
-        TSV :func:`write` is not implemented yet.
+        The number of rows written to the TSV file.
     """
-    raise NotImplementedError('TSV write is not implemented yet')
+    rows: list[JSONDict]
+    if isinstance(data, list):
+        rows = [row for row in data if isinstance(row, dict)]
+    else:
+        rows = [data]
+    if not rows:
+        return 0
+    fieldnames = sorted({key for row in rows for key in row})
+    with path.open('w', encoding='utf-8', newline='') as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter='\t')
+        writer.writeheader()
+        for row in rows:
+            writer.writerow({field: row.get(field) for field in fieldnames})
+    return len(rows)

etlplus/file/txt.py CHANGED Viewed

@@ -1,19 +1,34 @@
 """
 :mod:`etlplus.file.txt` module.
-Stub helpers for TXT read/write.
+Helpers for reading/writing text files.
 """
 from __future__ import annotations
 from pathlib import Path
+from typing import cast
 from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
+from ..utils import count_records
 # SECTION: EXPORTS ========================================================== #
-def read(path: Path) -> JSONData:
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
     """
     Read TXT content from ``path``.
@@ -24,27 +39,32 @@ def read(path: Path) -> JSONData:
     Returns
     -------
-    JSONData
-        Parsed payload.
-    Raises
-    ------
-    NotImplementedError
-        TXT :func:`read` is not implemented yet.
+    JSONList
+        The list of dictionaries read from the TXT file.
     """
-    raise NotImplementedError('TXT read is not implemented yet')
+    rows: JSONList = []
+    with path.open('r', encoding='utf-8') as handle:
+        for line in handle:
+            text = line.rstrip('\n')
+            if text == '':
+                continue
+            rows.append({'text': text})
+    return rows
-def write(path: Path, data: JSONData) -> int:
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
     """
-    Write ``data`` to TXT at ``path``.
+    Write ``data`` to TXT at ``path`` and return record count.
     Parameters
     ----------
     path : Path
         Path to the TXT file on disk.
     data : JSONData
-        Data to write.
+        Data to write. Expects ``{'text': '...'} `` or a list of those.
     Returns
     -------
@@ -53,7 +73,27 @@ def write(path: Path, data: JSONData) -> int:
     Raises
     ------
-    NotImplementedError
-        TXT :func:`write` is not implemented yet.
+    TypeError
+        If any item in ``data`` is not a dictionary or if any dictionary
+        does not contain a ``'text'`` key.
     """
-    raise NotImplementedError('TXT write is not implemented yet')
+    rows: JSONList
+    if isinstance(data, list):
+        if not all(isinstance(item, dict) for item in data):
+            raise TypeError('TXT payloads must contain only objects (dicts)')
+        rows = cast(JSONList, data)
+    else:
+        rows = [cast(JSONDict, data)]
+    if not rows:
+        return 0
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open('w', encoding='utf-8') as handle:
+        for row in rows:
+            if 'text' not in row:
+                raise TypeError('TXT payloads must include a "text" key')
+            handle.write(str(row['text']))
+            handle.write('\n')
+    return count_records(rows)

etlplus/file/xls.py CHANGED Viewed

@@ -1,19 +1,77 @@
 """
 :mod:`etlplus.file.xls` module.
-Stub helpers for XLS read/write.
+Helpers for reading/writing Excel XLS files.
 """
 from __future__ import annotations
 from pathlib import Path
+from typing import Any
+from typing import cast
 from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
 # SECTION: EXPORTS ========================================================== #
-def read(path: Path) -> JSONData:
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: INTERNAL CONSTANTS =============================================== #
+_PANDAS_CACHE: dict[str, Any] = {}
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _get_pandas() -> Any:
+    """
+    Return the pandas module, importing it on first use.
+    Raises an informative ImportError if the optional dependency is missing.
+    """
+    mod = _PANDAS_CACHE.get('mod')
+    if mod is not None:  # pragma: no cover - tiny branch
+        return mod
+    try:
+        _pd = __import__('pandas')  # type: ignore[assignment]
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'XLS support requires optional dependency "pandas".\n'
+            'Install with: pip install pandas',
+        ) from e
+    _PANDAS_CACHE['mod'] = _pd
+    return _pd
+def _normalize_records(data: JSONData) -> JSONList:
+    """
+    Normalize JSON payloads into a list of dictionaries.
+    Raises TypeError when payloads contain non-dict items.
+    """
+    if isinstance(data, list):
+        if not all(isinstance(item, dict) for item in data):
+            raise TypeError('XLS payloads must contain only objects (dicts)')
+        return cast(JSONList, data)
+    return [cast(JSONDict, data)]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
     """
     Read XLS content from ``path``.
@@ -24,20 +82,35 @@ def read(path: Path) -> JSONData:
     Returns
     -------
-    JSONData
-        Parsed payload.
+    JSONList
+        The list of dictionaries read from the XLS file.
     Raises
     ------
-    NotImplementedError
-        XLS :func:`read` is not implemented yet.
+    ImportError
+        If the optional dependency "xlrd" is not installed.
     """
-    raise NotImplementedError('XLS read is not implemented yet')
+    pandas = _get_pandas()
+    try:
+        frame = pandas.read_excel(path, engine='xlrd')
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'XLS support requires optional dependency "xlrd".\n'
+            'Install with: pip install xlrd',
+        ) from e
+    return cast(JSONList, frame.to_dict(orient='records'))
-def write(path: Path, data: JSONData) -> int:
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
     """
-    Write ``data`` to XLS at ``path``.
+    Write ``data`` to XLS at ``path`` and return record count.
+    Notes
+    -----
+    XLS writing is not supported by pandas 2.x. Use XLSX for writes.
     Parameters
     ----------
@@ -53,7 +126,7 @@ def write(path: Path, data: JSONData) -> int:
     Raises
     ------
-    NotImplementedError
-        XLS :func:`write` is not implemented yet.
+    ImportError
+        If the optional dependency "xlwt" is not installed.
     """
-    raise NotImplementedError('XLS write is not implemented yet')
+    raise RuntimeError('XLS write is not supported; use XLSX instead')

etlplus/file/xlsx.py CHANGED Viewed

@@ -1,19 +1,77 @@
 """
 :mod:`etlplus.file.xlsx` module.
-Stub helpers for XLSX read/write.
+Helpers for reading/writing Excel XLSX files.
 """
 from __future__ import annotations
 from pathlib import Path
+from typing import Any
+from typing import cast
 from ..types import JSONData
+from ..types import JSONDict
+from ..types import JSONList
 # SECTION: EXPORTS ========================================================== #
-def read(path: Path) -> JSONData:
+__all__ = [
+    'read',
+    'write',
+]
+# SECTION: INTERNAL CONSTANTS =============================================== #
+_PANDAS_CACHE: dict[str, Any] = {}
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _get_pandas() -> Any:
+    """
+    Return the pandas module, importing it on first use.
+    Raises an informative ImportError if the optional dependency is missing.
+    """
+    mod = _PANDAS_CACHE.get('mod')
+    if mod is not None:  # pragma: no cover - tiny branch
+        return mod
+    try:
+        _pd = __import__('pandas')  # type: ignore[assignment]
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'XLSX support requires optional dependency "pandas".\n'
+            'Install with: pip install pandas',
+        ) from e
+    _PANDAS_CACHE['mod'] = _pd
+    return _pd
+def _normalize_records(data: JSONData) -> JSONList:
+    """
+    Normalize JSON payloads into a list of dictionaries.
+    Raises TypeError when payloads contain non-dict items.
+    """
+    if isinstance(data, list):
+        if not all(isinstance(item, dict) for item in data):
+            raise TypeError('XLSX payloads must contain only objects (dicts)')
+        return cast(JSONList, data)
+    return [cast(JSONDict, data)]
+# SECTION: FUNCTIONS ======================================================== #
+def read(
+    path: Path,
+) -> JSONList:
     """
     Read XLSX content from ``path``.
@@ -24,20 +82,31 @@ def read(path: Path) -> JSONData:
     Returns
     -------
-    JSONData
-        Parsed payload.
+    JSONList
+        The list of dictionaries read from the XLSX file.
     Raises
     ------
-    NotImplementedError
-        XLSX :func:`read` is not implemented yet.
+    ImportError
+        If optional dependencies for XLSX support are missing.
     """
-    raise NotImplementedError('XLSX read is not implemented yet')
+    pandas = _get_pandas()
+    try:
+        frame = pandas.read_excel(path)
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'XLSX support requires optional dependency "openpyxl".\n'
+            'Install with: pip install openpyxl',
+        ) from e
+    return cast(JSONList, frame.to_dict(orient='records'))
-def write(path: Path, data: JSONData) -> int:
+def write(
+    path: Path,
+    data: JSONData,
+) -> int:
     """
-    Write ``data`` to XLSX at ``path``.
+    Write ``data`` to XLSX at ``path`` and return record count.
     Parameters
     ----------
@@ -53,7 +122,21 @@ def write(path: Path, data: JSONData) -> int:
     Raises
     ------
-    NotImplementedError
-        XLSX :func:`write` is not implemented yet.
+    ImportError
+        If optional dependencies for XLSX support are missing.
     """
-    raise NotImplementedError('XLSX write is not implemented yet')
+    records = _normalize_records(data)
+    if not records:
+        return 0
+    pandas = _get_pandas()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    frame = pandas.DataFrame.from_records(records)
+    try:
+        frame.to_excel(path, index=False)
+    except ImportError as e:  # pragma: no cover
+        raise ImportError(
+            'XLSX support requires optional dependency "openpyxl".\n'
+            'Install with: pip install openpyxl',
+        ) from e
+    return len(records)

etlplus/file/xml.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 :mod:`etlplus.file.xml` module.
-XML read/write helpers.
+Helpers for reading/writing XML files.
 """
 from __future__ import annotations
@@ -14,6 +14,15 @@ from ..types import JSONData
 from ..types import JSONDict
 from ..utils import count_records
+# SECTION: EXPORTS ========================================================== #
+__all__ = [
+    'read',
+    'write',
+]
 # SECTION: CONSTANTS ======================================================== #
@@ -117,7 +126,7 @@ def read(
     path: Path,
 ) -> JSONDict:
     """
-    Parse XML document at ``path`` into a nested dictionary.
+    Read XML content from ``path``.
     Parameters
     ----------
@@ -137,7 +146,7 @@ def read(
 def write(path: Path, data: JSONData, *, root_tag: str) -> int:
     """
-    Write ``data`` as XML to ``path`` and return record count.
+    Write ``data`` to XML at ``path`` and return record count.
     Parameters
     ----------

etlplus 0.11.11__py3-none-any.whl → 0.12.2__py3-none-any.whl

etlplus 0.11.11py3-none-any.whl → 0.12.2py3-none-any.whl