PyPI - etlplus - Versions diffs - 0.16.10__py3-none-any.whl → 0.17.2__py3-none-any.whl - Mend

etlplus 0.16.10py3-none-any.whl → 0.17.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

etlplus/file/README.md +33 -0
etlplus/file/arrow.py +35 -5
etlplus/file/bson.py +50 -5
etlplus/file/cbor.py +35 -11
etlplus/file/dat.py +44 -6
etlplus/file/dta.py +46 -11
etlplus/file/duckdb.py +159 -5
etlplus/file/enums.py +29 -0
etlplus/file/fwf.py +37 -5
etlplus/file/hdf5.py +41 -3
etlplus/file/ini.py +62 -12
etlplus/file/msgpack.py +33 -9
etlplus/file/nc.py +55 -6
etlplus/file/ods.py +39 -6
etlplus/file/pb.py +32 -19
etlplus/file/properties.py +52 -12
etlplus/file/proto.py +24 -12
etlplus/file/psv.py +5 -5
etlplus/file/rda.py +83 -9
etlplus/file/rds.py +76 -8
etlplus/file/sas7bdat.py +41 -11
etlplus/file/sav.py +40 -11
etlplus/file/sqlite.py +123 -5
etlplus/file/tab.py +6 -7
etlplus/file/toml.py +54 -12
etlplus/file/xlsm.py +39 -5
etlplus/file/xpt.py +61 -5
{etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/METADATA +44 -26
{etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/RECORD +33 -33
{etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/WHEEL +0 -0
{etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/entry_points.txt +0 -0
{etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/licenses/LICENSE +0 -0
{etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/top_level.txt +0 -0

etlplus/file/README.md CHANGED Viewed

@@ -9,6 +9,12 @@ and writing data files.
   types
 - Exposes a `File` class with instance methods for reading and writing data
+Some formats require optional dependencies. Install with:
+```bash
+pip install -e ".[file]"
+```
 Back to project overview: see the top-level [README](../../README.md).
 - [`etlplus.file` Subpackage](#etlplusfile-subpackage)
@@ -29,21 +35,48 @@ matrix across all `FileFormat` values, see the top-level [README](../../README.m
 | Format    | Description                                 |
 |-----------|---------------------------------------------|
 | avro      | Apache Avro binary serialization            |
+| arrow     | Apache Arrow IPC                            |
+| bson      | Binary JSON (BSON)                          |
+| cbor      | Concise Binary Object Representation        |
 | csv       | Comma-separated values text files           |
+| dat       | Generic data files (delimited)              |
+| dta       | Stata datasets                              |
+| duckdb    | DuckDB database file                        |
 | feather   | Apache Arrow Feather columnar format        |
+| fwf       | Fixed-width formatted text files            |
 | gz        | Gzip-compressed files (see Compression)     |
+| hdf5      | Hierarchical Data Format                    |
+| ini       | INI config files                            |
 | json      | Standard JSON files                         |
+| msgpack   | MessagePack binary serialization            |
+| nc        | NetCDF datasets                             |
 | ndjson    | Newline-delimited JSON (JSON Lines)         |
+| ods       | OpenDocument spreadsheets                   |
 | orc       | Apache ORC columnar format                  |
 | parquet   | Apache Parquet columnar format              |
+| pb        | Protocol Buffers binary                     |
+| properties | Java-style properties                     |
+| proto     | Protocol Buffers schema                     |
+| psv       | Pipe-separated values text files            |
+| rda       | RData workspace bundles                     |
+| rds       | RDS datasets                                |
+| sas7bdat  | SAS datasets                                |
+| sav       | SPSS datasets                               |
+| sqlite    | SQLite database file                        |
+| tab       | Tab-delimited text files                    |
+| toml      | TOML config files                           |
 | tsv       | Tab-separated values text files             |
 | txt       | Plain text files                            |
 | xls       | Microsoft Excel (legacy .xls; read-only)    |
+| xlsm      | Microsoft Excel Macro-Enabled (XLSM)        |
 | xlsx      | Microsoft Excel (modern .xlsx)              |
+| xpt       | SAS transport files                         |
 | zip       | ZIP-compressed files (see Compression)      |
 | xml       | XML files                                   |
 | yaml      | YAML files                                  |
+Note: HDF5 support is read-only; writing is currently disabled.
 Compression formats (gz, zip) are also supported as wrappers for other formats. Formats not listed
 here are currently stubbed and will raise `NotImplementedError` on read/write.

etlplus/file/arrow.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
 :mod:`etlplus.file.arrow` module.
-Stub helpers for reading/writing Apache Arrow (ARROW) files (not implemented
-yet).
+Helpers for reading/writing Apache Arrow (ARROW) files.
 Notes
 -----
@@ -20,10 +19,13 @@ Notes
 from __future__ import annotations
 from pathlib import Path
+from typing import Any
+from typing import cast
 from ..types import JSONData
 from ..types import JSONList
-from . import stub
+from ._imports import get_optional_module
+from ._io import normalize_records
 # SECTION: EXPORTS ========================================================== #
@@ -35,6 +37,20 @@ __all__ = [
 ]
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _get_pyarrow() -> Any:
+    """Return the pyarrow module, importing it on first use."""
+    return get_optional_module(
+        'pyarrow',
+        error_message=(
+            'ARROW support requires optional dependency "pyarrow".\n'
+            'Install with: pip install pyarrow'
+        ),
+    )
 # SECTION: FUNCTIONS ======================================================== #
@@ -54,7 +70,11 @@ def read(
     JSONList
         The list of dictionaries read from the Apache Arrow file.
     """
-    return stub.read(path, format_name='ARROW')
+    pyarrow = _get_pyarrow()
+    with pyarrow.memory_map(str(path), 'r') as source:
+        reader = pyarrow.ipc.open_file(source)
+        table = reader.read_all()
+    return cast(JSONList, table.to_pylist())
 def write(
@@ -77,4 +97,14 @@ def write(
     int
         The number of rows written to the ARROW file.
     """
-    return stub.write(path, data, format_name='ARROW')
+    records = normalize_records(data, 'ARROW')
+    if not records:
+        return 0
+    pyarrow = _get_pyarrow()
+    table = pyarrow.Table.from_pylist(records)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with pyarrow.OSFile(str(path), 'wb') as sink:
+        with pyarrow.ipc.new_file(sink, table.schema) as writer:
+            writer.write_table(table)
+    return len(records)

etlplus/file/bson.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
 :mod:`etlplus.file.bson` module.
-Stub helpers for reading/writing Binary JSON (BSON) files (not implemented
-yet).
+Helpers for reading/writing Binary JSON (BSON) files.
 Notes
 -----
@@ -19,10 +18,13 @@ Notes
 from __future__ import annotations
 from pathlib import Path
+from typing import Any
+from typing import cast
 from ..types import JSONData
 from ..types import JSONList
-from . import stub
+from ._imports import get_optional_module
+from ._io import normalize_records
 # SECTION: EXPORTS ========================================================== #
@@ -34,6 +36,36 @@ __all__ = [
 ]
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _decode_all(bson_module: Any, payload: bytes) -> list[dict[str, Any]]:
+    if hasattr(bson_module, 'decode_all'):
+        return bson_module.decode_all(payload)
+    if hasattr(bson_module, 'BSON'):
+        return bson_module.BSON.decode_all(payload)
+    raise AttributeError('bson module lacks decode_all()')
+def _encode_doc(bson_module: Any, doc: dict[str, Any]) -> bytes:
+    if hasattr(bson_module, 'encode'):
+        return bson_module.encode(doc)
+    if hasattr(bson_module, 'BSON'):
+        return bson_module.BSON.encode(doc)
+    raise AttributeError('bson module lacks encode()')
+def _get_bson() -> Any:
+    """Return the bson module, importing it on first use."""
+    return get_optional_module(
+        'bson',
+        error_message=(
+            'BSON support requires optional dependency "pymongo".\n'
+            'Install with: pip install pymongo'
+        ),
+    )
 # SECTION: FUNCTIONS ======================================================== #
@@ -53,7 +85,11 @@ def read(
     JSONList
         The list of dictionaries read from the BSON file.
     """
-    return stub.read(path, format_name='BSON')
+    bson = _get_bson()
+    with path.open('rb') as handle:
+        payload = handle.read()
+    docs = _decode_all(bson, payload)
+    return cast(JSONList, docs)
 def write(
@@ -76,4 +112,13 @@ def write(
     int
         The number of rows written to the BSON file.
     """
-    return stub.write(path, data, format_name='BSON')
+    bson = _get_bson()
+    records = normalize_records(data, 'BSON')
+    if not records:
+        return 0
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open('wb') as handle:
+        for record in records:
+            handle.write(_encode_doc(bson, record))
+    return len(records)

etlplus/file/cbor.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
 :mod:`etlplus.file.cbor` module.
-Stub helpers for reading/writing Concise Binary Object Representation (CBOR)
-files (not implemented yet).
+Helpers for reading/writing Concise Binary Object Representation (CBOR) files.
 Notes
 -----
@@ -20,10 +19,12 @@ Notes
 from __future__ import annotations
 from pathlib import Path
+from typing import Any
 from ..types import JSONData
-from ..types import JSONList
-from . import stub
+from ._imports import get_optional_module
+from ._io import coerce_record_payload
+from ._io import normalize_records
 # SECTION: EXPORTS ========================================================== #
@@ -35,12 +36,26 @@ __all__ = [
 ]
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _get_cbor() -> Any:
+    """Return the cbor2 module, importing it on first use."""
+    return get_optional_module(
+        'cbor2',
+        error_message=(
+            'CBOR support requires optional dependency "cbor2".\n'
+            'Install with: pip install cbor2'
+        ),
+    )
 # SECTION: FUNCTIONS ======================================================== #
 def read(
     path: Path,
-) -> JSONList:
+) -> JSONData:
     """
     Read CBOR content from *path*.
@@ -51,10 +66,13 @@ def read(
     Returns
     -------
-    JSONList
-        The list of dictionaries read from the CBOR file.
+    JSONData
+        The structured data read from the CBOR file.
     """
-    return stub.read(path, format_name='CBOR')
+    cbor2 = _get_cbor()
+    with path.open('rb') as handle:
+        payload = cbor2.loads(handle.read())
+    return coerce_record_payload(payload, format_name='CBOR')
 def write(
@@ -62,14 +80,14 @@ def write(
     data: JSONData,
 ) -> int:
     """
-    Write *data* to CBOR at *path* and return record count.
+    Write *data* to CBOR file at *path* and return record count.
     Parameters
     ----------
     path : Path
         Path to the CBOR file on disk.
     data : JSONData
-        Data to write as CBOR. Should be a list of dictionaries or a
+        Data to write as CBOR file. Should be a list of dictionaries or a
         single dictionary.
     Returns
@@ -77,4 +95,10 @@ def write(
     int
         The number of rows written to the CBOR file.
     """
-    return stub.write(path, data, format_name='CBOR')
+    cbor2 = _get_cbor()
+    records = normalize_records(data, 'CBOR')
+    payload: JSONData = records if isinstance(data, list) else records[0]
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open('wb') as handle:
+        handle.write(cbor2.dumps(payload))
+    return len(records)

etlplus/file/dat.py CHANGED Viewed

@@ -1,12 +1,12 @@
 """
 :mod:`etlplus.file.dat` module.
-Stub helpers for reading/writing data (DAT) files (not implemented yet).
+Helpers for reading/writing data (DAT) files.
 Notes
 -----
-- A “DAT-formatted” file is a generic data file that may use various
-    delimiters or fixed-width formats.
+- A DAT file is a generic data file that may use various delimiters or fixed-
+    width formats.
 - Common cases:
     - Delimited text files (e.g., CSV, TSV).
     - Fixed-width formatted files.
@@ -18,11 +18,14 @@ Notes
 from __future__ import annotations
+import csv
 from pathlib import Path
+from typing import cast
 from ..types import JSONData
+from ..types import JSONDict
 from ..types import JSONList
-from . import stub
+from ._io import write_delimited
 # SECTION: EXPORTS ========================================================== #
@@ -53,7 +56,42 @@ def read(
     JSONList
         The list of dictionaries read from the DAT file.
     """
-    return stub.read(path, format_name='DAT')
+    with path.open('r', encoding='utf-8', newline='') as handle:
+        sample = handle.read(4096)
+        handle.seek(0)
+        sniffer = csv.Sniffer()
+        dialect: csv.Dialect
+        try:
+            dialect = cast(
+                csv.Dialect,
+                sniffer.sniff(sample, delimiters=',\t|;'),
+            )
+        except csv.Error:
+            dialect = cast(csv.Dialect, csv.get_dialect('excel'))
+        try:
+            has_header = sniffer.has_header(sample)
+        except csv.Error:
+            has_header = True
+        reader = csv.reader(handle, dialect)
+        rows = [row for row in reader if any(field.strip() for field in row)]
+        if not rows:
+            return []
+        if has_header:
+            header = rows[0]
+            data_rows = rows[1:]
+        else:
+            header = [f'col_{i + 1}' for i in range(len(rows[0]))]
+            data_rows = rows
+    records: JSONList = []
+    for row in data_rows:
+        record: JSONDict = {}
+        for index, name in enumerate(header):
+            record[name] = row[index] if index < len(row) else None
+        records.append(record)
+    return records
 def write(
@@ -76,4 +114,4 @@ def write(
     int
         The number of rows written to the DAT file.
     """
-    return stub.write(path, data, format_name='DAT')
+    return write_delimited(path, data, delimiter=',')

etlplus/file/dta.py CHANGED Viewed

@@ -1,27 +1,30 @@
 """
 :mod:`etlplus.file.dta` module.
-Stub helpers for reading/writing Stata (DTA) data files (not implemented yet).
+Helpers for reading/writing Stata (DTA) files.
 Notes
 -----
-- Stata DTA files are binary files used by Stata statistical software that
-    store datasets with variables, labels, and data types.
+- A DTA file is a proprietary binary format created by Stata to store datasets
+    with variables, labels, and data types.
 - Common cases:
-    - Reading data for analysis in Python.
-    - Writing processed data back to Stata format.
+    - Statistical analysis workflows.
+    - Data sharing in research environments.
+    - Interchange between Stata and other analytics tools.
 - Rule of thumb:
-    - If you need to work with Stata data files, use this module for reading
+    - If the file follows the DTA specification, use this module for reading
         and writing.
 """
 from __future__ import annotations
 from pathlib import Path
+from typing import cast
 from ..types import JSONData
 from ..types import JSONList
-from . import stub
+from ._imports import get_pandas
+from ._io import normalize_records
 # SECTION: EXPORTS ========================================================== #
@@ -51,8 +54,21 @@ def read(
     -------
     JSONList
         The list of dictionaries read from the DTA file.
+    Raises
+    ------
+    ImportError
+        If optional dependencies for DTA support are missing.
     """
-    return stub.read(path, format_name='DTA')
+    pandas = get_pandas('DTA')
+    try:
+        frame = pandas.read_stata(path)
+    except ImportError as err:  # pragma: no cover
+        raise ImportError(
+            'DTA support may require optional dependency "pyreadstat".\n'
+            'Install with: pip install pyreadstat',
+        ) from err
+    return cast(JSONList, frame.to_dict(orient='records'))
 def write(
@@ -67,12 +83,31 @@ def write(
     path : Path
         Path to the DTA file on disk.
     data : JSONData
-        Data to write as DTA file. Should be a list of dictionaries or a
-        single dictionary.
+        Data to write as DTA file. Should be a list of dictionaries or a single
+        dictionary.
     Returns
     -------
     int
         The number of rows written to the DTA file.
+    Raises
+    ------
+    ImportError
+        If optional dependencies for DTA support are missing.
     """
-    return stub.write(path, data, format_name='DTA')
+    records = normalize_records(data, 'DTA')
+    if not records:
+        return 0
+    pandas = get_pandas('DTA')
+    path.parent.mkdir(parents=True, exist_ok=True)
+    frame = pandas.DataFrame.from_records(records)
+    try:
+        frame.to_stata(path, write_index=False)
+    except ImportError as err:  # pragma: no cover
+        raise ImportError(
+            'DTA support may require optional dependency "pyreadstat".\n'
+            'Install with: pip install pyreadstat',
+        ) from err
+    return len(records)

etlplus/file/duckdb.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
 :mod:`etlplus.file.duckdb` module.
-Stub helpers for reading/writing DuckDB database (DUCKDB) files (not
-implemented yet).
+Helpers for reading/writing DuckDB database (DUCKDB) files.
 Notes
 -----
@@ -19,11 +18,14 @@ Notes
 from __future__ import annotations
+import json
 from pathlib import Path
+from typing import Any
 from ..types import JSONData
 from ..types import JSONList
-from . import stub
+from ._imports import get_optional_module
+from ._io import normalize_records
 # SECTION: EXPORTS ========================================================== #
@@ -35,6 +37,103 @@ __all__ = [
 ]
+# SECTION: INTERNAL CONSTANTS ============================================== #
+DEFAULT_TABLE = 'data'
+# SECTION: INTERNAL FUNCTIONS =============================================== #
+def _coerce_sql_value(
+    value: Any,
+) -> Any:
+    """
+    Normalize values into DuckDB-compatible types.
+    Parameters
+    ----------
+    value : Any
+        The value to normalize.
+    Returns
+    -------
+    Any
+        The normalized value.
+    """
+    if value is None or isinstance(value, (str, int, float, bool)):
+        return value
+    return json.dumps(value, ensure_ascii=True)
+def _get_duckdb() -> Any:
+    """
+    Return the duckdb module, importing it on first use.
+    Returns
+    -------
+    Any
+        The duckdb module.
+    """
+    return get_optional_module(
+        'duckdb',
+        error_message=(
+            'DUCKDB support requires optional dependency "duckdb".\n'
+            'Install with: pip install duckdb'
+        ),
+    )
+def _infer_column_type(values: list[Any]) -> str:
+    """Infer a basic DuckDB column type from sample values."""
+    seen_bool = False
+    seen_int = False
+    seen_float = False
+    seen_other = False
+    for value in values:
+        if value is None:
+            continue
+        if isinstance(value, bool):
+            seen_bool = True
+        elif isinstance(value, int):
+            seen_int = True
+        elif isinstance(value, float):
+            seen_float = True
+        else:
+            seen_other = True
+            break
+    if seen_other:
+        return 'VARCHAR'
+    if seen_float:
+        return 'DOUBLE'
+    if seen_int:
+        return 'BIGINT'
+    if seen_bool:
+        return 'BOOLEAN'
+    return 'VARCHAR'
+def _quote_identifier(value: str) -> str:
+    """Return a safely quoted SQL identifier."""
+    escaped = value.replace('"', '""')
+    return f'"{escaped}"'
+def _resolve_table(tables: list[str]) -> str | None:
+    """Pick a table name for read operations."""
+    if not tables:
+        return None
+    if DEFAULT_TABLE in tables:
+        return DEFAULT_TABLE
+    if len(tables) == 1:
+        return tables[0]
+    raise ValueError(
+        'Multiple tables found in DuckDB file; expected "data" or a '
+        'single table',
+    )
 # SECTION: FUNCTIONS ======================================================== #
@@ -54,7 +153,25 @@ def read(
     JSONList
         The list of dictionaries read from the DUCKDB file.
     """
-    return stub.read(path, format_name='DUCKDB')
+    duckdb = _get_duckdb()
+    conn = duckdb.connect(str(path))
+    try:
+        tables = [row[0] for row in conn.execute('SHOW TABLES').fetchall()]
+        table = _resolve_table(tables)
+        if table is None:
+            return []
+        query = f'SELECT * FROM {_quote_identifier(table)}'
+        cursor = conn.execute(query)
+        rows = cursor.fetchall()
+        columns = [desc[0] for desc in cursor.description or []]
+        if not columns:
+            info = conn.execute(
+                f'PRAGMA table_info({_quote_identifier(table)})',
+            ).fetchall()
+            columns = [row[1] for row in info]
+        return [dict(zip(columns, row, strict=True)) for row in rows]
+    finally:
+        conn.close()
 def write(
@@ -77,4 +194,41 @@ def write(
     int
         The number of rows written to the DUCKDB file.
     """
-    return stub.write(path, data, format_name='DUCKDB')
+    records = normalize_records(data, 'DUCKDB')
+    if not records:
+        return 0
+    columns = sorted({key for row in records for key in row})
+    if not columns:
+        return 0
+    column_values: dict[str, list[Any]] = {col: [] for col in columns}
+    for row in records:
+        for column in columns:
+            column_values[column].append(row.get(column))
+    column_defs = ', '.join(
+        f'{_quote_identifier(column)} {_infer_column_type(values)}'
+        for column, values in column_values.items()
+    )
+    table_ident = _quote_identifier(DEFAULT_TABLE)
+    insert_columns = ', '.join(_quote_identifier(column) for column in columns)
+    placeholders = ', '.join('?' for _ in columns)
+    insert_sql = (
+        f'INSERT INTO {table_ident} ({insert_columns}) VALUES ({placeholders})'
+    )
+    duckdb = _get_duckdb()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    conn = duckdb.connect(str(path))
+    try:
+        conn.execute(f'DROP TABLE IF EXISTS {table_ident}')
+        conn.execute(f'CREATE TABLE {table_ident} ({column_defs})')
+        rows = [
+            tuple(_coerce_sql_value(row.get(column)) for column in columns)
+            for row in records
+        ]
+        conn.executemany(insert_sql, rows)
+    finally:
+        conn.close()
+    return len(records)

etlplus 0.16.10__py3-none-any.whl → 0.17.2__py3-none-any.whl

etlplus 0.16.10py3-none-any.whl → 0.17.2py3-none-any.whl