PyPI - etlplus - Versions diffs - 0.16.10__py3-none-any.whl → 0.17.3__py3-none-any.whl - Mend

etlplus 0.16.10py3-none-any.whl → 0.17.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

etlplus/file/README.md +33 -0
etlplus/file/_imports.py +35 -20
etlplus/file/_io.py +138 -15
etlplus/file/_r.py +48 -0
etlplus/file/_sql.py +224 -0
etlplus/file/accdb.py +7 -6
etlplus/file/arrow.py +29 -10
etlplus/file/avro.py +13 -10
etlplus/file/bson.py +94 -10
etlplus/file/cbor.py +29 -17
etlplus/file/cfg.py +7 -6
etlplus/file/conf.py +7 -6
etlplus/file/core.py +1 -1
etlplus/file/csv.py +8 -7
etlplus/file/dat.py +52 -11
etlplus/file/dta.py +36 -16
etlplus/file/duckdb.py +72 -11
etlplus/file/enums.py +29 -0
etlplus/file/feather.py +15 -30
etlplus/file/fwf.py +44 -10
etlplus/file/gz.py +12 -7
etlplus/file/hbs.py +7 -6
etlplus/file/hdf5.py +71 -8
etlplus/file/ini.py +60 -17
etlplus/file/ion.py +7 -6
etlplus/file/jinja2.py +7 -6
etlplus/file/json.py +10 -11
etlplus/file/log.py +7 -6
etlplus/file/mat.py +7 -6
etlplus/file/mdb.py +7 -6
etlplus/file/msgpack.py +27 -15
etlplus/file/mustache.py +7 -6
etlplus/file/nc.py +69 -11
etlplus/file/ndjson.py +10 -6
etlplus/file/numbers.py +7 -6
etlplus/file/ods.py +48 -11
etlplus/file/orc.py +15 -30
etlplus/file/parquet.py +10 -6
etlplus/file/pb.py +36 -24
etlplus/file/pbf.py +7 -6
etlplus/file/properties.py +44 -18
etlplus/file/proto.py +24 -18
etlplus/file/psv.py +12 -11
etlplus/file/rda.py +57 -15
etlplus/file/rds.py +50 -14
etlplus/file/sas7bdat.py +26 -16
etlplus/file/sav.py +34 -16
etlplus/file/sqlite.py +70 -10
etlplus/file/stub.py +8 -6
etlplus/file/sylk.py +7 -6
etlplus/file/tab.py +13 -13
etlplus/file/toml.py +56 -17
etlplus/file/tsv.py +8 -7
etlplus/file/txt.py +10 -7
etlplus/file/vm.py +7 -6
etlplus/file/wks.py +7 -6
etlplus/file/xls.py +8 -5
etlplus/file/xlsm.py +48 -10
etlplus/file/xlsx.py +10 -6
etlplus/file/xml.py +11 -9
etlplus/file/xpt.py +46 -10
etlplus/file/yaml.py +10 -11
etlplus/file/zip.py +10 -5
etlplus/file/zsav.py +7 -6
{etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/METADATA +44 -26
{etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/RECORD +70 -68
{etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/WHEEL +0 -0
{etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/entry_points.txt +0 -0
{etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/licenses/LICENSE +0 -0
{etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/top_level.txt +0 -0

etlplus/file/dat.py CHANGED Viewed

@@ -1,12 +1,12 @@
 """
 :mod:`etlplus.file.dat` module.
-Stub helpers for reading/writing data (DAT) files (not implemented yet).
+Helpers for reading/writing data (DAT) files.
 Notes
 -----
-- A “DAT-formatted” file is a generic data file that may use various
-    delimiters or fixed-width formats.
+- A DAT file is a generic data file that may use various delimiters or fixed-
+    width formats.
 - Common cases:
     - Delimited text files (e.g., CSV, TSV).
     - Fixed-width formatted files.
@@ -18,11 +18,15 @@ Notes
 from __future__ import annotations
-from pathlib import Path
+import csv
+from typing import cast
 from ..types import JSONData
+from ..types import JSONDict
 from ..types import JSONList
-from . import stub
+from ..types import StrPath
+from ._io import coerce_path
+from ._io import write_delimited
 # SECTION: EXPORTS ========================================================== #
@@ -38,14 +42,14 @@ __all__ = [
 def read(
-    path: Path,
+    path: StrPath,
 ) -> JSONList:
     """
     Read DAT content from *path*.
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the DAT file on disk.
     Returns
@@ -53,11 +57,47 @@ def read(
     JSONList
         The list of dictionaries read from the DAT file.
     """
-    return stub.read(path, format_name='DAT')
+    path = coerce_path(path)
+    with path.open('r', encoding='utf-8', newline='') as handle:
+        sample = handle.read(4096)
+        handle.seek(0)
+        sniffer = csv.Sniffer()
+        dialect: csv.Dialect
+        try:
+            dialect = cast(
+                csv.Dialect,
+                sniffer.sniff(sample, delimiters=',\t|;'),
+            )
+        except csv.Error:
+            dialect = cast(csv.Dialect, csv.get_dialect('excel'))
+        try:
+            has_header = sniffer.has_header(sample)
+        except csv.Error:
+            has_header = True
+        reader = csv.reader(handle, dialect)
+        rows = [row for row in reader if any(field.strip() for field in row)]
+        if not rows:
+            return []
+        if has_header:
+            header = rows[0]
+            data_rows = rows[1:]
+        else:
+            header = [f'col_{i + 1}' for i in range(len(rows[0]))]
+            data_rows = rows
+    records: JSONList = []
+    for row in data_rows:
+        record: JSONDict = {}
+        for index, name in enumerate(header):
+            record[name] = row[index] if index < len(row) else None
+        records.append(record)
+    return records
 def write(
-    path: Path,
+    path: StrPath,
     data: JSONData,
 ) -> int:
     """
@@ -65,7 +105,7 @@ def write(
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the DAT file on disk.
     data : JSONData
         Data to write as DAT file. Should be a list of dictionaries or a
@@ -76,4 +116,5 @@ def write(
     int
         The number of rows written to the DAT file.
     """
-    return stub.write(path, data, format_name='DAT')
+    path = coerce_path(path)
+    return write_delimited(path, data, delimiter=',', format_name='DAT')

etlplus/file/dta.py CHANGED Viewed

@@ -1,27 +1,33 @@
 """
 :mod:`etlplus.file.dta` module.
-Stub helpers for reading/writing Stata (DTA) data files (not implemented yet).
+Helpers for reading/writing Stata (DTA) files.
 Notes
 -----
-- Stata DTA files are binary files used by Stata statistical software that
-    store datasets with variables, labels, and data types.
+- A DTA file is a proprietary binary format created by Stata to store datasets
+    with variables, labels, and data types.
 - Common cases:
-    - Reading data for analysis in Python.
-    - Writing processed data back to Stata format.
+    - Statistical analysis workflows.
+    - Data sharing in research environments.
+    - Interchange between Stata and other analytics tools.
 - Rule of thumb:
-    - If you need to work with Stata data files, use this module for reading
+    - If the file follows the DTA specification, use this module for reading
         and writing.
 """
 from __future__ import annotations
-from pathlib import Path
+from typing import cast
 from ..types import JSONData
 from ..types import JSONList
-from . import stub
+from ..types import StrPath
+from ._imports import get_dependency
+from ._imports import get_pandas
+from ._io import coerce_path
+from ._io import ensure_parent_dir
+from ._io import normalize_records
 # SECTION: EXPORTS ========================================================== #
@@ -37,14 +43,14 @@ __all__ = [
 def read(
-    path: Path,
+    path: StrPath,
 ) -> JSONList:
     """
     Read DTA content from *path*.
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the DTA file on disk.
     Returns
@@ -52,11 +58,15 @@ def read(
     JSONList
         The list of dictionaries read from the DTA file.
     """
-    return stub.read(path, format_name='DTA')
+    path = coerce_path(path)
+    get_dependency('pyreadstat', format_name='DTA')
+    pandas = get_pandas('DTA')
+    frame = pandas.read_stata(path)
+    return cast(JSONList, frame.to_dict(orient='records'))
 def write(
-    path: Path,
+    path: StrPath,
     data: JSONData,
 ) -> int:
     """
@@ -64,15 +74,25 @@ def write(
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the DTA file on disk.
     data : JSONData
-        Data to write as DTA file. Should be a list of dictionaries or a
-        single dictionary.
+        Data to write as DTA file. Should be a list of dictionaries or a single
+        dictionary.
     Returns
     -------
     int
         The number of rows written to the DTA file.
     """
-    return stub.write(path, data, format_name='DTA')
+    path = coerce_path(path)
+    records = normalize_records(data, 'DTA')
+    if not records:
+        return 0
+    get_dependency('pyreadstat', format_name='DTA')
+    pandas = get_pandas('DTA')
+    ensure_parent_dir(path)
+    frame = pandas.DataFrame.from_records(records)
+    frame.to_stata(path, write_index=False)
+    return len(records)

etlplus/file/duckdb.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
 :mod:`etlplus.file.duckdb` module.
-Stub helpers for reading/writing DuckDB database (DUCKDB) files (not
-implemented yet).
+Helpers for reading/writing DuckDB database (DUCKDB) files.
 Notes
 -----
@@ -19,11 +18,20 @@ Notes
 from __future__ import annotations
-from pathlib import Path
 from ..types import JSONData
 from ..types import JSONList
-from . import stub
+from ..types import StrPath
+from ._imports import get_dependency
+from ._io import coerce_path
+from ._io import ensure_parent_dir
+from ._io import normalize_records
+from ._sql import DEFAULT_TABLE
+from ._sql import DUCKDB_DIALECT
+from ._sql import coerce_sql_value
+from ._sql import collect_column_values
+from ._sql import infer_column_type
+from ._sql import quote_identifier
+from ._sql import resolve_table
 # SECTION: EXPORTS ========================================================== #
@@ -39,14 +47,14 @@ __all__ = [
 def read(
-    path: Path,
+    path: StrPath,
 ) -> JSONList:
     """
     Read DUCKDB content from *path*.
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the DUCKDB file on disk.
     Returns
@@ -54,11 +62,30 @@ def read(
     JSONList
         The list of dictionaries read from the DUCKDB file.
     """
-    return stub.read(path, format_name='DUCKDB')
+    path = coerce_path(path)
+    duckdb = get_dependency('duckdb', format_name='DUCKDB')
+    conn = duckdb.connect(str(path))
+    try:
+        tables = [row[0] for row in conn.execute('SHOW TABLES').fetchall()]
+        table = resolve_table(tables, engine_name='DuckDB')
+        if table is None:
+            return []
+        query = f'SELECT * FROM {quote_identifier(table)}'
+        cursor = conn.execute(query)
+        rows = cursor.fetchall()
+        columns = [desc[0] for desc in cursor.description or []]
+        if not columns:
+            info = conn.execute(
+                f'PRAGMA table_info({quote_identifier(table)})',
+            ).fetchall()
+            columns = [row[1] for row in info]
+        return [dict(zip(columns, row, strict=True)) for row in rows]
+    finally:
+        conn.close()
 def write(
-    path: Path,
+    path: StrPath,
     data: JSONData,
 ) -> int:
     """
@@ -66,7 +93,7 @@ def write(
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the DUCKDB file on disk.
     data : JSONData
         Data to write as DUCKDB. Should be a list of dictionaries or a
@@ -77,4 +104,38 @@ def write(
     int
         The number of rows written to the DUCKDB file.
     """
-    return stub.write(path, data, format_name='DUCKDB')
+    path = coerce_path(path)
+    records = normalize_records(data, 'DUCKDB')
+    if not records:
+        return 0
+    columns, column_values = collect_column_values(records)
+    if not columns:
+        return 0
+    column_defs = ', '.join(
+        f'{quote_identifier(column)} '
+        f'{infer_column_type(values, DUCKDB_DIALECT)}'
+        for column, values in column_values.items()
+    )
+    table_ident = quote_identifier(DEFAULT_TABLE)
+    insert_columns = ', '.join(quote_identifier(column) for column in columns)
+    placeholders = ', '.join('?' for _ in columns)
+    insert_sql = (
+        f'INSERT INTO {table_ident} ({insert_columns}) VALUES ({placeholders})'
+    )
+    duckdb = get_dependency('duckdb', format_name='DUCKDB')
+    ensure_parent_dir(path)
+    conn = duckdb.connect(str(path))
+    try:
+        conn.execute(f'DROP TABLE IF EXISTS {table_ident}')
+        conn.execute(f'CREATE TABLE {table_ident} ({column_defs})')
+        rows = [
+            tuple(coerce_sql_value(row.get(column)) for column in columns)
+            for row in records
+        ]
+        conn.executemany(insert_sql, rows)
+    finally:
+        conn.close()
+    return len(records)

etlplus/file/enums.py CHANGED Viewed

@@ -199,19 +199,48 @@ class FileFormat(CoercibleStrEnum):
             'yml': 'yaml',
             # File extensions
             '.avro': 'avro',
+            '.arrow': 'arrow',
             '.csv': 'csv',
+            '.duckdb': 'duckdb',
+            '.dat': 'dat',
             '.feather': 'feather',
+            '.fwf': 'fwf',
             '.gz': 'gz',
+            '.hdf': 'hdf5',
+            '.hdf5': 'hdf5',
+            '.h5': 'hdf5',
+            '.ini': 'ini',
             '.json': 'json',
             '.jsonl': 'ndjson',
+            '.bson': 'bson',
+            '.cbor': 'cbor',
+            '.msgpack': 'msgpack',
             '.ndjson': 'ndjson',
+            '.ods': 'ods',
             '.orc': 'orc',
             '.parquet': 'parquet',
             '.pq': 'parquet',
+            '.pb': 'pb',
+            '.proto': 'proto',
+            '.psv': 'psv',
+            '.sqlite': 'sqlite',
+            '.sqlite3': 'sqlite',
             '.stub': 'stub',
+            '.tab': 'tab',
+            '.dta': 'dta',
+            '.sas7bdat': 'sas7bdat',
+            '.xpt': 'xpt',
+            '.rds': 'rds',
+            '.rda': 'rda',
+            '.nc': 'nc',
+            '.sav': 'sav',
+            '.properties': 'properties',
+            '.prop': 'properties',
+            '.toml': 'toml',
             '.tsv': 'tsv',
             '.txt': 'txt',
             '.xls': 'xls',
+            '.xlsm': 'xlsm',
             '.xlsx': 'xlsx',
             '.zip': 'zip',
             '.xml': 'xml',

etlplus/file/feather.py CHANGED Viewed

@@ -18,12 +18,15 @@ Notes
 from __future__ import annotations
-from pathlib import Path
 from typing import cast
 from ..types import JSONData
 from ..types import JSONList
+from ..types import StrPath
+from ._imports import get_dependency
 from ._imports import get_pandas
+from ._io import coerce_path
+from ._io import ensure_parent_dir
 from ._io import normalize_records
 # SECTION: EXPORTS ========================================================== #
@@ -40,39 +43,30 @@ __all__ = [
 def read(
-    path: Path,
+    path: StrPath,
 ) -> JSONList:
     """
     Read Feather content from *path*.
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the Feather file on disk.
     Returns
     -------
     JSONList
         The list of dictionaries read from the Feather file.
-    Raises
-    ------
-    ImportError
-        When optional dependency "pyarrow" is missing.
     """
+    path = coerce_path(path)
+    get_dependency('pyarrow', format_name='Feather')
     pandas = get_pandas('Feather')
-    try:
-        frame = pandas.read_feather(path)
-    except ImportError as e:  # pragma: no cover
-        raise ImportError(
-            'Feather support requires optional dependency "pyarrow".\n'
-            'Install with: pip install pyarrow',
-        ) from e
+    frame = pandas.read_feather(path)
     return cast(JSONList, frame.to_dict(orient='records'))
 def write(
-    path: Path,
+    path: StrPath,
     data: JSONData,
 ) -> int:
     """
@@ -80,7 +74,7 @@ def write(
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the Feather file on disk.
     data : JSONData
         Data to write.
@@ -89,24 +83,15 @@ def write(
     -------
     int
         Number of records written.
-    Raises
-    ------
-    ImportError
-        When optional dependency "pyarrow" is missing.
     """
+    path = coerce_path(path)
     records = normalize_records(data, 'Feather')
     if not records:
         return 0
+    get_dependency('pyarrow', format_name='Feather')
     pandas = get_pandas('Feather')
-    path.parent.mkdir(parents=True, exist_ok=True)
+    ensure_parent_dir(path)
     frame = pandas.DataFrame.from_records(records)
-    try:
-        frame.to_feather(path)
-    except ImportError as e:  # pragma: no cover
-        raise ImportError(
-            'Feather support requires optional dependency "pyarrow".\n'
-            'Install with: pip install pyarrow',
-        ) from e
+    frame.to_feather(path)
     return len(records)

etlplus/file/fwf.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
 :mod:`etlplus.file.fwf` module.
-Stub helpers for reading/writing Fixed-Width Fields (FWF) files (not
-implemented yet).
+Helpers for reading/writing Fixed-Width Fields (FWF) files.
 Notes
 -----
@@ -18,11 +17,16 @@ Notes
 from __future__ import annotations
-from pathlib import Path
+from typing import cast
 from ..types import JSONData
 from ..types import JSONList
-from . import stub
+from ..types import StrPath
+from ._imports import get_pandas
+from ._io import coerce_path
+from ._io import ensure_parent_dir
+from ._io import normalize_records
+from ._io import stringify_value
 # SECTION: EXPORTS ========================================================== #
@@ -38,14 +42,14 @@ __all__ = [
 def read(
-    path: Path,
+    path: StrPath,
 ) -> JSONList:
     """
     Read FWF content from *path*.
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the FWF file on disk.
     Returns
@@ -53,11 +57,14 @@ def read(
     JSONList
         The list of dictionaries read from the FWF file.
     """
-    return stub.read(path, format_name='FWF')
+    path = coerce_path(path)
+    pandas = get_pandas('FWF')
+    frame = pandas.read_fwf(path)
+    return cast(JSONList, frame.to_dict(orient='records'))
 def write(
-    path: Path,
+    path: StrPath,
     data: JSONData,
 ) -> int:
     """
@@ -65,7 +72,7 @@ def write(
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the FWF file on disk.
     data : JSONData
         Data to write as FWF file. Should be a list of dictionaries or a
@@ -76,4 +83,31 @@ def write(
     int
         The number of rows written to the FWF file.
     """
-    return stub.write(path, data, format_name='FWF')
+    path = coerce_path(path)
+    records = normalize_records(data, 'FWF')
+    if not records:
+        return 0
+    fieldnames = sorted({key for row in records for key in row})
+    if not fieldnames:
+        return 0
+    widths: dict[str, int] = {name: len(name) for name in fieldnames}
+    for row in records:
+        for name in fieldnames:
+            widths[name] = max(
+                widths[name],
+                len(stringify_value(row.get(name))),
+            )
+    ensure_parent_dir(path)
+    with path.open('w', encoding='utf-8', newline='') as handle:
+        header = ' '.join(name.ljust(widths[name]) for name in fieldnames)
+        handle.write(header + '\n')
+        for row in records:
+            line = ' '.join(
+                stringify_value(row.get(name)).ljust(widths[name])
+                for name in fieldnames
+            )
+            handle.write(line + '\n')
+    return len(records)

etlplus/file/gz.py CHANGED Viewed

@@ -11,6 +11,9 @@ import tempfile
 from pathlib import Path
 from ..types import JSONData
+from ..types import StrPath
+from ._io import coerce_path
+from ._io import ensure_parent_dir
 from .enums import CompressionFormat
 from .enums import FileFormat
 from .enums import infer_file_format_and_compression
@@ -29,14 +32,14 @@ __all__ = [
 def _resolve_format(
-    path: Path,
+    path: StrPath,
 ) -> FileFormat:
     """
     Resolve the inner file format from a .gz filename.
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the GZ file on disk.
     Returns
@@ -63,14 +66,14 @@ def _resolve_format(
 def read(
-    path: Path,
+    path: StrPath,
 ) -> JSONData:
     """
     Read GZ content from *path* and parse the inner payload.
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the GZ file on disk.
     Returns
@@ -78,6 +81,7 @@ def read(
     JSONData
         Parsed payload.
     """
+    path = coerce_path(path)
     fmt = _resolve_format(path)
     with gzip.open(path, 'rb') as handle:
         payload = handle.read()
@@ -91,7 +95,7 @@ def read(
 def write(
-    path: Path,
+    path: StrPath,
     data: JSONData,
 ) -> int:
     """
@@ -99,7 +103,7 @@ def write(
     Parameters
     ----------
-    path : Path
+    path : StrPath
         Path to the GZ file on disk.
     data : JSONData
         Data to write.
@@ -109,6 +113,7 @@ def write(
     int
         Number of records written.
     """
+    path = coerce_path(path)
     fmt = _resolve_format(path)
     with tempfile.TemporaryDirectory() as tmpdir:
         tmp_path = Path(tmpdir) / f'payload.{fmt.value}'
@@ -117,7 +122,7 @@ def write(
         count = File(tmp_path, fmt).write(data)
         payload = tmp_path.read_bytes()
-    path.parent.mkdir(parents=True, exist_ok=True)
+    ensure_parent_dir(path)
     with gzip.open(path, 'wb') as handle:
         handle.write(payload)

etlplus 0.16.10__py3-none-any.whl → 0.17.3__py3-none-any.whl

etlplus 0.16.10py3-none-any.whl → 0.17.3py3-none-any.whl