PyPI - cds-pyde-toolkit - Versions diffs - 1.1.0__py3-none-any.whl - Mend

cds-pyde-toolkit 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

cds_pyde_toolkit/__init__.py +44 -0
cds_pyde_toolkit/cli.py +62 -0
cds_pyde_toolkit/py.typed +0 -0
cds_pyde_toolkit/schema_inferencer/__init__.py +38 -0
cds_pyde_toolkit/schema_inferencer/cli.py +268 -0
cds_pyde_toolkit/schema_inferencer/core.py +1550 -0
cds_pyde_toolkit-1.1.0.dist-info/METADATA +156 -0
cds_pyde_toolkit-1.1.0.dist-info/RECORD +12 -0
cds_pyde_toolkit-1.1.0.dist-info/WHEEL +5 -0
cds_pyde_toolkit-1.1.0.dist-info/entry_points.txt +2 -0
cds_pyde_toolkit-1.1.0.dist-info/licenses/LICENSE +21 -0
cds_pyde_toolkit-1.1.0.dist-info/top_level.txt +1 -0

cds_pyde_toolkit/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""
+cds_pyde_toolkit
+=================
+A growing toolkit of data-engineering helper functions and CLI commands.
+Currently included
+-------------------
+schema_inferencer   Infer column names, data types, schema definitions, and
+                     CREATE TABLE / VIEW DDL from a file or a pandas DataFrame.
+                     (Pandas/ANSI SQL or PySpark/Spark SQL.)
+Usage
+-----
+Namespaced (recommended as the toolkit grows, to avoid name clashes between
+tools)::
+    from cds_pyde_toolkit.schema_inferencer import infer_file
+    result = infer_file(my_dataframe, pyspark=True, casing="snake")
+Top-level convenience re-exports are also provided for the most commonly
+used function of each tool — currently just `infer_file`::
+    from cds_pyde_toolkit import infer_file
+CLI
+---
+    cds-pyde-toolkit schema-infer Sales1.csv --pyspark true
+    cds-pyde-toolkit --help
+"""
+from importlib.metadata import PackageNotFoundError
+from importlib.metadata import version as _installed_version
+from .schema_inferencer import infer_file
+try:
+    __version__ = _installed_version("cds-pyde-toolkit")
+except PackageNotFoundError:  # pragma: no cover — running from source without an install
+    __version__ = "0.0.0+unknown"
+__all__ = [
+    "infer_file",
+    "__version__",
+]

cds_pyde_toolkit/cli.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""
+cds_pyde_toolkit.cli
+======================
+Top-level command-line entry point for the whole toolkit, installed as the
+`cds-pyde-toolkit` console script. Each tool in the package contributes one
+subcommand here.
+Currently registered subcommands:
+    schema-infer    → cds_pyde_toolkit.schema_inferencer
+Adding a new tool later
+------------------------
+1. Create a new submodule, e.g. `cds_pyde_toolkit/data_profiler/` with its
+   own `core.py` and a `cli.py` that exposes `add_arguments(parser)` and
+   `run(args)` (see `schema_inferencer/cli.py` for the pattern).
+2. Register it below with one call to `subparsers.add_parser(...)` +
+   `<tool>.cli.add_arguments(...)`.
+That's it — no other changes needed; dispatch is generic via `args._run`.
+"""
+from __future__ import annotations
+import argparse
+from typing import Optional
+from . import __version__
+from .schema_inferencer import cli as schema_inferencer_cli
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog='cds-pyde-toolkit',
+        description='A growing toolkit of data-engineering helper commands.',
+    )
+    parser.add_argument(
+        '--version', action='version', version=f'cds-pyde-toolkit {__version__}'
+    )
+    subparsers = parser.add_subparsers(dest='command', required=True)
+    schema_infer_parser = subparsers.add_parser(
+        'schema-infer',
+        help='Infer column names, data types, schema, and CREATE TABLE/VIEW DDL '
+             'from a file or DataFrame.',
+    )
+    schema_inferencer_cli.add_arguments(schema_infer_parser)
+    # ── Future subcommands get registered here, e.g.: ─────────────────────────
+    # profile_parser = subparsers.add_parser('profile', help='...')
+    # data_profiler_cli.add_arguments(profile_parser)
+    return parser
+def main(argv: Optional[list] = None) -> None:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    args._run(args)
+if __name__ == '__main__':  # pragma: no cover
+    main()

cds_pyde_toolkit/py.typed ADDED Viewed

File without changes

cds_pyde_toolkit/schema_inferencer/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""
+cds_pyde_toolkit.schema_inferencer
+===============================
+Infer column names, data types, schema definitions, and CREATE TABLE / VIEW
+DDL from a CSV/TSV/Excel file — or directly from a pandas DataFrame already
+in memory (e.g. a Spark DataFrame converted via `.toPandas()`).
+Quick start
+-----------
+    from cds_pyde_toolkit.schema_inferencer import infer_file
+    result = infer_file(my_dataframe, pyspark=True, casing="snake")
+    print(result["schema"])
+    print(result["create_table"])
+See `cds_pyde_toolkit.schema_inferencer.core.infer_file` for the full parameter
+reference, or run `pyde-toolkit schema-infer --help` for the CLI.
+"""
+from .core import (
+    VALID_CASINGS,
+    VALID_LAYERS,
+    VALID_TABLE_TYPES,
+    format_column_name,
+    infer_file,
+    standardise_columns,
+    to_camel_case,
+)
+__all__ = [
+    "infer_file",
+    "standardise_columns",
+    "format_column_name",
+    "to_camel_case",
+    "VALID_CASINGS",
+    "VALID_LAYERS",
+    "VALID_TABLE_TYPES",
+]

cds_pyde_toolkit/schema_inferencer/cli.py ADDED Viewed

@@ -0,0 +1,268 @@
+"""
+cds_pyde_toolkit.schema_inferencer.cli
+====================================
+Argument definitions and run logic for the `schema-infer` subcommand.
+This module is designed to be mounted as a subcommand of the top-level
+`cds-pyde-toolkit` CLI (see `cds_pyde_toolkit.cli`), but can also be run standalone:
+    python -m cds_pyde_toolkit.schema_inferencer.cli Sales1.csv --pyspark true
+Two entry points are exposed for the top-level CLI to use:
+    add_arguments(parser)  -- registers all flags onto a given parser/subparser
+    run(args)               -- validates flags and executes the inference
+"""
+from __future__ import annotations
+import argparse
+import os
+import re
+import textwrap
+from typing import Optional, Union
+from .core import (
+    VALID_CASINGS,
+    VALID_LAYERS,
+    VALID_TABLE_TYPES,
+    _DEFAULT_SAMPLE,
+    _TYPE_CONFORMANCE_THRESHOLD,
+    infer_file,
+)
+def _str_to_bool(value: str) -> bool:
+    n = value.strip().lower()
+    if n in ('true', 'yes', '1'):
+        return True
+    if n in ('false', 'no', '0'):
+        return False
+    raise argparse.ArgumentTypeError(
+        f"Expected true/false (got {value!r}).  Use --pyspark true  or  --pyspark false"
+    )
+_EPILOG = textwrap.dedent(f"""\
+    ─── casing options ────────────────────────────────────────────────────────
+      camel      plantDescription    (default)
+      pascal     PlantDescription    every word Title-cased
+      snake      plant_description
+      screaming  PLANT_DESCRIPTION
+      kebab      plant-description
+      skip       original name preserved (quoted in DDL)
+    ─── layer options  (--pyspark true only) ─────────────────────────────────
+      bronze         all columns → STRING  +  column mapping + rename + DDL
+      parquet_bronze typed from DataFrame  +  column mapping + rename + DDL
+      silver         inferred types        +  schema + DDL
+      gold           inferred types        +  schema + DDL
+      gold_vw        CREATE VIEW from silver layer
+      all            all five layers in one report
+    ─── table-type options  (--pyspark true only) ────────────────────────────
+      delta            CREATE TABLE … USING DELTA          (default)
+      external         CREATE EXTERNAL TABLE … USING <fmt> LOCATION '<path>'
+      external_delta   CREATE EXTERNAL TABLE … USING DELTA LOCATION '<path>'
+    ─── sample rows ───────────────────────────────────────────────────────────
+      omitted    auto: min({_DEFAULT_SAMPLE:,}, total file rows)
+      0          full file — memory-safety check first
+      N          use min(N, total rows); warns if N > row count
+    ─── defaults when a flag is omitted ──────────────────────────────────────
+      --pyspark        false       Pandas dtype dict + ANSI SQL
+      --table          <file stem> e.g. 'Sales1' for Sales1.csv
+      --sample         auto        min({_DEFAULT_SAMPLE:,}, file row count)
+      --type-threshold {_TYPE_CONFORMANCE_THRESHOLD}        min fraction of values that must match a type
+      --case           camel       camelCase column names
+      --layer          (omitted)   single output, no layer prefix
+      --catalog        (omitted)   two-part name: layer.table
+      --table-type     delta       USING DELTA
+      --output         (omitted)   print to terminal, no file saved
+      --no-print       (omitted)   terminal output is shown
+      --sheet          0           first sheet  (Excel only)
+      --header-row     0           first row/line is the column header
+    ─── examples ──────────────────────────────────────────────────────────────
+      cds-pyde-toolkit schema-infer Sales1.csv
+      cds-pyde-toolkit schema-infer Sales1.csv  --pyspark true  --case pascal
+      cds-pyde-toolkit schema-infer Sales1.csv  --pyspark true  --layer bronze  --catalog prod
+      cds-pyde-toolkit schema-infer Sales1.csv  --pyspark true  --layer all  --catalog prod
+      cds-pyde-toolkit schema-infer Sales1.csv  --pyspark true  --layer bronze  \\
+          --table-type external_delta  --location /mnt/bronze/sales/
+      cds-pyde-toolkit schema-infer sales.xlsx  --pyspark true  --sheet Sheet2  --layer silver
+      cds-pyde-toolkit schema-infer Sales1.csv  --pyspark true  --layer all  --output txt --no-print
+      cds-pyde-toolkit schema-infer messy.xlsx  --sheet Sheet1  --header-row 4
+      cds-pyde-toolkit schema-infer noisy.csv    --type-threshold 0.80
+""")
+def add_arguments(parser: argparse.ArgumentParser) -> None:
+    """Register all schema-infer flags onto *parser* (a subparser or a standalone parser)."""
+    parser.description = (
+        'Infer column names, data types, schema, and CREATE TABLE / VIEW DDL\n'
+        'from a CSV / TSV or Excel file.  Delimiter is auto-detected.\n'
+        'Supports Databricks Unity Catalog three-part naming via --catalog.'
+    )
+    parser.formatter_class = argparse.RawDescriptionHelpFormatter
+    parser.epilog = _EPILOG
+    parser.add_argument('file',
+        help='CSV/TSV (.csv) or Excel (.xlsx .xls .xlsm .xlsb .ods)')
+    parser.add_argument('--pyspark',
+        type=_str_to_bool, default=False, metavar='true|false',
+        help='true → PySpark/Spark SQL  |  false → Pandas/ANSI SQL  (default: false)')
+    parser.add_argument('--table',
+        default=None, metavar='NAME',
+        help='Override table name (default: derived from file stem)')
+    parser.add_argument('--sample',
+        type=int, default=None, metavar='N',
+        help=f'Rows to sample (default auto min({_DEFAULT_SAMPLE:,},total); 0=full file)')
+    parser.add_argument('--type-threshold',
+        type=float, default=_TYPE_CONFORMANCE_THRESHOLD, metavar='0.0-1.0',
+        dest='type_threshold',
+        help=(
+            f'Min fraction of values that must match a type before it is assigned '
+            f'(default {_TYPE_CONFORMANCE_THRESHOLD}); guards against a few dirty '
+            f'values flipping a numeric/date column to string. Use 1.0 for strict matching.'
+        ))
+    parser.add_argument('--case',
+        choices=list(VALID_CASINGS), default='camel', metavar='CASING',
+        help='Column naming: camel|pascal|snake|screaming|kebab|skip  (default: camel)')
+    parser.add_argument('--layer',
+        choices=list(VALID_LAYERS), default=None, metavar='LAYER',
+        help='Layer: bronze|parquet_bronze|silver|gold|gold_vw|all  (requires --pyspark true)')
+    parser.add_argument('--catalog',
+        default=None, metavar='NAME',
+        help='Unity Catalog name for three-part table naming  (pyspark only)')
+    parser.add_argument('--table-type',
+        choices=list(VALID_TABLE_TYPES), default='delta', metavar='TYPE',
+        dest='table_type',
+        help='delta|external|external_delta  (pyspark only, default: delta)')
+    parser.add_argument('--location',
+        default='', metavar='PATH',
+        help='Storage path for external / external_delta tables')
+    parser.add_argument('--file-format',
+        default='parquet', metavar='FMT',
+        dest='file_format',
+        help="File format for external tables, e.g. parquet|csv|orc  (default: parquet)")
+    parser.add_argument('--sheet',
+        default=None, metavar='SHEET',
+        help='Excel sheet name or 0-based index (default: first sheet). ERROR with .csv')
+    parser.add_argument('--header-row',
+        type=int, default=None, metavar='N',
+        dest='header_row',
+        help=(
+            'Row containing column headers, 0-based (default: 0, i.e. first row/line). '
+            'Use e.g. 4 when rows 1-4 are junk/title rows and the real header is on row 5. '
+            'Applies to both .csv and Excel files.'
+        ))
+    parser.add_argument('--output',
+        choices=['txt'], default=None, metavar='FORMAT',
+        help="'txt' → save {table}_schema.txt in current directory")
+    parser.add_argument('--no-print',
+        action='store_true',
+        help='Suppress terminal output (combine with --output txt for file-only mode)')
+    parser.set_defaults(_run=run, _parser_error=parser.error)
+def run(args: argparse.Namespace) -> None:
+    """Validate parsed *args* and execute the inference. Raises via args._parser_error on bad flags."""
+    error = args._parser_error
+    ext = os.path.splitext(args.file)[1].lower()
+    if args.sheet is not None and ext == '.csv':
+        error('--sheet is not valid for .csv files.  Remove --sheet or use an Excel file.')
+    if args.layer is not None and not args.pyspark:
+        error('--layer requires --pyspark true.')
+    if args.catalog is not None and not args.pyspark:
+        error('--catalog requires --pyspark true.')
+    if args.table_type != 'delta' and not args.pyspark:
+        error('--table-type requires --pyspark true.')
+    if args.table_type in ('external', 'external_delta') and not args.location:
+        error(
+            f'--location is required when --table-type={args.table_type}.  '
+            'Provide the storage path, e.g. --location /mnt/data/sales/'
+        )
+    if args.sample is not None and args.sample < 0:
+        error('--sample must be ≥ 0 (0 = full file, omit = auto).')
+    if not (0.0 < args.type_threshold <= 1.0):
+        error(
+            f'--type-threshold must be in the range (0.0, 1.0]; got {args.type_threshold}. '
+            'Use 1.0 for strict all-or-nothing matching.'
+        )
+    # Resolve sheet
+    sheet_val: Union[str, int] = 0
+    if args.sheet is not None:
+        try:
+            sheet_val = int(args.sheet)
+        except ValueError:
+            sheet_val = args.sheet
+    # Resolve header row (default 0 when omitted)
+    header_row_val = args.header_row if args.header_row is not None else 0
+    if header_row_val < 0:
+        error('--header-row must be ≥ 0 (0 = first row/line is the header).')
+    # Resolve output path
+    output_path: Optional[str] = None
+    if args.output == 'txt':
+        stem       = args.table or os.path.splitext(os.path.basename(args.file))[0]
+        safe_stem  = re.sub(r'\W+', '_', stem).strip('_') or 'output'
+        output_path = f'{safe_stem}_schema.txt'
+    try:
+        infer_file(
+            source         = args.file,
+            pyspark        = args.pyspark,
+            table_name     = args.table,
+            sample_rows    = args.sample,
+            sheet          = sheet_val,
+            header_row     = header_row_val,
+            casing         = args.case,
+            print_output   = not args.no_print,
+            output_path    = output_path,
+            type_threshold = args.type_threshold,
+            layer          = args.layer,
+            catalog        = args.catalog,
+            table_type     = args.table_type,
+            location       = args.location,
+            file_format    = args.file_format,
+        )
+    except (ValueError, TypeError, FileNotFoundError, MemoryError) as e:
+        error(str(e))
+def main(argv: Optional[list] = None) -> None:
+    """Standalone entry point: `python -m cds_pyde_toolkit.schema_inferencer.cli ...`"""
+    parser = argparse.ArgumentParser(prog='schema-infer')
+    add_arguments(parser)
+    args = parser.parse_args(argv)
+    args._parser_error = parser.error
+    run(args)
+if __name__ == '__main__':  # pragma: no cover
+    main()