cds-pyde-toolkit 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Your Name
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: cds-pyde-toolkit
3
+ Version: 1.1.0
4
+ Summary: A growing toolkit of data-engineering helper functions and CLI commands — starting with schema inference (column standardisation, type inference, schema + DDL generation for Pandas/ANSI SQL or PySpark/Spark SQL).
5
+ Author: Your Name
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/your-org/cds-pyde-toolkit
8
+ Project-URL: Issues, https://github.com/your-org/cds-pyde-toolkit/issues
9
+ Keywords: pandas,pyspark,schema,ddl,data-engineering,delta-lake,databricks,toolkit
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Database
13
+ Classifier: Topic :: Software Development :: Libraries
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pandas>=1.3
24
+ Requires-Dist: numpy>=1.21
25
+ Provides-Extra: excel
26
+ Requires-Dist: openpyxl>=3.0; extra == "excel"
27
+ Requires-Dist: xlrd>=2.0; extra == "excel"
28
+ Requires-Dist: odfpy>=1.4; extra == "excel"
29
+ Provides-Extra: memcheck
30
+ Requires-Dist: psutil>=5.9; extra == "memcheck"
31
+ Provides-Extra: all
32
+ Requires-Dist: openpyxl>=3.0; extra == "all"
33
+ Requires-Dist: xlrd>=2.0; extra == "all"
34
+ Requires-Dist: odfpy>=1.4; extra == "all"
35
+ Requires-Dist: psutil>=5.9; extra == "all"
36
+ Provides-Extra: dev
37
+ Requires-Dist: pytest>=7.0; extra == "dev"
38
+ Requires-Dist: build>=1.0; extra == "dev"
39
+ Requires-Dist: twine>=4.0; extra == "dev"
40
+ Dynamic: license-file
41
+
42
+ # cds-pyde-toolkit
43
+
44
+ A growing toolkit of data-engineering helper functions and CLI commands.
45
+ The first tool is **schema inference**: standardise column names, infer
46
+ data types from sample data, and emit ready-to-use schema definitions and
47
+ `CREATE TABLE` DDL — either Pandas/ANSI SQL or PySpark/Spark SQL (with
48
+ bronze/silver/gold layer support for Databricks / Unity Catalog workflows).
49
+
50
+ ## Install
51
+
52
+ ```bash
53
+ pip install cds-pyde-toolkit
54
+
55
+ # with Excel support (.xlsx, .xls, .xlsm, .xlsb, .ods)
56
+ pip install "cds-pyde-toolkit[excel]"
57
+
58
+ # with the pre-flight memory check for large full-file reads
59
+ pip install "cds-pyde-toolkit[memcheck]"
60
+
61
+ # everything
62
+ pip install "cds-pyde-toolkit[all]"
63
+ ```
64
+
65
+ Already have it installed and want the latest release?
66
+
67
+ ```bash
68
+ pip install --upgrade cds-pyde-toolkit
69
+ ```
70
+
71
+ ## Quick start — Python
72
+
73
+ ```python
74
+ from cds_pyde_toolkit import infer_file # top-level convenience re-export
75
+ # or, namespaced (recommended as the toolkit grows):
76
+ from cds_pyde_toolkit.schema_inferencer import infer_file
77
+
78
+ result = infer_file(
79
+ "sales.csv",
80
+ pyspark=True,
81
+ casing="pascal",
82
+ table_name="sales_fact",
83
+ header_row=0, # skip junk title rows if needed, e.g. header_row=4
84
+ type_threshold=0.95, # tolerate a few dirty values before falling back to string
85
+ )
86
+
87
+ print(result["schema"]) # PySpark StructType or pandas dtype dict
88
+ print(result["create_table"]) # SQL DDL
89
+ print(result["rename_code"]) # copy-paste column rename snippet
90
+ print(result["report"]) # full formatted text report
91
+ ```
92
+
93
+ ## Quick start — CLI
94
+
95
+ ```bash
96
+ cds-pyde-toolkit schema-infer sales.csv
97
+ cds-pyde-toolkit schema-infer sales.csv --pyspark true --case pascal --layer bronze --catalog prod
98
+ cds-pyde-toolkit schema-infer sales.xlsx --sheet Sheet2 --layer silver
99
+ cds-pyde-toolkit schema-infer messy.csv --header-row 4 --type-threshold 0.80
100
+ cds-pyde-toolkit --version
101
+ ```
102
+
103
+ Run `cds-pyde-toolkit schema-infer --help` for the full flag reference, or see
104
+ the module docstring in `cds_pyde_toolkit/schema_inferencer/core.py`.
105
+
106
+ ## Features
107
+
108
+ - **Column standardisation** — camel, pascal, snake, screaming, kebab, or
109
+ skip casing, with symbol expansion (`/` → `or`, `%` → `pct`, etc.)
110
+ - **Type inference** — bool, int32/int64, float, date, datetime, string,
111
+ with a configurable conformance threshold (`--type-threshold`) to tolerate
112
+ dirty data
113
+ - **Header offset** — `--header-row` to skip junk/title rows above the real
114
+ header, for both CSV and Excel
115
+ - **Dual output modes** — Pandas dtypes + ANSI SQL, or PySpark StructType +
116
+ Spark SQL
117
+ - **Layered outputs** — bronze, parquet_bronze, silver, gold, gold_vw (view),
118
+ or all five at once
119
+ - **Table types** — managed Delta, external, or external Delta tables
120
+ - **Flexible input** — CSV/TSV (delimiter auto-detected), Excel
121
+ (`.xlsx .xls .xlsm .xlsb .ods`), or a pandas DataFrame directly
122
+
123
+ ## Project structure (for contributors)
124
+
125
+ ```
126
+ src/cds_pyde_toolkit/
127
+ ├── __init__.py # top-level re-exports + __version__
128
+ ├── cli.py # top-level CLI dispatcher (registers subcommands)
129
+ └── schema_inferencer/ # one subpackage per feature
130
+ ├── __init__.py # public API for this feature
131
+ ├── core.py # logic only, no argparse
132
+ └── cli.py # add_arguments(parser) + run(args) for this feature
133
+ ```
134
+
135
+ **Adding a new feature later:** create `cds_pyde_toolkit/<your_feature>/` with
136
+ the same three-file shape, then register it with one line in
137
+ `cds_pyde_toolkit/cli.py`'s `build_parser()`. No other files need to change.
138
+
139
+ ## Releasing a new version
140
+
141
+ Version lives in one place (`pyproject.toml`); the installed package's
142
+ `__version__` is read live from package metadata, so there's nothing else
143
+ to keep in sync.
144
+
145
+ ```bash
146
+ python scripts/bump_version.py patch # or minor / major / an exact X.Y.Z
147
+ python -m build
148
+ twine upload dist/*
149
+ ```
150
+
151
+ Anyone with it already installed just runs `pip install --upgrade cds-pyde-toolkit`
152
+ — no need to uninstall first.
153
+
154
+ ## License
155
+
156
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,115 @@
1
+ # cds-pyde-toolkit
2
+
3
+ A growing toolkit of data-engineering helper functions and CLI commands.
4
+ The first tool is **schema inference**: standardise column names, infer
5
+ data types from sample data, and emit ready-to-use schema definitions and
6
+ `CREATE TABLE` DDL — either Pandas/ANSI SQL or PySpark/Spark SQL (with
7
+ bronze/silver/gold layer support for Databricks / Unity Catalog workflows).
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install cds-pyde-toolkit
13
+
14
+ # with Excel support (.xlsx, .xls, .xlsm, .xlsb, .ods)
15
+ pip install "cds-pyde-toolkit[excel]"
16
+
17
+ # with the pre-flight memory check for large full-file reads
18
+ pip install "cds-pyde-toolkit[memcheck]"
19
+
20
+ # everything
21
+ pip install "cds-pyde-toolkit[all]"
22
+ ```
23
+
24
+ Already have it installed and want the latest release?
25
+
26
+ ```bash
27
+ pip install --upgrade cds-pyde-toolkit
28
+ ```
29
+
30
+ ## Quick start — Python
31
+
32
+ ```python
33
+ from cds_pyde_toolkit import infer_file # top-level convenience re-export
34
+ # or, namespaced (recommended as the toolkit grows):
35
+ from cds_pyde_toolkit.schema_inferencer import infer_file
36
+
37
+ result = infer_file(
38
+ "sales.csv",
39
+ pyspark=True,
40
+ casing="pascal",
41
+ table_name="sales_fact",
42
+ header_row=0, # skip junk title rows if needed, e.g. header_row=4
43
+ type_threshold=0.95, # tolerate a few dirty values before falling back to string
44
+ )
45
+
46
+ print(result["schema"]) # PySpark StructType or pandas dtype dict
47
+ print(result["create_table"]) # SQL DDL
48
+ print(result["rename_code"]) # copy-paste column rename snippet
49
+ print(result["report"]) # full formatted text report
50
+ ```
51
+
52
+ ## Quick start — CLI
53
+
54
+ ```bash
55
+ cds-pyde-toolkit schema-infer sales.csv
56
+ cds-pyde-toolkit schema-infer sales.csv --pyspark true --case pascal --layer bronze --catalog prod
57
+ cds-pyde-toolkit schema-infer sales.xlsx --sheet Sheet2 --layer silver
58
+ cds-pyde-toolkit schema-infer messy.csv --header-row 4 --type-threshold 0.80
59
+ cds-pyde-toolkit --version
60
+ ```
61
+
62
+ Run `cds-pyde-toolkit schema-infer --help` for the full flag reference, or see
63
+ the module docstring in `cds_pyde_toolkit/schema_inferencer/core.py`.
64
+
65
+ ## Features
66
+
67
+ - **Column standardisation** — camel, pascal, snake, screaming, kebab, or
68
+ skip casing, with symbol expansion (`/` → `or`, `%` → `pct`, etc.)
69
+ - **Type inference** — bool, int32/int64, float, date, datetime, string,
70
+ with a configurable conformance threshold (`--type-threshold`) to tolerate
71
+ dirty data
72
+ - **Header offset** — `--header-row` to skip junk/title rows above the real
73
+ header, for both CSV and Excel
74
+ - **Dual output modes** — Pandas dtypes + ANSI SQL, or PySpark StructType +
75
+ Spark SQL
76
+ - **Layered outputs** — bronze, parquet_bronze, silver, gold, gold_vw (view),
77
+ or all five at once
78
+ - **Table types** — managed Delta, external, or external Delta tables
79
+ - **Flexible input** — CSV/TSV (delimiter auto-detected), Excel
80
+ (`.xlsx .xls .xlsm .xlsb .ods`), or a pandas DataFrame directly
81
+
82
+ ## Project structure (for contributors)
83
+
84
+ ```
85
+ src/cds_pyde_toolkit/
86
+ ├── __init__.py # top-level re-exports + __version__
87
+ ├── cli.py # top-level CLI dispatcher (registers subcommands)
88
+ └── schema_inferencer/ # one subpackage per feature
89
+ ├── __init__.py # public API for this feature
90
+ ├── core.py # logic only, no argparse
91
+ └── cli.py # add_arguments(parser) + run(args) for this feature
92
+ ```
93
+
94
+ **Adding a new feature later:** create `cds_pyde_toolkit/<your_feature>/` with
95
+ the same three-file shape, then register it with one line in
96
+ `cds_pyde_toolkit/cli.py`'s `build_parser()`. No other files need to change.
97
+
98
+ ## Releasing a new version
99
+
100
+ Version lives in one place (`pyproject.toml`); the installed package's
101
+ `__version__` is read live from package metadata, so there's nothing else
102
+ to keep in sync.
103
+
104
+ ```bash
105
+ python scripts/bump_version.py patch # or minor / major / an exact X.Y.Z
106
+ python -m build
107
+ twine upload dist/*
108
+ ```
109
+
110
+ Anyone with it already installed just runs `pip install --upgrade cds-pyde-toolkit`
111
+ — no need to uninstall first.
112
+
113
+ ## License
114
+
115
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "cds-pyde-toolkit"
7
+ version = "1.1.0"
8
+ description = "A growing toolkit of data-engineering helper functions and CLI commands — starting with schema inference (column standardisation, type inference, schema + DDL generation for Pandas/ANSI SQL or PySpark/Spark SQL)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Your Name" }
14
+ ]
15
+ keywords = ["pandas", "pyspark", "schema", "ddl", "data-engineering", "delta-lake", "databricks", "toolkit"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "Topic :: Database",
20
+ "Topic :: Software Development :: Libraries",
21
+ "License :: OSI Approved :: MIT License",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.9",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ ]
28
+
29
+ dependencies = [
30
+ "pandas>=1.3",
31
+ "numpy>=1.21",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ excel = ["openpyxl>=3.0", "xlrd>=2.0", "odfpy>=1.4"]
36
+ memcheck = ["psutil>=5.9"]
37
+ all = ["openpyxl>=3.0", "xlrd>=2.0", "odfpy>=1.4", "psutil>=5.9"]
38
+ dev = ["pytest>=7.0", "build>=1.0", "twine>=4.0"]
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/your-org/cds-pyde-toolkit"
42
+ Issues = "https://github.com/your-org/cds-pyde-toolkit/issues"
43
+
44
+ [project.scripts]
45
+ cds-pyde-toolkit = "cds_pyde_toolkit.cli:main"
46
+
47
+ [tool.setuptools.packages.find]
48
+ where = ["src"]
49
+
50
+ [tool.setuptools.package-data]
51
+ cds_pyde_toolkit = ["py.typed"]
52
+
53
+ [tool.pytest.ini_options]
54
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,44 @@
1
+ """
2
+ cds_pyde_toolkit
3
+ =================
4
+ A growing toolkit of data-engineering helper functions and CLI commands.
5
+
6
+ Currently included
7
+ -------------------
8
+ schema_inferencer Infer column names, data types, schema definitions, and
9
+ CREATE TABLE / VIEW DDL from a file or a pandas DataFrame.
10
+ (Pandas/ANSI SQL or PySpark/Spark SQL.)
11
+
12
+ Usage
13
+ -----
14
+ Namespaced (recommended as the toolkit grows, to avoid name clashes between
15
+ tools)::
16
+
17
+ from cds_pyde_toolkit.schema_inferencer import infer_file
18
+ result = infer_file(my_dataframe, pyspark=True, casing="snake")
19
+
20
+ Top-level convenience re-exports are also provided for the most commonly
21
+ used function of each tool — currently just `infer_file`::
22
+
23
+ from cds_pyde_toolkit import infer_file
24
+
25
+ CLI
26
+ ---
27
+ cds-pyde-toolkit schema-infer Sales1.csv --pyspark true
28
+ cds-pyde-toolkit --help
29
+ """
30
+
31
+ from importlib.metadata import PackageNotFoundError
32
+ from importlib.metadata import version as _installed_version
33
+
34
+ from .schema_inferencer import infer_file
35
+
36
+ try:
37
+ __version__ = _installed_version("cds-pyde-toolkit")
38
+ except PackageNotFoundError: # pragma: no cover — running from source without an install
39
+ __version__ = "0.0.0+unknown"
40
+
41
+ __all__ = [
42
+ "infer_file",
43
+ "__version__",
44
+ ]
@@ -0,0 +1,62 @@
1
+ """
2
+ cds_pyde_toolkit.cli
3
+ ======================
4
+ Top-level command-line entry point for the whole toolkit, installed as the
5
+ `cds-pyde-toolkit` console script. Each tool in the package contributes one
6
+ subcommand here.
7
+
8
+ Currently registered subcommands:
9
+ schema-infer → cds_pyde_toolkit.schema_inferencer
10
+
11
+ Adding a new tool later
12
+ ------------------------
13
+ 1. Create a new submodule, e.g. `cds_pyde_toolkit/data_profiler/` with its
14
+ own `core.py` and a `cli.py` that exposes `add_arguments(parser)` and
15
+ `run(args)` (see `schema_inferencer/cli.py` for the pattern).
16
+ 2. Register it below with one call to `subparsers.add_parser(...)` +
17
+ `<tool>.cli.add_arguments(...)`.
18
+ That's it — no other changes needed; dispatch is generic via `args._run`.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ from typing import Optional
25
+
26
+ from . import __version__
27
+ from .schema_inferencer import cli as schema_inferencer_cli
28
+
29
+
30
+ def build_parser() -> argparse.ArgumentParser:
31
+ parser = argparse.ArgumentParser(
32
+ prog='cds-pyde-toolkit',
33
+ description='A growing toolkit of data-engineering helper commands.',
34
+ )
35
+ parser.add_argument(
36
+ '--version', action='version', version=f'cds-pyde-toolkit {__version__}'
37
+ )
38
+
39
+ subparsers = parser.add_subparsers(dest='command', required=True)
40
+
41
+ schema_infer_parser = subparsers.add_parser(
42
+ 'schema-infer',
43
+ help='Infer column names, data types, schema, and CREATE TABLE/VIEW DDL '
44
+ 'from a file or DataFrame.',
45
+ )
46
+ schema_inferencer_cli.add_arguments(schema_infer_parser)
47
+
48
+ # ── Future subcommands get registered here, e.g.: ─────────────────────────
49
+ # profile_parser = subparsers.add_parser('profile', help='...')
50
+ # data_profiler_cli.add_arguments(profile_parser)
51
+
52
+ return parser
53
+
54
+
55
+ def main(argv: Optional[list] = None) -> None:
56
+ parser = build_parser()
57
+ args = parser.parse_args(argv)
58
+ args._run(args)
59
+
60
+
61
+ if __name__ == '__main__': # pragma: no cover
62
+ main()
File without changes
@@ -0,0 +1,38 @@
1
+ """
2
+ cds_pyde_toolkit.schema_inferencer
3
+ ===============================
4
+ Infer column names, data types, schema definitions, and CREATE TABLE / VIEW
5
+ DDL from a CSV/TSV/Excel file — or directly from a pandas DataFrame already
6
+ in memory (e.g. a Spark DataFrame converted via `.toPandas()`).
7
+
8
+ Quick start
9
+ -----------
10
+ from cds_pyde_toolkit.schema_inferencer import infer_file
11
+
12
+ result = infer_file(my_dataframe, pyspark=True, casing="snake")
13
+ print(result["schema"])
14
+ print(result["create_table"])
15
+
16
+ See `cds_pyde_toolkit.schema_inferencer.core.infer_file` for the full parameter
17
+ reference, or run `pyde-toolkit schema-infer --help` for the CLI.
18
+ """
19
+
20
+ from .core import (
21
+ VALID_CASINGS,
22
+ VALID_LAYERS,
23
+ VALID_TABLE_TYPES,
24
+ format_column_name,
25
+ infer_file,
26
+ standardise_columns,
27
+ to_camel_case,
28
+ )
29
+
30
+ __all__ = [
31
+ "infer_file",
32
+ "standardise_columns",
33
+ "format_column_name",
34
+ "to_camel_case",
35
+ "VALID_CASINGS",
36
+ "VALID_LAYERS",
37
+ "VALID_TABLE_TYPES",
38
+ ]