cds-pyde-toolkit 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cds_pyde_toolkit-1.1.0/LICENSE +21 -0
- cds_pyde_toolkit-1.1.0/PKG-INFO +156 -0
- cds_pyde_toolkit-1.1.0/README.md +115 -0
- cds_pyde_toolkit-1.1.0/pyproject.toml +54 -0
- cds_pyde_toolkit-1.1.0/setup.cfg +4 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit/__init__.py +44 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit/cli.py +62 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit/py.typed +0 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit/schema_inferencer/__init__.py +38 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit/schema_inferencer/cli.py +268 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit/schema_inferencer/core.py +1550 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit.egg-info/PKG-INFO +156 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit.egg-info/SOURCES.txt +17 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit.egg-info/dependency_links.txt +1 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit.egg-info/entry_points.txt +2 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit.egg-info/requires.txt +21 -0
- cds_pyde_toolkit-1.1.0/src/cds_pyde_toolkit.egg-info/top_level.txt +1 -0
- cds_pyde_toolkit-1.1.0/tests/test_cli.py +62 -0
- cds_pyde_toolkit-1.1.0/tests/test_schema_inferencer.py +61 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Your Name
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cds-pyde-toolkit
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: A growing toolkit of data-engineering helper functions and CLI commands — starting with schema inference (column standardisation, type inference, schema + DDL generation for Pandas/ANSI SQL or PySpark/Spark SQL).
|
|
5
|
+
Author: Your Name
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/your-org/cds-pyde-toolkit
|
|
8
|
+
Project-URL: Issues, https://github.com/your-org/cds-pyde-toolkit/issues
|
|
9
|
+
Keywords: pandas,pyspark,schema,ddl,data-engineering,delta-lake,databricks,toolkit
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Database
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: pandas>=1.3
|
|
24
|
+
Requires-Dist: numpy>=1.21
|
|
25
|
+
Provides-Extra: excel
|
|
26
|
+
Requires-Dist: openpyxl>=3.0; extra == "excel"
|
|
27
|
+
Requires-Dist: xlrd>=2.0; extra == "excel"
|
|
28
|
+
Requires-Dist: odfpy>=1.4; extra == "excel"
|
|
29
|
+
Provides-Extra: memcheck
|
|
30
|
+
Requires-Dist: psutil>=5.9; extra == "memcheck"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: openpyxl>=3.0; extra == "all"
|
|
33
|
+
Requires-Dist: xlrd>=2.0; extra == "all"
|
|
34
|
+
Requires-Dist: odfpy>=1.4; extra == "all"
|
|
35
|
+
Requires-Dist: psutil>=5.9; extra == "all"
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
38
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
39
|
+
Requires-Dist: twine>=4.0; extra == "dev"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
# cds-pyde-toolkit
|
|
43
|
+
|
|
44
|
+
A growing toolkit of data-engineering helper functions and CLI commands.
|
|
45
|
+
The first tool is **schema inference**: standardise column names, infer
|
|
46
|
+
data types from sample data, and emit ready-to-use schema definitions and
|
|
47
|
+
`CREATE TABLE` DDL — either Pandas/ANSI SQL or PySpark/Spark SQL (with
|
|
48
|
+
bronze/silver/gold layer support for Databricks / Unity Catalog workflows).
|
|
49
|
+
|
|
50
|
+
## Install
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install cds-pyde-toolkit
|
|
54
|
+
|
|
55
|
+
# with Excel support (.xlsx, .xls, .xlsm, .xlsb, .ods)
|
|
56
|
+
pip install "cds-pyde-toolkit[excel]"
|
|
57
|
+
|
|
58
|
+
# with the pre-flight memory check for large full-file reads
|
|
59
|
+
pip install "cds-pyde-toolkit[memcheck]"
|
|
60
|
+
|
|
61
|
+
# everything
|
|
62
|
+
pip install "cds-pyde-toolkit[all]"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Already have it installed and want the latest release?
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install --upgrade cds-pyde-toolkit
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Quick start — Python
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from cds_pyde_toolkit import infer_file # top-level convenience re-export
|
|
75
|
+
# or, namespaced (recommended as the toolkit grows):
|
|
76
|
+
from cds_pyde_toolkit.schema_inferencer import infer_file
|
|
77
|
+
|
|
78
|
+
result = infer_file(
|
|
79
|
+
"sales.csv",
|
|
80
|
+
pyspark=True,
|
|
81
|
+
casing="pascal",
|
|
82
|
+
table_name="sales_fact",
|
|
83
|
+
header_row=0, # skip junk title rows if needed, e.g. header_row=4
|
|
84
|
+
type_threshold=0.95, # tolerate a few dirty values before falling back to string
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
print(result["schema"]) # PySpark StructType or pandas dtype dict
|
|
88
|
+
print(result["create_table"]) # SQL DDL
|
|
89
|
+
print(result["rename_code"]) # copy-paste column rename snippet
|
|
90
|
+
print(result["report"]) # full formatted text report
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Quick start — CLI
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
cds-pyde-toolkit schema-infer sales.csv
|
|
97
|
+
cds-pyde-toolkit schema-infer sales.csv --pyspark true --case pascal --layer bronze --catalog prod
|
|
98
|
+
cds-pyde-toolkit schema-infer sales.xlsx --sheet Sheet2 --layer silver
|
|
99
|
+
cds-pyde-toolkit schema-infer messy.csv --header-row 4 --type-threshold 0.80
|
|
100
|
+
cds-pyde-toolkit --version
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Run `cds-pyde-toolkit schema-infer --help` for the full flag reference, or see
|
|
104
|
+
the module docstring in `cds_pyde_toolkit/schema_inferencer/core.py`.
|
|
105
|
+
|
|
106
|
+
## Features
|
|
107
|
+
|
|
108
|
+
- **Column standardisation** — camel, pascal, snake, screaming, kebab, or
|
|
109
|
+
skip casing, with symbol expansion (`/` → `or`, `%` → `pct`, etc.)
|
|
110
|
+
- **Type inference** — bool, int32/int64, float, date, datetime, string,
|
|
111
|
+
with a configurable conformance threshold (`--type-threshold`) to tolerate
|
|
112
|
+
dirty data
|
|
113
|
+
- **Header offset** — `--header-row` to skip junk/title rows above the real
|
|
114
|
+
header, for both CSV and Excel
|
|
115
|
+
- **Dual output modes** — Pandas dtypes + ANSI SQL, or PySpark StructType +
|
|
116
|
+
Spark SQL
|
|
117
|
+
- **Layered outputs** — bronze, parquet_bronze, silver, gold, gold_vw (view),
|
|
118
|
+
or all five at once
|
|
119
|
+
- **Table types** — managed Delta, external, or external Delta tables
|
|
120
|
+
- **Flexible input** — CSV/TSV (delimiter auto-detected), Excel
|
|
121
|
+
(`.xlsx .xls .xlsm .xlsb .ods`), or a pandas DataFrame directly
|
|
122
|
+
|
|
123
|
+
## Project structure (for contributors)
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
src/cds_pyde_toolkit/
|
|
127
|
+
├── __init__.py # top-level re-exports + __version__
|
|
128
|
+
├── cli.py # top-level CLI dispatcher (registers subcommands)
|
|
129
|
+
└── schema_inferencer/ # one subpackage per feature
|
|
130
|
+
├── __init__.py # public API for this feature
|
|
131
|
+
├── core.py # logic only, no argparse
|
|
132
|
+
└── cli.py # add_arguments(parser) + run(args) for this feature
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**Adding a new feature later:** create `cds_pyde_toolkit/<your_feature>/` with
|
|
136
|
+
the same three-file shape, then register it with one line in
|
|
137
|
+
`cds_pyde_toolkit/cli.py`'s `build_parser()`. No other files need to change.
|
|
138
|
+
|
|
139
|
+
## Releasing a new version
|
|
140
|
+
|
|
141
|
+
Version lives in one place (`pyproject.toml`); the installed package's
|
|
142
|
+
`__version__` is read live from package metadata, so there's nothing else
|
|
143
|
+
to keep in sync.
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
python scripts/bump_version.py patch # or minor / major / an exact X.Y.Z
|
|
147
|
+
python -m build
|
|
148
|
+
twine upload dist/*
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Anyone with it already installed just runs `pip install --upgrade cds-pyde-toolkit`
|
|
152
|
+
— no need to uninstall first.
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# cds-pyde-toolkit
|
|
2
|
+
|
|
3
|
+
A growing toolkit of data-engineering helper functions and CLI commands.
|
|
4
|
+
The first tool is **schema inference**: standardise column names, infer
|
|
5
|
+
data types from sample data, and emit ready-to-use schema definitions and
|
|
6
|
+
`CREATE TABLE` DDL — either Pandas/ANSI SQL or PySpark/Spark SQL (with
|
|
7
|
+
bronze/silver/gold layer support for Databricks / Unity Catalog workflows).
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install cds-pyde-toolkit
|
|
13
|
+
|
|
14
|
+
# with Excel support (.xlsx, .xls, .xlsm, .xlsb, .ods)
|
|
15
|
+
pip install "cds-pyde-toolkit[excel]"
|
|
16
|
+
|
|
17
|
+
# with the pre-flight memory check for large full-file reads
|
|
18
|
+
pip install "cds-pyde-toolkit[memcheck]"
|
|
19
|
+
|
|
20
|
+
# everything
|
|
21
|
+
pip install "cds-pyde-toolkit[all]"
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Already have it installed and want the latest release?
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install --upgrade cds-pyde-toolkit
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quick start — Python
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from cds_pyde_toolkit import infer_file # top-level convenience re-export
|
|
34
|
+
# or, namespaced (recommended as the toolkit grows):
|
|
35
|
+
from cds_pyde_toolkit.schema_inferencer import infer_file
|
|
36
|
+
|
|
37
|
+
result = infer_file(
|
|
38
|
+
"sales.csv",
|
|
39
|
+
pyspark=True,
|
|
40
|
+
casing="pascal",
|
|
41
|
+
table_name="sales_fact",
|
|
42
|
+
header_row=0, # skip junk title rows if needed, e.g. header_row=4
|
|
43
|
+
type_threshold=0.95, # tolerate a few dirty values before falling back to string
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
print(result["schema"]) # PySpark StructType or pandas dtype dict
|
|
47
|
+
print(result["create_table"]) # SQL DDL
|
|
48
|
+
print(result["rename_code"]) # copy-paste column rename snippet
|
|
49
|
+
print(result["report"]) # full formatted text report
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick start — CLI
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
cds-pyde-toolkit schema-infer sales.csv
|
|
56
|
+
cds-pyde-toolkit schema-infer sales.csv --pyspark true --case pascal --layer bronze --catalog prod
|
|
57
|
+
cds-pyde-toolkit schema-infer sales.xlsx --sheet Sheet2 --layer silver
|
|
58
|
+
cds-pyde-toolkit schema-infer messy.csv --header-row 4 --type-threshold 0.80
|
|
59
|
+
cds-pyde-toolkit --version
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Run `cds-pyde-toolkit schema-infer --help` for the full flag reference, or see
|
|
63
|
+
the module docstring in `cds_pyde_toolkit/schema_inferencer/core.py`.
|
|
64
|
+
|
|
65
|
+
## Features
|
|
66
|
+
|
|
67
|
+
- **Column standardisation** — camel, pascal, snake, screaming, kebab, or
|
|
68
|
+
skip casing, with symbol expansion (`/` → `or`, `%` → `pct`, etc.)
|
|
69
|
+
- **Type inference** — bool, int32/int64, float, date, datetime, string,
|
|
70
|
+
with a configurable conformance threshold (`--type-threshold`) to tolerate
|
|
71
|
+
dirty data
|
|
72
|
+
- **Header offset** — `--header-row` to skip junk/title rows above the real
|
|
73
|
+
header, for both CSV and Excel
|
|
74
|
+
- **Dual output modes** — Pandas dtypes + ANSI SQL, or PySpark StructType +
|
|
75
|
+
Spark SQL
|
|
76
|
+
- **Layered outputs** — bronze, parquet_bronze, silver, gold, gold_vw (view),
|
|
77
|
+
or all five at once
|
|
78
|
+
- **Table types** — managed Delta, external, or external Delta tables
|
|
79
|
+
- **Flexible input** — CSV/TSV (delimiter auto-detected), Excel
|
|
80
|
+
(`.xlsx .xls .xlsm .xlsb .ods`), or a pandas DataFrame directly
|
|
81
|
+
|
|
82
|
+
## Project structure (for contributors)
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
src/cds_pyde_toolkit/
|
|
86
|
+
├── __init__.py # top-level re-exports + __version__
|
|
87
|
+
├── cli.py # top-level CLI dispatcher (registers subcommands)
|
|
88
|
+
└── schema_inferencer/ # one subpackage per feature
|
|
89
|
+
├── __init__.py # public API for this feature
|
|
90
|
+
├── core.py # logic only, no argparse
|
|
91
|
+
└── cli.py # add_arguments(parser) + run(args) for this feature
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Adding a new feature later:** create `cds_pyde_toolkit/<your_feature>/` with
|
|
95
|
+
the same three-file shape, then register it with one line in
|
|
96
|
+
`cds_pyde_toolkit/cli.py`'s `build_parser()`. No other files need to change.
|
|
97
|
+
|
|
98
|
+
## Releasing a new version
|
|
99
|
+
|
|
100
|
+
Version lives in one place (`pyproject.toml`); the installed package's
|
|
101
|
+
`__version__` is read live from package metadata, so there's nothing else
|
|
102
|
+
to keep in sync.
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
python scripts/bump_version.py patch # or minor / major / an exact X.Y.Z
|
|
106
|
+
python -m build
|
|
107
|
+
twine upload dist/*
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Anyone with it already installed just runs `pip install --upgrade cds-pyde-toolkit`
|
|
111
|
+
— no need to uninstall first.
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "cds-pyde-toolkit"
|
|
7
|
+
version = "1.1.0"
|
|
8
|
+
description = "A growing toolkit of data-engineering helper functions and CLI commands — starting with schema inference (column standardisation, type inference, schema + DDL generation for Pandas/ANSI SQL or PySpark/Spark SQL)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Your Name" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["pandas", "pyspark", "schema", "ddl", "data-engineering", "delta-lake", "databricks", "toolkit"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Topic :: Database",
|
|
20
|
+
"Topic :: Software Development :: Libraries",
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
dependencies = [
|
|
30
|
+
"pandas>=1.3",
|
|
31
|
+
"numpy>=1.21",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
excel = ["openpyxl>=3.0", "xlrd>=2.0", "odfpy>=1.4"]
|
|
36
|
+
memcheck = ["psutil>=5.9"]
|
|
37
|
+
all = ["openpyxl>=3.0", "xlrd>=2.0", "odfpy>=1.4", "psutil>=5.9"]
|
|
38
|
+
dev = ["pytest>=7.0", "build>=1.0", "twine>=4.0"]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/your-org/cds-pyde-toolkit"
|
|
42
|
+
Issues = "https://github.com/your-org/cds-pyde-toolkit/issues"
|
|
43
|
+
|
|
44
|
+
[project.scripts]
|
|
45
|
+
cds-pyde-toolkit = "cds_pyde_toolkit.cli:main"
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["src"]
|
|
49
|
+
|
|
50
|
+
[tool.setuptools.package-data]
|
|
51
|
+
cds_pyde_toolkit = ["py.typed"]
|
|
52
|
+
|
|
53
|
+
[tool.pytest.ini_options]
|
|
54
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cds_pyde_toolkit
|
|
3
|
+
=================
|
|
4
|
+
A growing toolkit of data-engineering helper functions and CLI commands.
|
|
5
|
+
|
|
6
|
+
Currently included
|
|
7
|
+
-------------------
|
|
8
|
+
schema_inferencer Infer column names, data types, schema definitions, and
|
|
9
|
+
CREATE TABLE / VIEW DDL from a file or a pandas DataFrame.
|
|
10
|
+
(Pandas/ANSI SQL or PySpark/Spark SQL.)
|
|
11
|
+
|
|
12
|
+
Usage
|
|
13
|
+
-----
|
|
14
|
+
Namespaced (recommended as the toolkit grows, to avoid name clashes between
|
|
15
|
+
tools)::
|
|
16
|
+
|
|
17
|
+
from cds_pyde_toolkit.schema_inferencer import infer_file
|
|
18
|
+
result = infer_file(my_dataframe, pyspark=True, casing="snake")
|
|
19
|
+
|
|
20
|
+
Top-level convenience re-exports are also provided for the most commonly
|
|
21
|
+
used function of each tool — currently just `infer_file`::
|
|
22
|
+
|
|
23
|
+
from cds_pyde_toolkit import infer_file
|
|
24
|
+
|
|
25
|
+
CLI
|
|
26
|
+
---
|
|
27
|
+
cds-pyde-toolkit schema-infer Sales1.csv --pyspark true
|
|
28
|
+
cds-pyde-toolkit --help
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from importlib.metadata import PackageNotFoundError
|
|
32
|
+
from importlib.metadata import version as _installed_version
|
|
33
|
+
|
|
34
|
+
from .schema_inferencer import infer_file
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
__version__ = _installed_version("cds-pyde-toolkit")
|
|
38
|
+
except PackageNotFoundError: # pragma: no cover — running from source without an install
|
|
39
|
+
__version__ = "0.0.0+unknown"
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
"infer_file",
|
|
43
|
+
"__version__",
|
|
44
|
+
]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cds_pyde_toolkit.cli
|
|
3
|
+
======================
|
|
4
|
+
Top-level command-line entry point for the whole toolkit, installed as the
|
|
5
|
+
`cds-pyde-toolkit` console script. Each tool in the package contributes one
|
|
6
|
+
subcommand here.
|
|
7
|
+
|
|
8
|
+
Currently registered subcommands:
|
|
9
|
+
schema-infer → cds_pyde_toolkit.schema_inferencer
|
|
10
|
+
|
|
11
|
+
Adding a new tool later
|
|
12
|
+
------------------------
|
|
13
|
+
1. Create a new submodule, e.g. `cds_pyde_toolkit/data_profiler/` with its
|
|
14
|
+
own `core.py` and a `cli.py` that exposes `add_arguments(parser)` and
|
|
15
|
+
`run(args)` (see `schema_inferencer/cli.py` for the pattern).
|
|
16
|
+
2. Register it below with one call to `subparsers.add_parser(...)` +
|
|
17
|
+
`<tool>.cli.add_arguments(...)`.
|
|
18
|
+
That's it — no other changes needed; dispatch is generic via `args._run`.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
from typing import Optional
|
|
25
|
+
|
|
26
|
+
from . import __version__
|
|
27
|
+
from .schema_inferencer import cli as schema_inferencer_cli
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
31
|
+
parser = argparse.ArgumentParser(
|
|
32
|
+
prog='cds-pyde-toolkit',
|
|
33
|
+
description='A growing toolkit of data-engineering helper commands.',
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
'--version', action='version', version=f'cds-pyde-toolkit {__version__}'
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
subparsers = parser.add_subparsers(dest='command', required=True)
|
|
40
|
+
|
|
41
|
+
schema_infer_parser = subparsers.add_parser(
|
|
42
|
+
'schema-infer',
|
|
43
|
+
help='Infer column names, data types, schema, and CREATE TABLE/VIEW DDL '
|
|
44
|
+
'from a file or DataFrame.',
|
|
45
|
+
)
|
|
46
|
+
schema_inferencer_cli.add_arguments(schema_infer_parser)
|
|
47
|
+
|
|
48
|
+
# ── Future subcommands get registered here, e.g.: ─────────────────────────
|
|
49
|
+
# profile_parser = subparsers.add_parser('profile', help='...')
|
|
50
|
+
# data_profiler_cli.add_arguments(profile_parser)
|
|
51
|
+
|
|
52
|
+
return parser
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def main(argv: Optional[list] = None) -> None:
|
|
56
|
+
parser = build_parser()
|
|
57
|
+
args = parser.parse_args(argv)
|
|
58
|
+
args._run(args)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == '__main__': # pragma: no cover
|
|
62
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cds_pyde_toolkit.schema_inferencer
|
|
3
|
+
===============================
|
|
4
|
+
Infer column names, data types, schema definitions, and CREATE TABLE / VIEW
|
|
5
|
+
DDL from a CSV/TSV/Excel file — or directly from a pandas DataFrame already
|
|
6
|
+
in memory (e.g. a Spark DataFrame converted via `.toPandas()`).
|
|
7
|
+
|
|
8
|
+
Quick start
|
|
9
|
+
-----------
|
|
10
|
+
from cds_pyde_toolkit.schema_inferencer import infer_file
|
|
11
|
+
|
|
12
|
+
result = infer_file(my_dataframe, pyspark=True, casing="snake")
|
|
13
|
+
print(result["schema"])
|
|
14
|
+
print(result["create_table"])
|
|
15
|
+
|
|
16
|
+
See `cds_pyde_toolkit.schema_inferencer.core.infer_file` for the full parameter
|
|
17
|
+
reference, or run `pyde-toolkit schema-infer --help` for the CLI.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from .core import (
|
|
21
|
+
VALID_CASINGS,
|
|
22
|
+
VALID_LAYERS,
|
|
23
|
+
VALID_TABLE_TYPES,
|
|
24
|
+
format_column_name,
|
|
25
|
+
infer_file,
|
|
26
|
+
standardise_columns,
|
|
27
|
+
to_camel_case,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"infer_file",
|
|
32
|
+
"standardise_columns",
|
|
33
|
+
"format_column_name",
|
|
34
|
+
"to_camel_case",
|
|
35
|
+
"VALID_CASINGS",
|
|
36
|
+
"VALID_LAYERS",
|
|
37
|
+
"VALID_TABLE_TYPES",
|
|
38
|
+
]
|