cds-pyde-toolkit 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ """
2
+ cds_pyde_toolkit
3
+ =================
4
+ A growing toolkit of data-engineering helper functions and CLI commands.
5
+
6
+ Currently included
7
+ -------------------
8
+ schema_inferencer Infer column names, data types, schema definitions, and
9
+ CREATE TABLE / VIEW DDL from a file or a pandas DataFrame.
10
+ (Pandas/ANSI SQL or PySpark/Spark SQL.)
11
+
12
+ Usage
13
+ -----
14
+ Namespaced (recommended as the toolkit grows, to avoid name clashes between
15
+ tools)::
16
+
17
+ from cds_pyde_toolkit.schema_inferencer import infer_file
18
+ result = infer_file(my_dataframe, pyspark=True, casing="snake")
19
+
20
+ Top-level convenience re-exports are also provided for the most commonly
21
+ used function of each tool — currently just `infer_file`::
22
+
23
+ from cds_pyde_toolkit import infer_file
24
+
25
+ CLI
26
+ ---
27
+ cds-pyde-toolkit schema-infer Sales1.csv --pyspark true
28
+ cds-pyde-toolkit --help
29
+ """
30
+
31
+ from importlib.metadata import PackageNotFoundError
32
+ from importlib.metadata import version as _installed_version
33
+
34
+ from .schema_inferencer import infer_file
35
+
36
+ try:
37
+ __version__ = _installed_version("cds-pyde-toolkit")
38
+ except PackageNotFoundError: # pragma: no cover — running from source without an install
39
+ __version__ = "0.0.0+unknown"
40
+
41
+ __all__ = [
42
+ "infer_file",
43
+ "__version__",
44
+ ]
@@ -0,0 +1,62 @@
1
+ """
2
+ cds_pyde_toolkit.cli
3
+ ======================
4
+ Top-level command-line entry point for the whole toolkit, installed as the
5
+ `cds-pyde-toolkit` console script. Each tool in the package contributes one
6
+ subcommand here.
7
+
8
+ Currently registered subcommands:
9
+ schema-infer → cds_pyde_toolkit.schema_inferencer
10
+
11
+ Adding a new tool later
12
+ ------------------------
13
+ 1. Create a new submodule, e.g. `cds_pyde_toolkit/data_profiler/` with its
14
+ own `core.py` and a `cli.py` that exposes `add_arguments(parser)` and
15
+ `run(args)` (see `schema_inferencer/cli.py` for the pattern).
16
+ 2. Register it below with one call to `subparsers.add_parser(...)` +
17
+ `<tool>.cli.add_arguments(...)`.
18
+ That's it — no other changes needed; dispatch is generic via `args._run`.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ from typing import Optional
25
+
26
+ from . import __version__
27
+ from .schema_inferencer import cli as schema_inferencer_cli
28
+
29
+
30
+ def build_parser() -> argparse.ArgumentParser:
31
+ parser = argparse.ArgumentParser(
32
+ prog='cds-pyde-toolkit',
33
+ description='A growing toolkit of data-engineering helper commands.',
34
+ )
35
+ parser.add_argument(
36
+ '--version', action='version', version=f'cds-pyde-toolkit {__version__}'
37
+ )
38
+
39
+ subparsers = parser.add_subparsers(dest='command', required=True)
40
+
41
+ schema_infer_parser = subparsers.add_parser(
42
+ 'schema-infer',
43
+ help='Infer column names, data types, schema, and CREATE TABLE/VIEW DDL '
44
+ 'from a file or DataFrame.',
45
+ )
46
+ schema_inferencer_cli.add_arguments(schema_infer_parser)
47
+
48
+ # ── Future subcommands get registered here, e.g.: ─────────────────────────
49
+ # profile_parser = subparsers.add_parser('profile', help='...')
50
+ # data_profiler_cli.add_arguments(profile_parser)
51
+
52
+ return parser
53
+
54
+
55
+ def main(argv: Optional[list] = None) -> None:
56
+ parser = build_parser()
57
+ args = parser.parse_args(argv)
58
+ args._run(args)
59
+
60
+
61
+ if __name__ == '__main__': # pragma: no cover
62
+ main()
File without changes
@@ -0,0 +1,38 @@
1
+ """
2
+ cds_pyde_toolkit.schema_inferencer
3
+ ===============================
4
+ Infer column names, data types, schema definitions, and CREATE TABLE / VIEW
5
+ DDL from a CSV/TSV/Excel file — or directly from a pandas DataFrame already
6
+ in memory (e.g. a Spark DataFrame converted via `.toPandas()`).
7
+
8
+ Quick start
9
+ -----------
10
+ from cds_pyde_toolkit.schema_inferencer import infer_file
11
+
12
+ result = infer_file(my_dataframe, pyspark=True, casing="snake")
13
+ print(result["schema"])
14
+ print(result["create_table"])
15
+
16
+ See `cds_pyde_toolkit.schema_inferencer.core.infer_file` for the full parameter
17
+ reference, or run `pyde-toolkit schema-infer --help` for the CLI.
18
+ """
19
+
20
+ from .core import (
21
+ VALID_CASINGS,
22
+ VALID_LAYERS,
23
+ VALID_TABLE_TYPES,
24
+ format_column_name,
25
+ infer_file,
26
+ standardise_columns,
27
+ to_camel_case,
28
+ )
29
+
30
+ __all__ = [
31
+ "infer_file",
32
+ "standardise_columns",
33
+ "format_column_name",
34
+ "to_camel_case",
35
+ "VALID_CASINGS",
36
+ "VALID_LAYERS",
37
+ "VALID_TABLE_TYPES",
38
+ ]
@@ -0,0 +1,268 @@
1
+ """
2
+ cds_pyde_toolkit.schema_inferencer.cli
3
+ ====================================
4
+ Argument definitions and run logic for the `schema-infer` subcommand.
5
+
6
+ This module is designed to be mounted as a subcommand of the top-level
7
+ `cds-pyde-toolkit` CLI (see `cds_pyde_toolkit.cli`), but can also be run standalone:
8
+
9
+ python -m cds_pyde_toolkit.schema_inferencer.cli Sales1.csv --pyspark true
10
+
11
+ Two entry points are exposed for the top-level CLI to use:
12
+ add_arguments(parser) -- registers all flags onto a given parser/subparser
13
+ run(args) -- validates flags and executes the inference
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import os
20
+ import re
21
+ import textwrap
22
+ from typing import Optional, Union
23
+
24
+ from .core import (
25
+ VALID_CASINGS,
26
+ VALID_LAYERS,
27
+ VALID_TABLE_TYPES,
28
+ _DEFAULT_SAMPLE,
29
+ _TYPE_CONFORMANCE_THRESHOLD,
30
+ infer_file,
31
+ )
32
+
33
+
34
+ def _str_to_bool(value: str) -> bool:
35
+ n = value.strip().lower()
36
+ if n in ('true', 'yes', '1'):
37
+ return True
38
+ if n in ('false', 'no', '0'):
39
+ return False
40
+ raise argparse.ArgumentTypeError(
41
+ f"Expected true/false (got {value!r}). Use --pyspark true or --pyspark false"
42
+ )
43
+
44
+
45
+ _EPILOG = textwrap.dedent(f"""\
46
+ ─── casing options ────────────────────────────────────────────────────────
47
+ camel plantDescription (default)
48
+ pascal PlantDescription every word Title-cased
49
+ snake plant_description
50
+ screaming PLANT_DESCRIPTION
51
+ kebab plant-description
52
+ skip original name preserved (quoted in DDL)
53
+
54
+ ─── layer options (--pyspark true only) ─────────────────────────────────
55
+ bronze all columns → STRING + column mapping + rename + DDL
56
+ parquet_bronze typed from DataFrame + column mapping + rename + DDL
57
+ silver inferred types + schema + DDL
58
+ gold inferred types + schema + DDL
59
+ gold_vw CREATE VIEW from silver layer
60
+ all all five layers in one report
61
+
62
+ ─── table-type options (--pyspark true only) ────────────────────────────
63
+ delta CREATE TABLE … USING DELTA (default)
64
+ external CREATE EXTERNAL TABLE … USING <fmt> LOCATION '<path>'
65
+ external_delta CREATE EXTERNAL TABLE … USING DELTA LOCATION '<path>'
66
+
67
+ ─── sample rows ───────────────────────────────────────────────────────────
68
+ omitted auto: min({_DEFAULT_SAMPLE:,}, total file rows)
69
+ 0 full file — memory-safety check first
70
+ N use min(N, total rows); warns if N > row count
71
+
72
+ ─── defaults when a flag is omitted ──────────────────────────────────────
73
+ --pyspark false Pandas dtype dict + ANSI SQL
74
+ --table <file stem> e.g. 'Sales1' for Sales1.csv
75
+ --sample auto min({_DEFAULT_SAMPLE:,}, file row count)
76
+ --type-threshold {_TYPE_CONFORMANCE_THRESHOLD} min fraction of values that must match a type
77
+ --case camel camelCase column names
78
+ --layer (omitted) single output, no layer prefix
79
+ --catalog (omitted) two-part name: layer.table
80
+ --table-type delta USING DELTA
81
+ --output (omitted) print to terminal, no file saved
82
+ --no-print (omitted) terminal output is shown
83
+ --sheet 0 first sheet (Excel only)
84
+ --header-row 0 first row/line is the column header
85
+
86
+ ─── examples ──────────────────────────────────────────────────────────────
87
+ cds-pyde-toolkit schema-infer Sales1.csv
88
+ cds-pyde-toolkit schema-infer Sales1.csv --pyspark true --case pascal
89
+ cds-pyde-toolkit schema-infer Sales1.csv --pyspark true --layer bronze --catalog prod
90
+ cds-pyde-toolkit schema-infer Sales1.csv --pyspark true --layer all --catalog prod
91
+ cds-pyde-toolkit schema-infer Sales1.csv --pyspark true --layer bronze \\
92
+ --table-type external_delta --location /mnt/bronze/sales/
93
+ cds-pyde-toolkit schema-infer sales.xlsx --pyspark true --sheet Sheet2 --layer silver
94
+ cds-pyde-toolkit schema-infer Sales1.csv --pyspark true --layer all --output txt --no-print
95
+ cds-pyde-toolkit schema-infer messy.xlsx --sheet Sheet1 --header-row 4
96
+ cds-pyde-toolkit schema-infer noisy.csv --type-threshold 0.80
97
+ """)
98
+
99
+
100
+ def add_arguments(parser: argparse.ArgumentParser) -> None:
101
+ """Register all schema-infer flags onto *parser* (a subparser or a standalone parser)."""
102
+ parser.description = (
103
+ 'Infer column names, data types, schema, and CREATE TABLE / VIEW DDL\n'
104
+ 'from a CSV / TSV or Excel file. Delimiter is auto-detected.\n'
105
+ 'Supports Databricks Unity Catalog three-part naming via --catalog.'
106
+ )
107
+ parser.formatter_class = argparse.RawDescriptionHelpFormatter
108
+ parser.epilog = _EPILOG
109
+
110
+ parser.add_argument('file',
111
+ help='CSV/TSV (.csv) or Excel (.xlsx .xls .xlsm .xlsb .ods)')
112
+
113
+ parser.add_argument('--pyspark',
114
+ type=_str_to_bool, default=False, metavar='true|false',
115
+ help='true → PySpark/Spark SQL | false → Pandas/ANSI SQL (default: false)')
116
+
117
+ parser.add_argument('--table',
118
+ default=None, metavar='NAME',
119
+ help='Override table name (default: derived from file stem)')
120
+
121
+ parser.add_argument('--sample',
122
+ type=int, default=None, metavar='N',
123
+ help=f'Rows to sample (default auto min({_DEFAULT_SAMPLE:,},total); 0=full file)')
124
+
125
+ parser.add_argument('--type-threshold',
126
+ type=float, default=_TYPE_CONFORMANCE_THRESHOLD, metavar='0.0-1.0',
127
+ dest='type_threshold',
128
+ help=(
129
+ f'Min fraction of values that must match a type before it is assigned '
130
+ f'(default {_TYPE_CONFORMANCE_THRESHOLD}); guards against a few dirty '
131
+ f'values flipping a numeric/date column to string. Use 1.0 for strict matching.'
132
+ ))
133
+
134
+ parser.add_argument('--case',
135
+ choices=list(VALID_CASINGS), default='camel', metavar='CASING',
136
+ help='Column naming: camel|pascal|snake|screaming|kebab|skip (default: camel)')
137
+
138
+ parser.add_argument('--layer',
139
+ choices=list(VALID_LAYERS), default=None, metavar='LAYER',
140
+ help='Layer: bronze|parquet_bronze|silver|gold|gold_vw|all (requires --pyspark true)')
141
+
142
+ parser.add_argument('--catalog',
143
+ default=None, metavar='NAME',
144
+ help='Unity Catalog name for three-part table naming (pyspark only)')
145
+
146
+ parser.add_argument('--table-type',
147
+ choices=list(VALID_TABLE_TYPES), default='delta', metavar='TYPE',
148
+ dest='table_type',
149
+ help='delta|external|external_delta (pyspark only, default: delta)')
150
+
151
+ parser.add_argument('--location',
152
+ default='', metavar='PATH',
153
+ help='Storage path for external / external_delta tables')
154
+
155
+ parser.add_argument('--file-format',
156
+ default='parquet', metavar='FMT',
157
+ dest='file_format',
158
+ help="File format for external tables, e.g. parquet|csv|orc (default: parquet)")
159
+
160
+ parser.add_argument('--sheet',
161
+ default=None, metavar='SHEET',
162
+ help='Excel sheet name or 0-based index (default: first sheet). ERROR with .csv')
163
+
164
+ parser.add_argument('--header-row',
165
+ type=int, default=None, metavar='N',
166
+ dest='header_row',
167
+ help=(
168
+ 'Row containing column headers, 0-based (default: 0, i.e. first row/line). '
169
+ 'Use e.g. 4 when rows 1-4 are junk/title rows and the real header is on row 5. '
170
+ 'Applies to both .csv and Excel files.'
171
+ ))
172
+
173
+ parser.add_argument('--output',
174
+ choices=['txt'], default=None, metavar='FORMAT',
175
+ help="'txt' → save {table}_schema.txt in current directory")
176
+
177
+ parser.add_argument('--no-print',
178
+ action='store_true',
179
+ help='Suppress terminal output (combine with --output txt for file-only mode)')
180
+
181
+ parser.set_defaults(_run=run, _parser_error=parser.error)
182
+
183
+
184
+ def run(args: argparse.Namespace) -> None:
185
+ """Validate parsed *args* and execute the inference. Raises via args._parser_error on bad flags."""
186
+ error = args._parser_error
187
+
188
+ ext = os.path.splitext(args.file)[1].lower()
189
+ if args.sheet is not None and ext == '.csv':
190
+ error('--sheet is not valid for .csv files. Remove --sheet or use an Excel file.')
191
+
192
+ if args.layer is not None and not args.pyspark:
193
+ error('--layer requires --pyspark true.')
194
+
195
+ if args.catalog is not None and not args.pyspark:
196
+ error('--catalog requires --pyspark true.')
197
+
198
+ if args.table_type != 'delta' and not args.pyspark:
199
+ error('--table-type requires --pyspark true.')
200
+
201
+ if args.table_type in ('external', 'external_delta') and not args.location:
202
+ error(
203
+ f'--location is required when --table-type={args.table_type}. '
204
+ 'Provide the storage path, e.g. --location /mnt/data/sales/'
205
+ )
206
+
207
+ if args.sample is not None and args.sample < 0:
208
+ error('--sample must be ≥ 0 (0 = full file, omit = auto).')
209
+
210
+ if not (0.0 < args.type_threshold <= 1.0):
211
+ error(
212
+ f'--type-threshold must be in the range (0.0, 1.0]; got {args.type_threshold}. '
213
+ 'Use 1.0 for strict all-or-nothing matching.'
214
+ )
215
+
216
+ # Resolve sheet
217
+ sheet_val: Union[str, int] = 0
218
+ if args.sheet is not None:
219
+ try:
220
+ sheet_val = int(args.sheet)
221
+ except ValueError:
222
+ sheet_val = args.sheet
223
+
224
+ # Resolve header row (default 0 when omitted)
225
+ header_row_val = args.header_row if args.header_row is not None else 0
226
+ if header_row_val < 0:
227
+ error('--header-row must be ≥ 0 (0 = first row/line is the header).')
228
+
229
+ # Resolve output path
230
+ output_path: Optional[str] = None
231
+ if args.output == 'txt':
232
+ stem = args.table or os.path.splitext(os.path.basename(args.file))[0]
233
+ safe_stem = re.sub(r'\W+', '_', stem).strip('_') or 'output'
234
+ output_path = f'{safe_stem}_schema.txt'
235
+
236
+ try:
237
+ infer_file(
238
+ source = args.file,
239
+ pyspark = args.pyspark,
240
+ table_name = args.table,
241
+ sample_rows = args.sample,
242
+ sheet = sheet_val,
243
+ header_row = header_row_val,
244
+ casing = args.case,
245
+ print_output = not args.no_print,
246
+ output_path = output_path,
247
+ type_threshold = args.type_threshold,
248
+ layer = args.layer,
249
+ catalog = args.catalog,
250
+ table_type = args.table_type,
251
+ location = args.location,
252
+ file_format = args.file_format,
253
+ )
254
+ except (ValueError, TypeError, FileNotFoundError, MemoryError) as e:
255
+ error(str(e))
256
+
257
+
258
+ def main(argv: Optional[list] = None) -> None:
259
+ """Standalone entry point: `python -m cds_pyde_toolkit.schema_inferencer.cli ...`"""
260
+ parser = argparse.ArgumentParser(prog='schema-infer')
261
+ add_arguments(parser)
262
+ args = parser.parse_args(argv)
263
+ args._parser_error = parser.error
264
+ run(args)
265
+
266
+
267
+ if __name__ == '__main__': # pragma: no cover
268
+ main()