datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablade/__init__.py +49 -1
- datablade/blade.py +322 -0
- datablade/core/__init__.py +28 -7
- datablade/core/frames.py +23 -236
- datablade/core/json.py +5 -10
- datablade/core/lists.py +5 -10
- datablade/core/messages.py +23 -11
- datablade/core/strings.py +5 -43
- datablade/core/zip.py +5 -24
- datablade/dataframes/__init__.py +51 -0
- datablade/dataframes/frames.py +585 -0
- datablade/dataframes/readers.py +1367 -0
- datablade/docs/ARCHITECTURE.md +102 -0
- datablade/docs/OBJECT_REGISTRY.md +194 -0
- datablade/docs/README.md +57 -0
- datablade/docs/TESTING.md +37 -0
- datablade/docs/USAGE.md +409 -0
- datablade/docs/__init__.py +87 -0
- datablade/docs/__main__.py +6 -0
- datablade/io/__init__.py +15 -0
- datablade/io/json.py +70 -0
- datablade/io/zip.py +111 -0
- datablade/registry.py +581 -0
- datablade/sql/__init__.py +56 -0
- datablade/sql/bulk_load.py +665 -0
- datablade/sql/ddl.py +402 -0
- datablade/sql/ddl_pyarrow.py +411 -0
- datablade/sql/dialects.py +12 -0
- datablade/sql/quoting.py +44 -0
- datablade/sql/schema_spec.py +65 -0
- datablade/sql/sqlserver.py +390 -0
- datablade/utils/__init__.py +38 -0
- datablade/utils/lists.py +32 -0
- datablade/utils/logging.py +204 -0
- datablade/utils/messages.py +29 -0
- datablade/utils/strings.py +249 -0
- datablade-0.0.6.dist-info/METADATA +406 -0
- datablade-0.0.6.dist-info/RECORD +41 -0
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
- datablade-0.0.0.dist-info/METADATA +0 -13
- datablade-0.0.0.dist-info/RECORD +0 -13
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""String and path helpers used across datablade."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import pathlib
|
|
5
|
+
from functools import singledispatch
|
|
6
|
+
from typing import Optional, Union
|
|
7
|
+
|
|
8
|
+
from .logging import log_warning
|
|
9
|
+
from .messages import print_verbose
|
|
10
|
+
|
|
11
|
+
PathInput = Union[str, pathlib.Path, os.PathLike]
|
|
12
|
+
_PATH_STRICT_DEFAULT = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def configure_paths(*, path_strict: bool = False) -> None:
|
|
16
|
+
"""Configure global path handling behavior."""
|
|
17
|
+
global _PATH_STRICT_DEFAULT
|
|
18
|
+
_PATH_STRICT_DEFAULT = bool(path_strict)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _resolve_path_strict(path_strict: Optional[bool]) -> bool:
|
|
22
|
+
if path_strict is None:
|
|
23
|
+
return _PATH_STRICT_DEFAULT
|
|
24
|
+
return bool(path_strict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@singledispatch
|
|
28
|
+
def _coerce_path_input(value: object, type_label: str) -> str:
|
|
29
|
+
raise TypeError(f"{type_label} must be a string or pathlib.Path")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@_coerce_path_input.register
|
|
33
|
+
def _(value: str, type_label: str) -> str:
|
|
34
|
+
return value
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@_coerce_path_input.register
|
|
38
|
+
def _(value: pathlib.Path, type_label: str) -> str:
|
|
39
|
+
return str(value)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@_coerce_path_input.register
|
|
43
|
+
def _(value: os.PathLike, type_label: str) -> str:
|
|
44
|
+
path_value = os.fspath(value)
|
|
45
|
+
if isinstance(path_value, bytes):
|
|
46
|
+
raise TypeError(f"{type_label} must be a string or pathlib.Path")
|
|
47
|
+
return path_value
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@_coerce_path_input.register
|
|
51
|
+
def _(value: bytes, type_label: str) -> str:
|
|
52
|
+
raise TypeError(f"{type_label} must be a string or pathlib.Path")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _normalize_path_value(path_value: str) -> str:
|
|
56
|
+
if os.name == "nt":
|
|
57
|
+
return path_value.replace("\\", "/")
|
|
58
|
+
return path_value
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _find_case_conflicts(
|
|
62
|
+
path_obj: pathlib.Path,
|
|
63
|
+
raw_path: str,
|
|
64
|
+
) -> list[tuple[str, str, str]]:
|
|
65
|
+
"""Return case mismatches as (provided, actual, parent) tuples."""
|
|
66
|
+
if not raw_path:
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
input_path = pathlib.Path(raw_path)
|
|
71
|
+
except Exception:
|
|
72
|
+
input_path = path_obj
|
|
73
|
+
|
|
74
|
+
if input_path.is_absolute():
|
|
75
|
+
anchor = input_path.anchor
|
|
76
|
+
if anchor:
|
|
77
|
+
current = pathlib.Path(anchor)
|
|
78
|
+
anchor_parts = pathlib.Path(anchor).parts
|
|
79
|
+
remaining_parts = input_path.parts[len(anchor_parts) :]
|
|
80
|
+
else:
|
|
81
|
+
current = pathlib.Path(anchor)
|
|
82
|
+
remaining_parts = input_path.parts
|
|
83
|
+
else:
|
|
84
|
+
current = pathlib.Path.cwd()
|
|
85
|
+
remaining_parts = input_path.parts
|
|
86
|
+
|
|
87
|
+
mismatches: list[tuple[str, str, str]] = []
|
|
88
|
+
for part in remaining_parts:
|
|
89
|
+
if part in ("", "."):
|
|
90
|
+
continue
|
|
91
|
+
if part == "..":
|
|
92
|
+
current = current.parent
|
|
93
|
+
continue
|
|
94
|
+
try:
|
|
95
|
+
with os.scandir(current) as entries:
|
|
96
|
+
actual = None
|
|
97
|
+
for entry in entries:
|
|
98
|
+
if entry.name.casefold() == part.casefold():
|
|
99
|
+
actual = entry.name
|
|
100
|
+
break
|
|
101
|
+
except OSError:
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
if actual is None:
|
|
105
|
+
current = current / part
|
|
106
|
+
continue
|
|
107
|
+
if actual != part:
|
|
108
|
+
mismatches.append((part, actual, str(current)))
|
|
109
|
+
current = current / actual
|
|
110
|
+
|
|
111
|
+
return mismatches
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def coerce_path(
|
|
115
|
+
input: Optional[PathInput],
|
|
116
|
+
*,
|
|
117
|
+
must_exist: bool = False,
|
|
118
|
+
verbose: bool = False,
|
|
119
|
+
label: str = "path",
|
|
120
|
+
path_strict: Optional[bool] = None,
|
|
121
|
+
type_label: Optional[str] = None,
|
|
122
|
+
) -> pathlib.Path:
|
|
123
|
+
"""Normalize a path-like input and optionally validate existence and case."""
|
|
124
|
+
if input is None:
|
|
125
|
+
print_verbose(f"No {label} provided; exiting.", verbose)
|
|
126
|
+
raise ValueError(f"{label} must be provided")
|
|
127
|
+
|
|
128
|
+
type_label = type_label or label
|
|
129
|
+
path_value = _coerce_path_input(input, type_label)
|
|
130
|
+
|
|
131
|
+
if not path_value.strip():
|
|
132
|
+
print_verbose(f"No {label} provided; exiting.", verbose)
|
|
133
|
+
raise ValueError(f"{label} must be provided")
|
|
134
|
+
|
|
135
|
+
normalized = _normalize_path_value(path_value)
|
|
136
|
+
path_obj = pathlib.Path(normalized)
|
|
137
|
+
exists = path_obj.exists()
|
|
138
|
+
|
|
139
|
+
if must_exist and not exists:
|
|
140
|
+
print_verbose(f"Path {path_obj} does not exist; exiting.", verbose)
|
|
141
|
+
raise ValueError(f"Path does not exist: {path_obj}")
|
|
142
|
+
|
|
143
|
+
if exists:
|
|
144
|
+
strict = _resolve_path_strict(path_strict)
|
|
145
|
+
conflicts = _find_case_conflicts(path_obj, path_value)
|
|
146
|
+
if conflicts:
|
|
147
|
+
details = "; ".join(
|
|
148
|
+
f"{provided} -> {actual} in {parent}"
|
|
149
|
+
for provided, actual, parent in conflicts
|
|
150
|
+
)
|
|
151
|
+
message = f"Path case mismatch for {label}: {details}"
|
|
152
|
+
if strict:
|
|
153
|
+
raise ValueError(message)
|
|
154
|
+
log_warning(message, verbose)
|
|
155
|
+
|
|
156
|
+
return path_obj
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def ensure_directory(
|
|
160
|
+
input: Optional[PathInput],
|
|
161
|
+
*,
|
|
162
|
+
verbose: bool = False,
|
|
163
|
+
label: str = "path",
|
|
164
|
+
path_strict: Optional[bool] = None,
|
|
165
|
+
type_label: Optional[str] = None,
|
|
166
|
+
) -> pathlib.Path:
|
|
167
|
+
"""Ensure a directory exists and return the resolved path."""
|
|
168
|
+
path_obj = coerce_path(
|
|
169
|
+
input,
|
|
170
|
+
must_exist=False,
|
|
171
|
+
verbose=verbose,
|
|
172
|
+
label=label,
|
|
173
|
+
path_strict=path_strict,
|
|
174
|
+
type_label=type_label,
|
|
175
|
+
)
|
|
176
|
+
path_obj.mkdir(parents=True, exist_ok=True)
|
|
177
|
+
return path_obj
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def sql_quotename(
|
|
181
|
+
name: Optional[str] = None,
|
|
182
|
+
brackets: bool = True,
|
|
183
|
+
ticks: bool = False,
|
|
184
|
+
verbose: bool = False,
|
|
185
|
+
) -> str:
|
|
186
|
+
"""
|
|
187
|
+
Quote a SQL Server name string with brackets or ticks.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
name: The name to quote. Must be a non-empty string.
|
|
191
|
+
brackets: If True, wraps the name in square brackets [name].
|
|
192
|
+
ticks: If True, wraps the name in single quotes 'name'.
|
|
193
|
+
Takes precedence over brackets if both are True.
|
|
194
|
+
verbose: If True, prints error messages.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
The quoted name string.
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
ValueError: If name is None or empty after stripping.
|
|
201
|
+
TypeError: If name is not a string.
|
|
202
|
+
|
|
203
|
+
Examples:
|
|
204
|
+
>>> sql_quotename('table_name')
|
|
205
|
+
'[table_name]'
|
|
206
|
+
>>> sql_quotename('table_name', brackets=False, ticks=True)
|
|
207
|
+
"'table_name'"
|
|
208
|
+
"""
|
|
209
|
+
if name is None:
|
|
210
|
+
print_verbose("No name provided; exiting sql_quotename.", verbose)
|
|
211
|
+
raise ValueError("name must be provided")
|
|
212
|
+
if not isinstance(name, str):
|
|
213
|
+
raise TypeError("name must be a string")
|
|
214
|
+
cleaned = name.strip()
|
|
215
|
+
if not cleaned:
|
|
216
|
+
raise ValueError("name must be a non-empty string")
|
|
217
|
+
|
|
218
|
+
return_value = cleaned.replace("[", "").replace("]", "")
|
|
219
|
+
if brackets:
|
|
220
|
+
return_value = f"[{return_value}]"
|
|
221
|
+
if ticks or not brackets:
|
|
222
|
+
return_value = f"'{return_value}'"
|
|
223
|
+
return return_value
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def pathing(
|
|
227
|
+
input: Optional[Union[str, pathlib.Path]], verbose: bool = False
|
|
228
|
+
) -> pathlib.Path:
|
|
229
|
+
"""
|
|
230
|
+
Standardize and validate a path string or Path object.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
input: The path to standardize (string or pathlib.Path). Must not be None.
|
|
234
|
+
verbose: If True, prints error messages.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
A pathlib.Path object if the path exists.
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
ValueError: If input is None or the path does not exist.
|
|
241
|
+
TypeError: If input is not a string or pathlib.Path.
|
|
242
|
+
"""
|
|
243
|
+
return coerce_path(
|
|
244
|
+
input,
|
|
245
|
+
must_exist=True,
|
|
246
|
+
verbose=verbose,
|
|
247
|
+
label="path input",
|
|
248
|
+
type_label="input",
|
|
249
|
+
)
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datablade
|
|
3
|
+
Version: 0.0.6
|
|
4
|
+
Summary: datablade is a suite of functions to provide standard syntax across data engineering projects.
|
|
5
|
+
Author-email: Brent Carpenetti <brentwc.git@pm.me>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 Brent Carpenetti
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
Requires-Python: >=3.12
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: pandas
|
|
31
|
+
Requires-Dist: pyarrow
|
|
32
|
+
Requires-Dist: numpy
|
|
33
|
+
Requires-Dist: openpyxl
|
|
34
|
+
Requires-Dist: requests
|
|
35
|
+
Provides-Extra: performance
|
|
36
|
+
Requires-Dist: polars; extra == "performance"
|
|
37
|
+
Requires-Dist: psutil; extra == "performance"
|
|
38
|
+
Provides-Extra: test
|
|
39
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
40
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
41
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "test"
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
45
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
|
|
46
|
+
Requires-Dist: polars; extra == "dev"
|
|
47
|
+
Requires-Dist: psutil; extra == "dev"
|
|
48
|
+
Requires-Dist: black; extra == "dev"
|
|
49
|
+
Requires-Dist: flake8; extra == "dev"
|
|
50
|
+
Requires-Dist: mypy; extra == "dev"
|
|
51
|
+
Requires-Dist: isort; extra == "dev"
|
|
52
|
+
Provides-Extra: all
|
|
53
|
+
Requires-Dist: polars; extra == "all"
|
|
54
|
+
Requires-Dist: psutil; extra == "all"
|
|
55
|
+
Requires-Dist: pytest>=7.0.0; extra == "all"
|
|
56
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "all"
|
|
57
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "all"
|
|
58
|
+
Requires-Dist: black; extra == "all"
|
|
59
|
+
Requires-Dist: flake8; extra == "all"
|
|
60
|
+
Requires-Dist: mypy; extra == "all"
|
|
61
|
+
Requires-Dist: isort; extra == "all"
|
|
62
|
+
Dynamic: license-file
|
|
63
|
+
|
|
64
|
+
# datablade
|
|
65
|
+
|
|
66
|
+
[](https://www.python.org/downloads/)
|
|
67
|
+
[](https://opensource.org/licenses/MIT)
|
|
68
|
+
|
|
69
|
+
**datablade** is a small, single-machine Python toolkit for data engineers who need reliable “file → DataFrame/Parquet → SQL DDL” workflows.
|
|
70
|
+
|
|
71
|
+
It focuses on:
|
|
72
|
+
|
|
73
|
+
- Reading common file formats with memory-aware heuristics
|
|
74
|
+
- Streaming large files in chunks (without concatenating)
|
|
75
|
+
- Normalizing DataFrame columns for downstream systems
|
|
76
|
+
- Generating `CREATE TABLE` DDL across a small set of SQL dialects
|
|
77
|
+
- Producing bulk-load commands (and executing BCP for SQL Server)
|
|
78
|
+
|
|
79
|
+
## What datablade Does
|
|
80
|
+
|
|
81
|
+
datablade helps data engineers:
|
|
82
|
+
|
|
83
|
+
- **Load data efficiently** from common file formats with automatic memory heuristics
|
|
84
|
+
- **Standardize data cleaning** with consistent column naming and type inference
|
|
85
|
+
- **Generate database schemas** for multiple SQL dialects from DataFrames or Parquet schemas
|
|
86
|
+
- **Handle datasets that don't fit in memory** using chunked iteration and optional Polars acceleration
|
|
87
|
+
- **Work across databases** with cross-dialect DDL and bulk-load command generation
|
|
88
|
+
- **Maintain data quality** with built-in validation and logging
|
|
89
|
+
|
|
90
|
+
## When to Use datablade
|
|
91
|
+
|
|
92
|
+
datablade is ideal for:
|
|
93
|
+
|
|
94
|
+
✅ **ETL/ELT Pipelines** - Building reproducible data ingestion workflows across multiple source formats
|
|
95
|
+
|
|
96
|
+
✅ **Multi-Database Projects** - Deploying the same schema to SQL Server, PostgreSQL, MySQL, or DuckDB
|
|
97
|
+
|
|
98
|
+
✅ **Large File Processing** - Streaming CSV/TSV/TXT/Parquet without concatenating
|
|
99
|
+
|
|
100
|
+
✅ **Data Lake to Warehouse** - Converting raw files to Parquet with optimized schemas
|
|
101
|
+
|
|
102
|
+
✅ **Ad-hoc Data Analysis** - Quickly exploring and preparing datasets with consistent patterns
|
|
103
|
+
|
|
104
|
+
✅ **Legacy System Integration** - Standardizing messy column names and data types from external sources
|
|
105
|
+
|
|
106
|
+
## When datablade is not the right tool
|
|
107
|
+
|
|
108
|
+
- Real-time streaming ingestion (Kafka, Spark Structured Streaming)
|
|
109
|
+
- Distributed compute / cluster execution (Spark, Dask)
|
|
110
|
+
- Warehouse-native transformations and modeling (dbt)
|
|
111
|
+
- A full-featured schema migration tool (Alembic, Flyway)
|
|
112
|
+
- Direct database connectivity/transactions (datablade generates SQL; it does not manage connections)
|
|
113
|
+
|
|
114
|
+
## Installation
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
pip install datablade
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Optional dependencies:**
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
# For high-performance file reading with Polars
|
|
124
|
+
pip install "datablade[performance]"
|
|
125
|
+
|
|
126
|
+
# For testing
|
|
127
|
+
pip install "datablade[test]"
|
|
128
|
+
|
|
129
|
+
# For development (includes testing + lint/format tooling)
|
|
130
|
+
pip install "datablade[dev]"
|
|
131
|
+
|
|
132
|
+
# All optional dependencies
|
|
133
|
+
pip install "datablade[all]"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Features
|
|
137
|
+
|
|
138
|
+
datablade provides four main modules:
|
|
139
|
+
|
|
140
|
+
### 📊 `datablade.dataframes`
|
|
141
|
+
|
|
142
|
+
DataFrame operations and transformations:
|
|
143
|
+
|
|
144
|
+
- Clean and normalize DataFrame columns
|
|
145
|
+
- Auto-detect and convert data types
|
|
146
|
+
- Generate optimized Parquet schemas
|
|
147
|
+
- Convert pandas DataFrames to PyArrow tables
|
|
148
|
+
- Generate multi-dialect SQL DDL statements
|
|
149
|
+
- **Memory-aware file reading** with automatic chunking
|
|
150
|
+
- **Polars integration** for high-performance large file processing
|
|
151
|
+
- Partitioned Parquet writing for datasets that don't fit in memory
|
|
152
|
+
|
|
153
|
+
### 🌐 `datablade.io`
|
|
154
|
+
|
|
155
|
+
Input/output operations for external data:
|
|
156
|
+
|
|
157
|
+
- Fetch JSON data from URLs
|
|
158
|
+
- Download and extract ZIP files
|
|
159
|
+
|
|
160
|
+
### 🛠️ `datablade.utils`
|
|
161
|
+
|
|
162
|
+
General utility functions:
|
|
163
|
+
|
|
164
|
+
- SQL name quoting
|
|
165
|
+
- Path standardization
|
|
166
|
+
- List flattening
|
|
167
|
+
- **Configurable logging** with Python logging module
|
|
168
|
+
|
|
169
|
+
### 🗄️ `datablade.sql`
|
|
170
|
+
|
|
171
|
+
Multi-dialect SQL utilities:
|
|
172
|
+
|
|
173
|
+
- **Multi-dialect support**: SQL Server, PostgreSQL, MySQL, DuckDB
|
|
174
|
+
- Dialect-aware identifier quoting
|
|
175
|
+
- CREATE TABLE generation for all dialects (from pandas DataFrames)
|
|
176
|
+
- CREATE TABLE generation from Parquet schemas (schema-only, via PyArrow)
|
|
177
|
+
- Optional `schema_spec` overrides for column types, nullability, and string sizing
|
|
178
|
+
- Bulk loading helpers:
|
|
179
|
+
- SQL Server: executes `bcp` via subprocess
|
|
180
|
+
- PostgreSQL/MySQL/DuckDB: returns command strings you can run in your environment
|
|
181
|
+
|
|
182
|
+
## Quick Start
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
import pandas as pd
|
|
186
|
+
from datablade import configure_logging, read_file_smart
|
|
187
|
+
from datablade.dataframes import clean_dataframe_columns, pandas_to_parquet_table
|
|
188
|
+
from datablade.io import get_json
|
|
189
|
+
from datablade.utils import sql_quotename
|
|
190
|
+
from datablade.sql import Dialect, generate_create_table, generate_create_table_from_parquet
|
|
191
|
+
|
|
192
|
+
# Configure logging
|
|
193
|
+
import logging
|
|
194
|
+
configure_logging(level=logging.INFO, log_file="datablade.log")
|
|
195
|
+
|
|
196
|
+
# Read a file into a single DataFrame (materializes)
|
|
197
|
+
df = read_file_smart('large_dataset.csv', verbose=True)
|
|
198
|
+
|
|
199
|
+
# Clean DataFrame
|
|
200
|
+
df = clean_dataframe_columns(df, verbose=True)
|
|
201
|
+
|
|
202
|
+
# Convert to Parquet
|
|
203
|
+
table = pandas_to_parquet_table(df, convert=True)
|
|
204
|
+
|
|
205
|
+
# Generate SQL DDL for multiple dialects
|
|
206
|
+
sql_sqlserver = generate_create_table(df, table='my_table', dialect=Dialect.SQLSERVER)
|
|
207
|
+
sql_postgres = generate_create_table(df, table='my_table', dialect=Dialect.POSTGRES)
|
|
208
|
+
|
|
209
|
+
# Generate SQL DDL directly from an existing Parquet schema (no data materialization)
|
|
210
|
+
# Note: nested Parquet types (struct/list/map/union) are dropped with a warning.
|
|
211
|
+
ddl_from_parquet = generate_create_table_from_parquet(
|
|
212
|
+
"events.parquet",
|
|
213
|
+
table="events",
|
|
214
|
+
dialect=Dialect.POSTGRES,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Fetch JSON data
|
|
218
|
+
data = get_json('https://api.example.com/data.json')
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Most file path parameters accept `str` or `pathlib.Path`. To treat case mismatches
|
|
222
|
+
as errors on case-insensitive filesystems, use `configure_paths(path_strict=True)`.
|
|
223
|
+
|
|
224
|
+
### Memory-Aware File Reading
|
|
225
|
+
|
|
226
|
+
See the file format support matrix in the bundled USAGE doc:
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
python -m datablade.docs --show USAGE
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from datablade.dataframes import (
|
|
234
|
+
excel_to_parquets,
|
|
235
|
+
read_file_chunked,
|
|
236
|
+
read_file_iter,
|
|
237
|
+
read_file_to_parquets,
|
|
238
|
+
stream_to_parquets,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Read large files in chunks
|
|
242
|
+
for chunk in read_file_chunked('huge_file.csv', memory_fraction=0.5):
|
|
243
|
+
process(chunk)
|
|
244
|
+
|
|
245
|
+
# Stream without ever concatenating/materializing
|
|
246
|
+
for chunk in read_file_iter('huge_file.csv', memory_fraction=0.3, verbose=True):
|
|
247
|
+
process(chunk)
|
|
248
|
+
|
|
249
|
+
# Parquet is also supported for streaming (single .parquet files)
|
|
250
|
+
for chunk in read_file_iter('huge_file.parquet', memory_fraction=0.3, verbose=True):
|
|
251
|
+
process(chunk)
|
|
252
|
+
|
|
253
|
+
# Excel streaming is available with openpyxl installed (read-only mode)
|
|
254
|
+
for chunk in read_file_iter('large.xlsx', chunksize=25_000, verbose=True):
|
|
255
|
+
process(chunk)
|
|
256
|
+
|
|
257
|
+
# Partition large files to multiple Parquets
|
|
258
|
+
files = read_file_to_parquets(
|
|
259
|
+
'large_file.csv',
|
|
260
|
+
output_dir='partitioned/',
|
|
261
|
+
convert_types=True,
|
|
262
|
+
verbose=True
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Stream to Parquet partitions without materializing
|
|
266
|
+
files = stream_to_parquets(
|
|
267
|
+
'large_file.csv',
|
|
268
|
+
output_dir='partitioned_streamed/',
|
|
269
|
+
rows_per_file=200_000,
|
|
270
|
+
convert_types=True,
|
|
271
|
+
verbose=True,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Excel streaming to Parquet partitions
|
|
275
|
+
files = excel_to_parquets(
|
|
276
|
+
'large.xlsx',
|
|
277
|
+
output_dir='partitioned_excel/',
|
|
278
|
+
rows_per_file=200_000,
|
|
279
|
+
convert_types=True,
|
|
280
|
+
verbose=True,
|
|
281
|
+
)
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
## Blade (Optional Facade)
|
|
285
|
+
|
|
286
|
+
The canonical API is module-level functions (for example, `datablade.dataframes.read_file_iter`).
|
|
287
|
+
|
|
288
|
+
If you prefer an object-style entrypoint with shared defaults, you can use the optional `Blade` facade:
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
from datablade import Blade
|
|
292
|
+
from datablade.sql import Dialect
|
|
293
|
+
|
|
294
|
+
blade = Blade(memory_fraction=0.3, verbose=True, convert_types=True)
|
|
295
|
+
|
|
296
|
+
for chunk in blade.iter("huge.csv"):
|
|
297
|
+
process(chunk)
|
|
298
|
+
|
|
299
|
+
files = blade.stream_to_parquets("huge.csv", output_dir="partitioned/")
|
|
300
|
+
|
|
301
|
+
# Generate DDL (CREATE TABLE)
|
|
302
|
+
ddl = blade.create_table_sql(
|
|
303
|
+
df,
|
|
304
|
+
table="my_table",
|
|
305
|
+
dialect=Dialect.POSTGRES,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Generate DDL from an existing Parquet file (schema-only)
|
|
309
|
+
ddl2 = blade.create_table_sql_from_parquet(
|
|
310
|
+
"events.parquet",
|
|
311
|
+
table="events",
|
|
312
|
+
dialect=Dialect.POSTGRES,
|
|
313
|
+
)
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
## Documentation
|
|
317
|
+
|
|
318
|
+
Docs are bundled with the installed package:
|
|
319
|
+
|
|
320
|
+
```bash
|
|
321
|
+
python -m datablade.docs --list
|
|
322
|
+
python -m datablade.docs --show USAGE
|
|
323
|
+
python -m datablade.docs --write-dir .\datablade-docs
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
After writing docs to disk, open the markdown files locally:
|
|
327
|
+
|
|
328
|
+
- README (docs landing page)
|
|
329
|
+
- USAGE (file reading, streaming, SQL, IO, logging)
|
|
330
|
+
- TESTING (how to run tests locally)
|
|
331
|
+
- ARCHITECTURE (pipeline overview)
|
|
332
|
+
- OBJECT_REGISTRY (registry reference)
|
|
333
|
+
|
|
334
|
+
## Testing
|
|
335
|
+
|
|
336
|
+
Run the test suite:
|
|
337
|
+
|
|
338
|
+
```bash
|
|
339
|
+
# Install with test dependencies
|
|
340
|
+
pip install -e ".[test]"
|
|
341
|
+
|
|
342
|
+
# Run all tests
|
|
343
|
+
pytest
|
|
344
|
+
|
|
345
|
+
# Run with coverage report
|
|
346
|
+
pytest --cov=datablade --cov-report=html
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
For detailed testing documentation, use the bundled TESTING doc:
|
|
350
|
+
|
|
351
|
+
```bash
|
|
352
|
+
python -m datablade.docs --show TESTING
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
## Backward Compatibility
|
|
356
|
+
|
|
357
|
+
All functions are available through the legacy `datablade.core` module for backward compatibility:
|
|
358
|
+
|
|
359
|
+
```python
|
|
360
|
+
# Legacy imports (still supported)
|
|
361
|
+
from datablade.core.frames import clean_dataframe_columns
|
|
362
|
+
from datablade.core.json import get
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
## Requirements
|
|
366
|
+
|
|
367
|
+
**Core dependencies:**
|
|
368
|
+
|
|
369
|
+
- pandas
|
|
370
|
+
- pyarrow
|
|
371
|
+
- numpy
|
|
372
|
+
- openpyxl
|
|
373
|
+
- requests
|
|
374
|
+
|
|
375
|
+
## Design choices and limitations
|
|
376
|
+
|
|
377
|
+
- **Single-machine focus**: datablade is designed for laptop/VM/server execution, not clusters.
|
|
378
|
+
- **Streaming vs materializing**:
|
|
379
|
+
- Use `read_file_iter()` to process arbitrarily large files chunk-by-chunk.
|
|
380
|
+
- `read_file_smart()` returns a single DataFrame and may still be memory-intensive.
|
|
381
|
+
- **Chunk concatenation**: the large-file pandas fallback in `read_file_smart()` can
|
|
382
|
+
temporarily spike memory usage during concat. Use `read_file_iter()` or
|
|
383
|
+
`return_type="iterator"` to avoid concatenation.
|
|
384
|
+
- **Polars materialization**: when returning a pandas DataFrame, Polars still
|
|
385
|
+
collects into memory; use `return_type="polars"` or `"polars_lazy"` to keep
|
|
386
|
+
Polars frames.
|
|
387
|
+
- **Parquet support**:
|
|
388
|
+
- Streaming reads support single `.parquet` files.
|
|
389
|
+
- Parquet “dataset directories” (Hive partitions / directory-of-parquets) are not a primary target API.
|
|
390
|
+
- **Parquet → SQL DDL**:
|
|
391
|
+
- Uses the Parquet schema (PyArrow) without scanning data.
|
|
392
|
+
- Complex/nested columns (struct/list/map/union) are dropped and logged as warnings.
|
|
393
|
+
- **DDL scope**: `CREATE TABLE` generation is column/type oriented (no indexes/constraints).
|
|
394
|
+
- **SQL Server bulk load**: the SQL Server helpers use the `bcp` CLI and require it
|
|
395
|
+
to be installed and available on PATH. When using `-U`/`-P`, credentials are
|
|
396
|
+
passed via process args (logs are redacted); prefer `-T` or `-G` where possible.
|
|
397
|
+
|
|
398
|
+
**Optional dependencies:**
|
|
399
|
+
|
|
400
|
+
- polars (for high-performance file reading)
|
|
401
|
+
- psutil (for memory-aware operations)
|
|
402
|
+
- pytest (for testing)
|
|
403
|
+
|
|
404
|
+
## License
|
|
405
|
+
|
|
406
|
+
MIT
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
datablade/__init__.py,sha256=T1FBNEZfjdIFwyfRLNCQBroMXDfaYai1sQTaRzhA1Qg,1487
|
|
2
|
+
datablade/blade.py,sha256=iKQtHhH-xs3EDH9ZNryheRGzE2OBkaxjZsA99iAG-p0,10330
|
|
3
|
+
datablade/registry.py,sha256=zmfbt3-LthxV2lmwNAmt4KN-EnLRKAguMnYnk5xxzjg,20323
|
|
4
|
+
datablade/core/__init__.py,sha256=VJ2lFumv-7CFKjIN_i4wPEaAGFEb-UhHPzsM2IyTUUM,871
|
|
5
|
+
datablade/core/frames.py,sha256=iiTQPkWKQHslwkz_pu6bHVvbnisWSZF_b3Hgy1SvE-Q,662
|
|
6
|
+
datablade/core/json.py,sha256=_0RoK_OXkOpwNG5Y7vdORQOCfs0ZVJAO0qeQqjxzpyE,121
|
|
7
|
+
datablade/core/lists.py,sha256=OVj-KLyeOlbOxALyxvBLQpB0s78RO2QxmM4OxbI70Cc,132
|
|
8
|
+
datablade/core/messages.py,sha256=_2CpeaDJIwmRlTm4UMrDTKnew1wZdyai6QeKFjN3rno,404
|
|
9
|
+
datablade/core/strings.py,sha256=vwx0NW-m-2nOYW1aisRXUjrd4wERe2Y5Ka64wAyF0Mo,173
|
|
10
|
+
datablade/core/zip.py,sha256=fmp6VTKH3JAaRWTKhWGHN-frO7Vr4EHlEYtUOlomNuc,119
|
|
11
|
+
datablade/dataframes/__init__.py,sha256=blPM2EkNI6FA8unnsGpTDP4A12G23ChXy4djyhpgHMk,1316
|
|
12
|
+
datablade/dataframes/frames.py,sha256=VaMgrY5F8uuWw6adx9g10sBlkW4TfZhH5AEo2TgD5N4,20993
|
|
13
|
+
datablade/dataframes/readers.py,sha256=1RFW0J1iKGvmgrRjDMB8PoHcMhQhyfAlUh1h6_aF_2E,47469
|
|
14
|
+
datablade/docs/ARCHITECTURE.md,sha256=ON65KUk3-EdwfnMglOT1CVZqyJNUXVbAUff803d7XDU,3290
|
|
15
|
+
datablade/docs/OBJECT_REGISTRY.md,sha256=00wkTuo6Pnhl9kgRDu7mJFO3yxSBedQQS7r-ktnsqSs,5386
|
|
16
|
+
datablade/docs/README.md,sha256=2uH6siz5ABEd6ZDjPYbXpH-zaOgyHzB3ik2e2eUj8Ks,1587
|
|
17
|
+
datablade/docs/TESTING.md,sha256=zc1Qz3ZRGH9CZqWfzArDmv3wRfS8-346EiXhE_fO0co,463
|
|
18
|
+
datablade/docs/USAGE.md,sha256=vJRmCF-dUeXAiWvvgUwc6fO3gpYNBZtwV-v2gmfoeeY,11866
|
|
19
|
+
datablade/docs/__init__.py,sha256=t4KRQOU653CVdkFD00Aj0hdjrlcmx70D2W2QnybJDnc,2388
|
|
20
|
+
datablade/docs/__main__.py,sha256=V-GK7vuR1aEvT_TgJPAZnapjLhwDXGOCSEseX4R6fts,112
|
|
21
|
+
datablade/io/__init__.py,sha256=jDenqP4biI1gnRdoO9X5C489cBlHrLuR3xp_TIOljp8,295
|
|
22
|
+
datablade/io/json.py,sha256=3Oao1lP8Dat3DT7MYyCes7feqPVFDHAPpFDemHuKVVw,2384
|
|
23
|
+
datablade/io/zip.py,sha256=Ppl8CvGhWIYFcBelpSkinmT_oFDmQmqNiHuFHicSoQI,4268
|
|
24
|
+
datablade/sql/__init__.py,sha256=Fs77XFT_hJcdxBZxLD9-6fgD2QEVtfp58LlZaEMFEJA,1466
|
|
25
|
+
datablade/sql/bulk_load.py,sha256=7SniXGHMIvwg9P_r95Nt7Z72jkQnu-LuQtPvv_Wmq2I,19992
|
|
26
|
+
datablade/sql/ddl.py,sha256=4xJ98LJmnD7PqloU8Ewa7J9iy2L6n9iPRsM5WtBKA0U,14958
|
|
27
|
+
datablade/sql/ddl_pyarrow.py,sha256=yBfTbDAA7dE3BXrCSPqEFogOYohAb3p0v3AIgBRxNqc,15344
|
|
28
|
+
datablade/sql/dialects.py,sha256=rM5qw6xdIuCSx81b6_TKYnsDhKeVpTdMDb_62HhtksA,266
|
|
29
|
+
datablade/sql/quoting.py,sha256=Y8yn1GAsaWSIenQDidwU4j2YsZmYE9ppOifAPiTbStk,1345
|
|
30
|
+
datablade/sql/schema_spec.py,sha256=HdEA5RS_yj8WNVH8wroWDyIK-8K4_fgMZC7rZBswkcs,2067
|
|
31
|
+
datablade/sql/sqlserver.py,sha256=ASLhtHIslMePRi9pUY6vWTaebQ7TM4kO60N9QsXwhok,13726
|
|
32
|
+
datablade/utils/__init__.py,sha256=KbvUZVWg2u53oUlfEIbDndwgz6f_j7N23Dl8Oz-vWXk,746
|
|
33
|
+
datablade/utils/lists.py,sha256=h3AXo2SSE2iaR_rJfoCgfNRFV9AOWerL-hHFYo6S4n8,798
|
|
34
|
+
datablade/utils/logging.py,sha256=kqM1389Wgzml69aja6g7yj8biNSi9AOPZlxr3YTnyp0,5491
|
|
35
|
+
datablade/utils/messages.py,sha256=yZZTGTS_eD0PBZFCdzz51fqBFWgq9Sdq-GMR1a0FclY,537
|
|
36
|
+
datablade/utils/strings.py,sha256=GLvYi7FPUHZ2nqNsZ0-mGbc3gu70UvVzrIy4bQWRHOo,7196
|
|
37
|
+
datablade-0.0.6.dist-info/licenses/LICENSE,sha256=QyU-OkETSZ-L0Rltu-SAMQtQU_UXjwLnb_QlA_AXm0U,1072
|
|
38
|
+
datablade-0.0.6.dist-info/METADATA,sha256=ou_j_HZwHrdNaZb26JhW_m3oRhILZ4tMFIPla9Sm8LM,13261
|
|
39
|
+
datablade-0.0.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
40
|
+
datablade-0.0.6.dist-info/top_level.txt,sha256=AwA5QxmfuaAs9XeXw1tCsboPsoffnMU-6CeLWMMUoUA,10
|
|
41
|
+
datablade-0.0.6.dist-info/RECORD,,
|