datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datablade/__init__.py +49 -1
  2. datablade/blade.py +322 -0
  3. datablade/core/__init__.py +28 -7
  4. datablade/core/frames.py +23 -236
  5. datablade/core/json.py +5 -10
  6. datablade/core/lists.py +5 -10
  7. datablade/core/messages.py +23 -11
  8. datablade/core/strings.py +5 -43
  9. datablade/core/zip.py +5 -24
  10. datablade/dataframes/__init__.py +51 -0
  11. datablade/dataframes/frames.py +585 -0
  12. datablade/dataframes/readers.py +1367 -0
  13. datablade/docs/ARCHITECTURE.md +102 -0
  14. datablade/docs/OBJECT_REGISTRY.md +194 -0
  15. datablade/docs/README.md +57 -0
  16. datablade/docs/TESTING.md +37 -0
  17. datablade/docs/USAGE.md +409 -0
  18. datablade/docs/__init__.py +87 -0
  19. datablade/docs/__main__.py +6 -0
  20. datablade/io/__init__.py +15 -0
  21. datablade/io/json.py +70 -0
  22. datablade/io/zip.py +111 -0
  23. datablade/registry.py +581 -0
  24. datablade/sql/__init__.py +56 -0
  25. datablade/sql/bulk_load.py +665 -0
  26. datablade/sql/ddl.py +402 -0
  27. datablade/sql/ddl_pyarrow.py +411 -0
  28. datablade/sql/dialects.py +12 -0
  29. datablade/sql/quoting.py +44 -0
  30. datablade/sql/schema_spec.py +65 -0
  31. datablade/sql/sqlserver.py +390 -0
  32. datablade/utils/__init__.py +38 -0
  33. datablade/utils/lists.py +32 -0
  34. datablade/utils/logging.py +204 -0
  35. datablade/utils/messages.py +29 -0
  36. datablade/utils/strings.py +249 -0
  37. datablade-0.0.6.dist-info/METADATA +406 -0
  38. datablade-0.0.6.dist-info/RECORD +41 -0
  39. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
  40. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
  41. datablade-0.0.0.dist-info/METADATA +0 -13
  42. datablade-0.0.0.dist-info/RECORD +0 -13
  43. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,249 @@
1
+ """String and path helpers used across datablade."""
2
+
3
+ import os
4
+ import pathlib
5
+ from functools import singledispatch
6
+ from typing import Optional, Union
7
+
8
+ from .logging import log_warning
9
+ from .messages import print_verbose
10
+
11
+ PathInput = Union[str, pathlib.Path, os.PathLike]
12
+ _PATH_STRICT_DEFAULT = False
13
+
14
+
15
+ def configure_paths(*, path_strict: bool = False) -> None:
16
+ """Configure global path handling behavior."""
17
+ global _PATH_STRICT_DEFAULT
18
+ _PATH_STRICT_DEFAULT = bool(path_strict)
19
+
20
+
21
+ def _resolve_path_strict(path_strict: Optional[bool]) -> bool:
22
+ if path_strict is None:
23
+ return _PATH_STRICT_DEFAULT
24
+ return bool(path_strict)
25
+
26
+
27
+ @singledispatch
28
+ def _coerce_path_input(value: object, type_label: str) -> str:
29
+ raise TypeError(f"{type_label} must be a string or pathlib.Path")
30
+
31
+
32
+ @_coerce_path_input.register
33
+ def _(value: str, type_label: str) -> str:
34
+ return value
35
+
36
+
37
+ @_coerce_path_input.register
38
+ def _(value: pathlib.Path, type_label: str) -> str:
39
+ return str(value)
40
+
41
+
42
+ @_coerce_path_input.register
43
+ def _(value: os.PathLike, type_label: str) -> str:
44
+ path_value = os.fspath(value)
45
+ if isinstance(path_value, bytes):
46
+ raise TypeError(f"{type_label} must be a string or pathlib.Path")
47
+ return path_value
48
+
49
+
50
+ @_coerce_path_input.register
51
+ def _(value: bytes, type_label: str) -> str:
52
+ raise TypeError(f"{type_label} must be a string or pathlib.Path")
53
+
54
+
55
+ def _normalize_path_value(path_value: str) -> str:
56
+ if os.name == "nt":
57
+ return path_value.replace("\\", "/")
58
+ return path_value
59
+
60
+
61
+ def _find_case_conflicts(
62
+ path_obj: pathlib.Path,
63
+ raw_path: str,
64
+ ) -> list[tuple[str, str, str]]:
65
+ """Return case mismatches as (provided, actual, parent) tuples."""
66
+ if not raw_path:
67
+ return []
68
+
69
+ try:
70
+ input_path = pathlib.Path(raw_path)
71
+ except Exception:
72
+ input_path = path_obj
73
+
74
+ if input_path.is_absolute():
75
+ anchor = input_path.anchor
76
+ if anchor:
77
+ current = pathlib.Path(anchor)
78
+ anchor_parts = pathlib.Path(anchor).parts
79
+ remaining_parts = input_path.parts[len(anchor_parts) :]
80
+ else:
81
+ current = pathlib.Path(anchor)
82
+ remaining_parts = input_path.parts
83
+ else:
84
+ current = pathlib.Path.cwd()
85
+ remaining_parts = input_path.parts
86
+
87
+ mismatches: list[tuple[str, str, str]] = []
88
+ for part in remaining_parts:
89
+ if part in ("", "."):
90
+ continue
91
+ if part == "..":
92
+ current = current.parent
93
+ continue
94
+ try:
95
+ with os.scandir(current) as entries:
96
+ actual = None
97
+ for entry in entries:
98
+ if entry.name.casefold() == part.casefold():
99
+ actual = entry.name
100
+ break
101
+ except OSError:
102
+ break
103
+
104
+ if actual is None:
105
+ current = current / part
106
+ continue
107
+ if actual != part:
108
+ mismatches.append((part, actual, str(current)))
109
+ current = current / actual
110
+
111
+ return mismatches
112
+
113
+
114
+ def coerce_path(
115
+ input: Optional[PathInput],
116
+ *,
117
+ must_exist: bool = False,
118
+ verbose: bool = False,
119
+ label: str = "path",
120
+ path_strict: Optional[bool] = None,
121
+ type_label: Optional[str] = None,
122
+ ) -> pathlib.Path:
123
+ """Normalize a path-like input and optionally validate existence and case."""
124
+ if input is None:
125
+ print_verbose(f"No {label} provided; exiting.", verbose)
126
+ raise ValueError(f"{label} must be provided")
127
+
128
+ type_label = type_label or label
129
+ path_value = _coerce_path_input(input, type_label)
130
+
131
+ if not path_value.strip():
132
+ print_verbose(f"No {label} provided; exiting.", verbose)
133
+ raise ValueError(f"{label} must be provided")
134
+
135
+ normalized = _normalize_path_value(path_value)
136
+ path_obj = pathlib.Path(normalized)
137
+ exists = path_obj.exists()
138
+
139
+ if must_exist and not exists:
140
+ print_verbose(f"Path {path_obj} does not exist; exiting.", verbose)
141
+ raise ValueError(f"Path does not exist: {path_obj}")
142
+
143
+ if exists:
144
+ strict = _resolve_path_strict(path_strict)
145
+ conflicts = _find_case_conflicts(path_obj, path_value)
146
+ if conflicts:
147
+ details = "; ".join(
148
+ f"{provided} -> {actual} in {parent}"
149
+ for provided, actual, parent in conflicts
150
+ )
151
+ message = f"Path case mismatch for {label}: {details}"
152
+ if strict:
153
+ raise ValueError(message)
154
+ log_warning(message, verbose)
155
+
156
+ return path_obj
157
+
158
+
159
+ def ensure_directory(
160
+ input: Optional[PathInput],
161
+ *,
162
+ verbose: bool = False,
163
+ label: str = "path",
164
+ path_strict: Optional[bool] = None,
165
+ type_label: Optional[str] = None,
166
+ ) -> pathlib.Path:
167
+ """Ensure a directory exists and return the resolved path."""
168
+ path_obj = coerce_path(
169
+ input,
170
+ must_exist=False,
171
+ verbose=verbose,
172
+ label=label,
173
+ path_strict=path_strict,
174
+ type_label=type_label,
175
+ )
176
+ path_obj.mkdir(parents=True, exist_ok=True)
177
+ return path_obj
178
+
179
+
180
+ def sql_quotename(
181
+ name: Optional[str] = None,
182
+ brackets: bool = True,
183
+ ticks: bool = False,
184
+ verbose: bool = False,
185
+ ) -> str:
186
+ """
187
+ Quote a SQL Server name string with brackets or ticks.
188
+
189
+ Args:
190
+ name: The name to quote. Must be a non-empty string.
191
+ brackets: If True, wraps the name in square brackets [name].
192
+ ticks: If True, wraps the name in single quotes 'name'.
193
+ Takes precedence over brackets if both are True.
194
+ verbose: If True, prints error messages.
195
+
196
+ Returns:
197
+ The quoted name string.
198
+
199
+ Raises:
200
+ ValueError: If name is None or empty after stripping.
201
+ TypeError: If name is not a string.
202
+
203
+ Examples:
204
+ >>> sql_quotename('table_name')
205
+ '[table_name]'
206
+ >>> sql_quotename('table_name', brackets=False, ticks=True)
207
+ "'table_name'"
208
+ """
209
+ if name is None:
210
+ print_verbose("No name provided; exiting sql_quotename.", verbose)
211
+ raise ValueError("name must be provided")
212
+ if not isinstance(name, str):
213
+ raise TypeError("name must be a string")
214
+ cleaned = name.strip()
215
+ if not cleaned:
216
+ raise ValueError("name must be a non-empty string")
217
+
218
+ return_value = cleaned.replace("[", "").replace("]", "")
219
+ if brackets:
220
+ return_value = f"[{return_value}]"
221
+ if ticks or not brackets:
222
+ return_value = f"'{return_value}'"
223
+ return return_value
224
+
225
+
226
+ def pathing(
227
+ input: Optional[Union[str, pathlib.Path]], verbose: bool = False
228
+ ) -> pathlib.Path:
229
+ """
230
+ Standardize and validate a path string or Path object.
231
+
232
+ Args:
233
+ input: The path to standardize (string or pathlib.Path). Must not be None.
234
+ verbose: If True, prints error messages.
235
+
236
+ Returns:
237
+ A pathlib.Path object if the path exists.
238
+
239
+ Raises:
240
+ ValueError: If input is None or the path does not exist.
241
+ TypeError: If input is not a string or pathlib.Path.
242
+ """
243
+ return coerce_path(
244
+ input,
245
+ must_exist=True,
246
+ verbose=verbose,
247
+ label="path input",
248
+ type_label="input",
249
+ )
@@ -0,0 +1,406 @@
1
+ Metadata-Version: 2.4
2
+ Name: datablade
3
+ Version: 0.0.6
4
+ Summary: datablade is a suite of functions to provide standard syntax across data engineering projects.
5
+ Author-email: Brent Carpenetti <brentwc.git@pm.me>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 Brent Carpenetti
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ Requires-Python: >=3.12
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: pandas
31
+ Requires-Dist: pyarrow
32
+ Requires-Dist: numpy
33
+ Requires-Dist: openpyxl
34
+ Requires-Dist: requests
35
+ Provides-Extra: performance
36
+ Requires-Dist: polars; extra == "performance"
37
+ Requires-Dist: psutil; extra == "performance"
38
+ Provides-Extra: test
39
+ Requires-Dist: pytest>=7.0.0; extra == "test"
40
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
41
+ Requires-Dist: pytest-mock>=3.10.0; extra == "test"
42
+ Provides-Extra: dev
43
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
44
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
45
+ Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
46
+ Requires-Dist: polars; extra == "dev"
47
+ Requires-Dist: psutil; extra == "dev"
48
+ Requires-Dist: black; extra == "dev"
49
+ Requires-Dist: flake8; extra == "dev"
50
+ Requires-Dist: mypy; extra == "dev"
51
+ Requires-Dist: isort; extra == "dev"
52
+ Provides-Extra: all
53
+ Requires-Dist: polars; extra == "all"
54
+ Requires-Dist: psutil; extra == "all"
55
+ Requires-Dist: pytest>=7.0.0; extra == "all"
56
+ Requires-Dist: pytest-cov>=4.0.0; extra == "all"
57
+ Requires-Dist: pytest-mock>=3.10.0; extra == "all"
58
+ Requires-Dist: black; extra == "all"
59
+ Requires-Dist: flake8; extra == "all"
60
+ Requires-Dist: mypy; extra == "all"
61
+ Requires-Dist: isort; extra == "all"
62
+ Dynamic: license-file
63
+
64
+ # datablade
65
+
66
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
67
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
68
+
69
+ **datablade** is a small, single-machine Python toolkit for data engineers who need reliable “file → DataFrame/Parquet → SQL DDL” workflows.
70
+
71
+ It focuses on:
72
+
73
+ - Reading common file formats with memory-aware heuristics
74
+ - Streaming large files in chunks (without concatenating)
75
+ - Normalizing DataFrame columns for downstream systems
76
+ - Generating `CREATE TABLE` DDL across a small set of SQL dialects
77
+ - Producing bulk-load commands (and executing BCP for SQL Server)
78
+
79
+ ## What datablade Does
80
+
81
+ datablade helps data engineers:
82
+
83
+ - **Load data efficiently** from common file formats with automatic memory heuristics
84
+ - **Standardize data cleaning** with consistent column naming and type inference
85
+ - **Generate database schemas** for multiple SQL dialects from DataFrames or Parquet schemas
86
+ - **Handle datasets that don't fit in memory** using chunked iteration and optional Polars acceleration
87
+ - **Work across databases** with cross-dialect DDL and bulk-load command generation
88
+ - **Maintain data quality** with built-in validation and logging
89
+
90
+ ## When to Use datablade
91
+
92
+ datablade is ideal for:
93
+
94
+ ✅ **ETL/ELT Pipelines** - Building reproducible data ingestion workflows across multiple source formats
95
+
96
+ ✅ **Multi-Database Projects** - Deploying the same schema to SQL Server, PostgreSQL, MySQL, or DuckDB
97
+
98
+ ✅ **Large File Processing** - Streaming CSV/TSV/TXT/Parquet without concatenating
99
+
100
+ ✅ **Data Lake to Warehouse** - Converting raw files to Parquet with optimized schemas
101
+
102
+ ✅ **Ad-hoc Data Analysis** - Quickly exploring and preparing datasets with consistent patterns
103
+
104
+ ✅ **Legacy System Integration** - Standardizing messy column names and data types from external sources
105
+
106
+ ## When datablade is not the right tool
107
+
108
+ - Real-time streaming ingestion (Kafka, Spark Structured Streaming)
109
+ - Distributed compute / cluster execution (Spark, Dask)
110
+ - Warehouse-native transformations and modeling (dbt)
111
+ - A full-featured schema migration tool (Alembic, Flyway)
112
+ - Direct database connectivity/transactions (datablade generates SQL; it does not manage connections)
113
+
114
+ ## Installation
115
+
116
+ ```bash
117
+ pip install datablade
118
+ ```
119
+
120
+ **Optional dependencies:**
121
+
122
+ ```bash
123
+ # For high-performance file reading with Polars
124
+ pip install "datablade[performance]"
125
+
126
+ # For testing
127
+ pip install "datablade[test]"
128
+
129
+ # For development (includes testing + lint/format tooling)
130
+ pip install "datablade[dev]"
131
+
132
+ # All optional dependencies
133
+ pip install "datablade[all]"
134
+ ```
135
+
136
+ ## Features
137
+
138
+ datablade provides four main modules:
139
+
140
+ ### 📊 `datablade.dataframes`
141
+
142
+ DataFrame operations and transformations:
143
+
144
+ - Clean and normalize DataFrame columns
145
+ - Auto-detect and convert data types
146
+ - Generate optimized Parquet schemas
147
+ - Convert pandas DataFrames to PyArrow tables
148
+ - Generate multi-dialect SQL DDL statements
149
+ - **Memory-aware file reading** with automatic chunking
150
+ - **Polars integration** for high-performance large file processing
151
+ - Partitioned Parquet writing for datasets that don't fit in memory
152
+
153
+ ### 🌐 `datablade.io`
154
+
155
+ Input/output operations for external data:
156
+
157
+ - Fetch JSON data from URLs
158
+ - Download and extract ZIP files
159
+
160
+ ### 🛠️ `datablade.utils`
161
+
162
+ General utility functions:
163
+
164
+ - SQL name quoting
165
+ - Path standardization
166
+ - List flattening
167
+ - **Configurable logging** with Python logging module
168
+
169
+ ### 🗄️ `datablade.sql`
170
+
171
+ Multi-dialect SQL utilities:
172
+
173
+ - **Multi-dialect support**: SQL Server, PostgreSQL, MySQL, DuckDB
174
+ - Dialect-aware identifier quoting
175
+ - CREATE TABLE generation for all dialects (from pandas DataFrames)
176
+ - CREATE TABLE generation from Parquet schemas (schema-only, via PyArrow)
177
+ - Optional `schema_spec` overrides for column types, nullability, and string sizing
178
+ - Bulk loading helpers:
179
+ - SQL Server: executes `bcp` via subprocess
180
+ - PostgreSQL/MySQL/DuckDB: returns command strings you can run in your environment
181
+
182
+ ## Quick Start
183
+
184
+ ```python
185
+ import pandas as pd
186
+ from datablade import configure_logging, read_file_smart
187
+ from datablade.dataframes import clean_dataframe_columns, pandas_to_parquet_table
188
+ from datablade.io import get_json
189
+ from datablade.utils import sql_quotename
190
+ from datablade.sql import Dialect, generate_create_table, generate_create_table_from_parquet
191
+
192
+ # Configure logging
193
+ import logging
194
+ configure_logging(level=logging.INFO, log_file="datablade.log")
195
+
196
+ # Read a file into a single DataFrame (materializes)
197
+ df = read_file_smart('large_dataset.csv', verbose=True)
198
+
199
+ # Clean DataFrame
200
+ df = clean_dataframe_columns(df, verbose=True)
201
+
202
+ # Convert to Parquet
203
+ table = pandas_to_parquet_table(df, convert=True)
204
+
205
+ # Generate SQL DDL for multiple dialects
206
+ sql_sqlserver = generate_create_table(df, table='my_table', dialect=Dialect.SQLSERVER)
207
+ sql_postgres = generate_create_table(df, table='my_table', dialect=Dialect.POSTGRES)
208
+
209
+ # Generate SQL DDL directly from an existing Parquet schema (no data materialization)
210
+ # Note: nested Parquet types (struct/list/map/union) are dropped with a warning.
211
+ ddl_from_parquet = generate_create_table_from_parquet(
212
+ "events.parquet",
213
+ table="events",
214
+ dialect=Dialect.POSTGRES,
215
+ )
216
+
217
+ # Fetch JSON data
218
+ data = get_json('https://api.example.com/data.json')
219
+ ```
220
+
221
+ Most file path parameters accept `str` or `pathlib.Path`. To treat case mismatches
222
+ as errors on case-insensitive filesystems, use `configure_paths(path_strict=True)`.
223
+
224
+ ### Memory-Aware File Reading
225
+
226
+ See the file format support matrix in the bundled USAGE doc:
227
+
228
+ ```bash
229
+ python -m datablade.docs --show USAGE
230
+ ```
231
+
232
+ ```python
233
+ from datablade.dataframes import (
234
+ excel_to_parquets,
235
+ read_file_chunked,
236
+ read_file_iter,
237
+ read_file_to_parquets,
238
+ stream_to_parquets,
239
+ )
240
+
241
+ # Read large files in chunks
242
+ for chunk in read_file_chunked('huge_file.csv', memory_fraction=0.5):
243
+ process(chunk)
244
+
245
+ # Stream without ever concatenating/materializing
246
+ for chunk in read_file_iter('huge_file.csv', memory_fraction=0.3, verbose=True):
247
+ process(chunk)
248
+
249
+ # Parquet is also supported for streaming (single .parquet files)
250
+ for chunk in read_file_iter('huge_file.parquet', memory_fraction=0.3, verbose=True):
251
+ process(chunk)
252
+
253
+ # Excel streaming is available with openpyxl installed (read-only mode)
254
+ for chunk in read_file_iter('large.xlsx', chunksize=25_000, verbose=True):
255
+ process(chunk)
256
+
257
+ # Partition large files to multiple Parquets
258
+ files = read_file_to_parquets(
259
+ 'large_file.csv',
260
+ output_dir='partitioned/',
261
+ convert_types=True,
262
+ verbose=True
263
+ )
264
+
265
+ # Stream to Parquet partitions without materializing
266
+ files = stream_to_parquets(
267
+ 'large_file.csv',
268
+ output_dir='partitioned_streamed/',
269
+ rows_per_file=200_000,
270
+ convert_types=True,
271
+ verbose=True,
272
+ )
273
+
274
+ # Excel streaming to Parquet partitions
275
+ files = excel_to_parquets(
276
+ 'large.xlsx',
277
+ output_dir='partitioned_excel/',
278
+ rows_per_file=200_000,
279
+ convert_types=True,
280
+ verbose=True,
281
+ )
282
+ ```
283
+
284
+ ## Blade (Optional Facade)
285
+
286
+ The canonical API is module-level functions (for example, `datablade.dataframes.read_file_iter`).
287
+
288
+ If you prefer an object-style entrypoint with shared defaults, you can use the optional `Blade` facade:
289
+
290
+ ```python
291
+ from datablade import Blade
292
+ from datablade.sql import Dialect
293
+
294
+ blade = Blade(memory_fraction=0.3, verbose=True, convert_types=True)
295
+
296
+ for chunk in blade.iter("huge.csv"):
297
+ process(chunk)
298
+
299
+ files = blade.stream_to_parquets("huge.csv", output_dir="partitioned/")
300
+
301
+ # Generate DDL (CREATE TABLE)
302
+ ddl = blade.create_table_sql(
303
+ df,
304
+ table="my_table",
305
+ dialect=Dialect.POSTGRES,
306
+ )
307
+
308
+ # Generate DDL from an existing Parquet file (schema-only)
309
+ ddl2 = blade.create_table_sql_from_parquet(
310
+ "events.parquet",
311
+ table="events",
312
+ dialect=Dialect.POSTGRES,
313
+ )
314
+ ```
315
+
316
+ ## Documentation
317
+
318
+ Docs are bundled with the installed package:
319
+
320
+ ```bash
321
+ python -m datablade.docs --list
322
+ python -m datablade.docs --show USAGE
323
+ python -m datablade.docs --write-dir .\datablade-docs
324
+ ```
325
+
326
+ After writing docs to disk, open the markdown files locally:
327
+
328
+ - README (docs landing page)
329
+ - USAGE (file reading, streaming, SQL, IO, logging)
330
+ - TESTING (how to run tests locally)
331
+ - ARCHITECTURE (pipeline overview)
332
+ - OBJECT_REGISTRY (registry reference)
333
+
334
+ ## Testing
335
+
336
+ Run the test suite:
337
+
338
+ ```bash
339
+ # Install with test dependencies
340
+ pip install -e ".[test]"
341
+
342
+ # Run all tests
343
+ pytest
344
+
345
+ # Run with coverage report
346
+ pytest --cov=datablade --cov-report=html
347
+ ```
348
+
349
+ For detailed testing documentation, use the bundled TESTING doc:
350
+
351
+ ```bash
352
+ python -m datablade.docs --show TESTING
353
+ ```
354
+
355
+ ## Backward Compatibility
356
+
357
+ All functions are available through the legacy `datablade.core` module for backward compatibility:
358
+
359
+ ```python
360
+ # Legacy imports (still supported)
361
+ from datablade.core.frames import clean_dataframe_columns
362
+ from datablade.core.json import get
363
+ ```
364
+
365
+ ## Requirements
366
+
367
+ **Core dependencies:**
368
+
369
+ - pandas
370
+ - pyarrow
371
+ - numpy
372
+ - openpyxl
373
+ - requests
374
+
375
+ ## Design choices and limitations
376
+
377
+ - **Single-machine focus**: datablade is designed for laptop/VM/server execution, not clusters.
378
+ - **Streaming vs materializing**:
379
+ - Use `read_file_iter()` to process arbitrarily large files chunk-by-chunk.
380
+ - `read_file_smart()` returns a single DataFrame and may still be memory-intensive.
381
+ - **Chunk concatenation**: the large-file pandas fallback in `read_file_smart()` can
382
+ temporarily spike memory usage during concat. Use `read_file_iter()` or
383
+ `return_type="iterator"` to avoid concatenation.
384
+ - **Polars materialization**: when returning a pandas DataFrame, Polars still
385
+ collects into memory; use `return_type="polars"` or `"polars_lazy"` to keep
386
+ Polars frames.
387
+ - **Parquet support**:
388
+ - Streaming reads support single `.parquet` files.
389
+ - Parquet “dataset directories” (Hive partitions / directory-of-parquets) are not a primary target API.
390
+ - **Parquet → SQL DDL**:
391
+ - Uses the Parquet schema (PyArrow) without scanning data.
392
+ - Complex/nested columns (struct/list/map/union) are dropped and logged as warnings.
393
+ - **DDL scope**: `CREATE TABLE` generation is column/type oriented (no indexes/constraints).
394
+ - **SQL Server bulk load**: the SQL Server helpers use the `bcp` CLI and require it
395
+ to be installed and available on PATH. When using `-U`/`-P`, credentials are
396
+ passed via process args (logs are redacted); prefer `-T` or `-G` where possible.
397
+
398
+ **Optional dependencies:**
399
+
400
+ - polars (for high-performance file reading)
401
+ - psutil (for memory-aware operations)
402
+ - pytest (for testing)
403
+
404
+ ## License
405
+
406
+ MIT
@@ -0,0 +1,41 @@
1
+ datablade/__init__.py,sha256=T1FBNEZfjdIFwyfRLNCQBroMXDfaYai1sQTaRzhA1Qg,1487
2
+ datablade/blade.py,sha256=iKQtHhH-xs3EDH9ZNryheRGzE2OBkaxjZsA99iAG-p0,10330
3
+ datablade/registry.py,sha256=zmfbt3-LthxV2lmwNAmt4KN-EnLRKAguMnYnk5xxzjg,20323
4
+ datablade/core/__init__.py,sha256=VJ2lFumv-7CFKjIN_i4wPEaAGFEb-UhHPzsM2IyTUUM,871
5
+ datablade/core/frames.py,sha256=iiTQPkWKQHslwkz_pu6bHVvbnisWSZF_b3Hgy1SvE-Q,662
6
+ datablade/core/json.py,sha256=_0RoK_OXkOpwNG5Y7vdORQOCfs0ZVJAO0qeQqjxzpyE,121
7
+ datablade/core/lists.py,sha256=OVj-KLyeOlbOxALyxvBLQpB0s78RO2QxmM4OxbI70Cc,132
8
+ datablade/core/messages.py,sha256=_2CpeaDJIwmRlTm4UMrDTKnew1wZdyai6QeKFjN3rno,404
9
+ datablade/core/strings.py,sha256=vwx0NW-m-2nOYW1aisRXUjrd4wERe2Y5Ka64wAyF0Mo,173
10
+ datablade/core/zip.py,sha256=fmp6VTKH3JAaRWTKhWGHN-frO7Vr4EHlEYtUOlomNuc,119
11
+ datablade/dataframes/__init__.py,sha256=blPM2EkNI6FA8unnsGpTDP4A12G23ChXy4djyhpgHMk,1316
12
+ datablade/dataframes/frames.py,sha256=VaMgrY5F8uuWw6adx9g10sBlkW4TfZhH5AEo2TgD5N4,20993
13
+ datablade/dataframes/readers.py,sha256=1RFW0J1iKGvmgrRjDMB8PoHcMhQhyfAlUh1h6_aF_2E,47469
14
+ datablade/docs/ARCHITECTURE.md,sha256=ON65KUk3-EdwfnMglOT1CVZqyJNUXVbAUff803d7XDU,3290
15
+ datablade/docs/OBJECT_REGISTRY.md,sha256=00wkTuo6Pnhl9kgRDu7mJFO3yxSBedQQS7r-ktnsqSs,5386
16
+ datablade/docs/README.md,sha256=2uH6siz5ABEd6ZDjPYbXpH-zaOgyHzB3ik2e2eUj8Ks,1587
17
+ datablade/docs/TESTING.md,sha256=zc1Qz3ZRGH9CZqWfzArDmv3wRfS8-346EiXhE_fO0co,463
18
+ datablade/docs/USAGE.md,sha256=vJRmCF-dUeXAiWvvgUwc6fO3gpYNBZtwV-v2gmfoeeY,11866
19
+ datablade/docs/__init__.py,sha256=t4KRQOU653CVdkFD00Aj0hdjrlcmx70D2W2QnybJDnc,2388
20
+ datablade/docs/__main__.py,sha256=V-GK7vuR1aEvT_TgJPAZnapjLhwDXGOCSEseX4R6fts,112
21
+ datablade/io/__init__.py,sha256=jDenqP4biI1gnRdoO9X5C489cBlHrLuR3xp_TIOljp8,295
22
+ datablade/io/json.py,sha256=3Oao1lP8Dat3DT7MYyCes7feqPVFDHAPpFDemHuKVVw,2384
23
+ datablade/io/zip.py,sha256=Ppl8CvGhWIYFcBelpSkinmT_oFDmQmqNiHuFHicSoQI,4268
24
+ datablade/sql/__init__.py,sha256=Fs77XFT_hJcdxBZxLD9-6fgD2QEVtfp58LlZaEMFEJA,1466
25
+ datablade/sql/bulk_load.py,sha256=7SniXGHMIvwg9P_r95Nt7Z72jkQnu-LuQtPvv_Wmq2I,19992
26
+ datablade/sql/ddl.py,sha256=4xJ98LJmnD7PqloU8Ewa7J9iy2L6n9iPRsM5WtBKA0U,14958
27
+ datablade/sql/ddl_pyarrow.py,sha256=yBfTbDAA7dE3BXrCSPqEFogOYohAb3p0v3AIgBRxNqc,15344
28
+ datablade/sql/dialects.py,sha256=rM5qw6xdIuCSx81b6_TKYnsDhKeVpTdMDb_62HhtksA,266
29
+ datablade/sql/quoting.py,sha256=Y8yn1GAsaWSIenQDidwU4j2YsZmYE9ppOifAPiTbStk,1345
30
+ datablade/sql/schema_spec.py,sha256=HdEA5RS_yj8WNVH8wroWDyIK-8K4_fgMZC7rZBswkcs,2067
31
+ datablade/sql/sqlserver.py,sha256=ASLhtHIslMePRi9pUY6vWTaebQ7TM4kO60N9QsXwhok,13726
32
+ datablade/utils/__init__.py,sha256=KbvUZVWg2u53oUlfEIbDndwgz6f_j7N23Dl8Oz-vWXk,746
33
+ datablade/utils/lists.py,sha256=h3AXo2SSE2iaR_rJfoCgfNRFV9AOWerL-hHFYo6S4n8,798
34
+ datablade/utils/logging.py,sha256=kqM1389Wgzml69aja6g7yj8biNSi9AOPZlxr3YTnyp0,5491
35
+ datablade/utils/messages.py,sha256=yZZTGTS_eD0PBZFCdzz51fqBFWgq9Sdq-GMR1a0FclY,537
36
+ datablade/utils/strings.py,sha256=GLvYi7FPUHZ2nqNsZ0-mGbc3gu70UvVzrIy4bQWRHOo,7196
37
+ datablade-0.0.6.dist-info/licenses/LICENSE,sha256=QyU-OkETSZ-L0Rltu-SAMQtQU_UXjwLnb_QlA_AXm0U,1072
38
+ datablade-0.0.6.dist-info/METADATA,sha256=ou_j_HZwHrdNaZb26JhW_m3oRhILZ4tMFIPla9Sm8LM,13261
39
+ datablade-0.0.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
40
+ datablade-0.0.6.dist-info/top_level.txt,sha256=AwA5QxmfuaAs9XeXw1tCsboPsoffnMU-6CeLWMMUoUA,10
41
+ datablade-0.0.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5