datablade 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablade/__init__.py +10 -2
- datablade/blade.py +174 -5
- datablade/dataframes/__init__.py +8 -0
- datablade/dataframes/frames.py +127 -27
- datablade/dataframes/readers.py +988 -161
- datablade/docs/ARCHITECTURE.md +102 -0
- datablade/docs/OBJECT_REGISTRY.md +194 -0
- datablade/docs/README.md +57 -0
- datablade/docs/TESTING.md +37 -0
- datablade/docs/USAGE.md +409 -0
- datablade/docs/__init__.py +87 -0
- datablade/docs/__main__.py +6 -0
- datablade/io/json.py +45 -8
- datablade/io/zip.py +68 -30
- datablade/registry.py +581 -0
- datablade/sql/__init__.py +25 -1
- datablade/sql/bulk_load.py +309 -49
- datablade/sql/ddl.py +201 -26
- datablade/sql/ddl_pyarrow.py +150 -26
- datablade/sql/dialects.py +2 -0
- datablade/sql/quoting.py +2 -0
- datablade/sql/schema_spec.py +65 -0
- datablade/sql/sqlserver.py +390 -0
- datablade/utils/__init__.py +2 -1
- datablade/utils/lists.py +3 -0
- datablade/utils/logging.py +46 -1
- datablade/utils/strings.py +180 -17
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/METADATA +68 -13
- datablade-0.0.6.dist-info/RECORD +41 -0
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
- datablade-0.0.5.dist-info/RECORD +0 -31
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/licenses/LICENSE +0 -0
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datablade
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.6
|
|
4
4
|
Summary: datablade is a suite of functions to provide standard syntax across data engineering projects.
|
|
5
5
|
Author-email: Brent Carpenetti <brentwc.git@pm.me>
|
|
6
6
|
License: MIT License
|
|
@@ -63,7 +63,6 @@ Dynamic: license-file
|
|
|
63
63
|
|
|
64
64
|
# datablade
|
|
65
65
|
|
|
66
|
-
[](https://github.com/brentwc/data-prep/actions/workflows/test.yml)
|
|
67
66
|
[](https://www.python.org/downloads/)
|
|
68
67
|
[](https://opensource.org/licenses/MIT)
|
|
69
68
|
|
|
@@ -115,20 +114,23 @@ datablade is ideal for:
|
|
|
115
114
|
## Installation
|
|
116
115
|
|
|
117
116
|
```bash
|
|
118
|
-
pip install
|
|
117
|
+
pip install datablade
|
|
119
118
|
```
|
|
120
119
|
|
|
121
120
|
**Optional dependencies:**
|
|
122
121
|
|
|
123
122
|
```bash
|
|
124
123
|
# For high-performance file reading with Polars
|
|
125
|
-
pip install
|
|
124
|
+
pip install "datablade[performance]"
|
|
126
125
|
|
|
127
|
-
# For
|
|
128
|
-
pip install
|
|
126
|
+
# For testing
|
|
127
|
+
pip install "datablade[test]"
|
|
128
|
+
|
|
129
|
+
# For development (includes testing + lint/format tooling)
|
|
130
|
+
pip install "datablade[dev]"
|
|
129
131
|
|
|
130
132
|
# All optional dependencies
|
|
131
|
-
pip install
|
|
133
|
+
pip install "datablade[all]"
|
|
132
134
|
```
|
|
133
135
|
|
|
134
136
|
## Features
|
|
@@ -172,6 +174,7 @@ Multi-dialect SQL utilities:
|
|
|
172
174
|
- Dialect-aware identifier quoting
|
|
173
175
|
- CREATE TABLE generation for all dialects (from pandas DataFrames)
|
|
174
176
|
- CREATE TABLE generation from Parquet schemas (schema-only, via PyArrow)
|
|
177
|
+
- Optional `schema_spec` overrides for column types, nullability, and string sizing
|
|
175
178
|
- Bulk loading helpers:
|
|
176
179
|
- SQL Server: executes `bcp` via subprocess
|
|
177
180
|
- PostgreSQL/MySQL/DuckDB: returns command strings you can run in your environment
|
|
@@ -215,10 +218,25 @@ ddl_from_parquet = generate_create_table_from_parquet(
|
|
|
215
218
|
data = get_json('https://api.example.com/data.json')
|
|
216
219
|
```
|
|
217
220
|
|
|
221
|
+
Most file path parameters accept `str` or `pathlib.Path`. To treat case mismatches
|
|
222
|
+
as errors on case-insensitive filesystems, use `configure_paths(path_strict=True)`.
|
|
223
|
+
|
|
218
224
|
### Memory-Aware File Reading
|
|
219
225
|
|
|
226
|
+
See the file format support matrix in the bundled USAGE doc:
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
python -m datablade.docs --show USAGE
|
|
230
|
+
```
|
|
231
|
+
|
|
220
232
|
```python
|
|
221
|
-
from datablade.dataframes import
|
|
233
|
+
from datablade.dataframes import (
|
|
234
|
+
excel_to_parquets,
|
|
235
|
+
read_file_chunked,
|
|
236
|
+
read_file_iter,
|
|
237
|
+
read_file_to_parquets,
|
|
238
|
+
stream_to_parquets,
|
|
239
|
+
)
|
|
222
240
|
|
|
223
241
|
# Read large files in chunks
|
|
224
242
|
for chunk in read_file_chunked('huge_file.csv', memory_fraction=0.5):
|
|
@@ -232,6 +250,10 @@ for chunk in read_file_iter('huge_file.csv', memory_fraction=0.3, verbose=True):
|
|
|
232
250
|
for chunk in read_file_iter('huge_file.parquet', memory_fraction=0.3, verbose=True):
|
|
233
251
|
process(chunk)
|
|
234
252
|
|
|
253
|
+
# Excel streaming is available with openpyxl installed (read-only mode)
|
|
254
|
+
for chunk in read_file_iter('large.xlsx', chunksize=25_000, verbose=True):
|
|
255
|
+
process(chunk)
|
|
256
|
+
|
|
235
257
|
# Partition large files to multiple Parquets
|
|
236
258
|
files = read_file_to_parquets(
|
|
237
259
|
'large_file.csv',
|
|
@@ -248,6 +270,15 @@ files = stream_to_parquets(
|
|
|
248
270
|
convert_types=True,
|
|
249
271
|
verbose=True,
|
|
250
272
|
)
|
|
273
|
+
|
|
274
|
+
# Excel streaming to Parquet partitions
|
|
275
|
+
files = excel_to_parquets(
|
|
276
|
+
'large.xlsx',
|
|
277
|
+
output_dir='partitioned_excel/',
|
|
278
|
+
rows_per_file=200_000,
|
|
279
|
+
convert_types=True,
|
|
280
|
+
verbose=True,
|
|
281
|
+
)
|
|
251
282
|
```
|
|
252
283
|
|
|
253
284
|
## Blade (Optional Facade)
|
|
@@ -284,10 +315,21 @@ ddl2 = blade.create_table_sql_from_parquet(
|
|
|
284
315
|
|
|
285
316
|
## Documentation
|
|
286
317
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
-
|
|
318
|
+
Docs are bundled with the installed package:
|
|
319
|
+
|
|
320
|
+
```bash
|
|
321
|
+
python -m datablade.docs --list
|
|
322
|
+
python -m datablade.docs --show USAGE
|
|
323
|
+
python -m datablade.docs --write-dir .\datablade-docs
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
After writing docs to disk, open the markdown files locally:
|
|
327
|
+
|
|
328
|
+
- README (docs landing page)
|
|
329
|
+
- USAGE (file reading, streaming, SQL, IO, logging)
|
|
330
|
+
- TESTING (how to run tests locally)
|
|
331
|
+
- ARCHITECTURE (pipeline overview)
|
|
332
|
+
- OBJECT_REGISTRY (registry reference)
|
|
291
333
|
|
|
292
334
|
## Testing
|
|
293
335
|
|
|
@@ -304,7 +346,11 @@ pytest
|
|
|
304
346
|
pytest --cov=datablade --cov-report=html
|
|
305
347
|
```
|
|
306
348
|
|
|
307
|
-
|
|
349
|
+
For detailed testing documentation, use the bundled TESTING doc:
|
|
350
|
+
|
|
351
|
+
```bash
|
|
352
|
+
python -m datablade.docs --show TESTING
|
|
353
|
+
```
|
|
308
354
|
|
|
309
355
|
## Backward Compatibility
|
|
310
356
|
|
|
@@ -332,6 +378,12 @@ from datablade.core.json import get
|
|
|
332
378
|
- **Streaming vs materializing**:
|
|
333
379
|
- Use `read_file_iter()` to process arbitrarily large files chunk-by-chunk.
|
|
334
380
|
- `read_file_smart()` returns a single DataFrame and may still be memory-intensive.
|
|
381
|
+
- **Chunk concatenation**: the large-file pandas fallback in `read_file_smart()` can
|
|
382
|
+
temporarily spike memory usage during concat. Use `read_file_iter()` or
|
|
383
|
+
`return_type="iterator"` to avoid concatenation.
|
|
384
|
+
- **Polars materialization**: when returning a pandas DataFrame, Polars still
|
|
385
|
+
collects into memory; use `return_type="polars"` or `"polars_lazy"` to keep
|
|
386
|
+
Polars frames.
|
|
335
387
|
- **Parquet support**:
|
|
336
388
|
- Streaming reads support single `.parquet` files.
|
|
337
389
|
- Parquet “dataset directories” (Hive partitions / directory-of-parquets) are not a primary target API.
|
|
@@ -339,6 +391,9 @@ from datablade.core.json import get
|
|
|
339
391
|
- Uses the Parquet schema (PyArrow) without scanning data.
|
|
340
392
|
- Complex/nested columns (struct/list/map/union) are dropped and logged as warnings.
|
|
341
393
|
- **DDL scope**: `CREATE TABLE` generation is column/type oriented (no indexes/constraints).
|
|
394
|
+
- **SQL Server bulk load**: the SQL Server helpers use the `bcp` CLI and require it
|
|
395
|
+
to be installed and available on PATH. When using `-U`/`-P`, credentials are
|
|
396
|
+
passed via process args (logs are redacted); prefer `-T` or `-G` where possible.
|
|
342
397
|
|
|
343
398
|
**Optional dependencies:**
|
|
344
399
|
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
datablade/__init__.py,sha256=T1FBNEZfjdIFwyfRLNCQBroMXDfaYai1sQTaRzhA1Qg,1487
|
|
2
|
+
datablade/blade.py,sha256=iKQtHhH-xs3EDH9ZNryheRGzE2OBkaxjZsA99iAG-p0,10330
|
|
3
|
+
datablade/registry.py,sha256=zmfbt3-LthxV2lmwNAmt4KN-EnLRKAguMnYnk5xxzjg,20323
|
|
4
|
+
datablade/core/__init__.py,sha256=VJ2lFumv-7CFKjIN_i4wPEaAGFEb-UhHPzsM2IyTUUM,871
|
|
5
|
+
datablade/core/frames.py,sha256=iiTQPkWKQHslwkz_pu6bHVvbnisWSZF_b3Hgy1SvE-Q,662
|
|
6
|
+
datablade/core/json.py,sha256=_0RoK_OXkOpwNG5Y7vdORQOCfs0ZVJAO0qeQqjxzpyE,121
|
|
7
|
+
datablade/core/lists.py,sha256=OVj-KLyeOlbOxALyxvBLQpB0s78RO2QxmM4OxbI70Cc,132
|
|
8
|
+
datablade/core/messages.py,sha256=_2CpeaDJIwmRlTm4UMrDTKnew1wZdyai6QeKFjN3rno,404
|
|
9
|
+
datablade/core/strings.py,sha256=vwx0NW-m-2nOYW1aisRXUjrd4wERe2Y5Ka64wAyF0Mo,173
|
|
10
|
+
datablade/core/zip.py,sha256=fmp6VTKH3JAaRWTKhWGHN-frO7Vr4EHlEYtUOlomNuc,119
|
|
11
|
+
datablade/dataframes/__init__.py,sha256=blPM2EkNI6FA8unnsGpTDP4A12G23ChXy4djyhpgHMk,1316
|
|
12
|
+
datablade/dataframes/frames.py,sha256=VaMgrY5F8uuWw6adx9g10sBlkW4TfZhH5AEo2TgD5N4,20993
|
|
13
|
+
datablade/dataframes/readers.py,sha256=1RFW0J1iKGvmgrRjDMB8PoHcMhQhyfAlUh1h6_aF_2E,47469
|
|
14
|
+
datablade/docs/ARCHITECTURE.md,sha256=ON65KUk3-EdwfnMglOT1CVZqyJNUXVbAUff803d7XDU,3290
|
|
15
|
+
datablade/docs/OBJECT_REGISTRY.md,sha256=00wkTuo6Pnhl9kgRDu7mJFO3yxSBedQQS7r-ktnsqSs,5386
|
|
16
|
+
datablade/docs/README.md,sha256=2uH6siz5ABEd6ZDjPYbXpH-zaOgyHzB3ik2e2eUj8Ks,1587
|
|
17
|
+
datablade/docs/TESTING.md,sha256=zc1Qz3ZRGH9CZqWfzArDmv3wRfS8-346EiXhE_fO0co,463
|
|
18
|
+
datablade/docs/USAGE.md,sha256=vJRmCF-dUeXAiWvvgUwc6fO3gpYNBZtwV-v2gmfoeeY,11866
|
|
19
|
+
datablade/docs/__init__.py,sha256=t4KRQOU653CVdkFD00Aj0hdjrlcmx70D2W2QnybJDnc,2388
|
|
20
|
+
datablade/docs/__main__.py,sha256=V-GK7vuR1aEvT_TgJPAZnapjLhwDXGOCSEseX4R6fts,112
|
|
21
|
+
datablade/io/__init__.py,sha256=jDenqP4biI1gnRdoO9X5C489cBlHrLuR3xp_TIOljp8,295
|
|
22
|
+
datablade/io/json.py,sha256=3Oao1lP8Dat3DT7MYyCes7feqPVFDHAPpFDemHuKVVw,2384
|
|
23
|
+
datablade/io/zip.py,sha256=Ppl8CvGhWIYFcBelpSkinmT_oFDmQmqNiHuFHicSoQI,4268
|
|
24
|
+
datablade/sql/__init__.py,sha256=Fs77XFT_hJcdxBZxLD9-6fgD2QEVtfp58LlZaEMFEJA,1466
|
|
25
|
+
datablade/sql/bulk_load.py,sha256=7SniXGHMIvwg9P_r95Nt7Z72jkQnu-LuQtPvv_Wmq2I,19992
|
|
26
|
+
datablade/sql/ddl.py,sha256=4xJ98LJmnD7PqloU8Ewa7J9iy2L6n9iPRsM5WtBKA0U,14958
|
|
27
|
+
datablade/sql/ddl_pyarrow.py,sha256=yBfTbDAA7dE3BXrCSPqEFogOYohAb3p0v3AIgBRxNqc,15344
|
|
28
|
+
datablade/sql/dialects.py,sha256=rM5qw6xdIuCSx81b6_TKYnsDhKeVpTdMDb_62HhtksA,266
|
|
29
|
+
datablade/sql/quoting.py,sha256=Y8yn1GAsaWSIenQDidwU4j2YsZmYE9ppOifAPiTbStk,1345
|
|
30
|
+
datablade/sql/schema_spec.py,sha256=HdEA5RS_yj8WNVH8wroWDyIK-8K4_fgMZC7rZBswkcs,2067
|
|
31
|
+
datablade/sql/sqlserver.py,sha256=ASLhtHIslMePRi9pUY6vWTaebQ7TM4kO60N9QsXwhok,13726
|
|
32
|
+
datablade/utils/__init__.py,sha256=KbvUZVWg2u53oUlfEIbDndwgz6f_j7N23Dl8Oz-vWXk,746
|
|
33
|
+
datablade/utils/lists.py,sha256=h3AXo2SSE2iaR_rJfoCgfNRFV9AOWerL-hHFYo6S4n8,798
|
|
34
|
+
datablade/utils/logging.py,sha256=kqM1389Wgzml69aja6g7yj8biNSi9AOPZlxr3YTnyp0,5491
|
|
35
|
+
datablade/utils/messages.py,sha256=yZZTGTS_eD0PBZFCdzz51fqBFWgq9Sdq-GMR1a0FclY,537
|
|
36
|
+
datablade/utils/strings.py,sha256=GLvYi7FPUHZ2nqNsZ0-mGbc3gu70UvVzrIy4bQWRHOo,7196
|
|
37
|
+
datablade-0.0.6.dist-info/licenses/LICENSE,sha256=QyU-OkETSZ-L0Rltu-SAMQtQU_UXjwLnb_QlA_AXm0U,1072
|
|
38
|
+
datablade-0.0.6.dist-info/METADATA,sha256=ou_j_HZwHrdNaZb26JhW_m3oRhILZ4tMFIPla9Sm8LM,13261
|
|
39
|
+
datablade-0.0.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
40
|
+
datablade-0.0.6.dist-info/top_level.txt,sha256=AwA5QxmfuaAs9XeXw1tCsboPsoffnMU-6CeLWMMUoUA,10
|
|
41
|
+
datablade-0.0.6.dist-info/RECORD,,
|
datablade-0.0.5.dist-info/RECORD
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
datablade/__init__.py,sha256=HGsVFJ4wSvSBXzKywpRpJweze_upiJbw-7yZ0JIlslY,1246
|
|
2
|
-
datablade/blade.py,sha256=pT6NYKhrKal_0c2m1QKur2jZRWkJC1fE7sTO4ldiWcU,4442
|
|
3
|
-
datablade/core/__init__.py,sha256=VJ2lFumv-7CFKjIN_i4wPEaAGFEb-UhHPzsM2IyTUUM,871
|
|
4
|
-
datablade/core/frames.py,sha256=iiTQPkWKQHslwkz_pu6bHVvbnisWSZF_b3Hgy1SvE-Q,662
|
|
5
|
-
datablade/core/json.py,sha256=_0RoK_OXkOpwNG5Y7vdORQOCfs0ZVJAO0qeQqjxzpyE,121
|
|
6
|
-
datablade/core/lists.py,sha256=OVj-KLyeOlbOxALyxvBLQpB0s78RO2QxmM4OxbI70Cc,132
|
|
7
|
-
datablade/core/messages.py,sha256=_2CpeaDJIwmRlTm4UMrDTKnew1wZdyai6QeKFjN3rno,404
|
|
8
|
-
datablade/core/strings.py,sha256=vwx0NW-m-2nOYW1aisRXUjrd4wERe2Y5Ka64wAyF0Mo,173
|
|
9
|
-
datablade/core/zip.py,sha256=fmp6VTKH3JAaRWTKhWGHN-frO7Vr4EHlEYtUOlomNuc,119
|
|
10
|
-
datablade/dataframes/__init__.py,sha256=5nu_xgxIrL5GlX84mIkh66rCg5bUKor7bgP3EQwihBY,1122
|
|
11
|
-
datablade/dataframes/frames.py,sha256=prbsOrq3rwLm-xPq2VrWVYyrdQLbjsuWsh_OQWq6Stw,16742
|
|
12
|
-
datablade/dataframes/readers.py,sha256=_Ititzvk4hLRmITxDQIS-Ji6t9rc-wTBIEcjuPSNQ8U,19378
|
|
13
|
-
datablade/io/__init__.py,sha256=jDenqP4biI1gnRdoO9X5C489cBlHrLuR3xp_TIOljp8,295
|
|
14
|
-
datablade/io/json.py,sha256=3VSyNiDkLYA5YihlOdUyf2Z83iDxB5ztjc4vDtwvu9Q,1055
|
|
15
|
-
datablade/io/zip.py,sha256=GaWCRNidUyeGJ4OBkImxIjtOV10mJaly9yLzyyBIE20,2716
|
|
16
|
-
datablade/sql/__init__.py,sha256=5Cef9DA3ifgPD6WNcGQ_ncDloWLxESc7Cxd8PHybfnU,762
|
|
17
|
-
datablade/sql/bulk_load.py,sha256=ugJMtzGc7Cb1FcuR7UhBaTPD76dqWj3oYOVwQkugeFw,12068
|
|
18
|
-
datablade/sql/ddl.py,sha256=WmvjVLdMkPoMDZkM0CbsV5TJAaMihBWZRyDkURj5wAY,8373
|
|
19
|
-
datablade/sql/ddl_pyarrow.py,sha256=QNdw4YN4CwUCvU2tsKHkCBCfFPFlYsUV5fUUC_MXpbs,10869
|
|
20
|
-
datablade/sql/dialects.py,sha256=eishu7E5CaZXc6Jae7hn87qDbgyBuC_cSey-IR6SGKQ,207
|
|
21
|
-
datablade/sql/quoting.py,sha256=l2pCcr55TyuPkpiUp3IFN8fmkp16fQ7ynvonTjd_OKQ,1291
|
|
22
|
-
datablade/utils/__init__.py,sha256=HzIUWIBZpqNKvYVeVZfhqwBCNOE9b3-IdUoCwslCk1Y,706
|
|
23
|
-
datablade/utils/lists.py,sha256=U2cX9bj8NfmL39sAdPPWis2xQ6CpC9mtc2I0LExQBOM,691
|
|
24
|
-
datablade/utils/logging.py,sha256=SSzoRgBZiiKWrN_dsvHpqhqkgWiDqP0BGxfgr94QyyE,4227
|
|
25
|
-
datablade/utils/messages.py,sha256=yZZTGTS_eD0PBZFCdzz51fqBFWgq9Sdq-GMR1a0FclY,537
|
|
26
|
-
datablade/utils/strings.py,sha256=173z_TM6d5zojD3d3lGiM8tyU0z6IMmIqIn3E9zWdVw,2633
|
|
27
|
-
datablade-0.0.5.dist-info/licenses/LICENSE,sha256=QyU-OkETSZ-L0Rltu-SAMQtQU_UXjwLnb_QlA_AXm0U,1072
|
|
28
|
-
datablade-0.0.5.dist-info/METADATA,sha256=4u9PckSw3RVizI3uB2rGfKSR_2Vo2WmB4lk1CWuIP0k,11911
|
|
29
|
-
datablade-0.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
30
|
-
datablade-0.0.5.dist-info/top_level.txt,sha256=AwA5QxmfuaAs9XeXw1tCsboPsoffnMU-6CeLWMMUoUA,10
|
|
31
|
-
datablade-0.0.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|