datablade 0.0.5__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {datablade-0.0.5/src/datablade.egg-info → datablade-0.0.6}/PKG-INFO +68 -13
  2. {datablade-0.0.5 → datablade-0.0.6}/pyproject.toml +5 -1
  3. {datablade-0.0.5 → datablade-0.0.6}/readme.md +67 -12
  4. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/__init__.py +10 -2
  5. datablade-0.0.6/src/datablade/blade.py +322 -0
  6. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/dataframes/__init__.py +8 -0
  7. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/dataframes/frames.py +127 -27
  8. datablade-0.0.6/src/datablade/dataframes/readers.py +1367 -0
  9. datablade-0.0.6/src/datablade/docs/ARCHITECTURE.md +102 -0
  10. datablade-0.0.6/src/datablade/docs/OBJECT_REGISTRY.md +194 -0
  11. datablade-0.0.6/src/datablade/docs/README.md +57 -0
  12. datablade-0.0.6/src/datablade/docs/TESTING.md +37 -0
  13. datablade-0.0.6/src/datablade/docs/USAGE.md +409 -0
  14. datablade-0.0.6/src/datablade/docs/__init__.py +87 -0
  15. datablade-0.0.6/src/datablade/docs/__main__.py +6 -0
  16. datablade-0.0.6/src/datablade/io/json.py +70 -0
  17. datablade-0.0.6/src/datablade/io/zip.py +111 -0
  18. datablade-0.0.6/src/datablade/registry.py +581 -0
  19. datablade-0.0.6/src/datablade/sql/__init__.py +56 -0
  20. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/sql/bulk_load.py +309 -49
  21. datablade-0.0.6/src/datablade/sql/ddl.py +402 -0
  22. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/sql/ddl_pyarrow.py +150 -26
  23. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/sql/dialects.py +2 -0
  24. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/sql/quoting.py +2 -0
  25. datablade-0.0.6/src/datablade/sql/schema_spec.py +65 -0
  26. datablade-0.0.6/src/datablade/sql/sqlserver.py +390 -0
  27. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/utils/__init__.py +2 -1
  28. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/utils/lists.py +3 -0
  29. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/utils/logging.py +46 -1
  30. datablade-0.0.6/src/datablade/utils/strings.py +249 -0
  31. {datablade-0.0.5 → datablade-0.0.6/src/datablade.egg-info}/PKG-INFO +68 -13
  32. {datablade-0.0.5 → datablade-0.0.6}/src/datablade.egg-info/SOURCES.txt +11 -0
  33. {datablade-0.0.5 → datablade-0.0.6}/tests/test_dataframes.py +28 -4
  34. {datablade-0.0.5 → datablade-0.0.6}/tests/test_integration.py +9 -0
  35. {datablade-0.0.5 → datablade-0.0.6}/tests/test_io.py +1 -1
  36. {datablade-0.0.5 → datablade-0.0.6}/tests/test_readers.py +122 -6
  37. datablade-0.0.6/tests/test_registry.py +118 -0
  38. {datablade-0.0.5 → datablade-0.0.6}/tests/test_sql.py +397 -10
  39. {datablade-0.0.5 → datablade-0.0.6}/tests/test_utils.py +36 -0
  40. datablade-0.0.5/src/datablade/blade.py +0 -153
  41. datablade-0.0.5/src/datablade/dataframes/readers.py +0 -540
  42. datablade-0.0.5/src/datablade/io/json.py +0 -33
  43. datablade-0.0.5/src/datablade/io/zip.py +0 -73
  44. datablade-0.0.5/src/datablade/sql/__init__.py +0 -32
  45. datablade-0.0.5/src/datablade/sql/ddl.py +0 -227
  46. datablade-0.0.5/src/datablade/utils/strings.py +0 -86
  47. {datablade-0.0.5 → datablade-0.0.6}/LICENSE +0 -0
  48. {datablade-0.0.5 → datablade-0.0.6}/setup.cfg +0 -0
  49. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/core/__init__.py +0 -0
  50. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/core/frames.py +0 -0
  51. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/core/json.py +0 -0
  52. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/core/lists.py +0 -0
  53. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/core/messages.py +0 -0
  54. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/core/strings.py +0 -0
  55. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/core/zip.py +0 -0
  56. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/io/__init__.py +0 -0
  57. {datablade-0.0.5 → datablade-0.0.6}/src/datablade/utils/messages.py +0 -0
  58. {datablade-0.0.5 → datablade-0.0.6}/src/datablade.egg-info/dependency_links.txt +0 -0
  59. {datablade-0.0.5 → datablade-0.0.6}/src/datablade.egg-info/requires.txt +0 -0
  60. {datablade-0.0.5 → datablade-0.0.6}/src/datablade.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datablade
3
- Version: 0.0.5
3
+ Version: 0.0.6
4
4
  Summary: datablade is a suite of functions to provide standard syntax across data engineering projects.
5
5
  Author-email: Brent Carpenetti <brentwc.git@pm.me>
6
6
  License: MIT License
@@ -63,7 +63,6 @@ Dynamic: license-file
63
63
 
64
64
  # datablade
65
65
 
66
- [![Tests](https://github.com/brentwc/data-prep/actions/workflows/test.yml/badge.svg)](https://github.com/brentwc/data-prep/actions/workflows/test.yml)
67
66
  [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
68
67
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
69
68
 
@@ -115,20 +114,23 @@ datablade is ideal for:
115
114
  ## Installation
116
115
 
117
116
  ```bash
118
- pip install git+https://github.com/brentwc/data-prep.git
117
+ pip install datablade
119
118
  ```
120
119
 
121
120
  **Optional dependencies:**
122
121
 
123
122
  ```bash
124
123
  # For high-performance file reading with Polars
125
- pip install git+https://github.com/brentwc/data-prep.git#egg=datablade[performance]
124
+ pip install "datablade[performance]"
126
125
 
127
- # For development and testing
128
- pip install git+https://github.com/brentwc/data-prep.git#egg=datablade[dev]
126
+ # For testing
127
+ pip install "datablade[test]"
128
+
129
+ # For development (includes testing + lint/format tooling)
130
+ pip install "datablade[dev]"
129
131
 
130
132
  # All optional dependencies
131
- pip install git+https://github.com/brentwc/data-prep.git#egg=datablade[all]
133
+ pip install "datablade[all]"
132
134
  ```
133
135
 
134
136
  ## Features
@@ -172,6 +174,7 @@ Multi-dialect SQL utilities:
172
174
  - Dialect-aware identifier quoting
173
175
  - CREATE TABLE generation for all dialects (from pandas DataFrames)
174
176
  - CREATE TABLE generation from Parquet schemas (schema-only, via PyArrow)
177
+ - Optional `schema_spec` overrides for column types, nullability, and string sizing
175
178
  - Bulk loading helpers:
176
179
  - SQL Server: executes `bcp` via subprocess
177
180
  - PostgreSQL/MySQL/DuckDB: returns command strings you can run in your environment
@@ -215,10 +218,25 @@ ddl_from_parquet = generate_create_table_from_parquet(
215
218
  data = get_json('https://api.example.com/data.json')
216
219
  ```
217
220
 
221
+ Most file path parameters accept `str` or `pathlib.Path`. To treat case mismatches
222
+ as errors on case-insensitive filesystems, use `configure_paths(path_strict=True)`.
223
+
218
224
  ### Memory-Aware File Reading
219
225
 
226
+ See the file format support matrix in the bundled USAGE doc:
227
+
228
+ ```bash
229
+ python -m datablade.docs --show USAGE
230
+ ```
231
+
220
232
  ```python
221
- from datablade.dataframes import read_file_chunked, read_file_iter, read_file_to_parquets, stream_to_parquets
233
+ from datablade.dataframes import (
234
+ excel_to_parquets,
235
+ read_file_chunked,
236
+ read_file_iter,
237
+ read_file_to_parquets,
238
+ stream_to_parquets,
239
+ )
222
240
 
223
241
  # Read large files in chunks
224
242
  for chunk in read_file_chunked('huge_file.csv', memory_fraction=0.5):
@@ -232,6 +250,10 @@ for chunk in read_file_iter('huge_file.csv', memory_fraction=0.3, verbose=True):
232
250
  for chunk in read_file_iter('huge_file.parquet', memory_fraction=0.3, verbose=True):
233
251
  process(chunk)
234
252
 
253
+ # Excel streaming is available with openpyxl installed (read-only mode)
254
+ for chunk in read_file_iter('large.xlsx', chunksize=25_000, verbose=True):
255
+ process(chunk)
256
+
235
257
  # Partition large files to multiple Parquets
236
258
  files = read_file_to_parquets(
237
259
  'large_file.csv',
@@ -248,6 +270,15 @@ files = stream_to_parquets(
248
270
  convert_types=True,
249
271
  verbose=True,
250
272
  )
273
+
274
+ # Excel streaming to Parquet partitions
275
+ files = excel_to_parquets(
276
+ 'large.xlsx',
277
+ output_dir='partitioned_excel/',
278
+ rows_per_file=200_000,
279
+ convert_types=True,
280
+ verbose=True,
281
+ )
251
282
  ```
252
283
 
253
284
  ## Blade (Optional Facade)
@@ -284,10 +315,21 @@ ddl2 = blade.create_table_sql_from_parquet(
284
315
 
285
316
  ## Documentation
286
317
 
287
- - [Docs Home](docs/README.md) - Documentation landing page
288
- - [Usage Guide](docs/USAGE.md) - File reading (including streaming), SQL, IO, logging
289
- - [Testing Guide](docs/TESTING.md) - How to run tests locally
290
- - [Test Suite](tests/README.md) - Testing documentation and coverage
318
+ Docs are bundled with the installed package:
319
+
320
+ ```bash
321
+ python -m datablade.docs --list
322
+ python -m datablade.docs --show USAGE
323
+ python -m datablade.docs --write-dir .\datablade-docs
324
+ ```
325
+
326
+ After writing docs to disk, open the markdown files locally:
327
+
328
+ - README (docs landing page)
329
+ - USAGE (file reading, streaming, SQL, IO, logging)
330
+ - TESTING (how to run tests locally)
331
+ - ARCHITECTURE (pipeline overview)
332
+ - OBJECT_REGISTRY (registry reference)
291
333
 
292
334
  ## Testing
293
335
 
@@ -304,7 +346,11 @@ pytest
304
346
  pytest --cov=datablade --cov-report=html
305
347
  ```
306
348
 
307
- See [tests/README.md](tests/README.md) for detailed testing documentation.
349
+ For detailed testing documentation, use the bundled TESTING doc:
350
+
351
+ ```bash
352
+ python -m datablade.docs --show TESTING
353
+ ```
308
354
 
309
355
  ## Backward Compatibility
310
356
 
@@ -332,6 +378,12 @@ from datablade.core.json import get
332
378
  - **Streaming vs materializing**:
333
379
  - Use `read_file_iter()` to process arbitrarily large files chunk-by-chunk.
334
380
  - `read_file_smart()` returns a single DataFrame and may still be memory-intensive.
381
+ - **Chunk concatenation**: the large-file pandas fallback in `read_file_smart()` can
382
+ temporarily spike memory usage during concat. Use `read_file_iter()` or
383
+ `return_type="iterator"` to avoid concatenation.
384
+ - **Polars materialization**: when returning a pandas DataFrame, Polars still
385
+ collects into memory; use `return_type="polars"` or `"polars_lazy"` to keep
386
+ Polars frames.
335
387
  - **Parquet support**:
336
388
  - Streaming reads support single `.parquet` files.
337
389
  - Parquet “dataset directories” (Hive partitions / directory-of-parquets) are not a primary target API.
@@ -339,6 +391,9 @@ from datablade.core.json import get
339
391
  - Uses the Parquet schema (PyArrow) without scanning data.
340
392
  - Complex/nested columns (struct/list/map/union) are dropped and logged as warnings.
341
393
  - **DDL scope**: `CREATE TABLE` generation is column/type oriented (no indexes/constraints).
394
+ - **SQL Server bulk load**: the SQL Server helpers use the `bcp` CLI and require it
395
+ to be installed and available on PATH. When using `-U`/`-P`, credentials are
396
+ passed via process args (logs are redacted); prefer `-T` or `-G` where possible.
342
397
 
343
398
  **Optional dependencies:**
344
399
 
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
  name = "datablade"
7
7
  dynamic = ["version"]
8
8
  description = "datablade is a suite of functions to provide standard syntax across data engineering projects."
9
- readme = "readme.md"
9
+ readme = { file = "readme.md", content-type = "text/markdown" }
10
10
  requires-python = ">=3.12"
11
11
  license = { file = "LICENSE" }
12
12
  authors = [
@@ -56,6 +56,10 @@ all = [
56
56
  [tool.setuptools]
57
57
  include-package-data = true
58
58
 
59
+ [tool.setuptools.package-data]
60
+ datablade = ["docs/*.md"]
61
+ "datablade.docs" = ["*.md"]
62
+
59
63
  [tool.setuptools.packages.find]
60
64
  where = ["src"]
61
65
 
@@ -1,6 +1,5 @@
1
1
  # datablade
2
2
 
3
- [![Tests](https://github.com/brentwc/data-prep/actions/workflows/test.yml/badge.svg)](https://github.com/brentwc/data-prep/actions/workflows/test.yml)
4
3
  [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
5
4
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
5
 
@@ -52,20 +51,23 @@ datablade is ideal for:
52
51
  ## Installation
53
52
 
54
53
  ```bash
55
- pip install git+https://github.com/brentwc/data-prep.git
54
+ pip install datablade
56
55
  ```
57
56
 
58
57
  **Optional dependencies:**
59
58
 
60
59
  ```bash
61
60
  # For high-performance file reading with Polars
62
- pip install git+https://github.com/brentwc/data-prep.git#egg=datablade[performance]
61
+ pip install "datablade[performance]"
63
62
 
64
- # For development and testing
65
- pip install git+https://github.com/brentwc/data-prep.git#egg=datablade[dev]
63
+ # For testing
64
+ pip install "datablade[test]"
65
+
66
+ # For development (includes testing + lint/format tooling)
67
+ pip install "datablade[dev]"
66
68
 
67
69
  # All optional dependencies
68
- pip install git+https://github.com/brentwc/data-prep.git#egg=datablade[all]
70
+ pip install "datablade[all]"
69
71
  ```
70
72
 
71
73
  ## Features
@@ -109,6 +111,7 @@ Multi-dialect SQL utilities:
109
111
  - Dialect-aware identifier quoting
110
112
  - CREATE TABLE generation for all dialects (from pandas DataFrames)
111
113
  - CREATE TABLE generation from Parquet schemas (schema-only, via PyArrow)
114
+ - Optional `schema_spec` overrides for column types, nullability, and string sizing
112
115
  - Bulk loading helpers:
113
116
  - SQL Server: executes `bcp` via subprocess
114
117
  - PostgreSQL/MySQL/DuckDB: returns command strings you can run in your environment
@@ -152,10 +155,25 @@ ddl_from_parquet = generate_create_table_from_parquet(
152
155
  data = get_json('https://api.example.com/data.json')
153
156
  ```
154
157
 
158
+ Most file path parameters accept `str` or `pathlib.Path`. To treat case mismatches
159
+ as errors on case-insensitive filesystems, use `configure_paths(path_strict=True)`.
160
+
155
161
  ### Memory-Aware File Reading
156
162
 
163
+ See the file format support matrix in the bundled USAGE doc:
164
+
165
+ ```bash
166
+ python -m datablade.docs --show USAGE
167
+ ```
168
+
157
169
  ```python
158
- from datablade.dataframes import read_file_chunked, read_file_iter, read_file_to_parquets, stream_to_parquets
170
+ from datablade.dataframes import (
171
+ excel_to_parquets,
172
+ read_file_chunked,
173
+ read_file_iter,
174
+ read_file_to_parquets,
175
+ stream_to_parquets,
176
+ )
159
177
 
160
178
  # Read large files in chunks
161
179
  for chunk in read_file_chunked('huge_file.csv', memory_fraction=0.5):
@@ -169,6 +187,10 @@ for chunk in read_file_iter('huge_file.csv', memory_fraction=0.3, verbose=True):
169
187
  for chunk in read_file_iter('huge_file.parquet', memory_fraction=0.3, verbose=True):
170
188
  process(chunk)
171
189
 
190
+ # Excel streaming is available with openpyxl installed (read-only mode)
191
+ for chunk in read_file_iter('large.xlsx', chunksize=25_000, verbose=True):
192
+ process(chunk)
193
+
172
194
  # Partition large files to multiple Parquets
173
195
  files = read_file_to_parquets(
174
196
  'large_file.csv',
@@ -185,6 +207,15 @@ files = stream_to_parquets(
185
207
  convert_types=True,
186
208
  verbose=True,
187
209
  )
210
+
211
+ # Excel streaming to Parquet partitions
212
+ files = excel_to_parquets(
213
+ 'large.xlsx',
214
+ output_dir='partitioned_excel/',
215
+ rows_per_file=200_000,
216
+ convert_types=True,
217
+ verbose=True,
218
+ )
188
219
  ```
189
220
 
190
221
  ## Blade (Optional Facade)
@@ -221,10 +252,21 @@ ddl2 = blade.create_table_sql_from_parquet(
221
252
 
222
253
  ## Documentation
223
254
 
224
- - [Docs Home](docs/README.md) - Documentation landing page
225
- - [Usage Guide](docs/USAGE.md) - File reading (including streaming), SQL, IO, logging
226
- - [Testing Guide](docs/TESTING.md) - How to run tests locally
227
- - [Test Suite](tests/README.md) - Testing documentation and coverage
255
+ Docs are bundled with the installed package:
256
+
257
+ ```bash
258
+ python -m datablade.docs --list
259
+ python -m datablade.docs --show USAGE
260
+ python -m datablade.docs --write-dir .\datablade-docs
261
+ ```
262
+
263
+ After writing docs to disk, open the markdown files locally:
264
+
265
+ - README (docs landing page)
266
+ - USAGE (file reading, streaming, SQL, IO, logging)
267
+ - TESTING (how to run tests locally)
268
+ - ARCHITECTURE (pipeline overview)
269
+ - OBJECT_REGISTRY (registry reference)
228
270
 
229
271
  ## Testing
230
272
 
@@ -241,7 +283,11 @@ pytest
241
283
  pytest --cov=datablade --cov-report=html
242
284
  ```
243
285
 
244
- See [tests/README.md](tests/README.md) for detailed testing documentation.
286
+ For detailed testing documentation, use the bundled TESTING doc:
287
+
288
+ ```bash
289
+ python -m datablade.docs --show TESTING
290
+ ```
245
291
 
246
292
  ## Backward Compatibility
247
293
 
@@ -269,6 +315,12 @@ from datablade.core.json import get
269
315
  - **Streaming vs materializing**:
270
316
  - Use `read_file_iter()` to process arbitrarily large files chunk-by-chunk.
271
317
  - `read_file_smart()` returns a single DataFrame and may still be memory-intensive.
318
+ - **Chunk concatenation**: the large-file pandas fallback in `read_file_smart()` can
319
+ temporarily spike memory usage during concat. Use `read_file_iter()` or
320
+ `return_type="iterator"` to avoid concatenation.
321
+ - **Polars materialization**: when returning a pandas DataFrame, Polars still
322
+ collects into memory; use `return_type="polars"` or `"polars_lazy"` to keep
323
+ Polars frames.
272
324
  - **Parquet support**:
273
325
  - Streaming reads support single `.parquet` files.
274
326
  - Parquet “dataset directories” (Hive partitions / directory-of-parquets) are not a primary target API.
@@ -276,6 +328,9 @@ from datablade.core.json import get
276
328
  - Uses the Parquet schema (PyArrow) without scanning data.
277
329
  - Complex/nested columns (struct/list/map/union) are dropped and logged as warnings.
278
330
  - **DDL scope**: `CREATE TABLE` generation is column/type oriented (no indexes/constraints).
331
+ - **SQL Server bulk load**: the SQL Server helpers use the `bcp` CLI and require it
332
+ to be installed and available on PATH. When using `-U`/`-P`, credentials are
333
+ passed via process args (logs are redacted); prefer `-T` or `-G` where possible.
279
334
 
280
335
  **Optional dependencies:**
281
336
 
@@ -12,24 +12,28 @@ For backward compatibility, all functions are also available from datablade.core
12
12
 
13
13
  # Also maintain core for backward compatibility
14
14
  # Import from new organized structure
15
- from . import core, dataframes, io, sql, utils
15
+ from . import core, dataframes, io, registry, sql, utils
16
16
  from .blade import Blade
17
17
  from .dataframes import read_file_chunked, read_file_smart, read_file_to_parquets
18
+ from .registry import DialectSpec, ObjectNode, ObjectRef, ObjectRegistry
18
19
  from .sql import Dialect, bulk_load, generate_create_table
19
20
 
20
21
  # Convenience re-exports for commonly used functions
21
22
  from .utils.logging import configure_logging, get_logger
23
+ from .utils.strings import configure_paths
22
24
 
23
- __version__ = "0.0.5"
25
+ __version__ = "0.0.6"
24
26
 
25
27
  __all__ = [
26
28
  "dataframes",
27
29
  "io",
28
30
  "utils",
29
31
  "sql",
32
+ "registry",
30
33
  "core", # Maintain backward compatibility
31
34
  # Convenience re-exports
32
35
  "configure_logging",
36
+ "configure_paths",
33
37
  "get_logger",
34
38
  "read_file_smart",
35
39
  "read_file_chunked",
@@ -38,4 +42,8 @@ __all__ = [
38
42
  "generate_create_table",
39
43
  "bulk_load",
40
44
  "Blade",
45
+ "DialectSpec",
46
+ "ObjectRef",
47
+ "ObjectNode",
48
+ "ObjectRegistry",
41
49
  ]