datablade 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablade/__init__.py +10 -2
- datablade/blade.py +174 -5
- datablade/dataframes/__init__.py +8 -0
- datablade/dataframes/frames.py +127 -27
- datablade/dataframes/readers.py +988 -161
- datablade/docs/ARCHITECTURE.md +102 -0
- datablade/docs/OBJECT_REGISTRY.md +194 -0
- datablade/docs/README.md +57 -0
- datablade/docs/TESTING.md +37 -0
- datablade/docs/USAGE.md +409 -0
- datablade/docs/__init__.py +87 -0
- datablade/docs/__main__.py +6 -0
- datablade/io/json.py +45 -8
- datablade/io/zip.py +68 -30
- datablade/registry.py +581 -0
- datablade/sql/__init__.py +25 -1
- datablade/sql/bulk_load.py +309 -49
- datablade/sql/ddl.py +201 -26
- datablade/sql/ddl_pyarrow.py +150 -26
- datablade/sql/dialects.py +2 -0
- datablade/sql/quoting.py +2 -0
- datablade/sql/schema_spec.py +65 -0
- datablade/sql/sqlserver.py +390 -0
- datablade/utils/__init__.py +2 -1
- datablade/utils/lists.py +3 -0
- datablade/utils/logging.py +46 -1
- datablade/utils/strings.py +180 -17
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/METADATA +68 -13
- datablade-0.0.6.dist-info/RECORD +41 -0
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
- datablade-0.0.5.dist-info/RECORD +0 -31
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/licenses/LICENSE +0 -0
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# datablade architecture overview
|
|
2
|
+
|
|
3
|
+
This document explains how the `datablade` modules fit together and highlights the
|
|
4
|
+
main decision points in the data-reading and SQL DDL workflows.
|
|
5
|
+
|
|
6
|
+
## Package layout
|
|
7
|
+
|
|
8
|
+
`datablade` is organized into a few focused namespaces:
|
|
9
|
+
|
|
10
|
+
- `datablade.dataframes`: file readers, DataFrame cleanup, Parquet helpers
|
|
11
|
+
- `datablade.sql`: dialect-aware quoting, DDL generation, bulk loading
|
|
12
|
+
- `datablade.io`: HTTP JSON + ZIP helpers
|
|
13
|
+
- `datablade.utils`: logging, string/list helpers
|
|
14
|
+
- `datablade.core`: backward-compatible re-exports of the above modules
|
|
15
|
+
|
|
16
|
+
::: mermaid
|
|
17
|
+
flowchart TB
|
|
18
|
+
root[datablade] --> dataframes[datablade.dataframes]
|
|
19
|
+
root --> sql[datablade.sql]
|
|
20
|
+
root --> io[datablade.io]
|
|
21
|
+
root --> utils[datablade.utils]
|
|
22
|
+
root --> core[datablade.core - compat]
|
|
23
|
+
|
|
24
|
+
dataframes --> readers[readers.py<br/>read_file_smart/read_file_iter]
|
|
25
|
+
dataframes --> frames[frames.py<br/>cleaning + schema]
|
|
26
|
+
|
|
27
|
+
sql --> ddl[ddl.py<br/>DataFrame -> DDL]
|
|
28
|
+
sql --> ddl_arrow[ddl_pyarrow.py<br/>Parquet schema -> DDL]
|
|
29
|
+
sql --> bulk[bulk_load.py]
|
|
30
|
+
sql --> quoting[quoting.py]
|
|
31
|
+
|
|
32
|
+
io --> json_io[json.py]
|
|
33
|
+
io --> zip_io[zip.py]
|
|
34
|
+
|
|
35
|
+
utils --> logging_utils[logging.py]
|
|
36
|
+
utils --> strings_utils[strings.py]
|
|
37
|
+
utils --> lists_utils[lists.py]
|
|
38
|
+
:::
|
|
39
|
+
|
|
40
|
+
## Data reading pipeline (memory-aware)
|
|
41
|
+
|
|
42
|
+
`read_file_smart` (and its chunked/streaming helpers) chooses the loading strategy
|
|
43
|
+
based on file type, estimated memory, and optional Polars availability.
|
|
44
|
+
|
|
45
|
+
::: mermaid
|
|
46
|
+
flowchart TD
|
|
47
|
+
start([read_file_smart]) --> exists{file exists?}
|
|
48
|
+
exists -- no --> err_missing[error]
|
|
49
|
+
exists -- yes --> est[estimate memory]
|
|
50
|
+
est --> fits{fits in memory?}
|
|
51
|
+
|
|
52
|
+
fits -- yes --> direct[direct pandas read<br/>csv/xlsx/json/parquet]
|
|
53
|
+
fits -- no --> polars{use_polars?}
|
|
54
|
+
|
|
55
|
+
polars -- yes --> polars_ok{polars available?}
|
|
56
|
+
polars_ok -- yes --> polars_read[polars scan + collect]
|
|
57
|
+
polars_ok -- no --> chunked
|
|
58
|
+
|
|
59
|
+
polars -- no --> chunked[chunked pandas read]
|
|
60
|
+
chunked --> concat[concat chunks]
|
|
61
|
+
|
|
62
|
+
direct --> done([DataFrame])
|
|
63
|
+
polars_read --> done
|
|
64
|
+
concat --> done
|
|
65
|
+
:::
|
|
66
|
+
|
|
67
|
+
## Parquet partitioning pipeline
|
|
68
|
+
|
|
69
|
+
Both `read_file_to_parquets` and `stream_to_parquets` chunk through a file, clean
|
|
70
|
+
columns, optionally coerce numeric values, and write Parquet partitions.
|
|
71
|
+
|
|
72
|
+
::: mermaid
|
|
73
|
+
flowchart LR
|
|
74
|
+
input[(input file)] --> reader[read_file_chunked/read_file_iter]
|
|
75
|
+
reader --> clean[clean_dataframe_columns]
|
|
76
|
+
clean --> cast{convert_types?}
|
|
77
|
+
cast -- yes --> numeric[try_cast_string_columns_to_numeric]
|
|
78
|
+
cast -- no --> write
|
|
79
|
+
numeric --> write[write parquet partition]
|
|
80
|
+
write --> more{more chunks?}
|
|
81
|
+
more -- yes --> reader
|
|
82
|
+
more -- no --> done([parquet files])
|
|
83
|
+
:::
|
|
84
|
+
|
|
85
|
+
## SQL DDL generation
|
|
86
|
+
|
|
87
|
+
DDL generation uses two entry points depending on source schema.
|
|
88
|
+
|
|
89
|
+
::: mermaid
|
|
90
|
+
flowchart TB
|
|
91
|
+
df[df: pandas DataFrame] --> ddl_df[ddl.generate_create_table]
|
|
92
|
+
parquet[parquet path] --> ddl_parquet[ddl_pyarrow.generate_create_table_from_parquet]
|
|
93
|
+
|
|
94
|
+
ddl_df --> qualify[_qualify_name + quote_identifier]
|
|
95
|
+
ddl_parquet --> qualify
|
|
96
|
+
qualify --> sql_stmt([CREATE TABLE statement])
|
|
97
|
+
:::
|
|
98
|
+
|
|
99
|
+
## Legacy `core` namespace
|
|
100
|
+
|
|
101
|
+
`datablade.core` mirrors the organized modules and re-exports them for
|
|
102
|
+
backward compatibility. New code should import from the newer modules.
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# Object Registry (draft)
|
|
2
|
+
|
|
3
|
+
This document defines a draft specification for an in-memory object registry that
|
|
4
|
+
provides SQL-like dot notation for database objects while keeping names and layout
|
|
5
|
+
externalized in configuration.
|
|
6
|
+
|
|
7
|
+
The registry is metadata-first. It does not connect to databases, perform I/O, or
|
|
8
|
+
decide how content is produced. It simply gives a stable, structured namespace for
|
|
9
|
+
object references and their associated content.
|
|
10
|
+
|
|
11
|
+
## Goals
|
|
12
|
+
|
|
13
|
+
- Provide dot-notation access to objects (catalog.schema.object.content).
|
|
14
|
+
- Externalize names and layout in a config file to reduce hard-coded strings.
|
|
15
|
+
- Allow host to be optional so connections can supply it at runtime.
|
|
16
|
+
- Make content developer-managed (lazy by default, no implied loading).
|
|
17
|
+
- Keep the model extensible across SQL dialects and non-SQL backends.
|
|
18
|
+
|
|
19
|
+
## Non-goals
|
|
20
|
+
|
|
21
|
+
- No database connections or live introspection.
|
|
22
|
+
- No migrations, transactional DDL, or ORM behavior.
|
|
23
|
+
- No enforcement of how content is loaded or computed.
|
|
24
|
+
|
|
25
|
+
## Core concepts
|
|
26
|
+
|
|
27
|
+
- ObjectRef: immutable reference to an object, with metadata and content.
|
|
28
|
+
- ObjectNode: namespace container supporting dot and key access.
|
|
29
|
+
- ObjectRegistry: root registry, config loading, and object iteration.
|
|
30
|
+
- DialectAdapter: formatter for qualified names based on dialect rules.
|
|
31
|
+
|
|
32
|
+
## Config schema (v1)
|
|
33
|
+
|
|
34
|
+
Top-level keys:
|
|
35
|
+
|
|
36
|
+
- version (int, required)
|
|
37
|
+
- defaults (object, optional)
|
|
38
|
+
- dialects (object, optional)
|
|
39
|
+
- catalogs (object, optional)
|
|
40
|
+
- hosts (object, optional)
|
|
41
|
+
- metadata (object, optional)
|
|
42
|
+
|
|
43
|
+
defaults:
|
|
44
|
+
|
|
45
|
+
- dialect (string, optional; default "sqlserver")
|
|
46
|
+
- host (string|null, optional)
|
|
47
|
+
- catalog (string|null, optional)
|
|
48
|
+
- schema (string|null, optional; default "dbo")
|
|
49
|
+
- object_type (string, optional; default "table")
|
|
50
|
+
- name_policy (string, optional; "preserve" | "lower" | "upper" | "normalize")
|
|
51
|
+
|
|
52
|
+
dialects:
|
|
53
|
+
|
|
54
|
+
- <dialect_name>:
|
|
55
|
+
- qualifier (string, required): dot-separated segments
|
|
56
|
+
- allowed: host, catalog, schema, object
|
|
57
|
+
- quote_style (string, optional):
|
|
58
|
+
- "sqlserver" | "postgres" | "mysql" | "duckdb" | "none"
|
|
59
|
+
|
|
60
|
+
hosts:
|
|
61
|
+
|
|
62
|
+
- <host_key>:
|
|
63
|
+
- host (string, optional)
|
|
64
|
+
- catalogs (map, required)
|
|
65
|
+
|
|
66
|
+
catalogs (hostless or under host):
|
|
67
|
+
|
|
68
|
+
- <catalog_key>:
|
|
69
|
+
- catalog (string, optional)
|
|
70
|
+
- schemas (map, optional)
|
|
71
|
+
- objects (map, optional) # allow catalog-level objects
|
|
72
|
+
|
|
73
|
+
schemas:
|
|
74
|
+
|
|
75
|
+
- <schema_key>:
|
|
76
|
+
- schema (string, optional)
|
|
77
|
+
- objects (map, required)
|
|
78
|
+
|
|
79
|
+
objects:
|
|
80
|
+
|
|
81
|
+
- <object_key>:
|
|
82
|
+
- name (string, optional; defaults to key)
|
|
83
|
+
- object_type (string, optional; defaults to defaults.object_type)
|
|
84
|
+
- aliases (list[string], optional)
|
|
85
|
+
- content (any, optional)
|
|
86
|
+
- tags (map[string,string], optional)
|
|
87
|
+
|
|
88
|
+
## Name policy and lookup
|
|
89
|
+
|
|
90
|
+
Keys are the stable identifiers used for dot-notation. The name policy determines
|
|
91
|
+
how keys and aliases are normalized for lookup.
|
|
92
|
+
|
|
93
|
+
- preserve: do not transform keys; lookups are case-sensitive.
|
|
94
|
+
- lower / upper: normalize keys and aliases to the chosen case.
|
|
95
|
+
- normalize: transform keys into safe identifiers:
|
|
96
|
+
- lower-case
|
|
97
|
+
- replace invalid characters with "_"
|
|
98
|
+
- collapse multiple "_"
|
|
99
|
+
- prefix "_" if the key starts with a digit
|
|
100
|
+
- keep the original key as an implicit alias
|
|
101
|
+
|
|
102
|
+
Lookup behavior:
|
|
103
|
+
|
|
104
|
+
- Dot access uses normalized keys.
|
|
105
|
+
- Bracket access checks raw key, then aliases, then normalized keys.
|
|
106
|
+
- Aliases are only for lookup; they do not change the stored name.
|
|
107
|
+
|
|
108
|
+
Collision rules:
|
|
109
|
+
|
|
110
|
+
- Two siblings that normalize to the same key are errors.
|
|
111
|
+
- An alias that collides with a sibling key or alias is an error in strict mode.
|
|
112
|
+
|
|
113
|
+
## Host optionality
|
|
114
|
+
|
|
115
|
+
Host is optional. If host is missing, qualified names use only the available
|
|
116
|
+
segments. If a runtime connection provides host/catalog/schema, it can override
|
|
117
|
+
missing fields.
|
|
118
|
+
|
|
119
|
+
## Content semantics
|
|
120
|
+
|
|
121
|
+
Content is developer-managed. The registry stores it but does not interpret it.
|
|
122
|
+
Content can be a DataFrame, SQL text, a lazy loader, or a computed result.
|
|
123
|
+
|
|
124
|
+
## Dialect qualification
|
|
125
|
+
|
|
126
|
+
Dialect qualification uses `dialects.<name>.qualifier` to decide which segments
|
|
127
|
+
to include and in what order. Missing segments are skipped.
|
|
128
|
+
|
|
129
|
+
Example qualifiers:
|
|
130
|
+
|
|
131
|
+
- sqlserver: catalog.schema.object
|
|
132
|
+
- postgres: schema.object
|
|
133
|
+
- nosql: collection
|
|
134
|
+
|
|
135
|
+
## Validation rules (draft)
|
|
136
|
+
|
|
137
|
+
Errors:
|
|
138
|
+
|
|
139
|
+
- version missing or not an integer
|
|
140
|
+
- neither catalogs nor hosts provided
|
|
141
|
+
- unknown fields in strict mode
|
|
142
|
+
- duplicate keys after normalization within a parent
|
|
143
|
+
- alias collisions with sibling keys or aliases
|
|
144
|
+
- dialect qualifier uses unknown segments
|
|
145
|
+
|
|
146
|
+
Warnings:
|
|
147
|
+
|
|
148
|
+
- name differs only by case from key under preserve
|
|
149
|
+
- host provided under hostless catalogs root
|
|
150
|
+
- objects without content (informational)
|
|
151
|
+
|
|
152
|
+
## Example config
|
|
153
|
+
|
|
154
|
+
```yaml
|
|
155
|
+
version: 1
|
|
156
|
+
|
|
157
|
+
defaults:
|
|
158
|
+
dialect: sqlserver
|
|
159
|
+
schema: dbo
|
|
160
|
+
name_policy: normalize
|
|
161
|
+
|
|
162
|
+
dialects:
|
|
163
|
+
sqlserver:
|
|
164
|
+
qualifier: catalog.schema.object
|
|
165
|
+
postgres:
|
|
166
|
+
qualifier: schema.object
|
|
167
|
+
|
|
168
|
+
catalogs:
|
|
169
|
+
sales:
|
|
170
|
+
catalog: SalesDW
|
|
171
|
+
schemas:
|
|
172
|
+
reporting:
|
|
173
|
+
schema: rpt
|
|
174
|
+
objects:
|
|
175
|
+
orders:
|
|
176
|
+
name: Orders
|
|
177
|
+
object_type: table
|
|
178
|
+
aliases: [orders_current]
|
|
179
|
+
content: null
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Usage sketch
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
registry = ObjectRegistry.from_yaml("layout.yaml")
|
|
186
|
+
|
|
187
|
+
df = registry.catalogs.sales.reporting.orders.content
|
|
188
|
+
df2 = registry.catalogs.sales.reporting.orders_current.content
|
|
189
|
+
|
|
190
|
+
df3 = registry.catalogs["sales"].schemas["rpt"].objects["Order Details"].content
|
|
191
|
+
|
|
192
|
+
qualified = registry.catalogs.sales.reporting.orders.qualified(dialect="sqlserver")
|
|
193
|
+
```
|
|
194
|
+
|
datablade/docs/README.md
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# datablade Documentation
|
|
2
|
+
|
|
3
|
+
This folder contains the human-facing documentation for **datablade**.
|
|
4
|
+
|
|
5
|
+
## Quick start
|
|
6
|
+
|
|
7
|
+
Install:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install datablade
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Access docs after installation:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
python -m datablade.docs --list
|
|
17
|
+
python -m datablade.docs --show USAGE
|
|
18
|
+
python -m datablade.docs --write-dir .\\datablade-docs
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Basic usage:
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
import pandas as pd
|
|
25
|
+
from datablade.dataframes import read_file_smart, clean_dataframe_columns
|
|
26
|
+
from datablade.sql import Dialect, generate_create_table, generate_create_table_from_parquet
|
|
27
|
+
|
|
28
|
+
df = read_file_smart("data.csv", verbose=True)
|
|
29
|
+
df = clean_dataframe_columns(df)
|
|
30
|
+
|
|
31
|
+
ddl = generate_create_table(df, table="my_table", dialect=Dialect.POSTGRES)
|
|
32
|
+
print(ddl)
|
|
33
|
+
|
|
34
|
+
# Or: generate DDL from a Parquet file schema without materializing rows
|
|
35
|
+
# Note: nested Parquet types (struct/list/map/union) are dropped with a warning.
|
|
36
|
+
ddl2 = generate_create_table_from_parquet(
|
|
37
|
+
"events.parquet",
|
|
38
|
+
table="events",
|
|
39
|
+
dialect=Dialect.POSTGRES,
|
|
40
|
+
)
|
|
41
|
+
print(ddl2)
|
|
42
|
+
|
|
43
|
+
# Optional schema_spec overrides (type/nullability/string sizing)
|
|
44
|
+
schema_spec = {
|
|
45
|
+
"columns": {"notes": {"sql_type": "text", "nullable": True}},
|
|
46
|
+
}
|
|
47
|
+
ddl3 = generate_create_table(df, table="my_table", dialect=Dialect.POSTGRES, schema_spec=schema_spec)
|
|
48
|
+
print(ddl3)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Most file path parameters accept `str` or `pathlib.Path`.
|
|
52
|
+
|
|
53
|
+
## Guides
|
|
54
|
+
|
|
55
|
+
- See [docs/USAGE.md](USAGE.md) for the main usage guide (file reading, streaming, SQL, IO, logging).
|
|
56
|
+
- See [docs/ARCHITECTURE.md](ARCHITECTURE.md) for an architecture overview with pipeline diagrams.
|
|
57
|
+
- See [docs/TESTING.md](TESTING.md) for running tests locally.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Testing
|
|
2
|
+
|
|
3
|
+
## Install test dependencies
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install -e ".[test]"
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Run tests
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pytest
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Coverage
|
|
16
|
+
|
|
17
|
+
If you have `pytest-cov` available:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pytest --cov=datablade --cov-report=term-missing
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
If `pytest-cov` is problematic in your environment, you can use `coverage` directly:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
coverage run -m pytest
|
|
27
|
+
coverage report -m
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Lint (optional, for contributors)
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install -e ".[dev]"
|
|
34
|
+
black .
|
|
35
|
+
isort .
|
|
36
|
+
flake8
|
|
37
|
+
```
|