datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datablade/__init__.py +49 -1
  2. datablade/blade.py +322 -0
  3. datablade/core/__init__.py +28 -7
  4. datablade/core/frames.py +23 -236
  5. datablade/core/json.py +5 -10
  6. datablade/core/lists.py +5 -10
  7. datablade/core/messages.py +23 -11
  8. datablade/core/strings.py +5 -43
  9. datablade/core/zip.py +5 -24
  10. datablade/dataframes/__init__.py +51 -0
  11. datablade/dataframes/frames.py +585 -0
  12. datablade/dataframes/readers.py +1367 -0
  13. datablade/docs/ARCHITECTURE.md +102 -0
  14. datablade/docs/OBJECT_REGISTRY.md +194 -0
  15. datablade/docs/README.md +57 -0
  16. datablade/docs/TESTING.md +37 -0
  17. datablade/docs/USAGE.md +409 -0
  18. datablade/docs/__init__.py +87 -0
  19. datablade/docs/__main__.py +6 -0
  20. datablade/io/__init__.py +15 -0
  21. datablade/io/json.py +70 -0
  22. datablade/io/zip.py +111 -0
  23. datablade/registry.py +581 -0
  24. datablade/sql/__init__.py +56 -0
  25. datablade/sql/bulk_load.py +665 -0
  26. datablade/sql/ddl.py +402 -0
  27. datablade/sql/ddl_pyarrow.py +411 -0
  28. datablade/sql/dialects.py +12 -0
  29. datablade/sql/quoting.py +44 -0
  30. datablade/sql/schema_spec.py +65 -0
  31. datablade/sql/sqlserver.py +390 -0
  32. datablade/utils/__init__.py +38 -0
  33. datablade/utils/lists.py +32 -0
  34. datablade/utils/logging.py +204 -0
  35. datablade/utils/messages.py +29 -0
  36. datablade/utils/strings.py +249 -0
  37. datablade-0.0.6.dist-info/METADATA +406 -0
  38. datablade-0.0.6.dist-info/RECORD +41 -0
  39. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
  40. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
  41. datablade-0.0.0.dist-info/METADATA +0 -13
  42. datablade-0.0.0.dist-info/RECORD +0 -13
  43. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,102 @@
1
+ # datablade architecture overview
2
+
3
+ This document explains how the `datablade` modules fit together and highlights the
4
+ main decision points in the data-reading and SQL DDL workflows.
5
+
6
+ ## Package layout
7
+
8
+ `datablade` is organized into a few focused namespaces:
9
+
10
+ - `datablade.dataframes`: file readers, DataFrame cleanup, Parquet helpers
11
+ - `datablade.sql`: dialect-aware quoting, DDL generation, bulk loading
12
+ - `datablade.io`: HTTP JSON + ZIP helpers
13
+ - `datablade.utils`: logging, string/list helpers
14
+ - `datablade.core`: backward-compatible re-exports of the above modules
15
+
16
+ ::: mermaid
17
+ flowchart TB
18
+ root[datablade] --> dataframes[datablade.dataframes]
19
+ root --> sql[datablade.sql]
20
+ root --> io[datablade.io]
21
+ root --> utils[datablade.utils]
22
+ root --> core[datablade.core - compat]
23
+
24
+ dataframes --> readers[readers.py<br/>read_file_smart/read_file_iter]
25
+ dataframes --> frames[frames.py<br/>cleaning + schema]
26
+
27
+ sql --> ddl[ddl.py<br/>DataFrame -> DDL]
28
+ sql --> ddl_arrow[ddl_pyarrow.py<br/>Parquet schema -> DDL]
29
+ sql --> bulk[bulk_load.py]
30
+ sql --> quoting[quoting.py]
31
+
32
+ io --> json_io[json.py]
33
+ io --> zip_io[zip.py]
34
+
35
+ utils --> logging_utils[logging.py]
36
+ utils --> strings_utils[strings.py]
37
+ utils --> lists_utils[lists.py]
38
+ :::
39
+
40
+ ## Data reading pipeline (memory-aware)
41
+
42
+ `read_file_smart` (and its chunked/streaming helpers) chooses the loading strategy
43
+ based on file type, estimated memory, and optional Polars availability.
44
+
45
+ ::: mermaid
46
+ flowchart TD
47
+ start([read_file_smart]) --> exists{file exists?}
48
+ exists -- no --> err_missing[error]
49
+ exists -- yes --> est[estimate memory]
50
+ est --> fits{fits in memory?}
51
+
52
+ fits -- yes --> direct[direct pandas read<br/>csv/xlsx/json/parquet]
53
+ fits -- no --> polars{use_polars?}
54
+
55
+ polars -- yes --> polars_ok{polars available?}
56
+ polars_ok -- yes --> polars_read[polars scan + collect]
57
+ polars_ok -- no --> chunked
58
+
59
+ polars -- no --> chunked[chunked pandas read]
60
+ chunked --> concat[concat chunks]
61
+
62
+ direct --> done([DataFrame])
63
+ polars_read --> done
64
+ concat --> done
65
+ :::
66
+
67
+ ## Parquet partitioning pipeline
68
+
69
+ Both `read_file_to_parquets` and `stream_to_parquets` chunk through a file, clean
70
+ columns, optionally coerce numeric values, and write Parquet partitions.
71
+
72
+ ::: mermaid
73
+ flowchart LR
74
+ input[(input file)] --> reader[read_file_chunked/read_file_iter]
75
+ reader --> clean[clean_dataframe_columns]
76
+ clean --> cast{convert_types?}
77
+ cast -- yes --> numeric[try_cast_string_columns_to_numeric]
78
+ cast -- no --> write
79
+ numeric --> write[write parquet partition]
80
+ write --> more{more chunks?}
81
+ more -- yes --> reader
82
+ more -- no --> done([parquet files])
83
+ :::
84
+
85
+ ## SQL DDL generation
86
+
87
+ DDL generation uses two entry points depending on source schema.
88
+
89
+ ::: mermaid
90
+ flowchart TB
91
+ df[df: pandas DataFrame] --> ddl_df[ddl.generate_create_table]
92
+ parquet[parquet path] --> ddl_parquet[ddl_pyarrow.generate_create_table_from_parquet]
93
+
94
+ ddl_df --> qualify[_qualify_name + quote_identifier]
95
+ ddl_parquet --> qualify
96
+ qualify --> sql_stmt([CREATE TABLE statement])
97
+ :::
98
+
99
+ ## Legacy `core` namespace
100
+
101
+ `datablade.core` mirrors the organized modules and re-exports them for
102
+ backward compatibility. New code should import from the newer modules.
@@ -0,0 +1,194 @@
1
+ # Object Registry (draft)
2
+
3
+ This document defines a draft specification for an in-memory object registry that
4
+ provides SQL-like dot notation for database objects while keeping names and layout
5
+ externalized in configuration.
6
+
7
+ The registry is metadata-first. It does not connect to databases, perform I/O, or
8
+ decide how content is produced. It simply gives a stable, structured namespace for
9
+ object references and their associated content.
10
+
11
+ ## Goals
12
+
13
+ - Provide dot-notation access to objects (catalog.schema.object.content).
14
+ - Externalize names and layout in a config file to reduce hard-coded strings.
15
+ - Allow host to be optional so connections can supply it at runtime.
16
+ - Make content developer-managed (lazy by default, no implied loading).
17
+ - Keep the model extensible across SQL dialects and non-SQL backends.
18
+
19
+ ## Non-goals
20
+
21
+ - No database connections or live introspection.
22
+ - No migrations, transactional DDL, or ORM behavior.
23
+ - No enforcement of how content is loaded or computed.
24
+
25
+ ## Core concepts
26
+
27
+ - ObjectRef: immutable reference to an object, with metadata and content.
28
+ - ObjectNode: namespace container supporting dot and key access.
29
+ - ObjectRegistry: root registry, config loading, and object iteration.
30
+ - DialectAdapter: formatter for qualified names based on dialect rules.
31
+
32
+ ## Config schema (v1)
33
+
34
+ Top-level keys:
35
+
36
+ - version (int, required)
37
+ - defaults (object, optional)
38
+ - dialects (object, optional)
39
+ - catalogs (object, optional)
40
+ - hosts (object, optional)
41
+ - metadata (object, optional)
42
+
43
+ defaults:
44
+
45
+ - dialect (string, optional; default "sqlserver")
46
+ - host (string|null, optional)
47
+ - catalog (string|null, optional)
48
+ - schema (string|null, optional; default "dbo")
49
+ - object_type (string, optional; default "table")
50
+ - name_policy (string, optional; "preserve" | "lower" | "upper" | "normalize")
51
+
52
+ dialects:
53
+
54
+ - <dialect_name>:
55
+ - qualifier (string, required): dot-separated segments
56
+ - allowed: host, catalog, schema, object
57
+ - quote_style (string, optional):
58
+ - "sqlserver" | "postgres" | "mysql" | "duckdb" | "none"
59
+
60
+ hosts:
61
+
62
+ - <host_key>:
63
+ - host (string, optional)
64
+ - catalogs (map, required)
65
+
66
+ catalogs (hostless or under host):
67
+
68
+ - <catalog_key>:
69
+ - catalog (string, optional)
70
+ - schemas (map, optional)
71
+ - objects (map, optional) # allow catalog-level objects
72
+
73
+ schemas:
74
+
75
+ - <schema_key>:
76
+ - schema (string, optional)
77
+ - objects (map, required)
78
+
79
+ objects:
80
+
81
+ - <object_key>:
82
+ - name (string, optional; defaults to key)
83
+ - object_type (string, optional; defaults to defaults.object_type)
84
+ - aliases (list[string], optional)
85
+ - content (any, optional)
86
+ - tags (map[string,string], optional)
87
+
88
+ ## Name policy and lookup
89
+
90
+ Keys are the stable identifiers used for dot-notation. The name policy determines
91
+ how keys and aliases are normalized for lookup.
92
+
93
+ - preserve: do not transform keys; lookups are case-sensitive.
94
+ - lower / upper: normalize keys and aliases to the chosen case.
95
+ - normalize: transform keys into safe identifiers:
96
+ - lower-case
97
+ - replace invalid characters with "_"
98
+ - collapse multiple "_"
99
+ - prefix "_" if the key starts with a digit
100
+ - keep the original key as an implicit alias
101
+
102
+ Lookup behavior:
103
+
104
+ - Dot access uses normalized keys.
105
+ - Bracket access checks raw key, then aliases, then normalized keys.
106
+ - Aliases are only for lookup; they do not change the stored name.
107
+
108
+ Collision rules:
109
+
110
+ - Two siblings that normalize to the same key are errors.
111
+ - An alias that collides with a sibling key or alias is an error in strict mode.
112
+
113
+ ## Host optionality
114
+
115
+ Host is optional. If host is missing, qualified names use only the available
116
+ segments. If a runtime connection provides host/catalog/schema, it can override
117
+ missing fields.
118
+
119
+ ## Content semantics
120
+
121
+ Content is developer-managed. The registry stores it but does not interpret it.
122
+ Content can be a DataFrame, SQL text, a lazy loader, or a computed result.
123
+
124
+ ## Dialect qualification
125
+
126
+ Dialect qualification uses `dialects.<name>.qualifier` to decide which segments
127
+ to include and in what order. Missing segments are skipped.
128
+
129
+ Example qualifiers:
130
+
131
+ - sqlserver: catalog.schema.object
132
+ - postgres: schema.object
133
+ - nosql: collection
134
+
135
+ ## Validation rules (draft)
136
+
137
+ Errors:
138
+
139
+ - version missing or not an integer
140
+ - neither catalogs nor hosts provided
141
+ - unknown fields in strict mode
142
+ - duplicate keys after normalization within a parent
143
+ - alias collisions with sibling keys or aliases
144
+ - dialect qualifier uses unknown segments
145
+
146
+ Warnings:
147
+
148
+ - name differs only by case from key under preserve
149
+ - host provided under hostless catalogs root
150
+ - objects without content (informational)
151
+
152
+ ## Example config
153
+
154
+ ```yaml
155
+ version: 1
156
+
157
+ defaults:
158
+ dialect: sqlserver
159
+ schema: dbo
160
+ name_policy: normalize
161
+
162
+ dialects:
163
+ sqlserver:
164
+ qualifier: catalog.schema.object
165
+ postgres:
166
+ qualifier: schema.object
167
+
168
+ catalogs:
169
+ sales:
170
+ catalog: SalesDW
171
+ schemas:
172
+ reporting:
173
+ schema: rpt
174
+ objects:
175
+ orders:
176
+ name: Orders
177
+ object_type: table
178
+ aliases: [orders_current]
179
+ content: null
180
+ ```
181
+
182
+ ## Usage sketch
183
+
184
+ ```python
185
+ registry = ObjectRegistry.from_yaml("layout.yaml")
186
+
187
+ df = registry.catalogs.sales.reporting.orders.content
188
+ df2 = registry.catalogs.sales.reporting.orders_current.content
189
+
190
+ df3 = registry.catalogs["sales"].schemas["rpt"].objects["Order Details"].content
191
+
192
+ qualified = registry.catalogs.sales.reporting.orders.qualified(dialect="sqlserver")
193
+ ```
194
+
@@ -0,0 +1,57 @@
1
+ # datablade Documentation
2
+
3
+ This folder contains the human-facing documentation for **datablade**.
4
+
5
+ ## Quick start
6
+
7
+ Install:
8
+
9
+ ```bash
10
+ pip install datablade
11
+ ```
12
+
13
+ Access docs after installation:
14
+
15
+ ```bash
16
+ python -m datablade.docs --list
17
+ python -m datablade.docs --show USAGE
18
+ python -m datablade.docs --write-dir .\\datablade-docs
19
+ ```
20
+
21
+ Basic usage:
22
+
23
+ ```python
24
+ import pandas as pd
25
+ from datablade.dataframes import read_file_smart, clean_dataframe_columns
26
+ from datablade.sql import Dialect, generate_create_table, generate_create_table_from_parquet
27
+
28
+ df = read_file_smart("data.csv", verbose=True)
29
+ df = clean_dataframe_columns(df)
30
+
31
+ ddl = generate_create_table(df, table="my_table", dialect=Dialect.POSTGRES)
32
+ print(ddl)
33
+
34
+ # Or: generate DDL from a Parquet file schema without materializing rows
35
+ # Note: nested Parquet types (struct/list/map/union) are dropped with a warning.
36
+ ddl2 = generate_create_table_from_parquet(
37
+ "events.parquet",
38
+ table="events",
39
+ dialect=Dialect.POSTGRES,
40
+ )
41
+ print(ddl2)
42
+
43
+ # Optional schema_spec overrides (type/nullability/string sizing)
44
+ schema_spec = {
45
+ "columns": {"notes": {"sql_type": "text", "nullable": True}},
46
+ }
47
+ ddl3 = generate_create_table(df, table="my_table", dialect=Dialect.POSTGRES, schema_spec=schema_spec)
48
+ print(ddl3)
49
+ ```
50
+
51
+ Most file path parameters accept `str` or `pathlib.Path`.
52
+
53
+ ## Guides
54
+
55
+ - See [docs/USAGE.md](USAGE.md) for the main usage guide (file reading, streaming, SQL, IO, logging).
56
+ - See [docs/ARCHITECTURE.md](ARCHITECTURE.md) for an architecture overview with pipeline diagrams.
57
+ - See [docs/TESTING.md](TESTING.md) for running tests locally.
@@ -0,0 +1,37 @@
1
+ # Testing
2
+
3
+ ## Install test dependencies
4
+
5
+ ```bash
6
+ pip install -e ".[test]"
7
+ ```
8
+
9
+ ## Run tests
10
+
11
+ ```bash
12
+ pytest
13
+ ```
14
+
15
+ ## Coverage
16
+
17
+ If you have `pytest-cov` available:
18
+
19
+ ```bash
20
+ pytest --cov=datablade --cov-report=term-missing
21
+ ```
22
+
23
+ If `pytest-cov` is problematic in your environment, you can use `coverage` directly:
24
+
25
+ ```bash
26
+ coverage run -m pytest
27
+ coverage report -m
28
+ ```
29
+
30
+ ## Lint (optional, for contributors)
31
+
32
+ ```bash
33
+ pip install -e ".[dev]"
34
+ black .
35
+ isort .
36
+ flake8
37
+ ```