datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datablade/__init__.py +49 -1
  2. datablade/blade.py +322 -0
  3. datablade/core/__init__.py +28 -7
  4. datablade/core/frames.py +23 -236
  5. datablade/core/json.py +5 -10
  6. datablade/core/lists.py +5 -10
  7. datablade/core/messages.py +23 -11
  8. datablade/core/strings.py +5 -43
  9. datablade/core/zip.py +5 -24
  10. datablade/dataframes/__init__.py +51 -0
  11. datablade/dataframes/frames.py +585 -0
  12. datablade/dataframes/readers.py +1367 -0
  13. datablade/docs/ARCHITECTURE.md +102 -0
  14. datablade/docs/OBJECT_REGISTRY.md +194 -0
  15. datablade/docs/README.md +57 -0
  16. datablade/docs/TESTING.md +37 -0
  17. datablade/docs/USAGE.md +409 -0
  18. datablade/docs/__init__.py +87 -0
  19. datablade/docs/__main__.py +6 -0
  20. datablade/io/__init__.py +15 -0
  21. datablade/io/json.py +70 -0
  22. datablade/io/zip.py +111 -0
  23. datablade/registry.py +581 -0
  24. datablade/sql/__init__.py +56 -0
  25. datablade/sql/bulk_load.py +665 -0
  26. datablade/sql/ddl.py +402 -0
  27. datablade/sql/ddl_pyarrow.py +411 -0
  28. datablade/sql/dialects.py +12 -0
  29. datablade/sql/quoting.py +44 -0
  30. datablade/sql/schema_spec.py +65 -0
  31. datablade/sql/sqlserver.py +390 -0
  32. datablade/utils/__init__.py +38 -0
  33. datablade/utils/lists.py +32 -0
  34. datablade/utils/logging.py +204 -0
  35. datablade/utils/messages.py +29 -0
  36. datablade/utils/strings.py +249 -0
  37. datablade-0.0.6.dist-info/METADATA +406 -0
  38. datablade-0.0.6.dist-info/RECORD +41 -0
  39. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
  40. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
  41. datablade-0.0.0.dist-info/METADATA +0 -13
  42. datablade-0.0.0.dist-info/RECORD +0 -13
  43. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,411 @@
1
+ """Parquet schema-driven DDL generation using PyArrow."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import pathlib
7
+ from dataclasses import dataclass
8
+ from typing import Any, List, Mapping, Optional, Union
9
+
10
+ from ..utils.messages import print_verbose
11
+ from ..utils.strings import coerce_path
12
+ from .ddl import _qualify_name
13
+ from .dialects import Dialect
14
+ from .quoting import quote_identifier
15
+ from .schema_spec import resolve_column_spec
16
+
17
+ logger = logging.getLogger("datablade")
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class DroppedColumn:
22
+ """Metadata about a dropped column during Parquet DDL generation."""
23
+
24
+ name: str
25
+ arrow_type: str
26
+ reason: str
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class FallbackColumn:
31
+ """Metadata about a column handled via JSON fallback."""
32
+
33
+ name: str
34
+ arrow_type: str
35
+ sql_type: str
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class ParquetDDLMetadata:
40
+ """Details about columns dropped or handled via fallback."""
41
+
42
+ dropped_columns: List[DroppedColumn]
43
+ fallback_columns: List[FallbackColumn]
44
+
45
+
46
+ def _require_pyarrow():
47
+ """Import pyarrow lazily to keep core dependencies light."""
48
+ try:
49
+ import pyarrow as pa # type: ignore
50
+ import pyarrow.parquet as pq # type: ignore
51
+ except ImportError as exc: # pragma: no cover
52
+ raise ImportError(
53
+ "Parquet DDL generation requires 'pyarrow'. Install with: pip install pyarrow"
54
+ ) from exc
55
+
56
+ return pa, pq
57
+
58
+
59
+ def _is_complex_arrow_type(data_type) -> bool:
60
+ pa, _ = _require_pyarrow()
61
+ return (
62
+ pa.types.is_struct(data_type)
63
+ or pa.types.is_list(data_type)
64
+ or pa.types.is_large_list(data_type)
65
+ or pa.types.is_fixed_size_list(data_type)
66
+ or pa.types.is_map(data_type)
67
+ or pa.types.is_union(data_type)
68
+ )
69
+
70
+
71
+ def _json_fallback_sql_type(dialect: Dialect) -> str:
72
+ if dialect == Dialect.SQLSERVER:
73
+ return "nvarchar(max)"
74
+ if dialect == Dialect.POSTGRES:
75
+ return "text"
76
+ if dialect == Dialect.MYSQL:
77
+ return "TEXT"
78
+ if dialect == Dialect.DUCKDB:
79
+ return "VARCHAR"
80
+ raise NotImplementedError(f"Dialect not supported: {dialect}")
81
+
82
+
83
+ def _sql_type_from_arrow(data_type, dialect: Dialect) -> Optional[str]: # noqa: C901
84
+ """Map a pyarrow.DataType to a SQL type string.
85
+
86
+ Returns None when there is no clean mapping and the caller should drop the column.
87
+ """
88
+
89
+ pa, _ = _require_pyarrow()
90
+
91
+ # Dictionary-encoded columns behave like their value type for DDL purposes.
92
+ if pa.types.is_dictionary(data_type):
93
+ return _sql_type_from_arrow(data_type.value_type, dialect)
94
+
95
+ # Nested/complex types: no clean general mapping across dialects.
96
+ if (
97
+ pa.types.is_struct(data_type)
98
+ or pa.types.is_list(data_type)
99
+ or pa.types.is_large_list(data_type)
100
+ or pa.types.is_fixed_size_list(data_type)
101
+ or pa.types.is_map(data_type)
102
+ or pa.types.is_union(data_type)
103
+ ):
104
+ return None
105
+
106
+ if dialect == Dialect.SQLSERVER:
107
+ if pa.types.is_boolean(data_type):
108
+ return "bit"
109
+ if pa.types.is_int8(data_type) or pa.types.is_int16(data_type):
110
+ return "smallint"
111
+ if pa.types.is_int32(data_type):
112
+ return "int"
113
+ if pa.types.is_int64(data_type):
114
+ return "bigint"
115
+ if pa.types.is_uint8(data_type) or pa.types.is_uint16(data_type):
116
+ return "int"
117
+ if pa.types.is_uint32(data_type):
118
+ return "bigint"
119
+ if pa.types.is_uint64(data_type):
120
+ return "decimal(20, 0)"
121
+ if pa.types.is_float16(data_type) or pa.types.is_float32(data_type):
122
+ return "real"
123
+ if pa.types.is_float64(data_type):
124
+ return "float"
125
+ if pa.types.is_decimal(data_type):
126
+ precision = min(int(data_type.precision), 38)
127
+ scale = int(data_type.scale)
128
+ return f"decimal({precision}, {scale})"
129
+ if pa.types.is_date(data_type):
130
+ return "date"
131
+ if pa.types.is_time(data_type):
132
+ return "time"
133
+ if pa.types.is_timestamp(data_type):
134
+ # SQL Server has datetimeoffset for tz-aware values.
135
+ return "datetimeoffset" if data_type.tz is not None else "datetime2"
136
+ if pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type):
137
+ return "varbinary(max)"
138
+ if pa.types.is_fixed_size_binary(data_type):
139
+ return (
140
+ f"varbinary({int(data_type.byte_width)})"
141
+ if int(data_type.byte_width) <= 8000
142
+ else "varbinary(max)"
143
+ )
144
+ if pa.types.is_string(data_type) or pa.types.is_large_string(data_type):
145
+ return "nvarchar(max)"
146
+
147
+ # Anything else (including null) is not reliably representable.
148
+ return None
149
+
150
+ if dialect == Dialect.POSTGRES:
151
+ if pa.types.is_boolean(data_type):
152
+ return "boolean"
153
+ if pa.types.is_int8(data_type) or pa.types.is_int16(data_type):
154
+ return "smallint"
155
+ if pa.types.is_int32(data_type):
156
+ return "integer"
157
+ if pa.types.is_int64(data_type):
158
+ return "bigint"
159
+ if pa.types.is_unsigned_integer(data_type):
160
+ # Postgres has no unsigned ints; use a wider signed or numeric.
161
+ if pa.types.is_uint8(data_type) or pa.types.is_uint16(data_type):
162
+ return "integer"
163
+ if pa.types.is_uint32(data_type):
164
+ return "bigint"
165
+ if pa.types.is_uint64(data_type):
166
+ return "numeric(20, 0)"
167
+ if pa.types.is_float16(data_type) or pa.types.is_float32(data_type):
168
+ return "real"
169
+ if pa.types.is_float64(data_type):
170
+ return "double precision"
171
+ if pa.types.is_decimal(data_type):
172
+ precision = int(data_type.precision)
173
+ scale = int(data_type.scale)
174
+ return f"numeric({precision}, {scale})"
175
+ if pa.types.is_date(data_type):
176
+ return "date"
177
+ if pa.types.is_time(data_type):
178
+ return "time"
179
+ if pa.types.is_timestamp(data_type):
180
+ return "timestamptz" if data_type.tz is not None else "timestamp"
181
+ if pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type):
182
+ return "bytea"
183
+ if pa.types.is_fixed_size_binary(data_type):
184
+ return "bytea"
185
+ if pa.types.is_string(data_type) or pa.types.is_large_string(data_type):
186
+ return "text"
187
+
188
+ return None
189
+
190
+ if dialect == Dialect.MYSQL:
191
+ if pa.types.is_boolean(data_type):
192
+ return "TINYINT(1)"
193
+ if pa.types.is_int8(data_type) or pa.types.is_int16(data_type):
194
+ return "SMALLINT"
195
+ if pa.types.is_int32(data_type):
196
+ return "INT"
197
+ if pa.types.is_int64(data_type):
198
+ return "BIGINT"
199
+ if pa.types.is_unsigned_integer(data_type):
200
+ # MySQL supports UNSIGNED, but we keep mappings consistent with the existing
201
+ # pandas-based DDL generator (signed types).
202
+ if pa.types.is_uint8(data_type) or pa.types.is_uint16(data_type):
203
+ return "INT"
204
+ if pa.types.is_uint32(data_type):
205
+ return "BIGINT"
206
+ if pa.types.is_uint64(data_type):
207
+ return "DECIMAL(20, 0)"
208
+ if pa.types.is_float16(data_type) or pa.types.is_float32(data_type):
209
+ return "FLOAT"
210
+ if pa.types.is_float64(data_type):
211
+ return "DOUBLE"
212
+ if pa.types.is_decimal(data_type):
213
+ precision = min(int(data_type.precision), 65)
214
+ scale = min(int(data_type.scale), 30, precision)
215
+ return f"DECIMAL({precision}, {scale})"
216
+ if pa.types.is_date(data_type):
217
+ return "DATE"
218
+ if pa.types.is_time(data_type):
219
+ return "TIME"
220
+ if pa.types.is_timestamp(data_type):
221
+ return "DATETIME"
222
+ if pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type):
223
+ return "LONGBLOB"
224
+ if pa.types.is_fixed_size_binary(data_type):
225
+ width = int(data_type.byte_width)
226
+ return f"VARBINARY({width})" if width <= 65535 else "LONGBLOB"
227
+ if pa.types.is_string(data_type) or pa.types.is_large_string(data_type):
228
+ return "TEXT"
229
+
230
+ return None
231
+
232
+ if dialect == Dialect.DUCKDB:
233
+ if pa.types.is_boolean(data_type):
234
+ return "BOOLEAN"
235
+ if pa.types.is_signed_integer(data_type):
236
+ return "BIGINT"
237
+ if pa.types.is_unsigned_integer(data_type):
238
+ return "UBIGINT"
239
+ if pa.types.is_floating(data_type):
240
+ return "DOUBLE"
241
+ if pa.types.is_decimal(data_type):
242
+ precision = int(data_type.precision)
243
+ scale = int(data_type.scale)
244
+ return f"DECIMAL({precision}, {scale})"
245
+ if pa.types.is_date(data_type):
246
+ return "DATE"
247
+ if pa.types.is_time(data_type):
248
+ return "TIME"
249
+ if pa.types.is_timestamp(data_type):
250
+ return "TIMESTAMPTZ" if data_type.tz is not None else "TIMESTAMP"
251
+ if pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type):
252
+ return "BLOB"
253
+ if pa.types.is_fixed_size_binary(data_type):
254
+ return "BLOB"
255
+ if pa.types.is_string(data_type) or pa.types.is_large_string(data_type):
256
+ return "VARCHAR"
257
+
258
+ return None
259
+
260
+ raise NotImplementedError(f"Dialect not supported: {dialect}")
261
+
262
+
263
+ def generate_create_table_from_parquet(
264
+ parquet_path: str | pathlib.Path,
265
+ catalog: Optional[str] = None,
266
+ schema: Optional[str] = None,
267
+ table: str = "table",
268
+ drop_existing: bool = True,
269
+ dialect: Dialect = Dialect.SQLSERVER,
270
+ use_go: bool = False,
271
+ schema_spec: Optional[Mapping[str, Any]] = None,
272
+ verbose: bool = False,
273
+ fallback_to_json: bool = False,
274
+ return_metadata: bool = False,
275
+ ) -> Union[str, tuple[str, ParquetDDLMetadata]]:
276
+ """Generate a CREATE TABLE statement from a Parquet file schema.
277
+
278
+ This reads the Parquet schema only (via PyArrow) and does not materialize data.
279
+
280
+ Columns whose Parquet types have no clean mapping for the chosen dialect are
281
+ dropped, and a warning is logged under logger name 'datablade'. If
282
+ fallback_to_json is enabled, complex types are instead mapped to a text
283
+ column intended to store JSON-encoded values. Use return_metadata to receive
284
+ details about dropped and fallback-mapped columns.
285
+
286
+ When dialect is SQL Server and use_go is True, a GO batch separator is
287
+ inserted after a USE statement when a catalog is provided.
288
+
289
+ schema_spec may provide per-column sql_type/nullable overrides.
290
+ """
291
+
292
+ path_obj = coerce_path(
293
+ parquet_path,
294
+ must_exist=True,
295
+ verbose=verbose,
296
+ label="parquet_path",
297
+ )
298
+ if not isinstance(table, str) or not table.strip():
299
+ raise ValueError("table must be a non-empty string")
300
+ if catalog is not None and (not isinstance(catalog, str) or not catalog.strip()):
301
+ raise ValueError("catalog, if provided, must be a non-empty string")
302
+ if schema is not None and (not isinstance(schema, str) or not schema.strip()):
303
+ raise ValueError("schema, if provided, must be a non-empty string")
304
+ if not isinstance(use_go, bool):
305
+ raise TypeError("use_go must be a boolean")
306
+
307
+ _, pq = _require_pyarrow()
308
+
309
+ # Read Parquet metadata only; this does not load row data.
310
+ arrow_schema = pq.ParquetFile(path_obj).schema_arrow
311
+
312
+ qualified_name = _qualify_name(catalog, schema, table, dialect)
313
+ lines: List[str] = []
314
+ dropped_columns: List[DroppedColumn] = []
315
+ fallback_columns: List[FallbackColumn] = []
316
+
317
+ for field in arrow_schema:
318
+ column_name = str(field.name)
319
+ defaults, column_spec = resolve_column_spec(column_name, schema_spec)
320
+ sql_type_override = column_spec.get("sql_type")
321
+ if sql_type_override is not None:
322
+ if not isinstance(sql_type_override, str) or not sql_type_override.strip():
323
+ raise ValueError(
324
+ f"schema_spec.columns['{column_name}'].sql_type must be a non-empty string"
325
+ )
326
+ sql_type = sql_type_override.strip()
327
+ else:
328
+ sql_type = _sql_type_from_arrow(field.type, dialect)
329
+
330
+ if sql_type is None:
331
+ if fallback_to_json and _is_complex_arrow_type(field.type):
332
+ fallback_sql_type = _json_fallback_sql_type(dialect)
333
+ fallback_columns.append(
334
+ FallbackColumn(
335
+ name=str(field.name),
336
+ arrow_type=str(field.type),
337
+ sql_type=fallback_sql_type,
338
+ )
339
+ )
340
+ sql_type = fallback_sql_type
341
+ else:
342
+ dropped_columns.append(
343
+ DroppedColumn(
344
+ name=str(field.name),
345
+ arrow_type=str(field.type),
346
+ reason="unsupported type",
347
+ )
348
+ )
349
+ logger.warning(
350
+ "Dropping Parquet column %r (type=%s) for dialect=%s: unsupported type",
351
+ field.name,
352
+ str(field.type),
353
+ dialect.value,
354
+ )
355
+ continue
356
+
357
+ nullable = field.nullable
358
+ for label, value in (
359
+ ("nullable", column_spec.get("nullable")),
360
+ ("allow_null", column_spec.get("allow_null")),
361
+ ("defaults.nullable", defaults.get("nullable")),
362
+ ("defaults.allow_null", defaults.get("allow_null")),
363
+ ):
364
+ if value is None:
365
+ continue
366
+ if not isinstance(value, bool):
367
+ raise TypeError(f"{label} must be a boolean")
368
+ nullable = value
369
+ break
370
+
371
+ null_str = "NULL" if nullable else "NOT NULL"
372
+ lines.append(
373
+ f" {quote_identifier(column_name, dialect)} {sql_type} {null_str}"
374
+ )
375
+
376
+ if not lines:
377
+ raise ValueError(
378
+ "No supported columns found in Parquet schema for the selected dialect"
379
+ )
380
+
381
+ body = ",\n".join(lines)
382
+
383
+ drop_clause = ""
384
+ if drop_existing:
385
+ if dialect == Dialect.SQLSERVER:
386
+ object_id_name = qualified_name.replace("'", "''")
387
+ if catalog:
388
+ batch_sep = "GO\n" if use_go else ""
389
+ drop_clause = (
390
+ f"USE {quote_identifier(catalog, dialect)};\n"
391
+ f"{batch_sep}IF OBJECT_ID('{object_id_name}') IS NOT NULL "
392
+ f"DROP TABLE {qualified_name};\n"
393
+ )
394
+ else:
395
+ drop_clause = (
396
+ f"IF OBJECT_ID('{object_id_name}') IS NOT NULL "
397
+ f"DROP TABLE {qualified_name};\n"
398
+ )
399
+ else:
400
+ drop_clause = f"DROP TABLE IF EXISTS {qualified_name};\n"
401
+
402
+ statement = f"{drop_clause}CREATE TABLE {qualified_name} (\n{body}\n);"
403
+ print_verbose(
404
+ f"Generated CREATE TABLE from Parquet schema for {qualified_name}", verbose
405
+ )
406
+ if return_metadata:
407
+ metadata = ParquetDDLMetadata(
408
+ dropped_columns=dropped_columns, fallback_columns=fallback_columns
409
+ )
410
+ return statement, metadata
411
+ return statement
@@ -0,0 +1,12 @@
1
+ """Enumeration of SQL dialects supported by datablade."""
2
+
3
+ from enum import Enum
4
+
5
+
6
+ class Dialect(str, Enum):
7
+ """Supported SQL dialects for datablade DDL helpers."""
8
+
9
+ SQLSERVER = "sqlserver"
10
+ POSTGRES = "postgres"
11
+ MYSQL = "mysql"
12
+ DUCKDB = "duckdb"
@@ -0,0 +1,44 @@
1
+ """Identifier quoting for supported SQL dialects."""
2
+
3
+ from typing import Optional
4
+
5
+ from .dialects import Dialect
6
+
7
+
8
+ def quote_identifier(name: Optional[str], dialect: Dialect = Dialect.SQLSERVER) -> str:
9
+ """
10
+ Quote an identifier for the given SQL dialect.
11
+
12
+ Args:
13
+ name: Identifier to quote; must be non-empty string.
14
+ dialect: Target SQL dialect.
15
+
16
+ Returns:
17
+ Quoted identifier string.
18
+
19
+ Raises:
20
+ ValueError: If name is missing/empty.
21
+ TypeError: If name is not a string.
22
+ NotImplementedError: If dialect is unsupported.
23
+ """
24
+ if name is None:
25
+ raise ValueError("name must be provided")
26
+ if not isinstance(name, str):
27
+ raise TypeError("name must be a string")
28
+ cleaned = name.strip()
29
+ if not cleaned:
30
+ raise ValueError("name must be a non-empty string")
31
+
32
+ if dialect == Dialect.SQLSERVER:
33
+ return f"[{cleaned.replace('[', '').replace(']', '')}]"
34
+ if dialect == Dialect.POSTGRES:
35
+ escaped = cleaned.replace('"', '""')
36
+ return f'"{escaped}"'
37
+ if dialect == Dialect.MYSQL:
38
+ escaped = cleaned.replace("`", "``")
39
+ return f"`{escaped}`"
40
+ if dialect == Dialect.DUCKDB:
41
+ escaped = cleaned.replace('"', '""')
42
+ return f'"{escaped}"'
43
+
44
+ raise NotImplementedError(f"Dialect not supported: {dialect}")
@@ -0,0 +1,65 @@
1
+ """Schema specification helpers for DDL generation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping
6
+ from typing import Any, Optional, Tuple
7
+
8
+
9
+ def _as_mapping(value: Any, label: str) -> dict:
10
+ if value is None:
11
+ return {}
12
+ if not isinstance(value, Mapping):
13
+ raise TypeError(f"{label} must be a mapping")
14
+ return dict(value)
15
+
16
+
17
+ def resolve_schema_spec(
18
+ schema_spec: Optional[Mapping[str, Any]],
19
+ ) -> Tuple[dict, dict]:
20
+ """Return (defaults, columns) mappings for a schema spec."""
21
+ if schema_spec is None:
22
+ return {}, {}
23
+ if not isinstance(schema_spec, Mapping):
24
+ raise TypeError("schema_spec must be a mapping")
25
+
26
+ defaults = _as_mapping(schema_spec.get("defaults"), "schema_spec.defaults")
27
+ columns = _as_mapping(schema_spec.get("columns"), "schema_spec.columns")
28
+ return defaults, columns
29
+
30
+
31
+ def resolve_column_spec(
32
+ column_name: str,
33
+ schema_spec: Optional[Mapping[str, Any]],
34
+ ) -> Tuple[dict, dict]:
35
+ """Return (defaults, column_spec) for a column name."""
36
+ defaults, columns = resolve_schema_spec(schema_spec)
37
+ if not columns:
38
+ return defaults, {}
39
+
40
+ column_spec = columns.get(column_name)
41
+ if column_spec is None:
42
+ column_spec = columns.get(str(column_name))
43
+
44
+ if column_spec is None:
45
+ return defaults, {}
46
+ if not isinstance(column_spec, Mapping):
47
+ raise TypeError(f"schema_spec.columns['{column_name}'] must be a mapping")
48
+ return defaults, dict(column_spec)
49
+
50
+
51
+ def resolve_string_policy(
52
+ column_name: str,
53
+ defaults: dict,
54
+ column_spec: dict,
55
+ ) -> dict:
56
+ """Merge defaults + column string policy overrides."""
57
+ string_defaults = _as_mapping(defaults.get("string"), "schema_spec.defaults.string")
58
+ string_overrides = _as_mapping(
59
+ column_spec.get("string"),
60
+ f"schema_spec.columns['{column_name}'].string",
61
+ )
62
+ policy = {**string_defaults, **string_overrides}
63
+ if "defined_pad" in policy and "pad" not in policy:
64
+ policy["pad"] = policy["defined_pad"]
65
+ return policy