datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datablade/__init__.py +49 -1
  2. datablade/blade.py +322 -0
  3. datablade/core/__init__.py +28 -7
  4. datablade/core/frames.py +23 -236
  5. datablade/core/json.py +5 -10
  6. datablade/core/lists.py +5 -10
  7. datablade/core/messages.py +23 -11
  8. datablade/core/strings.py +5 -43
  9. datablade/core/zip.py +5 -24
  10. datablade/dataframes/__init__.py +51 -0
  11. datablade/dataframes/frames.py +585 -0
  12. datablade/dataframes/readers.py +1367 -0
  13. datablade/docs/ARCHITECTURE.md +102 -0
  14. datablade/docs/OBJECT_REGISTRY.md +194 -0
  15. datablade/docs/README.md +57 -0
  16. datablade/docs/TESTING.md +37 -0
  17. datablade/docs/USAGE.md +409 -0
  18. datablade/docs/__init__.py +87 -0
  19. datablade/docs/__main__.py +6 -0
  20. datablade/io/__init__.py +15 -0
  21. datablade/io/json.py +70 -0
  22. datablade/io/zip.py +111 -0
  23. datablade/registry.py +581 -0
  24. datablade/sql/__init__.py +56 -0
  25. datablade/sql/bulk_load.py +665 -0
  26. datablade/sql/ddl.py +402 -0
  27. datablade/sql/ddl_pyarrow.py +411 -0
  28. datablade/sql/dialects.py +12 -0
  29. datablade/sql/quoting.py +44 -0
  30. datablade/sql/schema_spec.py +65 -0
  31. datablade/sql/sqlserver.py +390 -0
  32. datablade/utils/__init__.py +38 -0
  33. datablade/utils/lists.py +32 -0
  34. datablade/utils/logging.py +204 -0
  35. datablade/utils/messages.py +29 -0
  36. datablade/utils/strings.py +249 -0
  37. datablade-0.0.6.dist-info/METADATA +406 -0
  38. datablade-0.0.6.dist-info/RECORD +41 -0
  39. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
  40. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
  41. datablade-0.0.0.dist-info/METADATA +0 -13
  42. datablade-0.0.0.dist-info/RECORD +0 -13
  43. {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
datablade/sql/ddl.py ADDED
@@ -0,0 +1,402 @@
1
+ """Pandas-driven DDL generation for multiple SQL dialects."""
2
+
3
+ from typing import Any, List, Mapping, Optional
4
+
5
+ import pandas as pd
6
+
7
+ from ..utils.messages import print_verbose
8
+ from .dialects import Dialect
9
+ from .quoting import quote_identifier
10
+ from .schema_spec import resolve_column_spec, resolve_string_policy
11
+
12
+ _VALID_PREFER_LENGTH = {"estimate", "minimum", "maximum"}
13
+
14
+
15
+ def _coerce_positive_int(value: Any, label: str) -> Optional[int]:
16
+ if value is None:
17
+ return None
18
+ if isinstance(value, bool) or not isinstance(value, int) or value <= 0:
19
+ raise ValueError(f"{label} must be a positive integer")
20
+ return int(value)
21
+
22
+
23
+ def _coerce_non_negative_int(value: Any, label: str) -> int:
24
+ if value is None:
25
+ return 0
26
+ if isinstance(value, bool) or not isinstance(value, int) or value < 0:
27
+ raise ValueError(f"{label} must be a non-negative integer")
28
+ return int(value)
29
+
30
+
31
+ def _coerce_optional_bool(value: Any, label: str) -> Optional[bool]:
32
+ if value is None:
33
+ return None
34
+ if not isinstance(value, bool):
35
+ raise TypeError(f"{label} must be a boolean")
36
+ return value
37
+
38
+
39
+ def _normalize_string_policy(policy: Optional[Mapping[str, Any]]) -> dict:
40
+ policy = {} if policy is None else dict(policy)
41
+ if "defined_pad" in policy and "pad" not in policy:
42
+ policy["pad"] = policy["defined_pad"]
43
+
44
+ prefer_length = policy.get("prefer_length", "estimate")
45
+ if prefer_length not in _VALID_PREFER_LENGTH:
46
+ raise ValueError(
47
+ "prefer_length must be one of 'estimate', 'minimum', or 'maximum'"
48
+ )
49
+
50
+ min_length = _coerce_positive_int(policy.get("min_length"), "min_length")
51
+ max_length = _coerce_positive_int(policy.get("max_length"), "max_length")
52
+ pad = _coerce_non_negative_int(policy.get("pad"), "pad")
53
+ empty_as_null = (
54
+ _coerce_optional_bool(policy.get("empty_as_null"), "empty_as_null") or False
55
+ )
56
+ allow_null = _coerce_optional_bool(policy.get("allow_null"), "allow_null")
57
+
58
+ return {
59
+ "prefer_length": prefer_length,
60
+ "min_length": min_length,
61
+ "max_length": max_length,
62
+ "pad": pad,
63
+ "empty_as_null": empty_as_null,
64
+ "allow_null": allow_null,
65
+ }
66
+
67
+
68
+ def _string_series_stats(series: pd.Series, empty_as_null: bool) -> tuple[int, bool]:
69
+ non_null = series.dropna()
70
+ if non_null.empty:
71
+ return 0, False
72
+
73
+ as_str = non_null.astype(str)
74
+ empty_mask = as_str == ""
75
+ any_empty = bool(empty_mask.any())
76
+ if empty_as_null:
77
+ as_str = as_str[~empty_mask]
78
+ if as_str.empty:
79
+ return 0, any_empty
80
+
81
+ lengths = as_str.map(len)
82
+ max_length = int(lengths.max()) if not lengths.empty else 0
83
+ return max_length, any_empty
84
+
85
+
86
+ def _select_string_length(
87
+ max_length: int,
88
+ *,
89
+ prefer_length: str,
90
+ pad: int,
91
+ min_length: Optional[int],
92
+ max_length_bound: Optional[int],
93
+ ) -> int:
94
+ if prefer_length == "minimum" and min_length is not None:
95
+ length = min_length
96
+ elif prefer_length == "maximum" and max_length_bound is not None:
97
+ length = max_length_bound
98
+ else:
99
+ length = max_length + pad
100
+
101
+ if min_length is not None:
102
+ length = max(length, min_length)
103
+ if max_length_bound is not None:
104
+ length = min(length, max_length_bound)
105
+
106
+ return max(1, int(length))
107
+
108
+
109
+ def _infer_sql_type( # noqa: C901
110
+ series: pd.Series,
111
+ dialect: Dialect,
112
+ *,
113
+ string_policy: Optional[Mapping[str, Any]] = None,
114
+ ) -> str:
115
+ """Infer a SQL column type for a pandas Series given a dialect."""
116
+ dtype = series.dtype
117
+
118
+ def _is_bytes_like(value: Any) -> bool:
119
+ return isinstance(value, (bytes, bytearray, memoryview))
120
+
121
+ def _is_bytes_like_series(s: pd.Series) -> bool:
122
+ if not pd.api.types.is_object_dtype(s.dtype):
123
+ return False
124
+ non_null = s.dropna()
125
+ if non_null.empty:
126
+ return False
127
+ sample = non_null.iloc[:100]
128
+ # require all sampled values to be bytes-like
129
+ return bool(sample.map(_is_bytes_like).all())
130
+
131
+ if dialect == Dialect.SQLSERVER:
132
+ # Use SQL Server's tiered integer sizes for best-fit types.
133
+ if pd.api.types.is_integer_dtype(dtype):
134
+ non_null = series.dropna()
135
+ if non_null.empty:
136
+ return "bigint"
137
+ min_value = non_null.min()
138
+ max_value = non_null.max()
139
+ if min_value >= 0 and max_value <= 255:
140
+ return "tinyint"
141
+ if min_value >= -32768 and max_value <= 32767:
142
+ return "smallint"
143
+ if min_value >= -2147483648 and max_value <= 2147483647:
144
+ return "int"
145
+ return "bigint"
146
+ if pd.api.types.is_float_dtype(dtype):
147
+ return "float"
148
+ if pd.api.types.is_bool_dtype(dtype):
149
+ return "bit"
150
+ if pd.api.types.is_datetime64_any_dtype(dtype):
151
+ return "datetime2"
152
+ if _is_bytes_like_series(series):
153
+ return "varbinary(max)"
154
+ # strings / objects
155
+ policy = _normalize_string_policy(string_policy)
156
+ max_length, _ = _string_series_stats(series, policy["empty_as_null"])
157
+ max_length = _select_string_length(
158
+ max_length,
159
+ prefer_length=policy["prefer_length"],
160
+ pad=policy["pad"],
161
+ min_length=policy["min_length"],
162
+ max_length_bound=policy["max_length"],
163
+ )
164
+ if (
165
+ pd.api.types.is_object_dtype(dtype)
166
+ or pd.api.types.is_string_dtype(dtype)
167
+ or isinstance(dtype, pd.CategoricalDtype)
168
+ ):
169
+ return f"nvarchar({max_length if max_length <= 4000 else 'max'})"
170
+ return "nvarchar(max)"
171
+
172
+ if dialect == Dialect.POSTGRES:
173
+ # PostgreSQL integer sizes are narrower than SQL Server's tinyint.
174
+ if pd.api.types.is_integer_dtype(dtype):
175
+ non_null = series.dropna()
176
+ if non_null.empty:
177
+ return "bigint"
178
+ min_value = non_null.min()
179
+ max_value = non_null.max()
180
+ if min_value >= -32768 and max_value <= 32767:
181
+ return "smallint"
182
+ if min_value >= -2147483648 and max_value <= 2147483647:
183
+ return "integer"
184
+ return "bigint"
185
+ if pd.api.types.is_float_dtype(dtype):
186
+ return "double precision"
187
+ if pd.api.types.is_bool_dtype(dtype):
188
+ return "boolean"
189
+ if pd.api.types.is_datetime64_any_dtype(dtype):
190
+ return "timestamp"
191
+ if _is_bytes_like_series(series):
192
+ return "bytea"
193
+ policy = _normalize_string_policy(string_policy)
194
+ max_length, _ = _string_series_stats(series, policy["empty_as_null"])
195
+ max_length = _select_string_length(
196
+ max_length,
197
+ prefer_length=policy["prefer_length"],
198
+ pad=policy["pad"],
199
+ min_length=policy["min_length"],
200
+ max_length_bound=policy["max_length"],
201
+ )
202
+ return f"varchar({max_length})" if max_length <= 65535 else "text"
203
+
204
+ if dialect == Dialect.MYSQL:
205
+ # Keep MySQL type names consistent with the existing DDL outputs.
206
+ if pd.api.types.is_integer_dtype(dtype):
207
+ non_null = series.dropna()
208
+ if non_null.empty:
209
+ return "BIGINT"
210
+ min_value = non_null.min()
211
+ max_value = non_null.max()
212
+ if min_value >= -32768 and max_value <= 32767:
213
+ return "SMALLINT"
214
+ if min_value >= -2147483648 and max_value <= 2147483647:
215
+ return "INT"
216
+ return "BIGINT"
217
+ if pd.api.types.is_float_dtype(dtype):
218
+ return "DOUBLE"
219
+ if pd.api.types.is_bool_dtype(dtype):
220
+ return "TINYINT(1)"
221
+ if pd.api.types.is_datetime64_any_dtype(dtype):
222
+ return "DATETIME"
223
+ if _is_bytes_like_series(series):
224
+ return "LONGBLOB"
225
+ policy = _normalize_string_policy(string_policy)
226
+ max_length, _ = _string_series_stats(series, policy["empty_as_null"])
227
+ max_length = _select_string_length(
228
+ max_length,
229
+ prefer_length=policy["prefer_length"],
230
+ pad=policy["pad"],
231
+ min_length=policy["min_length"],
232
+ max_length_bound=policy["max_length"],
233
+ )
234
+ return f"VARCHAR({max_length})" if max_length <= 65535 else "TEXT"
235
+
236
+ if dialect == Dialect.DUCKDB:
237
+ # DuckDB has simplified type names and distinguishes signed/unsigned.
238
+ if pd.api.types.is_integer_dtype(dtype):
239
+ return (
240
+ "BIGINT" if pd.api.types.is_signed_integer_dtype(dtype) else "UBIGINT"
241
+ )
242
+ if pd.api.types.is_float_dtype(dtype):
243
+ return "DOUBLE"
244
+ if pd.api.types.is_bool_dtype(dtype):
245
+ return "BOOLEAN"
246
+ if pd.api.types.is_datetime64_any_dtype(dtype):
247
+ return "TIMESTAMP"
248
+ if _is_bytes_like_series(series):
249
+ return "BLOB"
250
+ return "VARCHAR"
251
+
252
+ raise NotImplementedError(f"Dialect not supported: {dialect}")
253
+
254
+
255
+ def _qualify_name(
256
+ catalog: Optional[str], schema: Optional[str], table: str, dialect: Dialect
257
+ ) -> str:
258
+ """Build a fully-qualified table name for the selected dialect."""
259
+ if dialect == Dialect.SQLSERVER:
260
+ # catalog and schema are both used when provided
261
+ if catalog:
262
+ return (
263
+ f"{quote_identifier(catalog, dialect)}."
264
+ f"{quote_identifier(schema or 'dbo', dialect)}."
265
+ f"{quote_identifier(table, dialect)}"
266
+ )
267
+ return (
268
+ f"{quote_identifier(schema or 'dbo', dialect)}."
269
+ f"{quote_identifier(table, dialect)}"
270
+ )
271
+
272
+ if schema:
273
+ return f"{quote_identifier(schema, dialect)}.{quote_identifier(table, dialect)}"
274
+ return quote_identifier(table, dialect)
275
+
276
+
277
+ def generate_create_table(
278
+ df: pd.DataFrame,
279
+ catalog: Optional[str] = None,
280
+ schema: Optional[str] = None,
281
+ table: str = "table",
282
+ drop_existing: bool = True,
283
+ dialect: Dialect = Dialect.SQLSERVER,
284
+ use_go: bool = False,
285
+ schema_spec: Optional[Mapping[str, Any]] = None,
286
+ verbose: bool = False,
287
+ ) -> str:
288
+ """
289
+ Generate a CREATE TABLE statement for the given dialect.
290
+
291
+ Args:
292
+ df: Source DataFrame.
293
+ catalog: Optional catalog/database name.
294
+ schema: Optional schema name (defaults per dialect).
295
+ table: Target table name.
296
+ drop_existing: If True, include a DROP TABLE IF EXISTS stanza.
297
+ dialect: SQL dialect.
298
+ use_go: If True and dialect is SQL Server, insert a GO batch separator
299
+ after USE when a catalog is provided.
300
+ schema_spec: Optional schema overrides for column types and string sizing.
301
+ verbose: If True, prints progress messages.
302
+
303
+ Returns:
304
+ CREATE TABLE statement as string.
305
+
306
+ Raises:
307
+ ValueError: On missing/invalid inputs.
308
+ TypeError: If df is not a DataFrame.
309
+ NotImplementedError: If dialect unsupported.
310
+ """
311
+ if df is None:
312
+ raise ValueError("df must be provided")
313
+ if not isinstance(df, pd.DataFrame):
314
+ raise TypeError("df must be a pandas DataFrame")
315
+ if not isinstance(table, str) or not table.strip():
316
+ raise ValueError("table must be a non-empty string")
317
+ if catalog is not None and (not isinstance(catalog, str) or not catalog.strip()):
318
+ raise ValueError("catalog, if provided, must be a non-empty string")
319
+ if schema is not None and (not isinstance(schema, str) or not schema.strip()):
320
+ raise ValueError("schema, if provided, must be a non-empty string")
321
+ if not isinstance(use_go, bool):
322
+ raise TypeError("use_go must be a boolean")
323
+
324
+ qualified_name = _qualify_name(catalog, schema, table, dialect)
325
+ lines: List[str] = []
326
+
327
+ for column in df.columns:
328
+ series = df[column]
329
+ column_name = str(column)
330
+ defaults, column_spec = resolve_column_spec(column_name, schema_spec)
331
+ string_policy = resolve_string_policy(column_name, defaults, column_spec)
332
+ normalized_policy = _normalize_string_policy(string_policy)
333
+
334
+ nullable = series.isnull().any()
335
+ if normalized_policy["empty_as_null"] and (
336
+ pd.api.types.is_object_dtype(series.dtype)
337
+ or isinstance(series.dtype, pd.CategoricalDtype)
338
+ or pd.api.types.is_string_dtype(series.dtype)
339
+ ):
340
+ _, any_empty = _string_series_stats(series, True)
341
+ if any_empty:
342
+ nullable = True
343
+
344
+ nullable_override = _coerce_optional_bool(
345
+ column_spec.get("nullable"), "nullable"
346
+ )
347
+ if nullable_override is None:
348
+ nullable_override = _coerce_optional_bool(
349
+ column_spec.get("allow_null"), "allow_null"
350
+ )
351
+ if nullable_override is None:
352
+ nullable_override = normalized_policy["allow_null"]
353
+ if nullable_override is None:
354
+ nullable_override = _coerce_optional_bool(
355
+ defaults.get("nullable"), "defaults.nullable"
356
+ )
357
+ if nullable_override is None:
358
+ nullable_override = _coerce_optional_bool(
359
+ defaults.get("allow_null"), "defaults.allow_null"
360
+ )
361
+ if nullable_override is not None:
362
+ nullable = nullable_override
363
+
364
+ sql_type_override = column_spec.get("sql_type")
365
+ if sql_type_override is not None:
366
+ if not isinstance(sql_type_override, str) or not sql_type_override.strip():
367
+ raise ValueError(
368
+ f"schema_spec.columns['{column_name}'].sql_type must be a non-empty string"
369
+ )
370
+ sql_type = sql_type_override.strip()
371
+ else:
372
+ sql_type = _infer_sql_type(series, dialect, string_policy=normalized_policy)
373
+
374
+ null_str = "NULL" if nullable else "NOT NULL"
375
+ lines.append(
376
+ f" {quote_identifier(column_name, dialect)} {sql_type} {null_str}"
377
+ )
378
+
379
+ body = ",\n".join(lines)
380
+
381
+ drop_clause = ""
382
+ if drop_existing:
383
+ if dialect == Dialect.SQLSERVER:
384
+ object_id_name = qualified_name.replace("'", "''")
385
+ if catalog:
386
+ batch_sep = "GO\n" if use_go else ""
387
+ drop_clause = (
388
+ f"USE {quote_identifier(catalog, dialect)};\n"
389
+ f"{batch_sep}IF OBJECT_ID('{object_id_name}') IS NOT NULL "
390
+ f"DROP TABLE {qualified_name};\n"
391
+ )
392
+ else:
393
+ drop_clause = (
394
+ f"IF OBJECT_ID('{object_id_name}') IS NOT NULL "
395
+ f"DROP TABLE {qualified_name};\n"
396
+ )
397
+ else:
398
+ drop_clause = f"DROP TABLE IF EXISTS {qualified_name};\n"
399
+
400
+ statement = f"{drop_clause}CREATE TABLE {qualified_name} (\n{body}\n);"
401
+ print_verbose(f"Generated CREATE TABLE for {qualified_name}", verbose)
402
+ return statement