datablade 0.0.0__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablade/__init__.py +41 -1
- datablade/blade.py +153 -0
- datablade/core/__init__.py +28 -7
- datablade/core/frames.py +23 -236
- datablade/core/json.py +5 -10
- datablade/core/lists.py +5 -10
- datablade/core/messages.py +23 -11
- datablade/core/strings.py +5 -43
- datablade/core/zip.py +5 -24
- datablade/dataframes/__init__.py +43 -0
- datablade/dataframes/frames.py +485 -0
- datablade/dataframes/readers.py +540 -0
- datablade/io/__init__.py +15 -0
- datablade/io/json.py +33 -0
- datablade/io/zip.py +73 -0
- datablade/sql/__init__.py +32 -0
- datablade/sql/bulk_load.py +405 -0
- datablade/sql/ddl.py +227 -0
- datablade/sql/ddl_pyarrow.py +287 -0
- datablade/sql/dialects.py +10 -0
- datablade/sql/quoting.py +42 -0
- datablade/utils/__init__.py +37 -0
- datablade/utils/lists.py +29 -0
- datablade/utils/logging.py +159 -0
- datablade/utils/messages.py +29 -0
- datablade/utils/strings.py +86 -0
- datablade-0.0.5.dist-info/METADATA +351 -0
- datablade-0.0.5.dist-info/RECORD +31 -0
- {datablade-0.0.0.dist-info → datablade-0.0.5.dist-info}/WHEEL +1 -1
- {datablade-0.0.0.dist-info → datablade-0.0.5.dist-info/licenses}/LICENSE +20 -20
- datablade-0.0.0.dist-info/METADATA +0 -13
- datablade-0.0.0.dist-info/RECORD +0 -13
- {datablade-0.0.0.dist-info → datablade-0.0.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Bulk loading utilities for SQL databases.
|
|
3
|
+
|
|
4
|
+
Provides dialect-aware bulk loading from files to database tables.
|
|
5
|
+
Supports SQL Server (BCP), PostgreSQL (COPY), MySQL (LOAD DATA), and DuckDB.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pathlib
|
|
9
|
+
import subprocess
|
|
10
|
+
from typing import Optional, Union
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from ..utils.logging import log_debug, log_error, log_info
|
|
15
|
+
from .dialects import Dialect
|
|
16
|
+
from .quoting import quote_identifier
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _validate_bulk_load_params(
|
|
20
|
+
file_path: Union[str, pathlib.Path],
|
|
21
|
+
table_name: str,
|
|
22
|
+
database: str,
|
|
23
|
+
server: Optional[str] = None,
|
|
24
|
+
username: Optional[str] = None,
|
|
25
|
+
password: Optional[str] = None,
|
|
26
|
+
dialect: Dialect = Dialect.SQLSERVER,
|
|
27
|
+
) -> pathlib.Path:
|
|
28
|
+
"""Validate bulk load parameters and return resolved path."""
|
|
29
|
+
if not file_path:
|
|
30
|
+
raise ValueError("file_path must be provided")
|
|
31
|
+
|
|
32
|
+
path_obj = pathlib.Path(file_path)
|
|
33
|
+
if not path_obj.exists():
|
|
34
|
+
raise ValueError(f"File does not exist: {path_obj}")
|
|
35
|
+
|
|
36
|
+
if not isinstance(table_name, str) or not table_name.strip():
|
|
37
|
+
raise ValueError("table_name must be a non-empty string")
|
|
38
|
+
if not isinstance(database, str) or not database.strip():
|
|
39
|
+
raise ValueError("database must be a non-empty string")
|
|
40
|
+
|
|
41
|
+
if dialect == Dialect.SQLSERVER:
|
|
42
|
+
if not server:
|
|
43
|
+
raise ValueError("server is required for SQL Server")
|
|
44
|
+
if not username:
|
|
45
|
+
raise ValueError("username is required for SQL Server")
|
|
46
|
+
if not password:
|
|
47
|
+
raise ValueError("password is required for SQL Server")
|
|
48
|
+
|
|
49
|
+
return path_obj
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def bulk_load_sqlserver(
|
|
53
|
+
file_path: Union[str, pathlib.Path],
|
|
54
|
+
table_name: str,
|
|
55
|
+
database: str,
|
|
56
|
+
server: str,
|
|
57
|
+
username: str,
|
|
58
|
+
password: str,
|
|
59
|
+
schema: str = "dbo",
|
|
60
|
+
delimiter: str = ",",
|
|
61
|
+
verbose: bool = False,
|
|
62
|
+
) -> None:
|
|
63
|
+
"""
|
|
64
|
+
Bulk load a file into SQL Server using BCP.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
file_path: Path to the data file.
|
|
68
|
+
table_name: Target table name.
|
|
69
|
+
database: Database name.
|
|
70
|
+
server: SQL Server instance name.
|
|
71
|
+
username: SQL Server username.
|
|
72
|
+
password: SQL Server password.
|
|
73
|
+
schema: Schema name (default: dbo).
|
|
74
|
+
delimiter: Field delimiter (default: comma).
|
|
75
|
+
verbose: If True, logs progress messages.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValueError: On invalid inputs.
|
|
79
|
+
subprocess.CalledProcessError: If BCP command fails.
|
|
80
|
+
"""
|
|
81
|
+
path_obj = _validate_bulk_load_params(
|
|
82
|
+
file_path, table_name, database, server, username, password, Dialect.SQLSERVER
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
qualified_table = f"{database}.{schema}.{table_name}"
|
|
86
|
+
|
|
87
|
+
bcp_args = [
|
|
88
|
+
"bcp",
|
|
89
|
+
qualified_table,
|
|
90
|
+
"in",
|
|
91
|
+
str(path_obj),
|
|
92
|
+
"-c",
|
|
93
|
+
f"-t{delimiter}",
|
|
94
|
+
"-S",
|
|
95
|
+
server,
|
|
96
|
+
"-U",
|
|
97
|
+
username,
|
|
98
|
+
"-P",
|
|
99
|
+
password,
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
log_info(f"Executing BCP load to {qualified_table}", verbose)
|
|
103
|
+
log_debug(
|
|
104
|
+
f"BCP args: {bcp_args[:-1] + ['***REDACTED***']}",
|
|
105
|
+
verbose,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
process = subprocess.run(
|
|
110
|
+
bcp_args,
|
|
111
|
+
shell=False,
|
|
112
|
+
check=True,
|
|
113
|
+
stdout=subprocess.PIPE,
|
|
114
|
+
stderr=subprocess.PIPE,
|
|
115
|
+
)
|
|
116
|
+
log_info(f"Successfully loaded data to {qualified_table}", verbose)
|
|
117
|
+
if process.stdout:
|
|
118
|
+
log_debug(f"BCP output: {process.stdout.decode()}", verbose)
|
|
119
|
+
except subprocess.CalledProcessError as e:
|
|
120
|
+
error_msg = e.stderr.decode() if e.stderr else str(e)
|
|
121
|
+
log_error(f"BCP load failed: {error_msg}", verbose)
|
|
122
|
+
raise
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def bulk_load_postgres(
|
|
126
|
+
file_path: Union[str, pathlib.Path],
|
|
127
|
+
table_name: str,
|
|
128
|
+
database: str,
|
|
129
|
+
schema: str = "public",
|
|
130
|
+
connection_string: Optional[str] = None,
|
|
131
|
+
delimiter: str = ",",
|
|
132
|
+
header: bool = True,
|
|
133
|
+
verbose: bool = False,
|
|
134
|
+
) -> str:
|
|
135
|
+
"""
|
|
136
|
+
Generate a PostgreSQL COPY command for bulk loading.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
file_path: Path to the data file.
|
|
140
|
+
table_name: Target table name.
|
|
141
|
+
database: Database name.
|
|
142
|
+
schema: Schema name (default: public).
|
|
143
|
+
connection_string: Optional psql connection string.
|
|
144
|
+
delimiter: Field delimiter (default: comma).
|
|
145
|
+
header: If True, skip header row.
|
|
146
|
+
verbose: If True, logs progress messages.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
The COPY command as a string.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
ValueError: On invalid inputs.
|
|
153
|
+
"""
|
|
154
|
+
path_obj = _validate_bulk_load_params(
|
|
155
|
+
file_path, table_name, database, dialect=Dialect.POSTGRES
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
qualified_table = f"{quote_identifier(schema, Dialect.POSTGRES)}.{quote_identifier(table_name, Dialect.POSTGRES)}"
|
|
159
|
+
|
|
160
|
+
header_clause = "HEADER" if header else ""
|
|
161
|
+
copy_cmd = (
|
|
162
|
+
f"\\COPY {qualified_table} FROM '{path_obj}' "
|
|
163
|
+
f"WITH (FORMAT csv, DELIMITER '{delimiter}', {header_clause})"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
log_info(f"Generated COPY command for {qualified_table}", verbose)
|
|
167
|
+
return copy_cmd
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def bulk_load_mysql(
|
|
171
|
+
file_path: Union[str, pathlib.Path],
|
|
172
|
+
table_name: str,
|
|
173
|
+
database: str,
|
|
174
|
+
delimiter: str = ",",
|
|
175
|
+
enclosed_by: str = '"',
|
|
176
|
+
lines_terminated_by: str = "\\n",
|
|
177
|
+
ignore_lines: int = 1,
|
|
178
|
+
verbose: bool = False,
|
|
179
|
+
) -> str:
|
|
180
|
+
"""
|
|
181
|
+
Generate a MySQL LOAD DATA command for bulk loading.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
file_path: Path to the data file.
|
|
185
|
+
table_name: Target table name.
|
|
186
|
+
database: Database name.
|
|
187
|
+
delimiter: Field delimiter (default: comma).
|
|
188
|
+
enclosed_by: Field enclosure character.
|
|
189
|
+
lines_terminated_by: Line terminator.
|
|
190
|
+
ignore_lines: Number of header lines to skip.
|
|
191
|
+
verbose: If True, logs progress messages.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
The LOAD DATA command as a string.
|
|
195
|
+
|
|
196
|
+
Raises:
|
|
197
|
+
ValueError: On invalid inputs.
|
|
198
|
+
"""
|
|
199
|
+
path_obj = _validate_bulk_load_params(
|
|
200
|
+
file_path, table_name, database, dialect=Dialect.MYSQL
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
qualified_table = f"{quote_identifier(database, Dialect.MYSQL)}.{quote_identifier(table_name, Dialect.MYSQL)}"
|
|
204
|
+
|
|
205
|
+
load_cmd = (
|
|
206
|
+
f"LOAD DATA LOCAL INFILE '{path_obj}' "
|
|
207
|
+
f"INTO TABLE {qualified_table} "
|
|
208
|
+
f"FIELDS TERMINATED BY '{delimiter}' "
|
|
209
|
+
f"ENCLOSED BY '{enclosed_by}' "
|
|
210
|
+
f"LINES TERMINATED BY '{lines_terminated_by}' "
|
|
211
|
+
f"IGNORE {ignore_lines} LINES"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
log_info(f"Generated LOAD DATA command for {qualified_table}", verbose)
|
|
215
|
+
return load_cmd
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def bulk_load_duckdb(
|
|
219
|
+
file_path: Union[str, pathlib.Path],
|
|
220
|
+
table_name: str,
|
|
221
|
+
database: str = "memory",
|
|
222
|
+
schema: str = "main",
|
|
223
|
+
verbose: bool = False,
|
|
224
|
+
) -> str:
|
|
225
|
+
"""
|
|
226
|
+
Generate a DuckDB COPY command for bulk loading.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
file_path: Path to the data file.
|
|
230
|
+
table_name: Target table name.
|
|
231
|
+
database: Database name (default: memory).
|
|
232
|
+
schema: Schema name (default: main).
|
|
233
|
+
verbose: If True, logs progress messages.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
The COPY command as a string.
|
|
237
|
+
|
|
238
|
+
Raises:
|
|
239
|
+
ValueError: On invalid inputs.
|
|
240
|
+
"""
|
|
241
|
+
path_obj = _validate_bulk_load_params(
|
|
242
|
+
file_path, table_name, database, dialect=Dialect.DUCKDB
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
qualified_table = f"{quote_identifier(schema, Dialect.DUCKDB)}.{quote_identifier(table_name, Dialect.DUCKDB)}"
|
|
246
|
+
|
|
247
|
+
# DuckDB can infer format from file extension
|
|
248
|
+
suffix = path_obj.suffix.lower()
|
|
249
|
+
if suffix == ".parquet":
|
|
250
|
+
copy_cmd = f"COPY {qualified_table} FROM '{path_obj}' (FORMAT PARQUET)"
|
|
251
|
+
elif suffix == ".csv":
|
|
252
|
+
copy_cmd = f"COPY {qualified_table} FROM '{path_obj}' (FORMAT CSV, HEADER)"
|
|
253
|
+
else:
|
|
254
|
+
copy_cmd = f"COPY {qualified_table} FROM '{path_obj}'"
|
|
255
|
+
|
|
256
|
+
log_info(f"Generated COPY command for {qualified_table}", verbose)
|
|
257
|
+
return copy_cmd
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def bulk_load(
|
|
261
|
+
file_path: Union[str, pathlib.Path],
|
|
262
|
+
table_name: str,
|
|
263
|
+
database: str,
|
|
264
|
+
dialect: Dialect = Dialect.SQLSERVER,
|
|
265
|
+
schema: Optional[str] = None,
|
|
266
|
+
server: Optional[str] = None,
|
|
267
|
+
username: Optional[str] = None,
|
|
268
|
+
password: Optional[str] = None,
|
|
269
|
+
delimiter: str = ",",
|
|
270
|
+
verbose: bool = False,
|
|
271
|
+
) -> Optional[str]:
|
|
272
|
+
"""
|
|
273
|
+
Bulk load a file to a database table using the appropriate dialect method.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
file_path: Path to the data file.
|
|
277
|
+
table_name: Target table name.
|
|
278
|
+
database: Database name.
|
|
279
|
+
dialect: SQL dialect to use.
|
|
280
|
+
schema: Schema name (dialect-specific default if None).
|
|
281
|
+
server: Server name (required for SQL Server).
|
|
282
|
+
username: Username (required for SQL Server).
|
|
283
|
+
password: Password (required for SQL Server).
|
|
284
|
+
delimiter: Field delimiter.
|
|
285
|
+
verbose: If True, logs progress messages.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
For SQL Server, returns None (executes directly).
|
|
289
|
+
For other dialects, returns the command string.
|
|
290
|
+
|
|
291
|
+
Raises:
|
|
292
|
+
ValueError: On invalid inputs.
|
|
293
|
+
NotImplementedError: If dialect is unsupported.
|
|
294
|
+
"""
|
|
295
|
+
if dialect == Dialect.SQLSERVER:
|
|
296
|
+
bulk_load_sqlserver(
|
|
297
|
+
file_path=file_path,
|
|
298
|
+
table_name=table_name,
|
|
299
|
+
database=database,
|
|
300
|
+
server=server, # type: ignore
|
|
301
|
+
username=username, # type: ignore
|
|
302
|
+
password=password, # type: ignore
|
|
303
|
+
schema=schema or "dbo",
|
|
304
|
+
delimiter=delimiter,
|
|
305
|
+
verbose=verbose,
|
|
306
|
+
)
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
if dialect == Dialect.POSTGRES:
|
|
310
|
+
return bulk_load_postgres(
|
|
311
|
+
file_path=file_path,
|
|
312
|
+
table_name=table_name,
|
|
313
|
+
database=database,
|
|
314
|
+
schema=schema or "public",
|
|
315
|
+
delimiter=delimiter,
|
|
316
|
+
verbose=verbose,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
if dialect == Dialect.MYSQL:
|
|
320
|
+
return bulk_load_mysql(
|
|
321
|
+
file_path=file_path,
|
|
322
|
+
table_name=table_name,
|
|
323
|
+
database=database,
|
|
324
|
+
delimiter=delimiter,
|
|
325
|
+
verbose=verbose,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if dialect == Dialect.DUCKDB:
|
|
329
|
+
return bulk_load_duckdb(
|
|
330
|
+
file_path=file_path,
|
|
331
|
+
table_name=table_name,
|
|
332
|
+
database=database,
|
|
333
|
+
schema=schema or "main",
|
|
334
|
+
verbose=verbose,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
raise NotImplementedError(f"Dialect not supported: {dialect}")
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def write_dataframe_and_load(
|
|
341
|
+
df: pd.DataFrame,
|
|
342
|
+
file_path: Union[str, pathlib.Path],
|
|
343
|
+
table_name: str,
|
|
344
|
+
database: str,
|
|
345
|
+
dialect: Dialect = Dialect.SQLSERVER,
|
|
346
|
+
schema: Optional[str] = None,
|
|
347
|
+
server: Optional[str] = None,
|
|
348
|
+
username: Optional[str] = None,
|
|
349
|
+
password: Optional[str] = None,
|
|
350
|
+
delimiter: str = ",",
|
|
351
|
+
verbose: bool = False,
|
|
352
|
+
) -> Optional[str]:
|
|
353
|
+
"""
|
|
354
|
+
Write a DataFrame to a file and bulk load it to a database.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
df: The DataFrame to write.
|
|
358
|
+
file_path: Path where the file will be saved.
|
|
359
|
+
table_name: Target table name.
|
|
360
|
+
database: Database name.
|
|
361
|
+
dialect: SQL dialect to use.
|
|
362
|
+
schema: Schema name.
|
|
363
|
+
server: Server name (required for SQL Server).
|
|
364
|
+
username: Username (required for SQL Server).
|
|
365
|
+
password: Password (required for SQL Server).
|
|
366
|
+
delimiter: Field delimiter.
|
|
367
|
+
verbose: If True, logs progress messages.
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
For SQL Server, returns None (executes directly).
|
|
371
|
+
For other dialects, returns the command string.
|
|
372
|
+
|
|
373
|
+
Raises:
|
|
374
|
+
TypeError: If df is not a DataFrame.
|
|
375
|
+
ValueError: On invalid inputs.
|
|
376
|
+
"""
|
|
377
|
+
if df is None or not isinstance(df, pd.DataFrame):
|
|
378
|
+
raise TypeError("df must be a pandas DataFrame")
|
|
379
|
+
|
|
380
|
+
path_obj = pathlib.Path(file_path)
|
|
381
|
+
path_obj.parent.mkdir(parents=True, exist_ok=True)
|
|
382
|
+
|
|
383
|
+
# Write based on file extension
|
|
384
|
+
suffix = path_obj.suffix.lower()
|
|
385
|
+
if suffix == ".parquet":
|
|
386
|
+
df.to_parquet(path_obj, index=False)
|
|
387
|
+
elif suffix == ".csv":
|
|
388
|
+
df.to_csv(path_obj, index=False, sep=delimiter)
|
|
389
|
+
else:
|
|
390
|
+
df.to_csv(path_obj, index=False, sep=delimiter)
|
|
391
|
+
|
|
392
|
+
log_info(f"DataFrame written to {path_obj}", verbose)
|
|
393
|
+
|
|
394
|
+
return bulk_load(
|
|
395
|
+
file_path=path_obj,
|
|
396
|
+
table_name=table_name,
|
|
397
|
+
database=database,
|
|
398
|
+
dialect=dialect,
|
|
399
|
+
schema=schema,
|
|
400
|
+
server=server,
|
|
401
|
+
username=username,
|
|
402
|
+
password=password,
|
|
403
|
+
delimiter=delimiter,
|
|
404
|
+
verbose=verbose,
|
|
405
|
+
)
|
datablade/sql/ddl.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from ..utils.messages import print_verbose
|
|
6
|
+
from .dialects import Dialect
|
|
7
|
+
from .quoting import quote_identifier
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _infer_sql_type(series: pd.Series, dialect: Dialect) -> str: # noqa: C901
|
|
11
|
+
"""Infer a SQL column type for a pandas Series given a dialect."""
|
|
12
|
+
dtype = series.dtype
|
|
13
|
+
|
|
14
|
+
def _is_bytes_like(value: Any) -> bool:
|
|
15
|
+
return isinstance(value, (bytes, bytearray, memoryview))
|
|
16
|
+
|
|
17
|
+
def _is_bytes_like_series(s: pd.Series) -> bool:
|
|
18
|
+
if not pd.api.types.is_object_dtype(s.dtype):
|
|
19
|
+
return False
|
|
20
|
+
non_null = s.dropna()
|
|
21
|
+
if non_null.empty:
|
|
22
|
+
return False
|
|
23
|
+
sample = non_null.iloc[:100]
|
|
24
|
+
# require all sampled values to be bytes-like
|
|
25
|
+
return bool(sample.map(_is_bytes_like).all())
|
|
26
|
+
|
|
27
|
+
if dialect == Dialect.SQLSERVER:
|
|
28
|
+
if pd.api.types.is_integer_dtype(dtype):
|
|
29
|
+
non_null = series.dropna()
|
|
30
|
+
if non_null.empty:
|
|
31
|
+
return "bigint"
|
|
32
|
+
min_value = non_null.min()
|
|
33
|
+
max_value = non_null.max()
|
|
34
|
+
if min_value >= 0 and max_value <= 255:
|
|
35
|
+
return "tinyint"
|
|
36
|
+
if min_value >= -32768 and max_value <= 32767:
|
|
37
|
+
return "smallint"
|
|
38
|
+
if min_value >= -2147483648 and max_value <= 2147483647:
|
|
39
|
+
return "int"
|
|
40
|
+
return "bigint"
|
|
41
|
+
if pd.api.types.is_float_dtype(dtype):
|
|
42
|
+
return "float"
|
|
43
|
+
if pd.api.types.is_bool_dtype(dtype):
|
|
44
|
+
return "bit"
|
|
45
|
+
if pd.api.types.is_datetime64_any_dtype(dtype):
|
|
46
|
+
return "datetime2"
|
|
47
|
+
if _is_bytes_like_series(series):
|
|
48
|
+
return "varbinary(max)"
|
|
49
|
+
# strings / objects
|
|
50
|
+
non_null = series.dropna()
|
|
51
|
+
if non_null.empty:
|
|
52
|
+
max_length = 1
|
|
53
|
+
else:
|
|
54
|
+
lengths = non_null.astype(str).map(len)
|
|
55
|
+
max_length = int(lengths.max()) if not lengths.empty else 1
|
|
56
|
+
if pd.api.types.is_object_dtype(dtype) or isinstance(
|
|
57
|
+
dtype, pd.CategoricalDtype
|
|
58
|
+
):
|
|
59
|
+
return f"nvarchar({max_length if max_length <= 4000 else 'max'})"
|
|
60
|
+
return "nvarchar(max)"
|
|
61
|
+
|
|
62
|
+
if dialect == Dialect.POSTGRES:
|
|
63
|
+
if pd.api.types.is_integer_dtype(dtype):
|
|
64
|
+
non_null = series.dropna()
|
|
65
|
+
if non_null.empty:
|
|
66
|
+
return "bigint"
|
|
67
|
+
min_value = non_null.min()
|
|
68
|
+
max_value = non_null.max()
|
|
69
|
+
if min_value >= -32768 and max_value <= 32767:
|
|
70
|
+
return "smallint"
|
|
71
|
+
if min_value >= -2147483648 and max_value <= 2147483647:
|
|
72
|
+
return "integer"
|
|
73
|
+
return "bigint"
|
|
74
|
+
if pd.api.types.is_float_dtype(dtype):
|
|
75
|
+
return "double precision"
|
|
76
|
+
if pd.api.types.is_bool_dtype(dtype):
|
|
77
|
+
return "boolean"
|
|
78
|
+
if pd.api.types.is_datetime64_any_dtype(dtype):
|
|
79
|
+
return "timestamp"
|
|
80
|
+
if _is_bytes_like_series(series):
|
|
81
|
+
return "bytea"
|
|
82
|
+
non_null = series.dropna()
|
|
83
|
+
if non_null.empty:
|
|
84
|
+
max_length = 1
|
|
85
|
+
else:
|
|
86
|
+
lengths = non_null.astype(str).map(len)
|
|
87
|
+
max_length = int(lengths.max()) if not lengths.empty else 1
|
|
88
|
+
return f"varchar({max_length})" if max_length <= 65535 else "text"
|
|
89
|
+
|
|
90
|
+
if dialect == Dialect.MYSQL:
|
|
91
|
+
if pd.api.types.is_integer_dtype(dtype):
|
|
92
|
+
non_null = series.dropna()
|
|
93
|
+
if non_null.empty:
|
|
94
|
+
return "BIGINT"
|
|
95
|
+
min_value = non_null.min()
|
|
96
|
+
max_value = non_null.max()
|
|
97
|
+
if min_value >= -32768 and max_value <= 32767:
|
|
98
|
+
return "SMALLINT"
|
|
99
|
+
if min_value >= -2147483648 and max_value <= 2147483647:
|
|
100
|
+
return "INT"
|
|
101
|
+
return "BIGINT"
|
|
102
|
+
if pd.api.types.is_float_dtype(dtype):
|
|
103
|
+
return "DOUBLE"
|
|
104
|
+
if pd.api.types.is_bool_dtype(dtype):
|
|
105
|
+
return "TINYINT(1)"
|
|
106
|
+
if pd.api.types.is_datetime64_any_dtype(dtype):
|
|
107
|
+
return "DATETIME"
|
|
108
|
+
if _is_bytes_like_series(series):
|
|
109
|
+
return "LONGBLOB"
|
|
110
|
+
non_null = series.dropna()
|
|
111
|
+
if non_null.empty:
|
|
112
|
+
max_length = 1
|
|
113
|
+
else:
|
|
114
|
+
lengths = non_null.astype(str).map(len)
|
|
115
|
+
max_length = int(lengths.max()) if not lengths.empty else 1
|
|
116
|
+
return f"VARCHAR({max_length})" if max_length <= 65535 else "TEXT"
|
|
117
|
+
|
|
118
|
+
if dialect == Dialect.DUCKDB:
|
|
119
|
+
if pd.api.types.is_integer_dtype(dtype):
|
|
120
|
+
return (
|
|
121
|
+
"BIGINT" if pd.api.types.is_signed_integer_dtype(dtype) else "UBIGINT"
|
|
122
|
+
)
|
|
123
|
+
if pd.api.types.is_float_dtype(dtype):
|
|
124
|
+
return "DOUBLE"
|
|
125
|
+
if pd.api.types.is_bool_dtype(dtype):
|
|
126
|
+
return "BOOLEAN"
|
|
127
|
+
if pd.api.types.is_datetime64_any_dtype(dtype):
|
|
128
|
+
return "TIMESTAMP"
|
|
129
|
+
if _is_bytes_like_series(series):
|
|
130
|
+
return "BLOB"
|
|
131
|
+
return "VARCHAR"
|
|
132
|
+
|
|
133
|
+
raise NotImplementedError(f"Dialect not supported: {dialect}")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _qualify_name(
|
|
137
|
+
catalog: Optional[str], schema: Optional[str], table: str, dialect: Dialect
|
|
138
|
+
) -> str:
|
|
139
|
+
if dialect == Dialect.SQLSERVER:
|
|
140
|
+
# catalog and schema are both used when provided
|
|
141
|
+
if catalog:
|
|
142
|
+
return (
|
|
143
|
+
f"{quote_identifier(catalog, dialect)}."
|
|
144
|
+
f"{quote_identifier(schema or 'dbo', dialect)}."
|
|
145
|
+
f"{quote_identifier(table, dialect)}"
|
|
146
|
+
)
|
|
147
|
+
return (
|
|
148
|
+
f"{quote_identifier(schema or 'dbo', dialect)}."
|
|
149
|
+
f"{quote_identifier(table, dialect)}"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if schema:
|
|
153
|
+
return f"{quote_identifier(schema, dialect)}.{quote_identifier(table, dialect)}"
|
|
154
|
+
return quote_identifier(table, dialect)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def generate_create_table(
|
|
158
|
+
df: pd.DataFrame,
|
|
159
|
+
catalog: Optional[str] = None,
|
|
160
|
+
schema: Optional[str] = None,
|
|
161
|
+
table: str = "table",
|
|
162
|
+
drop_existing: bool = True,
|
|
163
|
+
dialect: Dialect = Dialect.SQLSERVER,
|
|
164
|
+
verbose: bool = False,
|
|
165
|
+
) -> str:
|
|
166
|
+
"""
|
|
167
|
+
Generate a CREATE TABLE statement for the given dialect.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
df: Source DataFrame.
|
|
171
|
+
catalog: Optional catalog/database name.
|
|
172
|
+
schema: Optional schema name (defaults per dialect).
|
|
173
|
+
table: Target table name.
|
|
174
|
+
drop_existing: If True, include a DROP TABLE IF EXISTS stanza.
|
|
175
|
+
dialect: SQL dialect.
|
|
176
|
+
verbose: If True, prints progress messages.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
CREATE TABLE statement as string.
|
|
180
|
+
|
|
181
|
+
Raises:
|
|
182
|
+
ValueError: On missing/invalid inputs.
|
|
183
|
+
TypeError: If df is not a DataFrame.
|
|
184
|
+
NotImplementedError: If dialect unsupported.
|
|
185
|
+
"""
|
|
186
|
+
if df is None:
|
|
187
|
+
raise ValueError("df must be provided")
|
|
188
|
+
if not isinstance(df, pd.DataFrame):
|
|
189
|
+
raise TypeError("df must be a pandas DataFrame")
|
|
190
|
+
if not isinstance(table, str) or not table.strip():
|
|
191
|
+
raise ValueError("table must be a non-empty string")
|
|
192
|
+
if catalog is not None and (not isinstance(catalog, str) or not catalog.strip()):
|
|
193
|
+
raise ValueError("catalog, if provided, must be a non-empty string")
|
|
194
|
+
if schema is not None and (not isinstance(schema, str) or not schema.strip()):
|
|
195
|
+
raise ValueError("schema, if provided, must be a non-empty string")
|
|
196
|
+
|
|
197
|
+
qualified_name = _qualify_name(catalog, schema, table, dialect)
|
|
198
|
+
lines: List[str] = []
|
|
199
|
+
|
|
200
|
+
for column in df.columns:
|
|
201
|
+
series = df[column]
|
|
202
|
+
nullable = series.isnull().any()
|
|
203
|
+
sql_type = _infer_sql_type(series, dialect)
|
|
204
|
+
null_str = "NULL" if nullable else "NOT NULL"
|
|
205
|
+
lines.append(
|
|
206
|
+
f" {quote_identifier(str(column), dialect)} {sql_type} {null_str}"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
body = ",\n".join(lines)
|
|
210
|
+
|
|
211
|
+
drop_clause = ""
|
|
212
|
+
if drop_existing:
|
|
213
|
+
if dialect == Dialect.SQLSERVER:
|
|
214
|
+
if catalog:
|
|
215
|
+
drop_clause = (
|
|
216
|
+
f"USE {quote_identifier(catalog, dialect)};\n"
|
|
217
|
+
f"IF OBJECT_ID('{qualified_name}') IS NOT NULL "
|
|
218
|
+
f"DROP TABLE {qualified_name};\n"
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
drop_clause = f"IF OBJECT_ID('{qualified_name}') IS NOT NULL DROP TABLE {qualified_name};\n"
|
|
222
|
+
else:
|
|
223
|
+
drop_clause = f"DROP TABLE IF EXISTS {qualified_name};\n"
|
|
224
|
+
|
|
225
|
+
statement = f"{drop_clause}CREATE TABLE {qualified_name} (\n{body}\n);"
|
|
226
|
+
print_verbose(f"Generated CREATE TABLE for {qualified_name}", verbose)
|
|
227
|
+
return statement
|