datablade 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,13 +5,16 @@ Provides dialect-aware bulk loading from files to database tables.
5
5
  Supports SQL Server (BCP), PostgreSQL (COPY), MySQL (LOAD DATA), and DuckDB.
6
6
  """
7
7
 
8
+ import os
8
9
  import pathlib
10
+ import shutil
9
11
  import subprocess
10
- from typing import Optional, Union
12
+ from typing import Iterable, Optional, Union
11
13
 
12
14
  import pandas as pd
13
15
 
14
- from ..utils.logging import log_debug, log_error, log_info
16
+ from ..utils.logging import log_debug, log_error, log_info, log_warning
17
+ from ..utils.strings import coerce_path, ensure_directory
15
18
  from .dialects import Dialect
16
19
  from .quoting import quote_identifier
17
20
 
@@ -23,15 +26,18 @@ def _validate_bulk_load_params(
23
26
  server: Optional[str] = None,
24
27
  username: Optional[str] = None,
25
28
  password: Optional[str] = None,
29
+ use_trusted_connection: bool = False,
30
+ use_azure_ad: bool = False,
26
31
  dialect: Dialect = Dialect.SQLSERVER,
32
+ verbose: bool = False,
27
33
  ) -> pathlib.Path:
28
34
  """Validate bulk load parameters and return resolved path."""
29
- if not file_path:
30
- raise ValueError("file_path must be provided")
31
-
32
- path_obj = pathlib.Path(file_path)
33
- if not path_obj.exists():
34
- raise ValueError(f"File does not exist: {path_obj}")
35
+ path_obj = coerce_path(
36
+ file_path,
37
+ must_exist=True,
38
+ verbose=verbose,
39
+ label="file_path",
40
+ )
35
41
 
36
42
  if not isinstance(table_name, str) or not table_name.strip():
37
43
  raise ValueError("table_name must be a non-empty string")
@@ -41,23 +47,106 @@ def _validate_bulk_load_params(
41
47
  if dialect == Dialect.SQLSERVER:
42
48
  if not server:
43
49
  raise ValueError("server is required for SQL Server")
50
+ if use_trusted_connection and use_azure_ad:
51
+ raise ValueError(
52
+ "use_trusted_connection and use_azure_ad are mutually exclusive"
53
+ )
54
+ if use_trusted_connection:
55
+ return path_obj
44
56
  if not username:
45
57
  raise ValueError("username is required for SQL Server")
46
- if not password:
58
+ if not password and not use_azure_ad:
47
59
  raise ValueError("password is required for SQL Server")
60
+ if use_azure_ad and not username:
61
+ raise ValueError("username is required for Azure AD authentication")
48
62
 
49
63
  return path_obj
50
64
 
51
65
 
66
+ def _build_bcp_args(
67
+ file_path: Union[str, pathlib.Path],
68
+ table_name: str,
69
+ database: str,
70
+ server: str,
71
+ username: Optional[str],
72
+ password: Optional[str],
73
+ schema: str,
74
+ delimiter: str,
75
+ use_trusted_connection: bool,
76
+ use_azure_ad: bool,
77
+ use_env_credentials: bool,
78
+ verbose: bool,
79
+ ) -> list[str]:
80
+ if use_env_credentials:
81
+ if not username:
82
+ username = os.getenv("DATABLADE_SQLSERVER_USERNAME")
83
+ if not password:
84
+ password = os.getenv("DATABLADE_SQLSERVER_PASSWORD")
85
+
86
+ path_obj = _validate_bulk_load_params(
87
+ file_path,
88
+ table_name,
89
+ database,
90
+ server,
91
+ username,
92
+ password,
93
+ use_trusted_connection=use_trusted_connection,
94
+ use_azure_ad=use_azure_ad,
95
+ dialect=Dialect.SQLSERVER,
96
+ verbose=verbose,
97
+ )
98
+
99
+ qualified_table = f"{database}.{schema}.{table_name}"
100
+ bcp_args = [
101
+ "bcp",
102
+ qualified_table,
103
+ "in",
104
+ str(path_obj),
105
+ "-c",
106
+ f"-t{delimiter}",
107
+ "-S",
108
+ server,
109
+ ]
110
+
111
+ if use_trusted_connection:
112
+ bcp_args.append("-T")
113
+ else:
114
+ if use_azure_ad:
115
+ bcp_args.append("-G")
116
+ if username:
117
+ bcp_args.extend(["-U", username])
118
+ if password:
119
+ bcp_args.extend(["-P", password])
120
+
121
+ return bcp_args
122
+
123
+
124
+ def _redact_bcp_args(bcp_args: list[str]) -> list[str]:
125
+ redacted_args: list[str] = []
126
+ redact_next = False
127
+ for arg in bcp_args:
128
+ if redact_next:
129
+ redacted_args.append("***REDACTED***")
130
+ redact_next = False
131
+ continue
132
+ redacted_args.append(arg)
133
+ if arg == "-P":
134
+ redact_next = True
135
+ return redacted_args
136
+
137
+
52
138
  def bulk_load_sqlserver(
53
139
  file_path: Union[str, pathlib.Path],
54
140
  table_name: str,
55
141
  database: str,
56
142
  server: str,
57
- username: str,
58
- password: str,
143
+ username: Optional[str] = None,
144
+ password: Optional[str] = None,
59
145
  schema: str = "dbo",
60
146
  delimiter: str = ",",
147
+ use_trusted_connection: bool = False,
148
+ use_azure_ad: bool = False,
149
+ use_env_credentials: bool = True,
61
150
  verbose: bool = False,
62
151
  ) -> None:
63
152
  """
@@ -72,38 +161,58 @@ def bulk_load_sqlserver(
72
161
  password: SQL Server password.
73
162
  schema: Schema name (default: dbo).
74
163
  delimiter: Field delimiter (default: comma).
164
+ use_trusted_connection: If True, use integrated authentication (-T).
165
+ use_azure_ad: If True, use Azure AD authentication (-G).
166
+ use_env_credentials: If True, fall back to DATABLADE_SQLSERVER_USERNAME
167
+ and DATABLADE_SQLSERVER_PASSWORD when username/password not provided.
75
168
  verbose: If True, logs progress messages.
76
169
 
77
170
  Raises:
78
171
  ValueError: On invalid inputs.
79
172
  subprocess.CalledProcessError: If BCP command fails.
80
173
  """
81
- path_obj = _validate_bulk_load_params(
82
- file_path, table_name, database, server, username, password, Dialect.SQLSERVER
174
+ bcp_args = _build_bcp_args(
175
+ file_path=file_path,
176
+ table_name=table_name,
177
+ database=database,
178
+ server=server,
179
+ username=username,
180
+ password=password,
181
+ schema=schema,
182
+ delimiter=delimiter,
183
+ use_trusted_connection=use_trusted_connection,
184
+ use_azure_ad=use_azure_ad,
185
+ use_env_credentials=use_env_credentials,
186
+ verbose=verbose,
83
187
  )
84
188
 
85
- qualified_table = f"{database}.{schema}.{table_name}"
86
-
87
- bcp_args = [
88
- "bcp",
89
- qualified_table,
90
- "in",
91
- str(path_obj),
92
- "-c",
93
- f"-t{delimiter}",
94
- "-S",
95
- server,
96
- "-U",
97
- username,
98
- "-P",
99
- password,
100
- ]
101
-
102
- log_info(f"Executing BCP load to {qualified_table}", verbose)
103
- log_debug(
104
- f"BCP args: {bcp_args[:-1] + ['***REDACTED***']}",
105
- verbose,
106
- )
189
+ bcp_preview = _redact_bcp_args(bcp_args)
190
+ bcp_path = shutil.which("bcp")
191
+ if not bcp_path:
192
+ install_steps = (
193
+ "Install the SQL Server command line utilities (bcp) and ensure the "
194
+ "binary is on PATH. For example: "
195
+ "macOS (Homebrew): brew install msodbcsql17 mssql-tools; "
196
+ "Linux (Debian/Ubuntu): install mssql-tools; "
197
+ "Windows: install SQL Server Command Line Utilities and restart your shell."
198
+ )
199
+ path_env = os.environ.get("PATH", "")
200
+ message = (
201
+ "BCP executable was not found on PATH. "
202
+ f"PATH={path_env}. {install_steps} "
203
+ f"Command preview: {bcp_preview}."
204
+ )
205
+ log_error(message, verbose)
206
+ raise FileNotFoundError(message)
207
+
208
+ log_info(f"Executing BCP load to {bcp_args[1]}", verbose)
209
+ if "-P" in bcp_args:
210
+ log_warning(
211
+ "BCP authentication uses -P with a plaintext password. "
212
+ "Consider using trusted connection (-T) or Azure AD (-G).",
213
+ verbose,
214
+ )
215
+ log_debug(f"BCP args: {bcp_preview}", verbose)
107
216
 
108
217
  try:
109
218
  process = subprocess.run(
@@ -113,15 +222,142 @@ def bulk_load_sqlserver(
113
222
  stdout=subprocess.PIPE,
114
223
  stderr=subprocess.PIPE,
115
224
  )
116
- log_info(f"Successfully loaded data to {qualified_table}", verbose)
225
+ log_info(f"Successfully loaded data to {bcp_args[1]}", verbose)
117
226
  if process.stdout:
118
227
  log_debug(f"BCP output: {process.stdout.decode()}", verbose)
119
228
  except subprocess.CalledProcessError as e:
120
229
  error_msg = e.stderr.decode() if e.stderr else str(e)
121
- log_error(f"BCP load failed: {error_msg}", verbose)
230
+ log_error(
231
+ f"BCP load failed: {error_msg} "
232
+ f"PATH={os.environ.get('PATH', '')}. "
233
+ "Ensure BCP is installed (SQL Server command line utilities) and on PATH. "
234
+ f"Command preview: {bcp_preview}.",
235
+ verbose,
236
+ )
122
237
  raise
123
238
 
124
239
 
240
+ def bulk_load_sqlserver_command(
241
+ file_path: Union[str, pathlib.Path],
242
+ table_name: str,
243
+ database: str,
244
+ server: str,
245
+ username: Optional[str] = None,
246
+ password: Optional[str] = None,
247
+ schema: str = "dbo",
248
+ delimiter: str = ",",
249
+ use_trusted_connection: bool = False,
250
+ use_azure_ad: bool = False,
251
+ use_env_credentials: bool = True,
252
+ redact_password: bool = False,
253
+ verbose: bool = False,
254
+ ) -> str:
255
+ """Return a BCP command string for a SQL Server bulk load."""
256
+ bcp_args = _build_bcp_args(
257
+ file_path=file_path,
258
+ table_name=table_name,
259
+ database=database,
260
+ server=server,
261
+ username=username,
262
+ password=password,
263
+ schema=schema,
264
+ delimiter=delimiter,
265
+ use_trusted_connection=use_trusted_connection,
266
+ use_azure_ad=use_azure_ad,
267
+ use_env_credentials=use_env_credentials,
268
+ verbose=verbose,
269
+ )
270
+
271
+ if redact_password:
272
+ bcp_args = _redact_bcp_args(bcp_args)
273
+
274
+ return subprocess.list2cmdline(bcp_args)
275
+
276
+
277
+ def bulk_load_sqlserver_many(
278
+ file_paths: list[Union[str, pathlib.Path]],
279
+ table_name: str,
280
+ database: str,
281
+ server: str,
282
+ username: Optional[str] = None,
283
+ password: Optional[str] = None,
284
+ schema: str = "dbo",
285
+ delimiter: str = ",",
286
+ use_trusted_connection: bool = False,
287
+ use_azure_ad: bool = False,
288
+ use_env_credentials: bool = True,
289
+ verbose: bool = False,
290
+ ) -> None:
291
+ """Bulk load multiple files into SQL Server using BCP."""
292
+ if file_paths is None:
293
+ raise ValueError("file_paths must be provided")
294
+ if not isinstance(file_paths, list):
295
+ raise TypeError("file_paths must be a list of paths")
296
+ if not file_paths:
297
+ raise ValueError("file_paths must contain at least one path")
298
+
299
+ for file_path in file_paths:
300
+ bulk_load_sqlserver(
301
+ file_path=file_path,
302
+ table_name=table_name,
303
+ database=database,
304
+ server=server,
305
+ username=username,
306
+ password=password,
307
+ schema=schema,
308
+ delimiter=delimiter,
309
+ use_trusted_connection=use_trusted_connection,
310
+ use_azure_ad=use_azure_ad,
311
+ use_env_credentials=use_env_credentials,
312
+ verbose=verbose,
313
+ )
314
+
315
+
316
+ def bulk_load_sqlserver_commands(
317
+ file_paths: Iterable[Union[str, pathlib.Path]],
318
+ table_name: str,
319
+ database: str,
320
+ server: str,
321
+ username: Optional[str] = None,
322
+ password: Optional[str] = None,
323
+ schema: str = "dbo",
324
+ delimiter: str = ",",
325
+ use_trusted_connection: bool = False,
326
+ use_azure_ad: bool = False,
327
+ use_env_credentials: bool = True,
328
+ redact_password: bool = False,
329
+ verbose: bool = False,
330
+ ) -> list[str]:
331
+ """Return BCP command strings for multiple SQL Server bulk loads."""
332
+ if file_paths is None:
333
+ raise ValueError("file_paths must be provided")
334
+
335
+ commands: list[str] = []
336
+ for file_path in file_paths:
337
+ commands.append(
338
+ bulk_load_sqlserver_command(
339
+ file_path=file_path,
340
+ table_name=table_name,
341
+ database=database,
342
+ server=server,
343
+ username=username,
344
+ password=password,
345
+ schema=schema,
346
+ delimiter=delimiter,
347
+ use_trusted_connection=use_trusted_connection,
348
+ use_azure_ad=use_azure_ad,
349
+ use_env_credentials=use_env_credentials,
350
+ redact_password=redact_password,
351
+ verbose=verbose,
352
+ )
353
+ )
354
+
355
+ if not commands:
356
+ raise ValueError("file_paths must contain at least one path")
357
+
358
+ return commands
359
+
360
+
125
361
  def bulk_load_postgres(
126
362
  file_path: Union[str, pathlib.Path],
127
363
  table_name: str,
@@ -152,17 +388,25 @@ def bulk_load_postgres(
152
388
  ValueError: On invalid inputs.
153
389
  """
154
390
  path_obj = _validate_bulk_load_params(
155
- file_path, table_name, database, dialect=Dialect.POSTGRES
391
+ file_path,
392
+ table_name,
393
+ database,
394
+ dialect=Dialect.POSTGRES,
395
+ verbose=verbose,
156
396
  )
157
397
 
158
- qualified_table = f"{quote_identifier(schema, Dialect.POSTGRES)}.{quote_identifier(table_name, Dialect.POSTGRES)}"
159
-
160
- header_clause = "HEADER" if header else ""
161
- copy_cmd = (
162
- f"\\COPY {qualified_table} FROM '{path_obj}' "
163
- f"WITH (FORMAT csv, DELIMITER '{delimiter}', {header_clause})"
398
+ qualified_table = (
399
+ f"{quote_identifier(schema, Dialect.POSTGRES)}."
400
+ f"{quote_identifier(table_name, Dialect.POSTGRES)}"
164
401
  )
165
402
 
403
+ options = ["FORMAT csv", f"DELIMITER '{delimiter}'"]
404
+ if header:
405
+ options.append("HEADER")
406
+
407
+ options_sql = ", ".join(options)
408
+ copy_cmd = f"\\COPY {qualified_table} FROM '{path_obj}' WITH ({options_sql})"
409
+
166
410
  log_info(f"Generated COPY command for {qualified_table}", verbose)
167
411
  return copy_cmd
168
412
 
@@ -197,10 +441,17 @@ def bulk_load_mysql(
197
441
  ValueError: On invalid inputs.
198
442
  """
199
443
  path_obj = _validate_bulk_load_params(
200
- file_path, table_name, database, dialect=Dialect.MYSQL
444
+ file_path,
445
+ table_name,
446
+ database,
447
+ dialect=Dialect.MYSQL,
448
+ verbose=verbose,
201
449
  )
202
450
 
203
- qualified_table = f"{quote_identifier(database, Dialect.MYSQL)}.{quote_identifier(table_name, Dialect.MYSQL)}"
451
+ qualified_table = (
452
+ f"{quote_identifier(database, Dialect.MYSQL)}."
453
+ f"{quote_identifier(table_name, Dialect.MYSQL)}"
454
+ )
204
455
 
205
456
  load_cmd = (
206
457
  f"LOAD DATA LOCAL INFILE '{path_obj}' "
@@ -239,7 +490,11 @@ def bulk_load_duckdb(
239
490
  ValueError: On invalid inputs.
240
491
  """
241
492
  path_obj = _validate_bulk_load_params(
242
- file_path, table_name, database, dialect=Dialect.DUCKDB
493
+ file_path,
494
+ table_name,
495
+ database,
496
+ dialect=Dialect.DUCKDB,
497
+ verbose=verbose,
243
498
  )
244
499
 
245
500
  qualified_table = f"{quote_identifier(schema, Dialect.DUCKDB)}.{quote_identifier(table_name, Dialect.DUCKDB)}"
@@ -377,8 +632,13 @@ def write_dataframe_and_load(
377
632
  if df is None or not isinstance(df, pd.DataFrame):
378
633
  raise TypeError("df must be a pandas DataFrame")
379
634
 
380
- path_obj = pathlib.Path(file_path)
381
- path_obj.parent.mkdir(parents=True, exist_ok=True)
635
+ path_obj = coerce_path(
636
+ file_path,
637
+ must_exist=False,
638
+ verbose=verbose,
639
+ label="file_path",
640
+ )
641
+ ensure_directory(path_obj.parent, verbose=verbose, label="output_dir")
382
642
 
383
643
  # Write based on file extension
384
644
  suffix = path_obj.suffix.lower()