datablade 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datablade/__init__.py CHANGED
@@ -12,24 +12,28 @@ For backward compatibility, all functions are also available from datablade.core
12
12
 
13
13
  # Also maintain core for backward compatibility
14
14
  # Import from new organized structure
15
- from . import core, dataframes, io, sql, utils
15
+ from . import core, dataframes, io, registry, sql, utils
16
16
  from .blade import Blade
17
17
  from .dataframes import read_file_chunked, read_file_smart, read_file_to_parquets
18
+ from .registry import DialectSpec, ObjectNode, ObjectRef, ObjectRegistry
18
19
  from .sql import Dialect, bulk_load, generate_create_table
19
20
 
20
21
  # Convenience re-exports for commonly used functions
21
22
  from .utils.logging import configure_logging, get_logger
23
+ from .utils.strings import configure_paths
22
24
 
23
- __version__ = "0.0.5"
25
+ __version__ = "0.0.6"
24
26
 
25
27
  __all__ = [
26
28
  "dataframes",
27
29
  "io",
28
30
  "utils",
29
31
  "sql",
32
+ "registry",
30
33
  "core", # Maintain backward compatibility
31
34
  # Convenience re-exports
32
35
  "configure_logging",
36
+ "configure_paths",
33
37
  "get_logger",
34
38
  "read_file_smart",
35
39
  "read_file_chunked",
@@ -38,4 +42,8 @@ __all__ = [
38
42
  "generate_create_table",
39
43
  "bulk_load",
40
44
  "Blade",
45
+ "DialectSpec",
46
+ "ObjectRef",
47
+ "ObjectNode",
48
+ "ObjectRegistry",
41
49
  ]
datablade/blade.py CHANGED
@@ -9,7 +9,7 @@ from __future__ import annotations
9
9
 
10
10
  from dataclasses import dataclass
11
11
  from pathlib import Path
12
- from typing import Any, Iterator, Optional, Union
12
+ from typing import Any, Iterator, Optional, Sequence, Union
13
13
 
14
14
  import pandas as pd
15
15
 
@@ -19,9 +19,18 @@ from .dataframes import (
19
19
  read_file_smart,
20
20
  read_file_to_parquets,
21
21
  stream_to_parquets,
22
+ stream_to_sink,
22
23
  try_cast_string_columns_to_numeric,
23
24
  )
24
- from .sql import Dialect, generate_create_table, generate_create_table_from_parquet
25
+ from .sql import (
26
+ Dialect,
27
+ ParquetDDLMetadata,
28
+ generate_create_table,
29
+ generate_create_table_from_parquet,
30
+ sqlserver_create_and_insert_from_parquet,
31
+ sqlserver_create_and_stage_from_parquets,
32
+ sqlserver_openrowset_parquet,
33
+ )
25
34
 
26
35
  PathLike = Union[str, Path]
27
36
 
@@ -37,11 +46,18 @@ class Blade:
37
46
  verbose: bool = False
38
47
  convert_types: bool = True
39
48
 
40
- def read(self, file_path: PathLike, **read_kwargs: Any) -> pd.DataFrame:
49
+ def read(
50
+ self,
51
+ file_path: PathLike,
52
+ *,
53
+ return_type: str = "dataframe",
54
+ **read_kwargs: Any,
55
+ ):
41
56
  return read_file_smart(
42
57
  file_path=file_path,
43
58
  memory_fraction=self.memory_fraction,
44
59
  verbose=self.verbose,
60
+ return_type=return_type,
45
61
  **read_kwargs,
46
62
  )
47
63
 
@@ -52,6 +68,7 @@ class Blade:
52
68
  chunksize: Optional[int] = None,
53
69
  **read_kwargs: Any,
54
70
  ) -> Iterator[pd.DataFrame]:
71
+ """Yield DataFrame chunks from a file without materializing."""
55
72
  return read_file_iter(
56
73
  file_path=file_path,
57
74
  chunksize=chunksize,
@@ -70,6 +87,7 @@ class Blade:
70
87
  convert_types: Optional[bool] = None,
71
88
  **read_kwargs: Any,
72
89
  ):
90
+ """Read and materialize a file into Parquet partitions."""
73
91
  return read_file_to_parquets(
74
92
  file_path=file_path,
75
93
  output_dir=output_dir,
@@ -93,6 +111,7 @@ class Blade:
93
111
  convert_types: Optional[bool] = None,
94
112
  **read_kwargs: Any,
95
113
  ):
114
+ """Stream input file chunks into Parquet partitions."""
96
115
  return stream_to_parquets(
97
116
  file_path=file_path,
98
117
  output_dir=output_dir,
@@ -106,10 +125,30 @@ class Blade:
106
125
  **read_kwargs,
107
126
  )
108
127
 
128
+ def stream_to_sink(
129
+ self,
130
+ chunks: Iterator[pd.DataFrame],
131
+ output_dir: PathLike,
132
+ *,
133
+ output_prefix: str = "part",
134
+ convert_types: Optional[bool] = None,
135
+ ):
136
+ return stream_to_sink(
137
+ chunks=chunks,
138
+ output_dir=output_dir,
139
+ output_prefix=output_prefix,
140
+ convert_types=(
141
+ self.convert_types if convert_types is None else convert_types
142
+ ),
143
+ verbose=self.verbose,
144
+ )
145
+
109
146
  def clean(self, df: pd.DataFrame) -> pd.DataFrame:
147
+ """Normalize column names and remove duplicate columns."""
110
148
  return clean_dataframe_columns(df, verbose=self.verbose)
111
149
 
112
150
  def cast_numeric(self, df: pd.DataFrame) -> pd.DataFrame:
151
+ """Attempt numeric conversion on string columns."""
113
152
  return try_cast_string_columns_to_numeric(df, verbose=self.verbose)
114
153
 
115
154
  def create_table_sql(
@@ -121,7 +160,10 @@ class Blade:
121
160
  table: str = "table",
122
161
  drop_existing: bool = True,
123
162
  dialect: Dialect = Dialect.SQLSERVER,
124
- ) -> str:
163
+ use_go: bool = False,
164
+ schema_spec: Optional[dict] = None,
165
+ ) -> Union[str, tuple[str, ParquetDDLMetadata]]:
166
+ """Generate CREATE TABLE DDL from a DataFrame schema."""
125
167
  return generate_create_table(
126
168
  df=df,
127
169
  catalog=catalog,
@@ -129,19 +171,26 @@ class Blade:
129
171
  table=table,
130
172
  drop_existing=drop_existing,
131
173
  dialect=dialect,
174
+ use_go=use_go,
175
+ schema_spec=schema_spec,
132
176
  verbose=self.verbose,
133
177
  )
134
178
 
135
179
  def create_table_sql_from_parquet(
136
180
  self,
137
- parquet_path: str,
181
+ parquet_path: PathLike,
138
182
  *,
139
183
  catalog: Optional[str] = None,
140
184
  schema: Optional[str] = None,
141
185
  table: str = "table",
142
186
  drop_existing: bool = True,
143
187
  dialect: Dialect = Dialect.SQLSERVER,
188
+ use_go: bool = False,
189
+ schema_spec: Optional[dict] = None,
190
+ fallback_to_json: bool = False,
191
+ return_metadata: bool = False,
144
192
  ) -> str:
193
+ """Generate CREATE TABLE DDL from a Parquet schema."""
145
194
  return generate_create_table_from_parquet(
146
195
  parquet_path=parquet_path,
147
196
  catalog=catalog,
@@ -149,5 +198,125 @@ class Blade:
149
198
  table=table,
150
199
  drop_existing=drop_existing,
151
200
  dialect=dialect,
201
+ use_go=use_go,
202
+ schema_spec=schema_spec,
203
+ verbose=self.verbose,
204
+ fallback_to_json=fallback_to_json,
205
+ return_metadata=return_metadata,
206
+ )
207
+
208
+ def sqlserver_openrowset_parquet(
209
+ self,
210
+ parquet_path: PathLike,
211
+ *,
212
+ data_source: Optional[str] = None,
213
+ table_alias: str = "rows",
214
+ select_columns: Optional[Sequence[str]] = None,
215
+ where: Optional[str] = None,
216
+ top: Optional[int] = None,
217
+ ) -> str:
218
+ """Generate a SQL Server OPENROWSET query over Parquet files."""
219
+ return sqlserver_openrowset_parquet(
220
+ parquet_path,
221
+ data_source=data_source,
222
+ table_alias=table_alias,
223
+ select_columns=select_columns,
224
+ where=where,
225
+ top=top,
226
+ )
227
+
228
+ def sqlserver_create_and_insert_from_parquet(
229
+ self,
230
+ parquet_path: PathLike,
231
+ output_dir: PathLike,
232
+ *,
233
+ table: str,
234
+ catalog: Optional[str] = None,
235
+ schema: Optional[str] = None,
236
+ drop_existing: bool = True,
237
+ use_go: bool = False,
238
+ schema_spec: Optional[dict] = None,
239
+ rows_per_file: Optional[int] = None,
240
+ memory_fraction: Optional[float] = None,
241
+ convert_types: Optional[bool] = None,
242
+ output_prefix: str = "part",
243
+ delimiter: str = ",",
244
+ include_header: bool = True,
245
+ line_terminator: str = "\n",
246
+ first_row: Optional[int] = None,
247
+ tablock: bool = True,
248
+ codepage: Optional[str] = None,
249
+ fallback_to_json: bool = False,
250
+ ) -> tuple[str, list[Path]]:
251
+ """Create SQL Server DDL + BULK INSERT statements from Parquet."""
252
+ return sqlserver_create_and_insert_from_parquet(
253
+ parquet_path=parquet_path,
254
+ output_dir=output_dir,
255
+ table=table,
256
+ catalog=catalog,
257
+ schema=schema,
258
+ drop_existing=drop_existing,
259
+ use_go=use_go,
260
+ schema_spec=schema_spec,
261
+ rows_per_file=rows_per_file,
262
+ memory_fraction=(
263
+ self.memory_fraction if memory_fraction is None else memory_fraction
264
+ ),
265
+ convert_types=(
266
+ self.convert_types if convert_types is None else convert_types
267
+ ),
268
+ output_prefix=output_prefix,
269
+ delimiter=delimiter,
270
+ include_header=include_header,
271
+ line_terminator=line_terminator,
272
+ first_row=first_row,
273
+ tablock=tablock,
274
+ codepage=codepage,
275
+ fallback_to_json=fallback_to_json,
276
+ verbose=self.verbose,
277
+ )
278
+
279
+ def sqlserver_create_and_stage_from_parquets(
280
+ self,
281
+ parquet_paths: Sequence[PathLike],
282
+ output_dir: PathLike,
283
+ *,
284
+ table: str,
285
+ catalog: Optional[str] = None,
286
+ schema: Optional[str] = None,
287
+ drop_existing: bool = True,
288
+ use_go: bool = False,
289
+ schema_spec: Optional[dict] = None,
290
+ rows_per_file: Optional[int] = None,
291
+ memory_fraction: Optional[float] = None,
292
+ convert_types: Optional[bool] = None,
293
+ output_prefix: str = "part",
294
+ delimiter: str = ",",
295
+ include_header: bool = True,
296
+ line_terminator: str = "\n",
297
+ fallback_to_json: bool = False,
298
+ ) -> tuple[str, list[Path]]:
299
+ """Generate SQL Server DDL and stage multiple Parquet files as CSVs."""
300
+ return sqlserver_create_and_stage_from_parquets(
301
+ parquet_paths=parquet_paths,
302
+ output_dir=output_dir,
303
+ table=table,
304
+ catalog=catalog,
305
+ schema=schema,
306
+ drop_existing=drop_existing,
307
+ use_go=use_go,
308
+ schema_spec=schema_spec,
309
+ rows_per_file=rows_per_file,
310
+ memory_fraction=(
311
+ self.memory_fraction if memory_fraction is None else memory_fraction
312
+ ),
313
+ convert_types=(
314
+ self.convert_types if convert_types is None else convert_types
315
+ ),
316
+ output_prefix=output_prefix,
317
+ delimiter=delimiter,
318
+ include_header=include_header,
319
+ line_terminator=line_terminator,
320
+ fallback_to_json=fallback_to_json,
152
321
  verbose=self.verbose,
153
322
  )
@@ -19,11 +19,15 @@ from .frames import (
19
19
  write_to_file_and_sql,
20
20
  )
21
21
  from .readers import (
22
+ excel_to_parquets,
23
+ json_to_jsonl,
24
+ parquet_to_csv_partitions,
22
25
  read_file_chunked,
23
26
  read_file_iter,
24
27
  read_file_smart,
25
28
  read_file_to_parquets,
26
29
  stream_to_parquets,
30
+ stream_to_sink,
27
31
  )
28
32
 
29
33
  __all__ = [
@@ -35,9 +39,13 @@ __all__ = [
35
39
  "generate_sql_server_create_table_string",
36
40
  "write_to_file_and_sql",
37
41
  # Memory-aware readers
42
+ "excel_to_parquets",
43
+ "json_to_jsonl",
44
+ "parquet_to_csv_partitions",
38
45
  "read_file_chunked",
39
46
  "read_file_iter",
40
47
  "read_file_to_parquets",
48
+ "stream_to_sink",
41
49
  "stream_to_parquets",
42
50
  "read_file_smart",
43
51
  ]
@@ -1,25 +1,42 @@
1
+ """DataFrame transformation and export helpers.
2
+
3
+ This module focuses on schema-aware DataFrame cleanup, type inference,
4
+ and downstream serialization helpers (Parquet schema/table generation
5
+ and SQL Server-compatible DDL helpers).
6
+ """
7
+
8
+ import os
1
9
  import pathlib
10
+ import shutil
2
11
  import subprocess
3
- from typing import Any, Optional
12
+ from typing import Any, Optional, Union
4
13
 
5
14
  import numpy as np
6
15
  import pandas as pd
7
16
  import pyarrow as pa
8
17
 
9
18
  from ..utils.logging import log_debug, log_error, log_info, log_warning
19
+ from ..utils.strings import coerce_path, ensure_directory
10
20
 
11
21
  _BYTES_LIKE_TYPES = (bytes, bytearray, memoryview)
12
22
 
13
23
 
14
24
  def _is_bytes_like(value: Any) -> bool:
25
+ """Return True when a value should be treated as binary data."""
15
26
  return isinstance(value, _BYTES_LIKE_TYPES)
16
27
 
17
28
 
18
29
  def _infer_object_pa_type(col_data: pd.Series) -> pa.DataType:
30
+ """Infer a PyArrow type for object-dtype Series.
31
+
32
+ We sample non-null values and prefer "narrower" types (binary, string,
33
+ bool, integer, float) before falling back to pyarrow's inference.
34
+ """
19
35
  non_null = col_data.dropna()
20
36
  if non_null.empty:
21
37
  return pa.string()
22
38
 
39
+ # Sample to avoid scanning very large object columns.
23
40
  sample = non_null.iloc[:100].tolist()
24
41
 
25
42
  if all(_is_bytes_like(v) for v in sample):
@@ -45,6 +62,8 @@ def _infer_object_pa_type(col_data: pd.Series) -> pa.DataType:
45
62
  return pa.float64()
46
63
 
47
64
  try:
65
+ # PyArrow can infer types for mixed object values; we normalize into
66
+ # the closest "primitive" type for DDL/Parquet stability.
48
67
  inferred = pa.infer_type(sample)
49
68
  if pa.types.is_binary(inferred) or pa.types.is_large_binary(inferred):
50
69
  return pa.binary()
@@ -93,9 +112,14 @@ def try_cast_string_columns_to_numeric(
93
112
  raise TypeError("df must be a pandas DataFrame")
94
113
 
95
114
  for col in df.columns:
96
- if df[col].dtype == "object":
115
+ dtype = df[col].dtype
116
+ if pd.api.types.is_object_dtype(dtype) or pd.api.types.is_string_dtype(dtype):
97
117
  non_null = df[col].dropna()
98
- if not non_null.empty and non_null.iloc[:100].map(_is_bytes_like).any():
118
+ if (
119
+ pd.api.types.is_object_dtype(dtype)
120
+ and not non_null.empty
121
+ and non_null.iloc[:100].map(_is_bytes_like).any()
122
+ ):
99
123
  log_debug(
100
124
  f"Column '{col}' contains bytes-like values; skipping numeric coercion.",
101
125
  verbose,
@@ -144,16 +168,16 @@ def clean_dataframe_columns(
144
168
  raise ValueError("df must be provided")
145
169
  if not isinstance(df, pd.DataFrame):
146
170
  raise TypeError("df must be a pandas DataFrame")
147
- # Step 1: Flatten MultiIndex columns
171
+ # Step 1: Flatten MultiIndex columns.
148
172
  if isinstance(df.columns, pd.MultiIndex):
149
173
  df.columns = ["_".join(map(str, col)).strip() for col in df.columns.values]
150
174
  log_info("Flattened MultiIndex columns.", verbose)
151
175
 
152
- # Step 2: Convert non-string column names to strings
176
+ # Step 2: Convert non-string column names to strings.
153
177
  df.columns = df.columns.map(str)
154
178
  log_debug("Converted column names to strings.", verbose)
155
179
 
156
- # Step 3: Remove duplicate columns, keeping the first occurrence
180
+ # Step 3: Remove duplicate columns, keeping the first occurrence.
157
181
  duplicates = df.columns.duplicated()
158
182
  if duplicates.any():
159
183
  duplicate_cols = df.columns[duplicates]
@@ -190,14 +214,14 @@ def generate_parquet_schema(
190
214
  col_name = column
191
215
  dtype = col_data.dtype
192
216
 
193
- # Determine if the column contains any nulls
217
+ # Determine if the column contains any nulls.
194
218
  nullable = col_data.isnull().any()
195
219
 
196
- # Map pandas dtype to PyArrow type
220
+ # Map pandas dtype to PyArrow type.
197
221
  pa_type = None
198
222
 
199
223
  if pd.api.types.is_integer_dtype(dtype):
200
- # Check the range to determine the smallest integer type
224
+ # Check the range to determine the smallest integer type.
201
225
  non_null = col_data.dropna()
202
226
  if non_null.empty:
203
227
  pa_type = pa.int64()
@@ -249,7 +273,7 @@ def generate_parquet_schema(
249
273
  else:
250
274
  pa_type = pa.string()
251
275
 
252
- # Create a field
276
+ # Create a field.
253
277
  field = pa.field(col_name, pa_type, nullable=nullable)
254
278
  fields.append(field)
255
279
 
@@ -285,6 +309,7 @@ def pandas_to_parquet_table(
285
309
  raise TypeError("df must be a pandas DataFrame")
286
310
 
287
311
  def _unique_col_name(existing: set[str], desired: str) -> str:
312
+ """Return a column name that does not collide with existing names."""
288
313
  if desired not in existing:
289
314
  return desired
290
315
  i = 1
@@ -293,6 +318,7 @@ def pandas_to_parquet_table(
293
318
  return f"{desired}_{i}"
294
319
 
295
320
  def _materialize_index_columns(input_df: pd.DataFrame) -> pd.DataFrame:
321
+ """Convert Index/MultiIndex into explicit columns."""
296
322
  if isinstance(input_df.index, pd.MultiIndex):
297
323
  index_names: list[str] = []
298
324
  for i, name in enumerate(input_df.index.names):
@@ -319,16 +345,20 @@ def pandas_to_parquet_table(
319
345
  out_df = out_df.reset_index(drop=True)
320
346
  return out_df
321
347
 
348
+ # Clean columns before schema inference to avoid duplicate/invalid names.
322
349
  df = clean_dataframe_columns(df=df, verbose=verbose)
323
350
 
324
351
  if preserve_index:
352
+ # Preserve index by materializing it into real columns.
325
353
  df = _materialize_index_columns(df)
326
354
 
327
355
  if convert:
356
+ # Attempt numeric coercion so Parquet schema uses numeric types.
328
357
  df = try_cast_string_columns_to_numeric(
329
358
  df=df, convert_partial=partial, verbose=verbose
330
359
  )
331
360
 
361
+ # Build schema explicitly so the Parquet table uses stable, optimized types.
332
362
  schema = generate_parquet_schema(df=df, verbose=verbose)
333
363
  try:
334
364
  # We materialize index into regular columns above so that the schema can
@@ -347,6 +377,8 @@ def generate_sql_server_create_table_string(
347
377
  schema: str = "dbo",
348
378
  table: str = "table",
349
379
  dropexisting: bool = True,
380
+ use_go: bool = False,
381
+ schema_spec: Optional[dict] = None,
350
382
  verbose: bool = False,
351
383
  ) -> str:
352
384
  """
@@ -358,6 +390,7 @@ def generate_sql_server_create_table_string(
358
390
  schema: The schema name (default: 'dbo').
359
391
  table: The table name.
360
392
  dropexisting: If True, includes DROP TABLE IF EXISTS statement.
393
+ use_go: If True, inserts GO after USE when a catalog is supplied.
361
394
  verbose: If True, prints progress messages.
362
395
 
363
396
  Returns:
@@ -389,18 +422,23 @@ def generate_sql_server_create_table_string(
389
422
  table=table,
390
423
  drop_existing=dropexisting,
391
424
  dialect=Dialect.SQLSERVER,
425
+ use_go=use_go,
426
+ schema_spec=schema_spec,
392
427
  verbose=verbose,
393
428
  )
394
429
 
395
430
 
396
431
  def write_to_file_and_sql(
397
432
  df: pd.DataFrame,
398
- file_path: str,
433
+ file_path: Union[str, pathlib.Path],
399
434
  table_name: str,
400
435
  sql_server: str,
401
436
  database: str,
402
- username: str,
403
- password: str,
437
+ username: Optional[str] = None,
438
+ password: Optional[str] = None,
439
+ use_trusted_connection: bool = False,
440
+ use_azure_ad: bool = False,
441
+ use_env_credentials: bool = True,
404
442
  verbose: bool = False,
405
443
  ) -> None:
406
444
  """
@@ -414,6 +452,10 @@ def write_to_file_and_sql(
414
452
  database: Database name.
415
453
  username: SQL Server username.
416
454
  password: SQL Server password.
455
+ use_trusted_connection: If True, use integrated authentication (-T).
456
+ use_azure_ad: If True, use Azure AD authentication (-G).
457
+ use_env_credentials: If True, fall back to DATABLADE_SQLSERVER_USERNAME
458
+ and DATABLADE_SQLSERVER_PASSWORD when username/password not provided.
417
459
  verbose: If True, prints progress messages.
418
460
 
419
461
  Raises:
@@ -421,21 +463,34 @@ def write_to_file_and_sql(
421
463
  """
422
464
  if df is None or not isinstance(df, pd.DataFrame):
423
465
  raise TypeError("df must be a pandas DataFrame")
424
- if not isinstance(file_path, str) or not file_path:
425
- raise ValueError("file_path must be a non-empty string")
466
+ path_obj = coerce_path(
467
+ file_path,
468
+ must_exist=False,
469
+ verbose=verbose,
470
+ label="file_path",
471
+ )
426
472
  if not isinstance(table_name, str) or not table_name.strip():
427
473
  raise ValueError("table_name must be a non-empty string")
428
474
  if not isinstance(sql_server, str) or not sql_server.strip():
429
475
  raise ValueError("sql_server must be a non-empty string")
430
476
  if not isinstance(database, str) or not database.strip():
431
477
  raise ValueError("database must be a non-empty string")
432
- if not isinstance(username, str) or not username.strip():
433
- raise ValueError("username must be a non-empty string")
434
- if not isinstance(password, str) or not password:
435
- raise ValueError("password must be provided")
436
-
437
- path_obj = pathlib.Path(file_path)
438
- path_obj.parent.mkdir(parents=True, exist_ok=True)
478
+ if use_trusted_connection and use_azure_ad:
479
+ raise ValueError(
480
+ "use_trusted_connection and use_azure_ad are mutually exclusive"
481
+ )
482
+ if use_env_credentials:
483
+ if not username:
484
+ username = os.getenv("DATABLADE_SQLSERVER_USERNAME")
485
+ if not password:
486
+ password = os.getenv("DATABLADE_SQLSERVER_PASSWORD")
487
+ if not use_trusted_connection:
488
+ if not isinstance(username, str) or not username.strip():
489
+ raise ValueError("username must be a non-empty string")
490
+ if not password and not use_azure_ad:
491
+ raise ValueError("password must be provided")
492
+
493
+ ensure_directory(path_obj.parent, verbose=verbose, label="output_dir")
439
494
 
440
495
  df.to_csv(path_obj, index=False)
441
496
  log_info(f"DataFrame written to file {path_obj}.", verbose)
@@ -450,16 +505,57 @@ def write_to_file_and_sql(
450
505
  "-t,",
451
506
  "-S",
452
507
  sql_server,
453
- "-U",
454
- username,
455
- "-P",
456
- password,
457
508
  ]
509
+ bcp_preview = bcp_args[:-1] + ["***REDACTED***"]
510
+ bcp_path = shutil.which("bcp")
511
+ if not bcp_path:
512
+ install_steps = (
513
+ "Install the SQL Server command line utilities (bcp) and ensure the "
514
+ "binary is on PATH. For example: "
515
+ "macOS (Homebrew): brew install msodbcsql17 mssql-tools; "
516
+ "Linux (Debian/Ubuntu): install mssql-tools; "
517
+ "Windows: install SQL Server Command Line Utilities and restart your shell."
518
+ )
519
+ path_env = os.environ.get("PATH", "")
520
+ message = (
521
+ "BCP executable was not found on PATH. "
522
+ f"PATH={path_env}. {install_steps} "
523
+ f"Command preview: {bcp_preview}."
524
+ )
525
+ log_error(message, verbose)
526
+ raise FileNotFoundError(message)
527
+
528
+ if use_trusted_connection:
529
+ bcp_args.append("-T")
530
+ else:
531
+ if use_azure_ad:
532
+ bcp_args.append("-G")
533
+ if username:
534
+ bcp_args.extend(["-U", username])
535
+ if password:
536
+ bcp_args.extend(["-P", password])
458
537
 
459
538
  log_debug(
460
539
  f"Executing BCP load to {qualified_table} on server {sql_server} as user {username}.",
461
540
  verbose,
462
541
  )
542
+ redacted_args = []
543
+ redact_next = False
544
+ for arg in bcp_args:
545
+ if redact_next:
546
+ redacted_args.append("***REDACTED***")
547
+ redact_next = False
548
+ continue
549
+ redacted_args.append(arg)
550
+ if arg == "-P":
551
+ redact_next = True
552
+ if "-P" in bcp_args:
553
+ log_warning(
554
+ "BCP authentication uses -P with a plaintext password. "
555
+ "Consider using trusted connection (-T) or Azure AD (-G).",
556
+ verbose,
557
+ )
558
+ log_debug(f"BCP args: {redacted_args}", verbose)
463
559
  process = subprocess.run(
464
560
  bcp_args,
465
561
  shell=False,
@@ -474,7 +570,11 @@ def write_to_file_and_sql(
474
570
  else:
475
571
  error_msg = process.stderr.decode()
476
572
  log_error(
477
- f"Error writing DataFrame to SQL Server table {table_name}: {error_msg}",
573
+ "Error writing DataFrame to SQL Server table "
574
+ f"{table_name}: {error_msg} "
575
+ f"PATH={os.environ.get('PATH', '')}. "
576
+ "Ensure BCP is installed (SQL Server command line utilities) and on PATH. "
577
+ f"Command preview: {bcp_preview}.",
478
578
  verbose,
479
579
  )
480
580
  raise subprocess.CalledProcessError(