datablade 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablade/__init__.py +10 -2
- datablade/blade.py +174 -5
- datablade/dataframes/__init__.py +8 -0
- datablade/dataframes/frames.py +127 -27
- datablade/dataframes/readers.py +988 -161
- datablade/docs/ARCHITECTURE.md +102 -0
- datablade/docs/OBJECT_REGISTRY.md +194 -0
- datablade/docs/README.md +57 -0
- datablade/docs/TESTING.md +37 -0
- datablade/docs/USAGE.md +409 -0
- datablade/docs/__init__.py +87 -0
- datablade/docs/__main__.py +6 -0
- datablade/io/json.py +45 -8
- datablade/io/zip.py +68 -30
- datablade/registry.py +581 -0
- datablade/sql/__init__.py +25 -1
- datablade/sql/bulk_load.py +309 -49
- datablade/sql/ddl.py +201 -26
- datablade/sql/ddl_pyarrow.py +150 -26
- datablade/sql/dialects.py +2 -0
- datablade/sql/quoting.py +2 -0
- datablade/sql/schema_spec.py +65 -0
- datablade/sql/sqlserver.py +390 -0
- datablade/utils/__init__.py +2 -1
- datablade/utils/lists.py +3 -0
- datablade/utils/logging.py +46 -1
- datablade/utils/strings.py +180 -17
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/METADATA +68 -13
- datablade-0.0.6.dist-info/RECORD +41 -0
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
- datablade-0.0.5.dist-info/RECORD +0 -31
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/licenses/LICENSE +0 -0
- {datablade-0.0.5.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
datablade/__init__.py
CHANGED
|
@@ -12,24 +12,28 @@ For backward compatibility, all functions are also available from datablade.core
|
|
|
12
12
|
|
|
13
13
|
# Also maintain core for backward compatibility
|
|
14
14
|
# Import from new organized structure
|
|
15
|
-
from . import core, dataframes, io, sql, utils
|
|
15
|
+
from . import core, dataframes, io, registry, sql, utils
|
|
16
16
|
from .blade import Blade
|
|
17
17
|
from .dataframes import read_file_chunked, read_file_smart, read_file_to_parquets
|
|
18
|
+
from .registry import DialectSpec, ObjectNode, ObjectRef, ObjectRegistry
|
|
18
19
|
from .sql import Dialect, bulk_load, generate_create_table
|
|
19
20
|
|
|
20
21
|
# Convenience re-exports for commonly used functions
|
|
21
22
|
from .utils.logging import configure_logging, get_logger
|
|
23
|
+
from .utils.strings import configure_paths
|
|
22
24
|
|
|
23
|
-
__version__ = "0.0.
|
|
25
|
+
__version__ = "0.0.6"
|
|
24
26
|
|
|
25
27
|
__all__ = [
|
|
26
28
|
"dataframes",
|
|
27
29
|
"io",
|
|
28
30
|
"utils",
|
|
29
31
|
"sql",
|
|
32
|
+
"registry",
|
|
30
33
|
"core", # Maintain backward compatibility
|
|
31
34
|
# Convenience re-exports
|
|
32
35
|
"configure_logging",
|
|
36
|
+
"configure_paths",
|
|
33
37
|
"get_logger",
|
|
34
38
|
"read_file_smart",
|
|
35
39
|
"read_file_chunked",
|
|
@@ -38,4 +42,8 @@ __all__ = [
|
|
|
38
42
|
"generate_create_table",
|
|
39
43
|
"bulk_load",
|
|
40
44
|
"Blade",
|
|
45
|
+
"DialectSpec",
|
|
46
|
+
"ObjectRef",
|
|
47
|
+
"ObjectNode",
|
|
48
|
+
"ObjectRegistry",
|
|
41
49
|
]
|
datablade/blade.py
CHANGED
|
@@ -9,7 +9,7 @@ from __future__ import annotations
|
|
|
9
9
|
|
|
10
10
|
from dataclasses import dataclass
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Any, Iterator, Optional, Union
|
|
12
|
+
from typing import Any, Iterator, Optional, Sequence, Union
|
|
13
13
|
|
|
14
14
|
import pandas as pd
|
|
15
15
|
|
|
@@ -19,9 +19,18 @@ from .dataframes import (
|
|
|
19
19
|
read_file_smart,
|
|
20
20
|
read_file_to_parquets,
|
|
21
21
|
stream_to_parquets,
|
|
22
|
+
stream_to_sink,
|
|
22
23
|
try_cast_string_columns_to_numeric,
|
|
23
24
|
)
|
|
24
|
-
from .sql import
|
|
25
|
+
from .sql import (
|
|
26
|
+
Dialect,
|
|
27
|
+
ParquetDDLMetadata,
|
|
28
|
+
generate_create_table,
|
|
29
|
+
generate_create_table_from_parquet,
|
|
30
|
+
sqlserver_create_and_insert_from_parquet,
|
|
31
|
+
sqlserver_create_and_stage_from_parquets,
|
|
32
|
+
sqlserver_openrowset_parquet,
|
|
33
|
+
)
|
|
25
34
|
|
|
26
35
|
PathLike = Union[str, Path]
|
|
27
36
|
|
|
@@ -37,11 +46,18 @@ class Blade:
|
|
|
37
46
|
verbose: bool = False
|
|
38
47
|
convert_types: bool = True
|
|
39
48
|
|
|
40
|
-
def read(
|
|
49
|
+
def read(
|
|
50
|
+
self,
|
|
51
|
+
file_path: PathLike,
|
|
52
|
+
*,
|
|
53
|
+
return_type: str = "dataframe",
|
|
54
|
+
**read_kwargs: Any,
|
|
55
|
+
):
|
|
41
56
|
return read_file_smart(
|
|
42
57
|
file_path=file_path,
|
|
43
58
|
memory_fraction=self.memory_fraction,
|
|
44
59
|
verbose=self.verbose,
|
|
60
|
+
return_type=return_type,
|
|
45
61
|
**read_kwargs,
|
|
46
62
|
)
|
|
47
63
|
|
|
@@ -52,6 +68,7 @@ class Blade:
|
|
|
52
68
|
chunksize: Optional[int] = None,
|
|
53
69
|
**read_kwargs: Any,
|
|
54
70
|
) -> Iterator[pd.DataFrame]:
|
|
71
|
+
"""Yield DataFrame chunks from a file without materializing."""
|
|
55
72
|
return read_file_iter(
|
|
56
73
|
file_path=file_path,
|
|
57
74
|
chunksize=chunksize,
|
|
@@ -70,6 +87,7 @@ class Blade:
|
|
|
70
87
|
convert_types: Optional[bool] = None,
|
|
71
88
|
**read_kwargs: Any,
|
|
72
89
|
):
|
|
90
|
+
"""Read and materialize a file into Parquet partitions."""
|
|
73
91
|
return read_file_to_parquets(
|
|
74
92
|
file_path=file_path,
|
|
75
93
|
output_dir=output_dir,
|
|
@@ -93,6 +111,7 @@ class Blade:
|
|
|
93
111
|
convert_types: Optional[bool] = None,
|
|
94
112
|
**read_kwargs: Any,
|
|
95
113
|
):
|
|
114
|
+
"""Stream input file chunks into Parquet partitions."""
|
|
96
115
|
return stream_to_parquets(
|
|
97
116
|
file_path=file_path,
|
|
98
117
|
output_dir=output_dir,
|
|
@@ -106,10 +125,30 @@ class Blade:
|
|
|
106
125
|
**read_kwargs,
|
|
107
126
|
)
|
|
108
127
|
|
|
128
|
+
def stream_to_sink(
|
|
129
|
+
self,
|
|
130
|
+
chunks: Iterator[pd.DataFrame],
|
|
131
|
+
output_dir: PathLike,
|
|
132
|
+
*,
|
|
133
|
+
output_prefix: str = "part",
|
|
134
|
+
convert_types: Optional[bool] = None,
|
|
135
|
+
):
|
|
136
|
+
return stream_to_sink(
|
|
137
|
+
chunks=chunks,
|
|
138
|
+
output_dir=output_dir,
|
|
139
|
+
output_prefix=output_prefix,
|
|
140
|
+
convert_types=(
|
|
141
|
+
self.convert_types if convert_types is None else convert_types
|
|
142
|
+
),
|
|
143
|
+
verbose=self.verbose,
|
|
144
|
+
)
|
|
145
|
+
|
|
109
146
|
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
147
|
+
"""Normalize column names and remove duplicate columns."""
|
|
110
148
|
return clean_dataframe_columns(df, verbose=self.verbose)
|
|
111
149
|
|
|
112
150
|
def cast_numeric(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
151
|
+
"""Attempt numeric conversion on string columns."""
|
|
113
152
|
return try_cast_string_columns_to_numeric(df, verbose=self.verbose)
|
|
114
153
|
|
|
115
154
|
def create_table_sql(
|
|
@@ -121,7 +160,10 @@ class Blade:
|
|
|
121
160
|
table: str = "table",
|
|
122
161
|
drop_existing: bool = True,
|
|
123
162
|
dialect: Dialect = Dialect.SQLSERVER,
|
|
124
|
-
|
|
163
|
+
use_go: bool = False,
|
|
164
|
+
schema_spec: Optional[dict] = None,
|
|
165
|
+
) -> Union[str, tuple[str, ParquetDDLMetadata]]:
|
|
166
|
+
"""Generate CREATE TABLE DDL from a DataFrame schema."""
|
|
125
167
|
return generate_create_table(
|
|
126
168
|
df=df,
|
|
127
169
|
catalog=catalog,
|
|
@@ -129,19 +171,26 @@ class Blade:
|
|
|
129
171
|
table=table,
|
|
130
172
|
drop_existing=drop_existing,
|
|
131
173
|
dialect=dialect,
|
|
174
|
+
use_go=use_go,
|
|
175
|
+
schema_spec=schema_spec,
|
|
132
176
|
verbose=self.verbose,
|
|
133
177
|
)
|
|
134
178
|
|
|
135
179
|
def create_table_sql_from_parquet(
|
|
136
180
|
self,
|
|
137
|
-
parquet_path:
|
|
181
|
+
parquet_path: PathLike,
|
|
138
182
|
*,
|
|
139
183
|
catalog: Optional[str] = None,
|
|
140
184
|
schema: Optional[str] = None,
|
|
141
185
|
table: str = "table",
|
|
142
186
|
drop_existing: bool = True,
|
|
143
187
|
dialect: Dialect = Dialect.SQLSERVER,
|
|
188
|
+
use_go: bool = False,
|
|
189
|
+
schema_spec: Optional[dict] = None,
|
|
190
|
+
fallback_to_json: bool = False,
|
|
191
|
+
return_metadata: bool = False,
|
|
144
192
|
) -> str:
|
|
193
|
+
"""Generate CREATE TABLE DDL from a Parquet schema."""
|
|
145
194
|
return generate_create_table_from_parquet(
|
|
146
195
|
parquet_path=parquet_path,
|
|
147
196
|
catalog=catalog,
|
|
@@ -149,5 +198,125 @@ class Blade:
|
|
|
149
198
|
table=table,
|
|
150
199
|
drop_existing=drop_existing,
|
|
151
200
|
dialect=dialect,
|
|
201
|
+
use_go=use_go,
|
|
202
|
+
schema_spec=schema_spec,
|
|
203
|
+
verbose=self.verbose,
|
|
204
|
+
fallback_to_json=fallback_to_json,
|
|
205
|
+
return_metadata=return_metadata,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
def sqlserver_openrowset_parquet(
|
|
209
|
+
self,
|
|
210
|
+
parquet_path: PathLike,
|
|
211
|
+
*,
|
|
212
|
+
data_source: Optional[str] = None,
|
|
213
|
+
table_alias: str = "rows",
|
|
214
|
+
select_columns: Optional[Sequence[str]] = None,
|
|
215
|
+
where: Optional[str] = None,
|
|
216
|
+
top: Optional[int] = None,
|
|
217
|
+
) -> str:
|
|
218
|
+
"""Generate a SQL Server OPENROWSET query over Parquet files."""
|
|
219
|
+
return sqlserver_openrowset_parquet(
|
|
220
|
+
parquet_path,
|
|
221
|
+
data_source=data_source,
|
|
222
|
+
table_alias=table_alias,
|
|
223
|
+
select_columns=select_columns,
|
|
224
|
+
where=where,
|
|
225
|
+
top=top,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def sqlserver_create_and_insert_from_parquet(
|
|
229
|
+
self,
|
|
230
|
+
parquet_path: PathLike,
|
|
231
|
+
output_dir: PathLike,
|
|
232
|
+
*,
|
|
233
|
+
table: str,
|
|
234
|
+
catalog: Optional[str] = None,
|
|
235
|
+
schema: Optional[str] = None,
|
|
236
|
+
drop_existing: bool = True,
|
|
237
|
+
use_go: bool = False,
|
|
238
|
+
schema_spec: Optional[dict] = None,
|
|
239
|
+
rows_per_file: Optional[int] = None,
|
|
240
|
+
memory_fraction: Optional[float] = None,
|
|
241
|
+
convert_types: Optional[bool] = None,
|
|
242
|
+
output_prefix: str = "part",
|
|
243
|
+
delimiter: str = ",",
|
|
244
|
+
include_header: bool = True,
|
|
245
|
+
line_terminator: str = "\n",
|
|
246
|
+
first_row: Optional[int] = None,
|
|
247
|
+
tablock: bool = True,
|
|
248
|
+
codepage: Optional[str] = None,
|
|
249
|
+
fallback_to_json: bool = False,
|
|
250
|
+
) -> tuple[str, list[Path]]:
|
|
251
|
+
"""Create SQL Server DDL + BULK INSERT statements from Parquet."""
|
|
252
|
+
return sqlserver_create_and_insert_from_parquet(
|
|
253
|
+
parquet_path=parquet_path,
|
|
254
|
+
output_dir=output_dir,
|
|
255
|
+
table=table,
|
|
256
|
+
catalog=catalog,
|
|
257
|
+
schema=schema,
|
|
258
|
+
drop_existing=drop_existing,
|
|
259
|
+
use_go=use_go,
|
|
260
|
+
schema_spec=schema_spec,
|
|
261
|
+
rows_per_file=rows_per_file,
|
|
262
|
+
memory_fraction=(
|
|
263
|
+
self.memory_fraction if memory_fraction is None else memory_fraction
|
|
264
|
+
),
|
|
265
|
+
convert_types=(
|
|
266
|
+
self.convert_types if convert_types is None else convert_types
|
|
267
|
+
),
|
|
268
|
+
output_prefix=output_prefix,
|
|
269
|
+
delimiter=delimiter,
|
|
270
|
+
include_header=include_header,
|
|
271
|
+
line_terminator=line_terminator,
|
|
272
|
+
first_row=first_row,
|
|
273
|
+
tablock=tablock,
|
|
274
|
+
codepage=codepage,
|
|
275
|
+
fallback_to_json=fallback_to_json,
|
|
276
|
+
verbose=self.verbose,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def sqlserver_create_and_stage_from_parquets(
|
|
280
|
+
self,
|
|
281
|
+
parquet_paths: Sequence[PathLike],
|
|
282
|
+
output_dir: PathLike,
|
|
283
|
+
*,
|
|
284
|
+
table: str,
|
|
285
|
+
catalog: Optional[str] = None,
|
|
286
|
+
schema: Optional[str] = None,
|
|
287
|
+
drop_existing: bool = True,
|
|
288
|
+
use_go: bool = False,
|
|
289
|
+
schema_spec: Optional[dict] = None,
|
|
290
|
+
rows_per_file: Optional[int] = None,
|
|
291
|
+
memory_fraction: Optional[float] = None,
|
|
292
|
+
convert_types: Optional[bool] = None,
|
|
293
|
+
output_prefix: str = "part",
|
|
294
|
+
delimiter: str = ",",
|
|
295
|
+
include_header: bool = True,
|
|
296
|
+
line_terminator: str = "\n",
|
|
297
|
+
fallback_to_json: bool = False,
|
|
298
|
+
) -> tuple[str, list[Path]]:
|
|
299
|
+
"""Generate SQL Server DDL and stage multiple Parquet files as CSVs."""
|
|
300
|
+
return sqlserver_create_and_stage_from_parquets(
|
|
301
|
+
parquet_paths=parquet_paths,
|
|
302
|
+
output_dir=output_dir,
|
|
303
|
+
table=table,
|
|
304
|
+
catalog=catalog,
|
|
305
|
+
schema=schema,
|
|
306
|
+
drop_existing=drop_existing,
|
|
307
|
+
use_go=use_go,
|
|
308
|
+
schema_spec=schema_spec,
|
|
309
|
+
rows_per_file=rows_per_file,
|
|
310
|
+
memory_fraction=(
|
|
311
|
+
self.memory_fraction if memory_fraction is None else memory_fraction
|
|
312
|
+
),
|
|
313
|
+
convert_types=(
|
|
314
|
+
self.convert_types if convert_types is None else convert_types
|
|
315
|
+
),
|
|
316
|
+
output_prefix=output_prefix,
|
|
317
|
+
delimiter=delimiter,
|
|
318
|
+
include_header=include_header,
|
|
319
|
+
line_terminator=line_terminator,
|
|
320
|
+
fallback_to_json=fallback_to_json,
|
|
152
321
|
verbose=self.verbose,
|
|
153
322
|
)
|
datablade/dataframes/__init__.py
CHANGED
|
@@ -19,11 +19,15 @@ from .frames import (
|
|
|
19
19
|
write_to_file_and_sql,
|
|
20
20
|
)
|
|
21
21
|
from .readers import (
|
|
22
|
+
excel_to_parquets,
|
|
23
|
+
json_to_jsonl,
|
|
24
|
+
parquet_to_csv_partitions,
|
|
22
25
|
read_file_chunked,
|
|
23
26
|
read_file_iter,
|
|
24
27
|
read_file_smart,
|
|
25
28
|
read_file_to_parquets,
|
|
26
29
|
stream_to_parquets,
|
|
30
|
+
stream_to_sink,
|
|
27
31
|
)
|
|
28
32
|
|
|
29
33
|
__all__ = [
|
|
@@ -35,9 +39,13 @@ __all__ = [
|
|
|
35
39
|
"generate_sql_server_create_table_string",
|
|
36
40
|
"write_to_file_and_sql",
|
|
37
41
|
# Memory-aware readers
|
|
42
|
+
"excel_to_parquets",
|
|
43
|
+
"json_to_jsonl",
|
|
44
|
+
"parquet_to_csv_partitions",
|
|
38
45
|
"read_file_chunked",
|
|
39
46
|
"read_file_iter",
|
|
40
47
|
"read_file_to_parquets",
|
|
48
|
+
"stream_to_sink",
|
|
41
49
|
"stream_to_parquets",
|
|
42
50
|
"read_file_smart",
|
|
43
51
|
]
|
datablade/dataframes/frames.py
CHANGED
|
@@ -1,25 +1,42 @@
|
|
|
1
|
+
"""DataFrame transformation and export helpers.
|
|
2
|
+
|
|
3
|
+
This module focuses on schema-aware DataFrame cleanup, type inference,
|
|
4
|
+
and downstream serialization helpers (Parquet schema/table generation
|
|
5
|
+
and SQL Server-compatible DDL helpers).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
1
9
|
import pathlib
|
|
10
|
+
import shutil
|
|
2
11
|
import subprocess
|
|
3
|
-
from typing import Any, Optional
|
|
12
|
+
from typing import Any, Optional, Union
|
|
4
13
|
|
|
5
14
|
import numpy as np
|
|
6
15
|
import pandas as pd
|
|
7
16
|
import pyarrow as pa
|
|
8
17
|
|
|
9
18
|
from ..utils.logging import log_debug, log_error, log_info, log_warning
|
|
19
|
+
from ..utils.strings import coerce_path, ensure_directory
|
|
10
20
|
|
|
11
21
|
_BYTES_LIKE_TYPES = (bytes, bytearray, memoryview)
|
|
12
22
|
|
|
13
23
|
|
|
14
24
|
def _is_bytes_like(value: Any) -> bool:
|
|
25
|
+
"""Return True when a value should be treated as binary data."""
|
|
15
26
|
return isinstance(value, _BYTES_LIKE_TYPES)
|
|
16
27
|
|
|
17
28
|
|
|
18
29
|
def _infer_object_pa_type(col_data: pd.Series) -> pa.DataType:
|
|
30
|
+
"""Infer a PyArrow type for object-dtype Series.
|
|
31
|
+
|
|
32
|
+
We sample non-null values and prefer "narrower" types (binary, string,
|
|
33
|
+
bool, integer, float) before falling back to pyarrow's inference.
|
|
34
|
+
"""
|
|
19
35
|
non_null = col_data.dropna()
|
|
20
36
|
if non_null.empty:
|
|
21
37
|
return pa.string()
|
|
22
38
|
|
|
39
|
+
# Sample to avoid scanning very large object columns.
|
|
23
40
|
sample = non_null.iloc[:100].tolist()
|
|
24
41
|
|
|
25
42
|
if all(_is_bytes_like(v) for v in sample):
|
|
@@ -45,6 +62,8 @@ def _infer_object_pa_type(col_data: pd.Series) -> pa.DataType:
|
|
|
45
62
|
return pa.float64()
|
|
46
63
|
|
|
47
64
|
try:
|
|
65
|
+
# PyArrow can infer types for mixed object values; we normalize into
|
|
66
|
+
# the closest "primitive" type for DDL/Parquet stability.
|
|
48
67
|
inferred = pa.infer_type(sample)
|
|
49
68
|
if pa.types.is_binary(inferred) or pa.types.is_large_binary(inferred):
|
|
50
69
|
return pa.binary()
|
|
@@ -93,9 +112,14 @@ def try_cast_string_columns_to_numeric(
|
|
|
93
112
|
raise TypeError("df must be a pandas DataFrame")
|
|
94
113
|
|
|
95
114
|
for col in df.columns:
|
|
96
|
-
|
|
115
|
+
dtype = df[col].dtype
|
|
116
|
+
if pd.api.types.is_object_dtype(dtype) or pd.api.types.is_string_dtype(dtype):
|
|
97
117
|
non_null = df[col].dropna()
|
|
98
|
-
if
|
|
118
|
+
if (
|
|
119
|
+
pd.api.types.is_object_dtype(dtype)
|
|
120
|
+
and not non_null.empty
|
|
121
|
+
and non_null.iloc[:100].map(_is_bytes_like).any()
|
|
122
|
+
):
|
|
99
123
|
log_debug(
|
|
100
124
|
f"Column '{col}' contains bytes-like values; skipping numeric coercion.",
|
|
101
125
|
verbose,
|
|
@@ -144,16 +168,16 @@ def clean_dataframe_columns(
|
|
|
144
168
|
raise ValueError("df must be provided")
|
|
145
169
|
if not isinstance(df, pd.DataFrame):
|
|
146
170
|
raise TypeError("df must be a pandas DataFrame")
|
|
147
|
-
# Step 1: Flatten MultiIndex columns
|
|
171
|
+
# Step 1: Flatten MultiIndex columns.
|
|
148
172
|
if isinstance(df.columns, pd.MultiIndex):
|
|
149
173
|
df.columns = ["_".join(map(str, col)).strip() for col in df.columns.values]
|
|
150
174
|
log_info("Flattened MultiIndex columns.", verbose)
|
|
151
175
|
|
|
152
|
-
# Step 2: Convert non-string column names to strings
|
|
176
|
+
# Step 2: Convert non-string column names to strings.
|
|
153
177
|
df.columns = df.columns.map(str)
|
|
154
178
|
log_debug("Converted column names to strings.", verbose)
|
|
155
179
|
|
|
156
|
-
# Step 3: Remove duplicate columns, keeping the first occurrence
|
|
180
|
+
# Step 3: Remove duplicate columns, keeping the first occurrence.
|
|
157
181
|
duplicates = df.columns.duplicated()
|
|
158
182
|
if duplicates.any():
|
|
159
183
|
duplicate_cols = df.columns[duplicates]
|
|
@@ -190,14 +214,14 @@ def generate_parquet_schema(
|
|
|
190
214
|
col_name = column
|
|
191
215
|
dtype = col_data.dtype
|
|
192
216
|
|
|
193
|
-
# Determine if the column contains any nulls
|
|
217
|
+
# Determine if the column contains any nulls.
|
|
194
218
|
nullable = col_data.isnull().any()
|
|
195
219
|
|
|
196
|
-
# Map pandas dtype to PyArrow type
|
|
220
|
+
# Map pandas dtype to PyArrow type.
|
|
197
221
|
pa_type = None
|
|
198
222
|
|
|
199
223
|
if pd.api.types.is_integer_dtype(dtype):
|
|
200
|
-
# Check the range to determine the smallest integer type
|
|
224
|
+
# Check the range to determine the smallest integer type.
|
|
201
225
|
non_null = col_data.dropna()
|
|
202
226
|
if non_null.empty:
|
|
203
227
|
pa_type = pa.int64()
|
|
@@ -249,7 +273,7 @@ def generate_parquet_schema(
|
|
|
249
273
|
else:
|
|
250
274
|
pa_type = pa.string()
|
|
251
275
|
|
|
252
|
-
# Create a field
|
|
276
|
+
# Create a field.
|
|
253
277
|
field = pa.field(col_name, pa_type, nullable=nullable)
|
|
254
278
|
fields.append(field)
|
|
255
279
|
|
|
@@ -285,6 +309,7 @@ def pandas_to_parquet_table(
|
|
|
285
309
|
raise TypeError("df must be a pandas DataFrame")
|
|
286
310
|
|
|
287
311
|
def _unique_col_name(existing: set[str], desired: str) -> str:
|
|
312
|
+
"""Return a column name that does not collide with existing names."""
|
|
288
313
|
if desired not in existing:
|
|
289
314
|
return desired
|
|
290
315
|
i = 1
|
|
@@ -293,6 +318,7 @@ def pandas_to_parquet_table(
|
|
|
293
318
|
return f"{desired}_{i}"
|
|
294
319
|
|
|
295
320
|
def _materialize_index_columns(input_df: pd.DataFrame) -> pd.DataFrame:
|
|
321
|
+
"""Convert Index/MultiIndex into explicit columns."""
|
|
296
322
|
if isinstance(input_df.index, pd.MultiIndex):
|
|
297
323
|
index_names: list[str] = []
|
|
298
324
|
for i, name in enumerate(input_df.index.names):
|
|
@@ -319,16 +345,20 @@ def pandas_to_parquet_table(
|
|
|
319
345
|
out_df = out_df.reset_index(drop=True)
|
|
320
346
|
return out_df
|
|
321
347
|
|
|
348
|
+
# Clean columns before schema inference to avoid duplicate/invalid names.
|
|
322
349
|
df = clean_dataframe_columns(df=df, verbose=verbose)
|
|
323
350
|
|
|
324
351
|
if preserve_index:
|
|
352
|
+
# Preserve index by materializing it into real columns.
|
|
325
353
|
df = _materialize_index_columns(df)
|
|
326
354
|
|
|
327
355
|
if convert:
|
|
356
|
+
# Attempt numeric coercion so Parquet schema uses numeric types.
|
|
328
357
|
df = try_cast_string_columns_to_numeric(
|
|
329
358
|
df=df, convert_partial=partial, verbose=verbose
|
|
330
359
|
)
|
|
331
360
|
|
|
361
|
+
# Build schema explicitly so the Parquet table uses stable, optimized types.
|
|
332
362
|
schema = generate_parquet_schema(df=df, verbose=verbose)
|
|
333
363
|
try:
|
|
334
364
|
# We materialize index into regular columns above so that the schema can
|
|
@@ -347,6 +377,8 @@ def generate_sql_server_create_table_string(
|
|
|
347
377
|
schema: str = "dbo",
|
|
348
378
|
table: str = "table",
|
|
349
379
|
dropexisting: bool = True,
|
|
380
|
+
use_go: bool = False,
|
|
381
|
+
schema_spec: Optional[dict] = None,
|
|
350
382
|
verbose: bool = False,
|
|
351
383
|
) -> str:
|
|
352
384
|
"""
|
|
@@ -358,6 +390,7 @@ def generate_sql_server_create_table_string(
|
|
|
358
390
|
schema: The schema name (default: 'dbo').
|
|
359
391
|
table: The table name.
|
|
360
392
|
dropexisting: If True, includes DROP TABLE IF EXISTS statement.
|
|
393
|
+
use_go: If True, inserts GO after USE when a catalog is supplied.
|
|
361
394
|
verbose: If True, prints progress messages.
|
|
362
395
|
|
|
363
396
|
Returns:
|
|
@@ -389,18 +422,23 @@ def generate_sql_server_create_table_string(
|
|
|
389
422
|
table=table,
|
|
390
423
|
drop_existing=dropexisting,
|
|
391
424
|
dialect=Dialect.SQLSERVER,
|
|
425
|
+
use_go=use_go,
|
|
426
|
+
schema_spec=schema_spec,
|
|
392
427
|
verbose=verbose,
|
|
393
428
|
)
|
|
394
429
|
|
|
395
430
|
|
|
396
431
|
def write_to_file_and_sql(
|
|
397
432
|
df: pd.DataFrame,
|
|
398
|
-
file_path: str,
|
|
433
|
+
file_path: Union[str, pathlib.Path],
|
|
399
434
|
table_name: str,
|
|
400
435
|
sql_server: str,
|
|
401
436
|
database: str,
|
|
402
|
-
username: str,
|
|
403
|
-
password: str,
|
|
437
|
+
username: Optional[str] = None,
|
|
438
|
+
password: Optional[str] = None,
|
|
439
|
+
use_trusted_connection: bool = False,
|
|
440
|
+
use_azure_ad: bool = False,
|
|
441
|
+
use_env_credentials: bool = True,
|
|
404
442
|
verbose: bool = False,
|
|
405
443
|
) -> None:
|
|
406
444
|
"""
|
|
@@ -414,6 +452,10 @@ def write_to_file_and_sql(
|
|
|
414
452
|
database: Database name.
|
|
415
453
|
username: SQL Server username.
|
|
416
454
|
password: SQL Server password.
|
|
455
|
+
use_trusted_connection: If True, use integrated authentication (-T).
|
|
456
|
+
use_azure_ad: If True, use Azure AD authentication (-G).
|
|
457
|
+
use_env_credentials: If True, fall back to DATABLADE_SQLSERVER_USERNAME
|
|
458
|
+
and DATABLADE_SQLSERVER_PASSWORD when username/password not provided.
|
|
417
459
|
verbose: If True, prints progress messages.
|
|
418
460
|
|
|
419
461
|
Raises:
|
|
@@ -421,21 +463,34 @@ def write_to_file_and_sql(
|
|
|
421
463
|
"""
|
|
422
464
|
if df is None or not isinstance(df, pd.DataFrame):
|
|
423
465
|
raise TypeError("df must be a pandas DataFrame")
|
|
424
|
-
|
|
425
|
-
|
|
466
|
+
path_obj = coerce_path(
|
|
467
|
+
file_path,
|
|
468
|
+
must_exist=False,
|
|
469
|
+
verbose=verbose,
|
|
470
|
+
label="file_path",
|
|
471
|
+
)
|
|
426
472
|
if not isinstance(table_name, str) or not table_name.strip():
|
|
427
473
|
raise ValueError("table_name must be a non-empty string")
|
|
428
474
|
if not isinstance(sql_server, str) or not sql_server.strip():
|
|
429
475
|
raise ValueError("sql_server must be a non-empty string")
|
|
430
476
|
if not isinstance(database, str) or not database.strip():
|
|
431
477
|
raise ValueError("database must be a non-empty string")
|
|
432
|
-
if
|
|
433
|
-
raise ValueError(
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
478
|
+
if use_trusted_connection and use_azure_ad:
|
|
479
|
+
raise ValueError(
|
|
480
|
+
"use_trusted_connection and use_azure_ad are mutually exclusive"
|
|
481
|
+
)
|
|
482
|
+
if use_env_credentials:
|
|
483
|
+
if not username:
|
|
484
|
+
username = os.getenv("DATABLADE_SQLSERVER_USERNAME")
|
|
485
|
+
if not password:
|
|
486
|
+
password = os.getenv("DATABLADE_SQLSERVER_PASSWORD")
|
|
487
|
+
if not use_trusted_connection:
|
|
488
|
+
if not isinstance(username, str) or not username.strip():
|
|
489
|
+
raise ValueError("username must be a non-empty string")
|
|
490
|
+
if not password and not use_azure_ad:
|
|
491
|
+
raise ValueError("password must be provided")
|
|
492
|
+
|
|
493
|
+
ensure_directory(path_obj.parent, verbose=verbose, label="output_dir")
|
|
439
494
|
|
|
440
495
|
df.to_csv(path_obj, index=False)
|
|
441
496
|
log_info(f"DataFrame written to file {path_obj}.", verbose)
|
|
@@ -450,16 +505,57 @@ def write_to_file_and_sql(
|
|
|
450
505
|
"-t,",
|
|
451
506
|
"-S",
|
|
452
507
|
sql_server,
|
|
453
|
-
"-U",
|
|
454
|
-
username,
|
|
455
|
-
"-P",
|
|
456
|
-
password,
|
|
457
508
|
]
|
|
509
|
+
bcp_preview = bcp_args[:-1] + ["***REDACTED***"]
|
|
510
|
+
bcp_path = shutil.which("bcp")
|
|
511
|
+
if not bcp_path:
|
|
512
|
+
install_steps = (
|
|
513
|
+
"Install the SQL Server command line utilities (bcp) and ensure the "
|
|
514
|
+
"binary is on PATH. For example: "
|
|
515
|
+
"macOS (Homebrew): brew install msodbcsql17 mssql-tools; "
|
|
516
|
+
"Linux (Debian/Ubuntu): install mssql-tools; "
|
|
517
|
+
"Windows: install SQL Server Command Line Utilities and restart your shell."
|
|
518
|
+
)
|
|
519
|
+
path_env = os.environ.get("PATH", "")
|
|
520
|
+
message = (
|
|
521
|
+
"BCP executable was not found on PATH. "
|
|
522
|
+
f"PATH={path_env}. {install_steps} "
|
|
523
|
+
f"Command preview: {bcp_preview}."
|
|
524
|
+
)
|
|
525
|
+
log_error(message, verbose)
|
|
526
|
+
raise FileNotFoundError(message)
|
|
527
|
+
|
|
528
|
+
if use_trusted_connection:
|
|
529
|
+
bcp_args.append("-T")
|
|
530
|
+
else:
|
|
531
|
+
if use_azure_ad:
|
|
532
|
+
bcp_args.append("-G")
|
|
533
|
+
if username:
|
|
534
|
+
bcp_args.extend(["-U", username])
|
|
535
|
+
if password:
|
|
536
|
+
bcp_args.extend(["-P", password])
|
|
458
537
|
|
|
459
538
|
log_debug(
|
|
460
539
|
f"Executing BCP load to {qualified_table} on server {sql_server} as user {username}.",
|
|
461
540
|
verbose,
|
|
462
541
|
)
|
|
542
|
+
redacted_args = []
|
|
543
|
+
redact_next = False
|
|
544
|
+
for arg in bcp_args:
|
|
545
|
+
if redact_next:
|
|
546
|
+
redacted_args.append("***REDACTED***")
|
|
547
|
+
redact_next = False
|
|
548
|
+
continue
|
|
549
|
+
redacted_args.append(arg)
|
|
550
|
+
if arg == "-P":
|
|
551
|
+
redact_next = True
|
|
552
|
+
if "-P" in bcp_args:
|
|
553
|
+
log_warning(
|
|
554
|
+
"BCP authentication uses -P with a plaintext password. "
|
|
555
|
+
"Consider using trusted connection (-T) or Azure AD (-G).",
|
|
556
|
+
verbose,
|
|
557
|
+
)
|
|
558
|
+
log_debug(f"BCP args: {redacted_args}", verbose)
|
|
463
559
|
process = subprocess.run(
|
|
464
560
|
bcp_args,
|
|
465
561
|
shell=False,
|
|
@@ -474,7 +570,11 @@ def write_to_file_and_sql(
|
|
|
474
570
|
else:
|
|
475
571
|
error_msg = process.stderr.decode()
|
|
476
572
|
log_error(
|
|
477
|
-
|
|
573
|
+
"Error writing DataFrame to SQL Server table "
|
|
574
|
+
f"{table_name}: {error_msg} "
|
|
575
|
+
f"PATH={os.environ.get('PATH', '')}. "
|
|
576
|
+
"Ensure BCP is installed (SQL Server command line utilities) and on PATH. "
|
|
577
|
+
f"Command preview: {bcp_preview}.",
|
|
478
578
|
verbose,
|
|
479
579
|
)
|
|
480
580
|
raise subprocess.CalledProcessError(
|