datablade 0.0.0__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,485 @@
1
+ import pathlib
2
+ import subprocess
3
+ from typing import Any, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pyarrow as pa
8
+
9
+ from ..utils.logging import log_debug, log_error, log_info, log_warning
10
+
11
+ _BYTES_LIKE_TYPES = (bytes, bytearray, memoryview)
12
+
13
+
14
+ def _is_bytes_like(value: Any) -> bool:
15
+ return isinstance(value, _BYTES_LIKE_TYPES)
16
+
17
+
18
+ def _infer_object_pa_type(col_data: pd.Series) -> pa.DataType:
19
+ non_null = col_data.dropna()
20
+ if non_null.empty:
21
+ return pa.string()
22
+
23
+ sample = non_null.iloc[:100].tolist()
24
+
25
+ if all(_is_bytes_like(v) for v in sample):
26
+ return pa.binary()
27
+ if all(isinstance(v, str) for v in sample):
28
+ return pa.string()
29
+ if all(isinstance(v, (bool, np.bool_)) for v in sample):
30
+ return pa.bool_()
31
+ if all(
32
+ isinstance(v, (int, np.integer)) and not isinstance(v, (bool, np.bool_))
33
+ for v in sample
34
+ ):
35
+ min_value = min(sample)
36
+ max_value = max(sample)
37
+ if min_value >= np.iinfo(np.int8).min and max_value <= np.iinfo(np.int8).max:
38
+ return pa.int8()
39
+ if min_value >= np.iinfo(np.int16).min and max_value <= np.iinfo(np.int16).max:
40
+ return pa.int16()
41
+ if min_value >= np.iinfo(np.int32).min and max_value <= np.iinfo(np.int32).max:
42
+ return pa.int32()
43
+ return pa.int64()
44
+ if all(isinstance(v, (float, np.floating)) for v in sample):
45
+ return pa.float64()
46
+
47
+ try:
48
+ inferred = pa.infer_type(sample)
49
+ if pa.types.is_binary(inferred) or pa.types.is_large_binary(inferred):
50
+ return pa.binary()
51
+ if pa.types.is_string(inferred) or pa.types.is_large_string(inferred):
52
+ return pa.string()
53
+ if pa.types.is_boolean(inferred):
54
+ return pa.bool_()
55
+ if pa.types.is_integer(inferred):
56
+ return pa.int64()
57
+ if pa.types.is_floating(inferred):
58
+ return pa.float64()
59
+ if pa.types.is_timestamp(inferred) or pa.types.is_date(inferred):
60
+ return inferred
61
+ except Exception:
62
+ pass
63
+
64
+ return pa.string()
65
+
66
+
67
+ def try_cast_string_columns_to_numeric(
68
+ df: Optional[pd.DataFrame] = None,
69
+ convert_partial: bool = False,
70
+ verbose: bool = False,
71
+ ) -> pd.DataFrame:
72
+ """
73
+ Attempt to cast DataFrame string columns to numeric values where possible.
74
+
75
+ Args:
76
+ df: The DataFrame to process. If None, returns None.
77
+ convert_partial: If True, columns with some values convertible to numeric types
78
+ will be converted to numeric types with NaNs where conversion failed.
79
+ If False, only columns where all values can be converted will be converted.
80
+ verbose: If True, prints progress messages.
81
+
82
+ Returns:
83
+ DataFrame with string columns converted to numeric types where possible,
84
+ or None if no DataFrame is provided.
85
+ """
86
+ if df is None:
87
+ log_warning(
88
+ "No DataFrame provided; exiting try_cast_string_columns_to_numeric.",
89
+ verbose,
90
+ )
91
+ raise ValueError("df must be provided")
92
+ if not isinstance(df, pd.DataFrame):
93
+ raise TypeError("df must be a pandas DataFrame")
94
+
95
+ for col in df.columns:
96
+ if df[col].dtype == "object":
97
+ non_null = df[col].dropna()
98
+ if not non_null.empty and non_null.iloc[:100].map(_is_bytes_like).any():
99
+ log_debug(
100
+ f"Column '{col}' contains bytes-like values; skipping numeric coercion.",
101
+ verbose,
102
+ )
103
+ continue
104
+ converted = pd.to_numeric(df[col], errors="coerce")
105
+ has_nan = converted.isnull().any()
106
+ if not has_nan:
107
+ df[col] = converted
108
+ log_info(f"Column '{col}' successfully converted to numeric.", verbose)
109
+ else:
110
+ if convert_partial:
111
+ df[col] = converted
112
+ log_info(
113
+ f"Column '{col}' partially converted to numeric with NaNs where conversion failed.",
114
+ verbose,
115
+ )
116
+ else:
117
+ log_debug(
118
+ f"Column '{col}' could not be fully converted to numeric; leaving as is.",
119
+ verbose,
120
+ )
121
+ return df
122
+
123
+
124
+ def clean_dataframe_columns(
125
+ df: Optional[pd.DataFrame] = None, verbose: bool = False
126
+ ) -> pd.DataFrame:
127
+ """
128
+ Clean the DataFrame columns by flattening MultiIndex, converting to strings,
129
+ and removing duplicates.
130
+
131
+ Args:
132
+ df: The DataFrame to clean. If None, returns None.
133
+ verbose: If True, prints progress messages.
134
+
135
+ Returns:
136
+ The cleaned DataFrame with:
137
+ - Flattened MultiIndex columns
138
+ - String column names
139
+ - Duplicate columns removed (keeping first occurrence)
140
+ Returns None if no DataFrame is provided.
141
+ """
142
+ if df is None:
143
+ log_warning("No DataFrame provided; exiting clean_dataframe_columns.", verbose)
144
+ raise ValueError("df must be provided")
145
+ if not isinstance(df, pd.DataFrame):
146
+ raise TypeError("df must be a pandas DataFrame")
147
+ # Step 1: Flatten MultiIndex columns
148
+ if isinstance(df.columns, pd.MultiIndex):
149
+ df.columns = ["_".join(map(str, col)).strip() for col in df.columns.values]
150
+ log_info("Flattened MultiIndex columns.", verbose)
151
+
152
+ # Step 2: Convert non-string column names to strings
153
+ df.columns = df.columns.map(str)
154
+ log_debug("Converted column names to strings.", verbose)
155
+
156
+ # Step 3: Remove duplicate columns, keeping the first occurrence
157
+ duplicates = df.columns.duplicated()
158
+ if duplicates.any():
159
+ duplicate_cols = df.columns[duplicates]
160
+ log_warning(f"Duplicate columns found: {list(duplicate_cols)}", verbose)
161
+ df = df.loc[:, ~duplicates]
162
+ log_info("Removed duplicate columns, keeping the first occurrence.", verbose)
163
+
164
+ return df
165
+
166
+
167
+ def generate_parquet_schema(
168
+ df: Optional[pd.DataFrame] = None, verbose: bool = False
169
+ ) -> pa.Schema:
170
+ """
171
+ Generate a PyArrow Schema from a pandas DataFrame with optimized data types.
172
+
173
+ Args:
174
+ df: The DataFrame to generate the schema from. If None, returns None.
175
+ verbose: If True, prints progress messages.
176
+
177
+ Returns:
178
+ PyArrow Schema object with optimized types (smallest integer type that fits the data),
179
+ or None if no DataFrame is provided.
180
+ """
181
+ if df is None:
182
+ log_warning("No DataFrame provided; exiting generate_parquet_schema.", verbose)
183
+ raise ValueError("df must be provided")
184
+ if not isinstance(df, pd.DataFrame):
185
+ raise TypeError("df must be a pandas DataFrame")
186
+
187
+ fields = []
188
+ for column in df.columns:
189
+ col_data = df[column]
190
+ col_name = column
191
+ dtype = col_data.dtype
192
+
193
+ # Determine if the column contains any nulls
194
+ nullable = col_data.isnull().any()
195
+
196
+ # Map pandas dtype to PyArrow type
197
+ pa_type = None
198
+
199
+ if pd.api.types.is_integer_dtype(dtype):
200
+ # Check the range to determine the smallest integer type
201
+ non_null = col_data.dropna()
202
+ if non_null.empty:
203
+ pa_type = pa.int64()
204
+ else:
205
+ min_value = non_null.min()
206
+ max_value = non_null.max()
207
+ if (
208
+ min_value >= np.iinfo(np.int8).min
209
+ and max_value <= np.iinfo(np.int8).max
210
+ ):
211
+ pa_type = pa.int8()
212
+ elif (
213
+ min_value >= np.iinfo(np.int16).min
214
+ and max_value <= np.iinfo(np.int16).max
215
+ ):
216
+ pa_type = pa.int16()
217
+ elif (
218
+ min_value >= np.iinfo(np.int32).min
219
+ and max_value <= np.iinfo(np.int32).max
220
+ ):
221
+ pa_type = pa.int32()
222
+ else:
223
+ pa_type = pa.int64()
224
+
225
+ elif pd.api.types.is_float_dtype(dtype):
226
+ pa_type = pa.float64()
227
+
228
+ elif pd.api.types.is_bool_dtype(dtype):
229
+ pa_type = pa.bool_()
230
+
231
+ elif isinstance(dtype, pd.DatetimeTZDtype):
232
+ tz = getattr(getattr(col_data.dt, "tz", None), "zone", None) or str(
233
+ col_data.dt.tz
234
+ )
235
+ pa_type = pa.timestamp("ms", tz=tz)
236
+
237
+ elif pd.api.types.is_datetime64_any_dtype(dtype):
238
+ pa_type = pa.timestamp("ms")
239
+
240
+ elif pd.api.types.is_timedelta64_dtype(dtype):
241
+ pa_type = pa.duration("ns")
242
+
243
+ elif isinstance(dtype, pd.CategoricalDtype):
244
+ pa_type = pa.string()
245
+
246
+ elif pd.api.types.is_object_dtype(dtype):
247
+ pa_type = _infer_object_pa_type(col_data)
248
+
249
+ else:
250
+ pa_type = pa.string()
251
+
252
+ # Create a field
253
+ field = pa.field(col_name, pa_type, nullable=nullable)
254
+ fields.append(field)
255
+
256
+ schema = pa.schema(fields)
257
+ return schema
258
+
259
+
260
+ def pandas_to_parquet_table(
261
+ df: Optional[pd.DataFrame] = None,
262
+ convert: bool = True,
263
+ partial: bool = False,
264
+ preserve_index: bool = False,
265
+ verbose: bool = False,
266
+ ) -> pa.Table:
267
+ """
268
+ Generate a PyArrow Table from a pandas DataFrame with automatic type conversion.
269
+
270
+ Args:
271
+ df: The DataFrame to convert. If None, returns None.
272
+ convert: If True, attempts to cast string columns to numeric types.
273
+ partial: If True with convert, allows partial conversion with NaNs.
274
+ preserve_index: If True, preserves the DataFrame index in the table.
275
+ verbose: If True, prints progress messages.
276
+
277
+ Returns:
278
+ PyArrow Table object with optimized schema, or None if DataFrame is None
279
+ or conversion fails.
280
+ """
281
+ if df is None:
282
+ log_warning("No DataFrame provided; exiting pandas_to_parquet_table.", verbose)
283
+ raise ValueError("df must be provided")
284
+ if not isinstance(df, pd.DataFrame):
285
+ raise TypeError("df must be a pandas DataFrame")
286
+
287
+ def _unique_col_name(existing: set[str], desired: str) -> str:
288
+ if desired not in existing:
289
+ return desired
290
+ i = 1
291
+ while f"{desired}_{i}" in existing:
292
+ i += 1
293
+ return f"{desired}_{i}"
294
+
295
+ def _materialize_index_columns(input_df: pd.DataFrame) -> pd.DataFrame:
296
+ if isinstance(input_df.index, pd.MultiIndex):
297
+ index_names: list[str] = []
298
+ for i, name in enumerate(input_df.index.names):
299
+ index_names.append(name or f"__index_level_{i}__")
300
+
301
+ existing = set(map(str, input_df.columns))
302
+ index_names = [_unique_col_name(existing, str(n)) for n in index_names]
303
+
304
+ idx_df = input_df.index.to_frame(index=False)
305
+ idx_df.columns = index_names
306
+ out_df = pd.concat([idx_df, input_df.reset_index(drop=True)], axis=1)
307
+ return out_df
308
+
309
+ index_name = (
310
+ str(input_df.index.name)
311
+ if input_df.index.name is not None
312
+ else "__index_level_0__"
313
+ )
314
+ existing = set(map(str, input_df.columns))
315
+ index_name = _unique_col_name(existing, index_name)
316
+
317
+ out_df = input_df.copy()
318
+ out_df.insert(0, index_name, out_df.index.to_numpy(copy=False))
319
+ out_df = out_df.reset_index(drop=True)
320
+ return out_df
321
+
322
+ df = clean_dataframe_columns(df=df, verbose=verbose)
323
+
324
+ if preserve_index:
325
+ df = _materialize_index_columns(df)
326
+
327
+ if convert:
328
+ df = try_cast_string_columns_to_numeric(
329
+ df=df, convert_partial=partial, verbose=verbose
330
+ )
331
+
332
+ schema = generate_parquet_schema(df=df, verbose=verbose)
333
+ try:
334
+ # We materialize index into regular columns above so that the schema can
335
+ # fully describe the resulting table and the index is preserved even
336
+ # when an explicit schema is supplied.
337
+ table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
338
+ return table
339
+ except Exception as e:
340
+ log_error(f"Error generating PyArrow Table: {e}", verbose)
341
+ raise
342
+
343
+
344
+ def generate_sql_server_create_table_string(
345
+ df: Optional[pd.DataFrame] = None,
346
+ catalog: str = "database",
347
+ schema: str = "dbo",
348
+ table: str = "table",
349
+ dropexisting: bool = True,
350
+ verbose: bool = False,
351
+ ) -> str:
352
+ """
353
+ Generate a SQL Server CREATE TABLE statement from a pandas DataFrame.
354
+
355
+ Args:
356
+ df: The DataFrame to generate the schema from. If None, returns None.
357
+ catalog: The database catalog name.
358
+ schema: The schema name (default: 'dbo').
359
+ table: The table name.
360
+ dropexisting: If True, includes DROP TABLE IF EXISTS statement.
361
+ verbose: If True, prints progress messages.
362
+
363
+ Returns:
364
+ SQL Server CREATE TABLE statement string with optimized column types,
365
+ or None if no DataFrame is provided.
366
+ """
367
+ if df is None:
368
+ log_warning(
369
+ "No DataFrame provided; exiting generate_sql_server_create_table_string.",
370
+ verbose,
371
+ )
372
+ raise ValueError("df must be provided")
373
+ if not isinstance(df, pd.DataFrame):
374
+ raise TypeError("df must be a pandas DataFrame")
375
+ if not isinstance(catalog, str) or not catalog.strip():
376
+ raise ValueError("catalog must be a non-empty string")
377
+ if not isinstance(schema, str) or not schema.strip():
378
+ raise ValueError("schema must be a non-empty string")
379
+ if not isinstance(table, str) or not table.strip():
380
+ raise ValueError("table must be a non-empty string")
381
+
382
+ from ..sql.ddl import generate_create_table
383
+ from ..sql.dialects import Dialect
384
+
385
+ return generate_create_table(
386
+ df=df,
387
+ catalog=catalog,
388
+ schema=schema,
389
+ table=table,
390
+ drop_existing=dropexisting,
391
+ dialect=Dialect.SQLSERVER,
392
+ verbose=verbose,
393
+ )
394
+
395
+
396
+ def write_to_file_and_sql(
397
+ df: pd.DataFrame,
398
+ file_path: str,
399
+ table_name: str,
400
+ sql_server: str,
401
+ database: str,
402
+ username: str,
403
+ password: str,
404
+ verbose: bool = False,
405
+ ) -> None:
406
+ """
407
+ Write a DataFrame to a CSV file and import it to SQL Server using BCP.
408
+
409
+ Args:
410
+ df: The DataFrame to write.
411
+ file_path: Path where the CSV file will be saved.
412
+ table_name: Name of the SQL Server table.
413
+ sql_server: SQL Server instance name.
414
+ database: Database name.
415
+ username: SQL Server username.
416
+ password: SQL Server password.
417
+ verbose: If True, prints progress messages.
418
+
419
+ Raises:
420
+ subprocess.CalledProcessError: If BCP command fails.
421
+ """
422
+ if df is None or not isinstance(df, pd.DataFrame):
423
+ raise TypeError("df must be a pandas DataFrame")
424
+ if not isinstance(file_path, str) or not file_path:
425
+ raise ValueError("file_path must be a non-empty string")
426
+ if not isinstance(table_name, str) or not table_name.strip():
427
+ raise ValueError("table_name must be a non-empty string")
428
+ if not isinstance(sql_server, str) or not sql_server.strip():
429
+ raise ValueError("sql_server must be a non-empty string")
430
+ if not isinstance(database, str) or not database.strip():
431
+ raise ValueError("database must be a non-empty string")
432
+ if not isinstance(username, str) or not username.strip():
433
+ raise ValueError("username must be a non-empty string")
434
+ if not isinstance(password, str) or not password:
435
+ raise ValueError("password must be provided")
436
+
437
+ path_obj = pathlib.Path(file_path)
438
+ path_obj.parent.mkdir(parents=True, exist_ok=True)
439
+
440
+ df.to_csv(path_obj, index=False)
441
+ log_info(f"DataFrame written to file {path_obj}.", verbose)
442
+
443
+ qualified_table = f"{database}.dbo.{table_name}"
444
+ bcp_args = [
445
+ "bcp",
446
+ qualified_table,
447
+ "in",
448
+ str(path_obj),
449
+ "-c",
450
+ "-t,",
451
+ "-S",
452
+ sql_server,
453
+ "-U",
454
+ username,
455
+ "-P",
456
+ password,
457
+ ]
458
+
459
+ log_debug(
460
+ f"Executing BCP load to {qualified_table} on server {sql_server} as user {username}.",
461
+ verbose,
462
+ )
463
+ process = subprocess.run(
464
+ bcp_args,
465
+ shell=False,
466
+ check=True,
467
+ stdout=subprocess.PIPE,
468
+ stderr=subprocess.PIPE,
469
+ )
470
+ if process.returncode == 0:
471
+ log_info(
472
+ f"DataFrame successfully written to SQL Server table {table_name}.", verbose
473
+ )
474
+ else:
475
+ error_msg = process.stderr.decode()
476
+ log_error(
477
+ f"Error writing DataFrame to SQL Server table {table_name}: {error_msg}",
478
+ verbose,
479
+ )
480
+ raise subprocess.CalledProcessError(
481
+ process.returncode,
482
+ bcp_args,
483
+ output=process.stdout,
484
+ stderr=process.stderr,
485
+ )