datablade 0.0.0__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datablade/__init__.py +49 -1
- datablade/blade.py +322 -0
- datablade/core/__init__.py +28 -7
- datablade/core/frames.py +23 -236
- datablade/core/json.py +5 -10
- datablade/core/lists.py +5 -10
- datablade/core/messages.py +23 -11
- datablade/core/strings.py +5 -43
- datablade/core/zip.py +5 -24
- datablade/dataframes/__init__.py +51 -0
- datablade/dataframes/frames.py +585 -0
- datablade/dataframes/readers.py +1367 -0
- datablade/docs/ARCHITECTURE.md +102 -0
- datablade/docs/OBJECT_REGISTRY.md +194 -0
- datablade/docs/README.md +57 -0
- datablade/docs/TESTING.md +37 -0
- datablade/docs/USAGE.md +409 -0
- datablade/docs/__init__.py +87 -0
- datablade/docs/__main__.py +6 -0
- datablade/io/__init__.py +15 -0
- datablade/io/json.py +70 -0
- datablade/io/zip.py +111 -0
- datablade/registry.py +581 -0
- datablade/sql/__init__.py +56 -0
- datablade/sql/bulk_load.py +665 -0
- datablade/sql/ddl.py +402 -0
- datablade/sql/ddl_pyarrow.py +411 -0
- datablade/sql/dialects.py +12 -0
- datablade/sql/quoting.py +44 -0
- datablade/sql/schema_spec.py +65 -0
- datablade/sql/sqlserver.py +390 -0
- datablade/utils/__init__.py +38 -0
- datablade/utils/lists.py +32 -0
- datablade/utils/logging.py +204 -0
- datablade/utils/messages.py +29 -0
- datablade/utils/strings.py +249 -0
- datablade-0.0.6.dist-info/METADATA +406 -0
- datablade-0.0.6.dist-info/RECORD +41 -0
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/WHEEL +1 -1
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info/licenses}/LICENSE +20 -20
- datablade-0.0.0.dist-info/METADATA +0 -13
- datablade-0.0.0.dist-info/RECORD +0 -13
- {datablade-0.0.0.dist-info → datablade-0.0.6.dist-info}/top_level.txt +0 -0
datablade/__init__.py
CHANGED
|
@@ -1 +1,49 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
datablade - A suite of functions providing standard syntax across data engineering projects.
|
|
3
|
+
|
|
4
|
+
The package is organized into four main modules:
|
|
5
|
+
- dataframes: DataFrame operations, transformations, and memory-aware file reading
|
|
6
|
+
- io: Input/output operations for external data
|
|
7
|
+
- utils: General utility functions and logging
|
|
8
|
+
- sql: Multi-dialect SQL generation, quoting, and bulk loading
|
|
9
|
+
|
|
10
|
+
For backward compatibility, all functions are also available from datablade.core.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
# Also maintain core for backward compatibility
|
|
14
|
+
# Import from new organized structure
|
|
15
|
+
from . import core, dataframes, io, registry, sql, utils
|
|
16
|
+
from .blade import Blade
|
|
17
|
+
from .dataframes import read_file_chunked, read_file_smart, read_file_to_parquets
|
|
18
|
+
from .registry import DialectSpec, ObjectNode, ObjectRef, ObjectRegistry
|
|
19
|
+
from .sql import Dialect, bulk_load, generate_create_table
|
|
20
|
+
|
|
21
|
+
# Convenience re-exports for commonly used functions
|
|
22
|
+
from .utils.logging import configure_logging, get_logger
|
|
23
|
+
from .utils.strings import configure_paths
|
|
24
|
+
|
|
25
|
+
__version__ = "0.0.6"
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"dataframes",
|
|
29
|
+
"io",
|
|
30
|
+
"utils",
|
|
31
|
+
"sql",
|
|
32
|
+
"registry",
|
|
33
|
+
"core", # Maintain backward compatibility
|
|
34
|
+
# Convenience re-exports
|
|
35
|
+
"configure_logging",
|
|
36
|
+
"configure_paths",
|
|
37
|
+
"get_logger",
|
|
38
|
+
"read_file_smart",
|
|
39
|
+
"read_file_chunked",
|
|
40
|
+
"read_file_to_parquets",
|
|
41
|
+
"Dialect",
|
|
42
|
+
"generate_create_table",
|
|
43
|
+
"bulk_load",
|
|
44
|
+
"Blade",
|
|
45
|
+
"DialectSpec",
|
|
46
|
+
"ObjectRef",
|
|
47
|
+
"ObjectNode",
|
|
48
|
+
"ObjectRegistry",
|
|
49
|
+
]
|
datablade/blade.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Optional facade class for datablade.
|
|
2
|
+
|
|
3
|
+
The canonical API is module-level functions (e.g., datablade.dataframes.read_file_iter).
|
|
4
|
+
This module provides a small convenience wrapper for users who prefer an object-style
|
|
5
|
+
entrypoint with shared defaults.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Iterator, Optional, Sequence, Union
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from .dataframes import (
|
|
17
|
+
clean_dataframe_columns,
|
|
18
|
+
read_file_iter,
|
|
19
|
+
read_file_smart,
|
|
20
|
+
read_file_to_parquets,
|
|
21
|
+
stream_to_parquets,
|
|
22
|
+
stream_to_sink,
|
|
23
|
+
try_cast_string_columns_to_numeric,
|
|
24
|
+
)
|
|
25
|
+
from .sql import (
|
|
26
|
+
Dialect,
|
|
27
|
+
ParquetDDLMetadata,
|
|
28
|
+
generate_create_table,
|
|
29
|
+
generate_create_table_from_parquet,
|
|
30
|
+
sqlserver_create_and_insert_from_parquet,
|
|
31
|
+
sqlserver_create_and_stage_from_parquets,
|
|
32
|
+
sqlserver_openrowset_parquet,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
PathLike = Union[str, Path]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class Blade:
|
|
40
|
+
"""Convenience facade for common datablade workflows.
|
|
41
|
+
|
|
42
|
+
Stores default options that are threaded through to the underlying functions.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
memory_fraction: float = 0.5
|
|
46
|
+
verbose: bool = False
|
|
47
|
+
convert_types: bool = True
|
|
48
|
+
|
|
49
|
+
def read(
|
|
50
|
+
self,
|
|
51
|
+
file_path: PathLike,
|
|
52
|
+
*,
|
|
53
|
+
return_type: str = "dataframe",
|
|
54
|
+
**read_kwargs: Any,
|
|
55
|
+
):
|
|
56
|
+
return read_file_smart(
|
|
57
|
+
file_path=file_path,
|
|
58
|
+
memory_fraction=self.memory_fraction,
|
|
59
|
+
verbose=self.verbose,
|
|
60
|
+
return_type=return_type,
|
|
61
|
+
**read_kwargs,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def iter(
|
|
65
|
+
self,
|
|
66
|
+
file_path: PathLike,
|
|
67
|
+
*,
|
|
68
|
+
chunksize: Optional[int] = None,
|
|
69
|
+
**read_kwargs: Any,
|
|
70
|
+
) -> Iterator[pd.DataFrame]:
|
|
71
|
+
"""Yield DataFrame chunks from a file without materializing."""
|
|
72
|
+
return read_file_iter(
|
|
73
|
+
file_path=file_path,
|
|
74
|
+
chunksize=chunksize,
|
|
75
|
+
memory_fraction=self.memory_fraction,
|
|
76
|
+
verbose=self.verbose,
|
|
77
|
+
**read_kwargs,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def partition_to_parquets(
|
|
81
|
+
self,
|
|
82
|
+
file_path: PathLike,
|
|
83
|
+
output_dir: PathLike,
|
|
84
|
+
*,
|
|
85
|
+
output_prefix: str = "part",
|
|
86
|
+
rows_per_file: Optional[int] = None,
|
|
87
|
+
convert_types: Optional[bool] = None,
|
|
88
|
+
**read_kwargs: Any,
|
|
89
|
+
):
|
|
90
|
+
"""Read and materialize a file into Parquet partitions."""
|
|
91
|
+
return read_file_to_parquets(
|
|
92
|
+
file_path=file_path,
|
|
93
|
+
output_dir=output_dir,
|
|
94
|
+
output_prefix=output_prefix,
|
|
95
|
+
rows_per_file=rows_per_file,
|
|
96
|
+
memory_fraction=self.memory_fraction,
|
|
97
|
+
convert_types=(
|
|
98
|
+
self.convert_types if convert_types is None else convert_types
|
|
99
|
+
),
|
|
100
|
+
verbose=self.verbose,
|
|
101
|
+
**read_kwargs,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def stream_to_parquets(
|
|
105
|
+
self,
|
|
106
|
+
file_path: PathLike,
|
|
107
|
+
output_dir: PathLike,
|
|
108
|
+
*,
|
|
109
|
+
output_prefix: str = "part",
|
|
110
|
+
rows_per_file: Optional[int] = None,
|
|
111
|
+
convert_types: Optional[bool] = None,
|
|
112
|
+
**read_kwargs: Any,
|
|
113
|
+
):
|
|
114
|
+
"""Stream input file chunks into Parquet partitions."""
|
|
115
|
+
return stream_to_parquets(
|
|
116
|
+
file_path=file_path,
|
|
117
|
+
output_dir=output_dir,
|
|
118
|
+
output_prefix=output_prefix,
|
|
119
|
+
rows_per_file=rows_per_file,
|
|
120
|
+
memory_fraction=self.memory_fraction,
|
|
121
|
+
convert_types=(
|
|
122
|
+
self.convert_types if convert_types is None else convert_types
|
|
123
|
+
),
|
|
124
|
+
verbose=self.verbose,
|
|
125
|
+
**read_kwargs,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def stream_to_sink(
|
|
129
|
+
self,
|
|
130
|
+
chunks: Iterator[pd.DataFrame],
|
|
131
|
+
output_dir: PathLike,
|
|
132
|
+
*,
|
|
133
|
+
output_prefix: str = "part",
|
|
134
|
+
convert_types: Optional[bool] = None,
|
|
135
|
+
):
|
|
136
|
+
return stream_to_sink(
|
|
137
|
+
chunks=chunks,
|
|
138
|
+
output_dir=output_dir,
|
|
139
|
+
output_prefix=output_prefix,
|
|
140
|
+
convert_types=(
|
|
141
|
+
self.convert_types if convert_types is None else convert_types
|
|
142
|
+
),
|
|
143
|
+
verbose=self.verbose,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
147
|
+
"""Normalize column names and remove duplicate columns."""
|
|
148
|
+
return clean_dataframe_columns(df, verbose=self.verbose)
|
|
149
|
+
|
|
150
|
+
def cast_numeric(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
151
|
+
"""Attempt numeric conversion on string columns."""
|
|
152
|
+
return try_cast_string_columns_to_numeric(df, verbose=self.verbose)
|
|
153
|
+
|
|
154
|
+
def create_table_sql(
|
|
155
|
+
self,
|
|
156
|
+
df: pd.DataFrame,
|
|
157
|
+
*,
|
|
158
|
+
catalog: Optional[str] = None,
|
|
159
|
+
schema: Optional[str] = None,
|
|
160
|
+
table: str = "table",
|
|
161
|
+
drop_existing: bool = True,
|
|
162
|
+
dialect: Dialect = Dialect.SQLSERVER,
|
|
163
|
+
use_go: bool = False,
|
|
164
|
+
schema_spec: Optional[dict] = None,
|
|
165
|
+
) -> Union[str, tuple[str, ParquetDDLMetadata]]:
|
|
166
|
+
"""Generate CREATE TABLE DDL from a DataFrame schema."""
|
|
167
|
+
return generate_create_table(
|
|
168
|
+
df=df,
|
|
169
|
+
catalog=catalog,
|
|
170
|
+
schema=schema,
|
|
171
|
+
table=table,
|
|
172
|
+
drop_existing=drop_existing,
|
|
173
|
+
dialect=dialect,
|
|
174
|
+
use_go=use_go,
|
|
175
|
+
schema_spec=schema_spec,
|
|
176
|
+
verbose=self.verbose,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def create_table_sql_from_parquet(
|
|
180
|
+
self,
|
|
181
|
+
parquet_path: PathLike,
|
|
182
|
+
*,
|
|
183
|
+
catalog: Optional[str] = None,
|
|
184
|
+
schema: Optional[str] = None,
|
|
185
|
+
table: str = "table",
|
|
186
|
+
drop_existing: bool = True,
|
|
187
|
+
dialect: Dialect = Dialect.SQLSERVER,
|
|
188
|
+
use_go: bool = False,
|
|
189
|
+
schema_spec: Optional[dict] = None,
|
|
190
|
+
fallback_to_json: bool = False,
|
|
191
|
+
return_metadata: bool = False,
|
|
192
|
+
) -> str:
|
|
193
|
+
"""Generate CREATE TABLE DDL from a Parquet schema."""
|
|
194
|
+
return generate_create_table_from_parquet(
|
|
195
|
+
parquet_path=parquet_path,
|
|
196
|
+
catalog=catalog,
|
|
197
|
+
schema=schema,
|
|
198
|
+
table=table,
|
|
199
|
+
drop_existing=drop_existing,
|
|
200
|
+
dialect=dialect,
|
|
201
|
+
use_go=use_go,
|
|
202
|
+
schema_spec=schema_spec,
|
|
203
|
+
verbose=self.verbose,
|
|
204
|
+
fallback_to_json=fallback_to_json,
|
|
205
|
+
return_metadata=return_metadata,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
def sqlserver_openrowset_parquet(
|
|
209
|
+
self,
|
|
210
|
+
parquet_path: PathLike,
|
|
211
|
+
*,
|
|
212
|
+
data_source: Optional[str] = None,
|
|
213
|
+
table_alias: str = "rows",
|
|
214
|
+
select_columns: Optional[Sequence[str]] = None,
|
|
215
|
+
where: Optional[str] = None,
|
|
216
|
+
top: Optional[int] = None,
|
|
217
|
+
) -> str:
|
|
218
|
+
"""Generate a SQL Server OPENROWSET query over Parquet files."""
|
|
219
|
+
return sqlserver_openrowset_parquet(
|
|
220
|
+
parquet_path,
|
|
221
|
+
data_source=data_source,
|
|
222
|
+
table_alias=table_alias,
|
|
223
|
+
select_columns=select_columns,
|
|
224
|
+
where=where,
|
|
225
|
+
top=top,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def sqlserver_create_and_insert_from_parquet(
|
|
229
|
+
self,
|
|
230
|
+
parquet_path: PathLike,
|
|
231
|
+
output_dir: PathLike,
|
|
232
|
+
*,
|
|
233
|
+
table: str,
|
|
234
|
+
catalog: Optional[str] = None,
|
|
235
|
+
schema: Optional[str] = None,
|
|
236
|
+
drop_existing: bool = True,
|
|
237
|
+
use_go: bool = False,
|
|
238
|
+
schema_spec: Optional[dict] = None,
|
|
239
|
+
rows_per_file: Optional[int] = None,
|
|
240
|
+
memory_fraction: Optional[float] = None,
|
|
241
|
+
convert_types: Optional[bool] = None,
|
|
242
|
+
output_prefix: str = "part",
|
|
243
|
+
delimiter: str = ",",
|
|
244
|
+
include_header: bool = True,
|
|
245
|
+
line_terminator: str = "\n",
|
|
246
|
+
first_row: Optional[int] = None,
|
|
247
|
+
tablock: bool = True,
|
|
248
|
+
codepage: Optional[str] = None,
|
|
249
|
+
fallback_to_json: bool = False,
|
|
250
|
+
) -> tuple[str, list[Path]]:
|
|
251
|
+
"""Create SQL Server DDL + BULK INSERT statements from Parquet."""
|
|
252
|
+
return sqlserver_create_and_insert_from_parquet(
|
|
253
|
+
parquet_path=parquet_path,
|
|
254
|
+
output_dir=output_dir,
|
|
255
|
+
table=table,
|
|
256
|
+
catalog=catalog,
|
|
257
|
+
schema=schema,
|
|
258
|
+
drop_existing=drop_existing,
|
|
259
|
+
use_go=use_go,
|
|
260
|
+
schema_spec=schema_spec,
|
|
261
|
+
rows_per_file=rows_per_file,
|
|
262
|
+
memory_fraction=(
|
|
263
|
+
self.memory_fraction if memory_fraction is None else memory_fraction
|
|
264
|
+
),
|
|
265
|
+
convert_types=(
|
|
266
|
+
self.convert_types if convert_types is None else convert_types
|
|
267
|
+
),
|
|
268
|
+
output_prefix=output_prefix,
|
|
269
|
+
delimiter=delimiter,
|
|
270
|
+
include_header=include_header,
|
|
271
|
+
line_terminator=line_terminator,
|
|
272
|
+
first_row=first_row,
|
|
273
|
+
tablock=tablock,
|
|
274
|
+
codepage=codepage,
|
|
275
|
+
fallback_to_json=fallback_to_json,
|
|
276
|
+
verbose=self.verbose,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def sqlserver_create_and_stage_from_parquets(
|
|
280
|
+
self,
|
|
281
|
+
parquet_paths: Sequence[PathLike],
|
|
282
|
+
output_dir: PathLike,
|
|
283
|
+
*,
|
|
284
|
+
table: str,
|
|
285
|
+
catalog: Optional[str] = None,
|
|
286
|
+
schema: Optional[str] = None,
|
|
287
|
+
drop_existing: bool = True,
|
|
288
|
+
use_go: bool = False,
|
|
289
|
+
schema_spec: Optional[dict] = None,
|
|
290
|
+
rows_per_file: Optional[int] = None,
|
|
291
|
+
memory_fraction: Optional[float] = None,
|
|
292
|
+
convert_types: Optional[bool] = None,
|
|
293
|
+
output_prefix: str = "part",
|
|
294
|
+
delimiter: str = ",",
|
|
295
|
+
include_header: bool = True,
|
|
296
|
+
line_terminator: str = "\n",
|
|
297
|
+
fallback_to_json: bool = False,
|
|
298
|
+
) -> tuple[str, list[Path]]:
|
|
299
|
+
"""Generate SQL Server DDL and stage multiple Parquet files as CSVs."""
|
|
300
|
+
return sqlserver_create_and_stage_from_parquets(
|
|
301
|
+
parquet_paths=parquet_paths,
|
|
302
|
+
output_dir=output_dir,
|
|
303
|
+
table=table,
|
|
304
|
+
catalog=catalog,
|
|
305
|
+
schema=schema,
|
|
306
|
+
drop_existing=drop_existing,
|
|
307
|
+
use_go=use_go,
|
|
308
|
+
schema_spec=schema_spec,
|
|
309
|
+
rows_per_file=rows_per_file,
|
|
310
|
+
memory_fraction=(
|
|
311
|
+
self.memory_fraction if memory_fraction is None else memory_fraction
|
|
312
|
+
),
|
|
313
|
+
convert_types=(
|
|
314
|
+
self.convert_types if convert_types is None else convert_types
|
|
315
|
+
),
|
|
316
|
+
output_prefix=output_prefix,
|
|
317
|
+
delimiter=delimiter,
|
|
318
|
+
include_header=include_header,
|
|
319
|
+
line_terminator=line_terminator,
|
|
320
|
+
fallback_to_json=fallback_to_json,
|
|
321
|
+
verbose=self.verbose,
|
|
322
|
+
)
|
datablade/core/__init__.py
CHANGED
|
@@ -1,7 +1,28 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
1
|
+
"""Backward-compatible exports for the legacy datablade.core namespace.
|
|
2
|
+
|
|
3
|
+
Historically, this package used dynamic imports to re-export everything.
|
|
4
|
+
We keep the same runtime surface area but use explicit imports so that IDEs,
|
|
5
|
+
type checkers, and static analysis tools can reason about the module.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from . import frames as _frames
|
|
9
|
+
from . import json as _json
|
|
10
|
+
from . import lists as _lists
|
|
11
|
+
from . import messages as _messages
|
|
12
|
+
from . import strings as _strings
|
|
13
|
+
from . import zip as _zip
|
|
14
|
+
from .frames import * # noqa: F401,F403
|
|
15
|
+
from .json import * # noqa: F401,F403
|
|
16
|
+
from .lists import * # noqa: F401,F403
|
|
17
|
+
from .messages import * # noqa: F401,F403
|
|
18
|
+
from .strings import * # noqa: F401,F403
|
|
19
|
+
from .zip import * # noqa: F401,F403
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
*_frames.__all__,
|
|
23
|
+
*_json.__all__,
|
|
24
|
+
*_lists.__all__,
|
|
25
|
+
*_messages.__all__,
|
|
26
|
+
*_strings.__all__,
|
|
27
|
+
*_zip.__all__,
|
|
28
|
+
]
|
datablade/core/frames.py
CHANGED
|
@@ -1,236 +1,23 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
for col in df.columns:
|
|
26
|
-
if df[col].dtype == 'object':
|
|
27
|
-
converted = pd.to_numeric(df[col], errors='coerce')
|
|
28
|
-
has_nan = converted.isnull().any()
|
|
29
|
-
if not has_nan:
|
|
30
|
-
df[col] = converted
|
|
31
|
-
print_verbose(f"Column '{col}' successfully converted to numeric.", verbose)
|
|
32
|
-
else:
|
|
33
|
-
if convert_partial:
|
|
34
|
-
df[col] = converted
|
|
35
|
-
print_verbose(f"Column '{col}' partially converted to numeric with NaNs where conversion failed.", verbose)
|
|
36
|
-
else:
|
|
37
|
-
print_verbose(f"Column '{col}' could not be fully converted to numeric; leaving as is.", verbose)
|
|
38
|
-
return df
|
|
39
|
-
|
|
40
|
-
def clean_dataframe_columns(df: pd.DataFrame=None, verbose: bool=False) -> pd.DataFrame|None:
|
|
41
|
-
"""
|
|
42
|
-
Clean the DataFrame columns by:
|
|
43
|
-
- Flattening MultiIndex columns
|
|
44
|
-
- Converting non-string column names to strings
|
|
45
|
-
- Removing duplicate columns, keeping the first occurrence
|
|
46
|
-
|
|
47
|
-
Parameters:
|
|
48
|
-
df (pd.DataFrame): The DataFrame to clean.
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
pd.DataFrame: The cleaned DataFrame.
|
|
52
|
-
"""
|
|
53
|
-
if df is None:
|
|
54
|
-
print_verbose("No DataFrame provided; exiting clean_dataframe_columns.", verbose)
|
|
55
|
-
exit
|
|
56
|
-
# Step 1: Flatten MultiIndex columns
|
|
57
|
-
if isinstance(df.columns, pd.MultiIndex):
|
|
58
|
-
df.columns = ['_'.join(map(str, col)).strip() for col in df.columns.values]
|
|
59
|
-
print_verbose("Flattened MultiIndex columns.", verbose)
|
|
60
|
-
|
|
61
|
-
# Step 2: Convert non-string column names to strings
|
|
62
|
-
df.columns = df.columns.map(str)
|
|
63
|
-
print_verbose("Converted column names to strings.", verbose)
|
|
64
|
-
|
|
65
|
-
# Step 3: Remove duplicate columns, keeping the first occurrence
|
|
66
|
-
duplicates = df.columns.duplicated()
|
|
67
|
-
if duplicates.any():
|
|
68
|
-
duplicate_cols = df.columns[duplicates]
|
|
69
|
-
print_verbose(f"Duplicate columns found: {list(duplicate_cols)}", verbose)
|
|
70
|
-
df = df.loc[:, ~duplicates]
|
|
71
|
-
print_verbose("Removed duplicate columns, keeping the first occurrence.", verbose)
|
|
72
|
-
|
|
73
|
-
return df
|
|
74
|
-
|
|
75
|
-
def generate_parquet_schema(df: pd.DataFrame=None, verbose: bool=False) -> pa.Schema|None:
|
|
76
|
-
"""
|
|
77
|
-
Generate a PyArrow Schema from a pandas DataFrame.
|
|
78
|
-
Parameters:
|
|
79
|
-
df (pandas.DataFrame): The DataFrame to generate the schema from.
|
|
80
|
-
Returns:
|
|
81
|
-
pyarrow.Schema: The PyArrow Schema object.
|
|
82
|
-
"""
|
|
83
|
-
if df is None:
|
|
84
|
-
print_verbose("No DataFrame provided; exiting generate_parquet_schema.", verbose)
|
|
85
|
-
exit
|
|
86
|
-
|
|
87
|
-
fields = []
|
|
88
|
-
for column in df.columns:
|
|
89
|
-
col_data = df[column]
|
|
90
|
-
col_name = column
|
|
91
|
-
dtype = col_data.dtype
|
|
92
|
-
|
|
93
|
-
# Determine if the column contains any nulls
|
|
94
|
-
nullable = col_data.isnull().any()
|
|
95
|
-
|
|
96
|
-
# Map pandas dtype to PyArrow type
|
|
97
|
-
pa_type = None
|
|
98
|
-
|
|
99
|
-
if pd.api.types.is_integer_dtype(dtype):
|
|
100
|
-
# Check the range to determine the smallest integer type
|
|
101
|
-
min_value = col_data.min()
|
|
102
|
-
max_value = col_data.max()
|
|
103
|
-
if min_value >= np.iinfo(np.int8).min and max_value <= np.iinfo(np.int8).max:
|
|
104
|
-
pa_type = pa.int8()
|
|
105
|
-
elif min_value >= np.iinfo(np.int16).min and max_value <= np.iinfo(np.int16).max:
|
|
106
|
-
pa_type = pa.int16()
|
|
107
|
-
elif min_value >= np.iinfo(np.int32).min and max_value <= np.iinfo(np.int32).max:
|
|
108
|
-
pa_type = pa.int32()
|
|
109
|
-
else:
|
|
110
|
-
pa_type = pa.int64()
|
|
111
|
-
|
|
112
|
-
elif pd.api.types.is_float_dtype(dtype):
|
|
113
|
-
pa_type = pa.float64()
|
|
114
|
-
|
|
115
|
-
elif pd.api.types.is_bool_dtype(dtype):
|
|
116
|
-
pa_type = pa.bool_()
|
|
117
|
-
|
|
118
|
-
elif pd.api.types.is_datetime64_any_dtype(dtype):
|
|
119
|
-
pa_type = pa.timestamp('ms')
|
|
120
|
-
|
|
121
|
-
elif isinstance(dtype, pd.CategoricalDtype) or pd.api.types.is_object_dtype(dtype):
|
|
122
|
-
pa_type = pa.string()
|
|
123
|
-
|
|
124
|
-
else:
|
|
125
|
-
pa_type = pa.string()
|
|
126
|
-
|
|
127
|
-
# Create a field
|
|
128
|
-
field = pa.field(col_name, pa_type, nullable=nullable)
|
|
129
|
-
fields.append(field)
|
|
130
|
-
|
|
131
|
-
schema = pa.schema(fields)
|
|
132
|
-
return schema
|
|
133
|
-
|
|
134
|
-
def pandas_to_parquet_table(df: pd.DataFrame=None, convert: bool=True, partial: bool=False, preserve_index: bool=False, verbose: bool=False) -> pa.Table|None:
|
|
135
|
-
"""
|
|
136
|
-
Generate a PyArrow Table from a pandas DataFrame.
|
|
137
|
-
|
|
138
|
-
Parameters:
|
|
139
|
-
df (pandas.DataFrame): The DataFrame to generate the table from.
|
|
140
|
-
table (str): The name of the table.
|
|
141
|
-
|
|
142
|
-
Returns:
|
|
143
|
-
pyarrow.Table: The PyArrow Table object.
|
|
144
|
-
"""
|
|
145
|
-
if df is None:
|
|
146
|
-
print_verbose("No DataFrame provided; exiting generate_parquet_table.", verbose)
|
|
147
|
-
exit
|
|
148
|
-
|
|
149
|
-
df = clean_dataframe_columns(df=df, verbose=verbose)
|
|
150
|
-
|
|
151
|
-
if convert:
|
|
152
|
-
df = try_cast_string_columns_to_numeric(df=df, convert_partial=partial, verbose=verbose)
|
|
153
|
-
|
|
154
|
-
schema = generate_parquet_schema(df=df, verbose=verbose)
|
|
155
|
-
try:
|
|
156
|
-
table = pa.Table.from_pandas(df, schema=schema, preserve_index=preserve_index)
|
|
157
|
-
return table
|
|
158
|
-
except Exception as e:
|
|
159
|
-
print_verbose(f"Error generating PyArrow Table: {e}", verbose)
|
|
160
|
-
exit
|
|
161
|
-
|
|
162
|
-
def generate_sql_server_create_table_string(df: pd.DataFrame=None, catalog: str='database', schema: str='dbo', table: str='table', dropexisting: bool=True, verbose: bool=False) -> str|None:
|
|
163
|
-
"""
|
|
164
|
-
Generate a SQL Server CREATE TABLE string from a pandas DataFrame.
|
|
165
|
-
|
|
166
|
-
Parameters:
|
|
167
|
-
df (pandas.DataFrame): The DataFrame to generate the schema from.
|
|
168
|
-
table_name (str): The name of the SQL table.
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
str: The SQL Server CREATE TABLE statement.
|
|
172
|
-
"""
|
|
173
|
-
if df is None:
|
|
174
|
-
print_verbose("No DataFrame provided; exiting try_cast_string_columns_to_numeric.", verbose)
|
|
175
|
-
exit
|
|
176
|
-
|
|
177
|
-
table_name = f"{sql_quotename(catalog)}.{sql_quotename(schema)}.{sql_quotename(table)}"
|
|
178
|
-
drop_statement = f"use {sql_quotename(catalog)}\rgo\rif object_id('{table_name}') is not null drop table {table_name};\r" if dropexisting else ""
|
|
179
|
-
|
|
180
|
-
create_statement = [f"{drop_statement};create table {table_name} ("]
|
|
181
|
-
indent = " "
|
|
182
|
-
column_lines = []
|
|
183
|
-
|
|
184
|
-
for column in df.columns:
|
|
185
|
-
col_data = df[column]
|
|
186
|
-
col_name = column
|
|
187
|
-
dtype = col_data.dtype
|
|
188
|
-
|
|
189
|
-
# Determine if the column contains any nulls
|
|
190
|
-
nullable = col_data.isnull().any()
|
|
191
|
-
null_str = f"{' ' if nullable else 'not'} null"
|
|
192
|
-
|
|
193
|
-
# Map pandas dtype to SQL Server type
|
|
194
|
-
sql_type = None
|
|
195
|
-
|
|
196
|
-
if pd.api.types.is_integer_dtype(dtype):
|
|
197
|
-
min_value = col_data.min()
|
|
198
|
-
max_value = col_data.max()
|
|
199
|
-
if min_value >= 0 and max_value <= 255:
|
|
200
|
-
sql_type = "tinyint"
|
|
201
|
-
elif min_value >= -32768 and max_value <= 32767:
|
|
202
|
-
sql_type = "smallint"
|
|
203
|
-
elif min_value >= -2147483648 and max_value <= 2147483647:
|
|
204
|
-
sql_type = "int"
|
|
205
|
-
else:
|
|
206
|
-
sql_type = "bigint"
|
|
207
|
-
|
|
208
|
-
elif pd.api.types.is_float_dtype(dtype):
|
|
209
|
-
sql_type = "float"
|
|
210
|
-
|
|
211
|
-
elif pd.api.types.is_bool_dtype(dtype):
|
|
212
|
-
sql_type = "bit"
|
|
213
|
-
|
|
214
|
-
elif pd.api.types.is_datetime64_any_dtype(dtype):
|
|
215
|
-
sql_type = "datetime2"
|
|
216
|
-
|
|
217
|
-
elif isinstance(dtype, pd.CategoricalDtype) or pd.api.types.is_object_dtype(dtype):
|
|
218
|
-
# Determine maximum length of string data
|
|
219
|
-
max_length = col_data.dropna().astype(str).map(len).max()
|
|
220
|
-
sql_type = f"nvarchar({str(max_length) if max_length <= 4000 else 'max'})"
|
|
221
|
-
|
|
222
|
-
else:
|
|
223
|
-
sql_type = "nvarchar(max)"
|
|
224
|
-
|
|
225
|
-
# Build the column definition
|
|
226
|
-
column_line = f"{indent}{sql_quotename(col_name)} {sql_type} {null_str},"
|
|
227
|
-
column_lines.append(column_line)
|
|
228
|
-
|
|
229
|
-
# Remove the last comma from the last column definition
|
|
230
|
-
if column_lines:
|
|
231
|
-
column_lines[-1] = column_lines[-1].rstrip(',')
|
|
232
|
-
|
|
233
|
-
create_statement.extend(column_lines)
|
|
234
|
-
create_statement.append(");")
|
|
235
|
-
return_statement = "\r".join(create_statement)
|
|
236
|
-
return return_statement
|
|
1
|
+
"""Backward-compatibility re-exports.
|
|
2
|
+
|
|
3
|
+
This module intentionally contains no independent implementations.
|
|
4
|
+
All functionality is provided by the newer modules in datablade.dataframes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from ..dataframes.frames import ( # noqa: F401
|
|
8
|
+
clean_dataframe_columns,
|
|
9
|
+
generate_parquet_schema,
|
|
10
|
+
generate_sql_server_create_table_string,
|
|
11
|
+
pandas_to_parquet_table,
|
|
12
|
+
try_cast_string_columns_to_numeric,
|
|
13
|
+
write_to_file_and_sql,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"try_cast_string_columns_to_numeric",
|
|
18
|
+
"clean_dataframe_columns",
|
|
19
|
+
"generate_parquet_schema",
|
|
20
|
+
"pandas_to_parquet_table",
|
|
21
|
+
"generate_sql_server_create_table_string",
|
|
22
|
+
"write_to_file_and_sql",
|
|
23
|
+
]
|
datablade/core/json.py
CHANGED
|
@@ -1,10 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
try:
|
|
7
|
-
response = requests.get(url, **kwargs)
|
|
8
|
-
return response.json()
|
|
9
|
-
except requests.exceptions.RequestException as e:
|
|
10
|
-
print_verbose(f"Error: {e}", verbose=verbose)
|
|
1
|
+
"""Backward-compatibility re-exports for IO JSON helpers."""
|
|
2
|
+
|
|
3
|
+
from ..io.json import get # noqa: F401
|
|
4
|
+
|
|
5
|
+
__all__ = ["get"]
|