datablade 0.0.0__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,540 @@
1
+ """
2
+ Memory-aware file reading utilities with Polars support.
3
+
4
+ This module provides intelligent file reading that:
5
+ - Estimates memory requirements before loading
6
+ - Automatically chunks large files
7
+ - Uses Polars for high-performance reading when available
8
+ - Writes large files to multiple Parquet partitions
9
+ """
10
+
11
+ import pathlib
12
+ from typing import Iterator, List, Optional, Union
13
+
14
+ import pandas as pd
15
+ import pyarrow.parquet as pq
16
+
17
+ from ..utils.logging import log_debug, log_info, log_warning
18
+
19
+
20
+ def _normalize_text_delimiter_kwargs(suffix: str, read_kwargs: dict) -> dict:
21
+ """Normalize delimiter/sep kwargs for delimited text formats.
22
+
23
+ - For TSV, default to tab separator unless the caller supplied one.
24
+ - For CSV/TXT, leave pandas defaults unless the caller supplied one.
25
+ """
26
+ if suffix not in (".csv", ".tsv", ".txt"):
27
+ return read_kwargs
28
+
29
+ if "sep" in read_kwargs or "delimiter" in read_kwargs:
30
+ return read_kwargs
31
+
32
+ if suffix == ".tsv":
33
+ out = dict(read_kwargs)
34
+ out["sep"] = "\t"
35
+ return out
36
+
37
+ return read_kwargs
38
+
39
+
40
+ def _polars_scan_csv_kwargs(suffix: str, read_kwargs: dict) -> dict:
41
+ """Best-effort mapping of pandas-style kwargs to polars scan_csv kwargs."""
42
+ # Polars uses `separator` (not `sep`). We only map delimiters because other
43
+ # pandas kwargs are not generally compatible.
44
+ if "sep" in read_kwargs:
45
+ return {"separator": read_kwargs["sep"]}
46
+ if "delimiter" in read_kwargs:
47
+ return {"separator": read_kwargs["delimiter"]}
48
+ if suffix == ".tsv":
49
+ return {"separator": "\t"}
50
+ return {}
51
+
52
+
53
+ def _infer_parquet_batch_rows(
54
+ file_path: pathlib.Path,
55
+ parquet_file: pq.ParquetFile,
56
+ memory_fraction: float,
57
+ verbose: bool,
58
+ ) -> int:
59
+ """Infer an approximate Parquet batch size (rows) to keep memory bounded."""
60
+ try:
61
+ available_memory = _get_available_memory()
62
+ target_memory = int(available_memory * memory_fraction)
63
+ file_size = file_path.stat().st_size
64
+ num_rows = int(getattr(parquet_file.metadata, "num_rows", 0) or 0)
65
+ if num_rows <= 0 or file_size <= 0 or target_memory <= 0:
66
+ return 65_536
67
+
68
+ # Parquet is compressed on disk; materialized batches are larger.
69
+ # We use a conservative multiplier to avoid overshooting.
70
+ bytes_per_row_on_disk = file_size / num_rows
71
+ inflated_bytes_per_row = max(1.0, bytes_per_row_on_disk * 3.0)
72
+ batch_rows = int(target_memory / inflated_bytes_per_row)
73
+
74
+ # Keep within sane bounds.
75
+ batch_rows = max(1_024, min(1_000_000, batch_rows))
76
+ log_debug(f"Auto Parquet batch_rows={batch_rows}", verbose)
77
+ return batch_rows
78
+ except Exception:
79
+ return 65_536
80
+
81
+
82
+ def _get_available_memory() -> int:
83
+ """Get available system memory in bytes."""
84
+ try:
85
+ import psutil
86
+
87
+ return psutil.virtual_memory().available
88
+ except ImportError:
89
+ log_warning("psutil not installed; assuming 4GB available memory", verbose=True)
90
+ return 4 * 1024 * 1024 * 1024
91
+
92
+
93
+ def _estimate_file_memory(file_path: pathlib.Path, sample_rows: int = 1000) -> int:
94
+ """
95
+ Estimate memory required to load a file by sampling.
96
+
97
+ Returns estimated bytes needed to load entire file.
98
+ """
99
+ file_size = file_path.stat().st_size
100
+ suffix = file_path.suffix.lower()
101
+
102
+ if suffix == ".parquet":
103
+ return file_size * 3
104
+
105
+ if suffix in (".csv", ".tsv", ".txt"):
106
+ try:
107
+ sample_kwargs = {}
108
+ if suffix == ".tsv":
109
+ sample_kwargs["sep"] = "\t"
110
+
111
+ sample = pd.read_csv(file_path, nrows=sample_rows, **sample_kwargs)
112
+ memory_per_row = sample.memory_usage(deep=True).sum() / len(sample)
113
+ bytes_per_row = file_size / max(1, _count_lines_estimate(file_path))
114
+ estimated_rows = file_size / bytes_per_row
115
+ return int(memory_per_row * estimated_rows * 1.2)
116
+ except Exception:
117
+ return file_size * 3
118
+
119
+ if suffix in (".xlsx", ".xls"):
120
+ return file_size * 10
121
+
122
+ return file_size * 3
123
+
124
+
125
+ def _count_lines_estimate(file_path: pathlib.Path, sample_size: int = 65536) -> int:
126
+ """Estimate number of lines in a file by sampling."""
127
+ file_size = file_path.stat().st_size
128
+ with open(file_path, "rb") as f:
129
+ sample = f.read(sample_size)
130
+ lines_in_sample = sample.count(b"\n")
131
+
132
+ if lines_in_sample == 0:
133
+ return 1
134
+
135
+ return int(file_size * lines_in_sample / len(sample))
136
+
137
+
138
+ def read_file_chunked(
139
+ file_path: Union[str, pathlib.Path],
140
+ chunksize: Optional[int] = None,
141
+ memory_fraction: float = 0.5,
142
+ verbose: bool = False,
143
+ **read_kwargs,
144
+ ) -> Iterator[pd.DataFrame]:
145
+ """
146
+ Read a file in chunks, automatically determining chunk size based on available memory.
147
+
148
+ Args:
149
+ file_path: Path to the file to read.
150
+ chunksize: Optional explicit chunk size (rows). If None, auto-calculated.
151
+ memory_fraction: Fraction of available memory to use (default: 0.5).
152
+ verbose: If True, logs progress messages.
153
+ **read_kwargs: Additional arguments passed to pandas read function.
154
+
155
+ Yields:
156
+ DataFrame chunks.
157
+
158
+ Raises:
159
+ ValueError: If file does not exist or format is unsupported.
160
+ """
161
+ path = pathlib.Path(file_path)
162
+ if not path.exists():
163
+ raise ValueError(f"File does not exist: {path}")
164
+
165
+ suffix = path.suffix.lower()
166
+
167
+ if suffix == ".parquet":
168
+ parquet_file = pq.ParquetFile(path)
169
+ batch_rows = chunksize
170
+ if batch_rows is None:
171
+ batch_rows = _infer_parquet_batch_rows(
172
+ file_path=path,
173
+ parquet_file=parquet_file,
174
+ memory_fraction=memory_fraction,
175
+ verbose=verbose,
176
+ )
177
+
178
+ for batch in parquet_file.iter_batches(
179
+ batch_size=int(batch_rows), use_threads=True
180
+ ):
181
+ yield batch.to_pandas()
182
+ log_debug(f"Read batch of {len(batch)} rows from parquet.", verbose)
183
+ return
184
+
185
+ if suffix not in (".csv", ".tsv", ".txt"):
186
+ raise ValueError(f"Unsupported file format for chunked reading: {suffix}")
187
+
188
+ if chunksize is None:
189
+ available_memory = _get_available_memory()
190
+ target_memory = int(available_memory * memory_fraction)
191
+ estimated_total = _estimate_file_memory(path)
192
+
193
+ if estimated_total <= target_memory:
194
+ log_info(
195
+ f"File fits in memory ({estimated_total / 1e6:.1f}MB); reading all at once.",
196
+ verbose,
197
+ )
198
+ normalized_kwargs = _normalize_text_delimiter_kwargs(suffix, read_kwargs)
199
+ df = pd.read_csv(path, **normalized_kwargs)
200
+ yield df
201
+ return
202
+
203
+ total_lines = _count_lines_estimate(path)
204
+ memory_per_row = estimated_total / max(1, total_lines)
205
+ chunksize = max(1000, int(target_memory / memory_per_row))
206
+ log_info(
207
+ f"File too large ({estimated_total / 1e6:.1f}MB); reading in chunks of {chunksize} rows.",
208
+ verbose,
209
+ )
210
+
211
+ chunk_num = 0
212
+ normalized_kwargs = _normalize_text_delimiter_kwargs(suffix, read_kwargs)
213
+ for chunk in pd.read_csv(path, chunksize=chunksize, **normalized_kwargs):
214
+ chunk_num += 1
215
+ log_debug(f"Read chunk {chunk_num} with {len(chunk)} rows.", verbose)
216
+ yield chunk
217
+
218
+
219
+ def read_file_iter(
220
+ file_path: Union[str, pathlib.Path],
221
+ chunksize: Optional[int] = None,
222
+ memory_fraction: float = 0.5,
223
+ verbose: bool = False,
224
+ **read_kwargs,
225
+ ) -> Iterator[pd.DataFrame]:
226
+ """Stream a file as an iterator of DataFrame chunks.
227
+
228
+ This is the "never materialize" API: unlike read_file_smart(), this function
229
+ does not concatenate chunks into a single DataFrame.
230
+
231
+ Supported streaming formats:
232
+ - .csv / .tsv / .txt (via pandas chunking)
233
+ - .parquet (via pyarrow iter_batches)
234
+ - .json (JSON Lines only: requires lines=True; uses pandas chunks)
235
+
236
+ Non-streaming formats:
237
+ - .xlsx / .xls are loaded fully and yielded as a single DataFrame (only if
238
+ the file is estimated to fit within memory_fraction of available memory).
239
+
240
+ Raises:
241
+ ValueError: If the file is missing, unsupported, or too large for a
242
+ non-streaming format.
243
+ """
244
+ path = pathlib.Path(file_path)
245
+ if not path.exists():
246
+ raise ValueError(f"File does not exist: {path}")
247
+
248
+ suffix = path.suffix.lower()
249
+
250
+ if suffix in (".csv", ".tsv", ".txt", ".parquet"):
251
+ yield from read_file_chunked(
252
+ file_path=path,
253
+ chunksize=chunksize,
254
+ memory_fraction=memory_fraction,
255
+ verbose=verbose,
256
+ **read_kwargs,
257
+ )
258
+ return
259
+
260
+ if suffix == ".json":
261
+ # pandas can stream JSON only for JSON Lines (one JSON object per line).
262
+ lines = bool(read_kwargs.get("lines", False))
263
+ if not lines:
264
+ available_memory = _get_available_memory()
265
+ target_memory = int(available_memory * memory_fraction)
266
+ estimated_total = _estimate_file_memory(path)
267
+ if estimated_total > target_memory:
268
+ raise ValueError(
269
+ "JSON streaming requires lines=True (JSON Lines) or the file must fit in memory. "
270
+ "Consider converting to JSON Lines or using read_file_to_parquets/read_file_chunked for delimited text."
271
+ )
272
+ yield pd.read_json(path, **read_kwargs)
273
+ return
274
+
275
+ # JSON Lines streaming.
276
+ if chunksize is None:
277
+ available_memory = _get_available_memory()
278
+ target_memory = int(available_memory * memory_fraction)
279
+ estimated_total = _estimate_file_memory(path)
280
+
281
+ if estimated_total <= target_memory:
282
+ yield pd.read_json(path, **read_kwargs)
283
+ return
284
+
285
+ total_lines = _count_lines_estimate(path)
286
+ memory_per_line = estimated_total / max(1, total_lines)
287
+ chunksize = max(1000, int(target_memory / max(1.0, memory_per_line)))
288
+ log_info(
289
+ f"JSON Lines too large ({estimated_total / 1e6:.1f}MB); streaming in chunks of {chunksize} rows.",
290
+ verbose,
291
+ )
292
+
293
+ # pandas returns a TextFileReader-like iterator when chunksize is provided.
294
+ json_iter = pd.read_json(path, chunksize=chunksize, **read_kwargs)
295
+ for i, chunk in enumerate(json_iter, start=1):
296
+ log_debug(f"Read json chunk {i} with {len(chunk)} rows.", verbose)
297
+ yield chunk
298
+ return
299
+
300
+ if suffix in (".xlsx", ".xls"):
301
+ available_memory = _get_available_memory()
302
+ target_memory = int(available_memory * memory_fraction)
303
+ estimated_total = _estimate_file_memory(path)
304
+ if estimated_total > target_memory:
305
+ raise ValueError(
306
+ f"Excel file is estimated too large to load safely ({estimated_total / 1e6:.1f}MB). "
307
+ "Excel streaming is not supported; consider exporting to CSV/Parquet first."
308
+ )
309
+ yield pd.read_excel(path, **read_kwargs)
310
+ return
311
+
312
+ raise ValueError(f"Unsupported file format for streaming: {suffix}")
313
+
314
+
315
+ def read_file_to_parquets(
316
+ file_path: Union[str, pathlib.Path],
317
+ output_dir: Union[str, pathlib.Path],
318
+ output_prefix: str = "part",
319
+ rows_per_file: Optional[int] = None,
320
+ memory_fraction: float = 0.5,
321
+ convert_types: bool = True,
322
+ verbose: bool = False,
323
+ **read_kwargs,
324
+ ) -> List[pathlib.Path]:
325
+ """
326
+ Read a large file and write it to multiple Parquet files if it doesn't fit in memory.
327
+
328
+ Args:
329
+ file_path: Path to the input file.
330
+ output_dir: Directory where Parquet files will be written.
331
+ output_prefix: Prefix for output file names (default: "part").
332
+ rows_per_file: Optional explicit rows per output file. If None, auto-calculated.
333
+ memory_fraction: Fraction of available memory to use.
334
+ convert_types: If True, attempts to convert string columns to numeric.
335
+ verbose: If True, logs progress messages.
336
+ **read_kwargs: Additional arguments passed to pandas read function.
337
+
338
+ Returns:
339
+ List of paths to the created Parquet files.
340
+
341
+ Raises:
342
+ ValueError: If file does not exist or format is unsupported.
343
+ """
344
+ # Import here to avoid circular imports
345
+ from .frames import clean_dataframe_columns, try_cast_string_columns_to_numeric
346
+
347
+ path = pathlib.Path(file_path)
348
+ output_path = pathlib.Path(output_dir)
349
+ output_path.mkdir(parents=True, exist_ok=True)
350
+
351
+ output_files: List[pathlib.Path] = []
352
+ part_num = 0
353
+
354
+ for chunk in read_file_chunked(
355
+ file_path=path,
356
+ chunksize=rows_per_file,
357
+ memory_fraction=memory_fraction,
358
+ verbose=verbose,
359
+ **read_kwargs,
360
+ ):
361
+ chunk = clean_dataframe_columns(chunk, verbose=verbose)
362
+ if convert_types:
363
+ chunk = try_cast_string_columns_to_numeric(chunk, verbose=verbose)
364
+
365
+ output_file = output_path / f"{output_prefix}_{part_num:05d}.parquet"
366
+ chunk.to_parquet(output_file, index=False)
367
+ output_files.append(output_file)
368
+
369
+ log_info(f"Wrote {len(chunk)} rows to {output_file}", verbose)
370
+ part_num += 1
371
+
372
+ log_info(f"Created {len(output_files)} Parquet files in {output_path}", verbose)
373
+ return output_files
374
+
375
+
376
+ def stream_to_parquets(
377
+ file_path: Union[str, pathlib.Path],
378
+ output_dir: Union[str, pathlib.Path],
379
+ output_prefix: str = "part",
380
+ rows_per_file: Optional[int] = None,
381
+ memory_fraction: float = 0.5,
382
+ convert_types: bool = True,
383
+ verbose: bool = False,
384
+ **read_kwargs,
385
+ ) -> List[pathlib.Path]:
386
+ """Stream a file and write it to Parquet partitions without materializing.
387
+
388
+ This helper is the "no concat" companion to read_file_to_parquets(). It uses
389
+ read_file_iter() under the hood and writes each incoming chunk to a separate
390
+ Parquet file.
391
+
392
+ Args:
393
+ file_path: Input file path.
394
+ output_dir: Directory where Parquet partitions are written.
395
+ output_prefix: Output filename prefix.
396
+ rows_per_file: Desired rows per partition. For streaming formats this
397
+ is passed as chunksize; if None, chunk sizes are chosen automatically
398
+ based on memory_fraction.
399
+ memory_fraction: Fraction of available memory to use when auto-sizing.
400
+ convert_types: If True, attempts to convert numeric-looking strings.
401
+ verbose: If True, logs progress.
402
+ **read_kwargs: Passed to the underlying reader.
403
+
404
+ Returns:
405
+ List of Parquet file paths.
406
+
407
+ Raises:
408
+ ValueError: If the input is missing/unsupported.
409
+ """
410
+ from .frames import clean_dataframe_columns, try_cast_string_columns_to_numeric
411
+
412
+ path = pathlib.Path(file_path)
413
+ output_path = pathlib.Path(output_dir)
414
+ output_path.mkdir(parents=True, exist_ok=True)
415
+
416
+ output_files: List[pathlib.Path] = []
417
+ part_num = 0
418
+
419
+ for chunk in read_file_iter(
420
+ file_path=path,
421
+ chunksize=rows_per_file,
422
+ memory_fraction=memory_fraction,
423
+ verbose=verbose,
424
+ **read_kwargs,
425
+ ):
426
+ chunk = clean_dataframe_columns(chunk, verbose=verbose)
427
+ if convert_types:
428
+ chunk = try_cast_string_columns_to_numeric(chunk, verbose=verbose)
429
+
430
+ output_file = output_path / f"{output_prefix}_{part_num:05d}.parquet"
431
+ chunk.to_parquet(output_file, index=False)
432
+ output_files.append(output_file)
433
+
434
+ log_info(f"Wrote {len(chunk)} rows to {output_file}", verbose)
435
+ part_num += 1
436
+
437
+ log_info(f"Created {len(output_files)} Parquet files in {output_path}", verbose)
438
+ return output_files
439
+
440
+
441
+ def read_file_smart(
442
+ file_path: Union[str, pathlib.Path],
443
+ use_polars: bool = True,
444
+ memory_fraction: float = 0.5,
445
+ verbose: bool = False,
446
+ **read_kwargs,
447
+ ) -> pd.DataFrame:
448
+ """
449
+ Intelligently read a file, using Polars for large files if available.
450
+
451
+ For files that fit in memory, reads directly. For large files, uses
452
+ Polars lazy evaluation or pandas chunking as a fallback.
453
+
454
+ Args:
455
+ file_path: Path to the file to read.
456
+ use_polars: If True and Polars is available, uses Polars for large files.
457
+ memory_fraction: Fraction of available memory to use.
458
+ verbose: If True, logs progress messages.
459
+ **read_kwargs: Additional arguments passed to the read function.
460
+
461
+ Returns:
462
+ DataFrame with the file contents.
463
+
464
+ Raises:
465
+ ValueError: If file does not exist or format is unsupported.
466
+ """
467
+ path = pathlib.Path(file_path)
468
+ if not path.exists():
469
+ raise ValueError(f"File does not exist: {path}")
470
+
471
+ suffix = path.suffix.lower()
472
+ available_memory = _get_available_memory()
473
+ target_memory = int(available_memory * memory_fraction)
474
+ estimated_memory = _estimate_file_memory(path)
475
+
476
+ fits_in_memory = estimated_memory <= target_memory
477
+
478
+ log_debug(
479
+ f"File: {path.name}, Estimated: {estimated_memory / 1e6:.1f}MB, "
480
+ f"Available: {target_memory / 1e6:.1f}MB, Fits: {fits_in_memory}",
481
+ verbose,
482
+ )
483
+
484
+ # Try Polars for better performance on large files
485
+ if use_polars and not fits_in_memory:
486
+ try:
487
+ import polars as pl
488
+
489
+ log_info(
490
+ f"Using Polars for large file ({estimated_memory / 1e6:.1f}MB)", verbose
491
+ )
492
+
493
+ if suffix == ".parquet":
494
+ lf = pl.scan_parquet(path)
495
+ elif suffix in (".csv", ".tsv", ".txt"):
496
+ lf = pl.scan_csv(path, **_polars_scan_csv_kwargs(suffix, read_kwargs))
497
+ else:
498
+ raise ValueError(f"Unsupported format for Polars: {suffix}")
499
+
500
+ df_polars = lf.collect(streaming=True)
501
+ log_info(f"Polars read {len(df_polars)} rows.", verbose)
502
+ return df_polars.to_pandas()
503
+
504
+ except ImportError:
505
+ log_warning(
506
+ "Polars not installed; falling back to pandas chunked reading.", verbose
507
+ )
508
+ except Exception as e:
509
+ log_warning(f"Polars failed: {e}; falling back to pandas.", verbose)
510
+
511
+ # Direct read for small files
512
+ if fits_in_memory:
513
+ log_info(
514
+ f"Reading file directly ({estimated_memory / 1e6:.1f}MB fits in memory).",
515
+ verbose,
516
+ )
517
+ if suffix == ".parquet":
518
+ return pd.read_parquet(path, **read_kwargs)
519
+ elif suffix in (".csv", ".tsv", ".txt"):
520
+ normalized_kwargs = _normalize_text_delimiter_kwargs(suffix, read_kwargs)
521
+ return pd.read_csv(path, **normalized_kwargs)
522
+ elif suffix in (".xlsx", ".xls"):
523
+ return pd.read_excel(path, **read_kwargs)
524
+ elif suffix == ".json":
525
+ return pd.read_json(path, **read_kwargs)
526
+ else:
527
+ raise ValueError(f"Unsupported file format: {suffix}")
528
+
529
+ # Chunked reading for large files when Polars isn't available
530
+ log_info(f"Reading large file in chunks ({estimated_memory / 1e6:.1f}MB).", verbose)
531
+ normalized_kwargs = _normalize_text_delimiter_kwargs(suffix, read_kwargs)
532
+ return pd.concat(
533
+ read_file_chunked(
534
+ file_path=path,
535
+ memory_fraction=memory_fraction,
536
+ verbose=verbose,
537
+ **normalized_kwargs,
538
+ ),
539
+ ignore_index=True,
540
+ )
@@ -0,0 +1,15 @@
1
+ """
2
+ Input/output operations for fetching and processing external data.
3
+
4
+ This module provides functions for:
5
+ - JSON data retrieval from URLs
6
+ - ZIP file downloading and extraction
7
+ """
8
+
9
+ from .json import get as get_json
10
+ from .zip import get as get_zip
11
+
12
+ __all__ = [
13
+ "get_json",
14
+ "get_zip",
15
+ ]
datablade/io/json.py ADDED
@@ -0,0 +1,33 @@
1
+ from typing import Any, Dict
2
+
3
+ import requests
4
+
5
+ from ..utils.messages import print_verbose
6
+
7
+
8
+ def get(url: str, verbose: bool = False, **kwargs: Any) -> Dict[str, Any]:
9
+ """
10
+ Get JSON data from a URL using HTTP GET request.
11
+
12
+ Args:
13
+ url: The URL to fetch JSON data from (must be non-empty string).
14
+ verbose: If True, prints error messages.
15
+ **kwargs: Additional keyword arguments passed to requests.get().
16
+
17
+ Returns:
18
+ Dictionary containing the JSON response.
19
+
20
+ Raises:
21
+ ValueError: If url is empty or not a string.
22
+ requests.RequestException: If the HTTP request fails.
23
+ """
24
+ if not isinstance(url, str) or not url.strip():
25
+ raise ValueError("url must be a non-empty string")
26
+
27
+ try:
28
+ response = requests.get(url, **kwargs)
29
+ response.raise_for_status() # Raise exception for bad status codes
30
+ return response.json()
31
+ except requests.exceptions.RequestException as e:
32
+ print_verbose(f"Error fetching JSON from {url}: {e}", verbose=verbose)
33
+ raise
datablade/io/zip.py ADDED
@@ -0,0 +1,73 @@
1
+ import io
2
+ import pathlib
3
+ import zipfile
4
+ from typing import Any, Optional
5
+
6
+ import requests
7
+
8
+ from ..utils.messages import print_verbose
9
+
10
+
11
+ def get(
12
+ url: str,
13
+ path: Optional[str | pathlib.Path] = None,
14
+ verbose: bool = False,
15
+ **kwargs: Any,
16
+ ) -> Optional[io.BytesIO]:
17
+ """
18
+ Download a ZIP file from a URL and either extract it to a path or return as BytesIO.
19
+
20
+ Args:
21
+ url: The URL of the ZIP file to download.
22
+ path: Optional path where the ZIP contents should be extracted.
23
+ If None, returns the ZIP data as a BytesIO object.
24
+ If provided, extracts all files to the specified path.
25
+ verbose: If True, prints progress messages.
26
+ **kwargs: Additional keyword arguments passed to requests.get().
27
+
28
+ Returns:
29
+ io.BytesIO containing the ZIP data if path is None,
30
+ otherwise None after extracting files to path.
31
+ Raises on error.
32
+
33
+ Raises:
34
+ ValueError: If url is empty or not a string.
35
+ requests.RequestException: If the download fails.
36
+ zipfile.BadZipFile: If the response is not a valid zip.
37
+ Exception: For other unexpected errors while extracting.
38
+ """
39
+ if not isinstance(url, str) or not url.strip():
40
+ raise ValueError("url must be a non-empty string")
41
+
42
+ try:
43
+ print_verbose(f"Downloading {url}", verbose=verbose)
44
+ response = requests.get(url, **kwargs)
45
+ response.raise_for_status() # Raise exception for bad status codes
46
+ data = response.content
47
+ zip_buffer = io.BytesIO(data)
48
+
49
+ if path is None:
50
+ return zip_buffer
51
+
52
+ print_verbose(f"Saving data to {path}", verbose=verbose)
53
+ zip_buffer.seek(0)
54
+ with zipfile.ZipFile(zip_buffer, "r") as zip_ref:
55
+ # Unlike utils.strings.pathing(), extracting should work even if the
56
+ # destination directory doesn't exist yet.
57
+ extract_root = pathlib.Path(path)
58
+ extract_root.mkdir(parents=True, exist_ok=True)
59
+ for zip_info in zip_ref.infolist():
60
+ extract_path = extract_root / zip_info.filename
61
+ extract_path.parent.mkdir(parents=True, exist_ok=True)
62
+ with open(extract_path, "wb") as f:
63
+ f.write(zip_ref.read(zip_info.filename))
64
+ return None
65
+ except requests.exceptions.RequestException as e:
66
+ print_verbose(f"Error downloading ZIP from {url}: {e}", verbose=verbose)
67
+ raise
68
+ except zipfile.BadZipFile as e:
69
+ print_verbose(f"Error: Invalid ZIP file from {url}: {e}", verbose=verbose)
70
+ raise
71
+ except Exception as e:
72
+ print_verbose(f"Error processing ZIP file: {e}", verbose=verbose)
73
+ raise
@@ -0,0 +1,32 @@
1
+ """
2
+ SQL utilities for datablade.
3
+
4
+ Provides dialect-aware quoting, DDL generation, and bulk loading.
5
+ Supports SQL Server, PostgreSQL, MySQL, and DuckDB.
6
+ """
7
+
8
+ from .bulk_load import (
9
+ bulk_load,
10
+ bulk_load_duckdb,
11
+ bulk_load_mysql,
12
+ bulk_load_postgres,
13
+ bulk_load_sqlserver,
14
+ write_dataframe_and_load,
15
+ )
16
+ from .ddl import generate_create_table
17
+ from .ddl_pyarrow import generate_create_table_from_parquet
18
+ from .dialects import Dialect
19
+ from .quoting import quote_identifier
20
+
21
+ __all__ = [
22
+ "Dialect",
23
+ "quote_identifier",
24
+ "generate_create_table",
25
+ "generate_create_table_from_parquet",
26
+ "bulk_load",
27
+ "bulk_load_sqlserver",
28
+ "bulk_load_postgres",
29
+ "bulk_load_mysql",
30
+ "bulk_load_duckdb",
31
+ "write_dataframe_and_load",
32
+ ]