adamops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. adamops/__init__.py +40 -0
  2. adamops/cli.py +163 -0
  3. adamops/data/__init__.py +24 -0
  4. adamops/data/feature_engineering.py +284 -0
  5. adamops/data/loaders.py +922 -0
  6. adamops/data/preprocessors.py +227 -0
  7. adamops/data/splitters.py +218 -0
  8. adamops/data/validators.py +148 -0
  9. adamops/deployment/__init__.py +21 -0
  10. adamops/deployment/api.py +237 -0
  11. adamops/deployment/cloud.py +191 -0
  12. adamops/deployment/containerize.py +262 -0
  13. adamops/deployment/exporters.py +148 -0
  14. adamops/evaluation/__init__.py +24 -0
  15. adamops/evaluation/comparison.py +133 -0
  16. adamops/evaluation/explainability.py +143 -0
  17. adamops/evaluation/metrics.py +233 -0
  18. adamops/evaluation/reports.py +165 -0
  19. adamops/evaluation/visualization.py +238 -0
  20. adamops/models/__init__.py +21 -0
  21. adamops/models/automl.py +277 -0
  22. adamops/models/ensembles.py +228 -0
  23. adamops/models/modelops.py +308 -0
  24. adamops/models/registry.py +250 -0
  25. adamops/monitoring/__init__.py +21 -0
  26. adamops/monitoring/alerts.py +200 -0
  27. adamops/monitoring/dashboard.py +117 -0
  28. adamops/monitoring/drift.py +212 -0
  29. adamops/monitoring/performance.py +195 -0
  30. adamops/pipelines/__init__.py +15 -0
  31. adamops/pipelines/orchestrators.py +183 -0
  32. adamops/pipelines/workflows.py +212 -0
  33. adamops/utils/__init__.py +18 -0
  34. adamops/utils/config.py +457 -0
  35. adamops/utils/helpers.py +663 -0
  36. adamops/utils/logging.py +412 -0
  37. adamops-0.1.0.dist-info/METADATA +310 -0
  38. adamops-0.1.0.dist-info/RECORD +42 -0
  39. adamops-0.1.0.dist-info/WHEEL +5 -0
  40. adamops-0.1.0.dist-info/entry_points.txt +2 -0
  41. adamops-0.1.0.dist-info/licenses/LICENSE +21 -0
  42. adamops-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,922 @@
1
+ """
2
+ AdamOps Data Loaders Module
3
+
4
+ Provides comprehensive data loading capabilities from various sources:
5
+ - CSV files with auto-encoding detection
6
+ - Excel files (.xlsx, .xls)
7
+ - JSON files
8
+ - SQL databases (SQLite, PostgreSQL, MySQL)
9
+ - API/URL endpoints
10
+ - Compressed files (.zip, .gz)
11
+ """
12
+
13
+ import os
14
+ import io
15
+ import json
16
+ import gzip
17
+ import zipfile
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional, Union
20
+ from urllib.parse import urlparse
21
+
22
+ import pandas as pd
23
+ import numpy as np
24
+
25
+ try:
26
+ import chardet
27
+ CHARDET_AVAILABLE = True
28
+ except ImportError:
29
+ CHARDET_AVAILABLE = False
30
+
31
+ try:
32
+ import requests
33
+ REQUESTS_AVAILABLE = True
34
+ except ImportError:
35
+ REQUESTS_AVAILABLE = False
36
+
37
+ try:
38
+ from sqlalchemy import create_engine, text
39
+ SQLALCHEMY_AVAILABLE = True
40
+ except ImportError:
41
+ SQLALCHEMY_AVAILABLE = False
42
+
43
+ from adamops.utils.logging import get_logger
44
+ from adamops.utils.helpers import ensure_dir
45
+
46
+ logger = get_logger(__name__)
47
+
48
+
49
+ # =============================================================================
50
+ # Encoding Detection
51
+ # =============================================================================
52
+
53
+ def detect_encoding(filepath: Union[str, Path], sample_size: int = 10000) -> str:
54
+ """
55
+ Detect the encoding of a file.
56
+
57
+ Args:
58
+ filepath: Path to the file.
59
+ sample_size: Number of bytes to sample for detection.
60
+
61
+ Returns:
62
+ str: Detected encoding (e.g., 'utf-8', 'latin-1').
63
+
64
+ Example:
65
+ >>> encoding = detect_encoding("data.csv")
66
+ >>> print(encoding)
67
+ 'utf-8'
68
+ """
69
+ if not CHARDET_AVAILABLE:
70
+ logger.warning("chardet not available, defaulting to utf-8")
71
+ return "utf-8"
72
+
73
+ with open(filepath, "rb") as f:
74
+ raw_data = f.read(sample_size)
75
+
76
+ result = chardet.detect(raw_data)
77
+ encoding = result.get("encoding", "utf-8")
78
+ confidence = result.get("confidence", 0)
79
+
80
+ logger.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
81
+
82
+ # Fall back to utf-8 if detection is uncertain
83
+ if confidence < 0.5:
84
+ encoding = "utf-8"
85
+
86
+ return encoding or "utf-8"
87
+
88
+
89
+ # =============================================================================
90
+ # CSV Loading
91
+ # =============================================================================
92
+
93
+ def load_csv(
94
+ filepath: Union[str, Path],
95
+ encoding: Optional[str] = None,
96
+ auto_detect_encoding: bool = True,
97
+ sep: str = ",",
98
+ header: Union[int, List[int], str] = "infer",
99
+ index_col: Optional[Union[int, str, List]] = None,
100
+ usecols: Optional[List] = None,
101
+ dtype: Optional[Dict] = None,
102
+ parse_dates: Optional[Union[bool, List]] = None,
103
+ na_values: Optional[List] = None,
104
+ nrows: Optional[int] = None,
105
+ skiprows: Optional[Union[int, List]] = None,
106
+ low_memory: bool = True,
107
+ **kwargs
108
+ ) -> pd.DataFrame:
109
+ """
110
+ Load data from a CSV file with auto-encoding detection.
111
+
112
+ Args:
113
+ filepath: Path to the CSV file.
114
+ encoding: File encoding. If None and auto_detect_encoding is True,
115
+ encoding will be detected automatically.
116
+ auto_detect_encoding: Whether to auto-detect encoding.
117
+ sep: Column separator.
118
+ header: Row number(s) to use as column names.
119
+ index_col: Column(s) to use as index.
120
+ usecols: Columns to load.
121
+ dtype: Data types for columns.
122
+ parse_dates: Columns to parse as dates.
123
+ na_values: Additional values to treat as NA.
124
+ nrows: Number of rows to read.
125
+ skiprows: Rows to skip.
126
+ low_memory: Use low memory mode.
127
+ **kwargs: Additional arguments passed to pd.read_csv.
128
+
129
+ Returns:
130
+ pd.DataFrame: Loaded data.
131
+
132
+ Example:
133
+ >>> df = load_csv("data.csv")
134
+ >>> df = load_csv("data.csv", usecols=["id", "name", "value"])
135
+ >>> df = load_csv("data.csv", parse_dates=["date_column"])
136
+ """
137
+ filepath = Path(filepath)
138
+
139
+ if not filepath.exists():
140
+ raise FileNotFoundError(f"File not found: {filepath}")
141
+
142
+ # Detect encoding if not specified
143
+ if encoding is None and auto_detect_encoding:
144
+ encoding = detect_encoding(filepath)
145
+ elif encoding is None:
146
+ encoding = "utf-8"
147
+
148
+ logger.info(f"Loading CSV: {filepath} (encoding: {encoding})")
149
+
150
+ try:
151
+ df = pd.read_csv(
152
+ filepath,
153
+ encoding=encoding,
154
+ sep=sep,
155
+ header=header,
156
+ index_col=index_col,
157
+ usecols=usecols,
158
+ dtype=dtype,
159
+ parse_dates=parse_dates,
160
+ na_values=na_values,
161
+ nrows=nrows,
162
+ skiprows=skiprows,
163
+ low_memory=low_memory,
164
+ **kwargs
165
+ )
166
+
167
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
168
+ return df
169
+
170
+ except UnicodeDecodeError:
171
+ # Try with different encodings
172
+ for fallback_encoding in ["latin-1", "cp1252", "iso-8859-1"]:
173
+ try:
174
+ logger.warning(f"Retrying with {fallback_encoding} encoding")
175
+ df = pd.read_csv(
176
+ filepath,
177
+ encoding=fallback_encoding,
178
+ sep=sep,
179
+ header=header,
180
+ index_col=index_col,
181
+ usecols=usecols,
182
+ dtype=dtype,
183
+ parse_dates=parse_dates,
184
+ na_values=na_values,
185
+ nrows=nrows,
186
+ skiprows=skiprows,
187
+ low_memory=low_memory,
188
+ **kwargs
189
+ )
190
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
191
+ return df
192
+ except UnicodeDecodeError:
193
+ continue
194
+
195
+ raise
196
+
197
+
198
+ # =============================================================================
199
+ # Excel Loading
200
+ # =============================================================================
201
+
202
+ def load_excel(
203
+ filepath: Union[str, Path],
204
+ sheet_name: Union[str, int, List, None] = 0,
205
+ header: Union[int, List[int], None] = 0,
206
+ index_col: Optional[Union[int, str, List]] = None,
207
+ usecols: Optional[Union[str, List]] = None,
208
+ dtype: Optional[Dict] = None,
209
+ parse_dates: Optional[Union[bool, List]] = None,
210
+ na_values: Optional[List] = None,
211
+ nrows: Optional[int] = None,
212
+ skiprows: Optional[Union[int, List]] = None,
213
+ **kwargs
214
+ ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
215
+ """
216
+ Load data from an Excel file (.xlsx, .xls).
217
+
218
+ Args:
219
+ filepath: Path to the Excel file.
220
+ sheet_name: Sheet name or index, or list for multiple sheets.
221
+ Use None to read all sheets.
222
+ header: Row number(s) to use as column names.
223
+ index_col: Column(s) to use as index.
224
+ usecols: Columns to load.
225
+ dtype: Data types for columns.
226
+ parse_dates: Columns to parse as dates.
227
+ na_values: Additional values to treat as NA.
228
+ nrows: Number of rows to read.
229
+ skiprows: Rows to skip.
230
+ **kwargs: Additional arguments passed to pd.read_excel.
231
+
232
+ Returns:
233
+ pd.DataFrame or Dict[str, pd.DataFrame]: Loaded data.
234
+
235
+ Example:
236
+ >>> df = load_excel("data.xlsx")
237
+ >>> df = load_excel("data.xlsx", sheet_name="Sheet1")
238
+ >>> sheets = load_excel("data.xlsx", sheet_name=None) # All sheets
239
+ """
240
+ filepath = Path(filepath)
241
+
242
+ if not filepath.exists():
243
+ raise FileNotFoundError(f"File not found: {filepath}")
244
+
245
+ logger.info(f"Loading Excel: {filepath}")
246
+
247
+ result = pd.read_excel(
248
+ filepath,
249
+ sheet_name=sheet_name,
250
+ header=header,
251
+ index_col=index_col,
252
+ usecols=usecols,
253
+ dtype=dtype,
254
+ parse_dates=parse_dates,
255
+ na_values=na_values,
256
+ nrows=nrows,
257
+ skiprows=skiprows,
258
+ **kwargs
259
+ )
260
+
261
+ if isinstance(result, dict):
262
+ for name, df in result.items():
263
+ logger.info(f"Sheet '{name}': {len(df)} rows, {len(df.columns)} columns")
264
+ else:
265
+ logger.info(f"Loaded {len(result)} rows, {len(result.columns)} columns")
266
+
267
+ return result
268
+
269
+
270
+ def get_excel_sheet_names(filepath: Union[str, Path]) -> List[str]:
271
+ """
272
+ Get sheet names from an Excel file.
273
+
274
+ Args:
275
+ filepath: Path to the Excel file.
276
+
277
+ Returns:
278
+ List[str]: List of sheet names.
279
+ """
280
+ filepath = Path(filepath)
281
+
282
+ if not filepath.exists():
283
+ raise FileNotFoundError(f"File not found: {filepath}")
284
+
285
+ excel_file = pd.ExcelFile(filepath)
286
+ return excel_file.sheet_names
287
+
288
+
289
+ # =============================================================================
290
+ # JSON Loading
291
+ # =============================================================================
292
+
293
+ def load_json(
294
+ filepath: Union[str, Path],
295
+ orient: Optional[str] = None,
296
+ lines: bool = False,
297
+ encoding: str = "utf-8",
298
+ **kwargs
299
+ ) -> pd.DataFrame:
300
+ """
301
+ Load data from a JSON file.
302
+
303
+ Args:
304
+ filepath: Path to the JSON file.
305
+ orient: JSON structure orientation. Options:
306
+ 'split', 'records', 'index', 'columns', 'values', 'table'
307
+ lines: Read file as line-delimited JSON.
308
+ encoding: File encoding.
309
+ **kwargs: Additional arguments passed to pd.read_json.
310
+
311
+ Returns:
312
+ pd.DataFrame: Loaded data.
313
+
314
+ Example:
315
+ >>> df = load_json("data.json")
316
+ >>> df = load_json("data.jsonl", lines=True)
317
+ """
318
+ filepath = Path(filepath)
319
+
320
+ if not filepath.exists():
321
+ raise FileNotFoundError(f"File not found: {filepath}")
322
+
323
+ logger.info(f"Loading JSON: {filepath}")
324
+
325
+ df = pd.read_json(
326
+ filepath,
327
+ orient=orient,
328
+ lines=lines,
329
+ encoding=encoding,
330
+ **kwargs
331
+ )
332
+
333
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
334
+ return df
335
+
336
+
337
+ def load_json_nested(
338
+ filepath: Union[str, Path],
339
+ record_path: Optional[Union[str, List[str]]] = None,
340
+ meta: Optional[List[str]] = None,
341
+ max_level: Optional[int] = None,
342
+ encoding: str = "utf-8",
343
+ ) -> pd.DataFrame:
344
+ """
345
+ Load nested JSON data and normalize it to a flat DataFrame.
346
+
347
+ Args:
348
+ filepath: Path to the JSON file.
349
+ record_path: Path to the records in the JSON structure.
350
+ meta: Fields to include from higher level.
351
+ max_level: Maximum normalization depth.
352
+ encoding: File encoding.
353
+
354
+ Returns:
355
+ pd.DataFrame: Normalized data.
356
+
357
+ Example:
358
+ >>> # For JSON like: {"data": [{"id": 1, "info": {"name": "A"}}]}
359
+ >>> df = load_json_nested("data.json", record_path="data")
360
+ """
361
+ filepath = Path(filepath)
362
+
363
+ if not filepath.exists():
364
+ raise FileNotFoundError(f"File not found: {filepath}")
365
+
366
+ logger.info(f"Loading nested JSON: {filepath}")
367
+
368
+ with open(filepath, "r", encoding=encoding) as f:
369
+ data = json.load(f)
370
+
371
+ df = pd.json_normalize(
372
+ data,
373
+ record_path=record_path,
374
+ meta=meta,
375
+ max_level=max_level,
376
+ )
377
+
378
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
379
+ return df
380
+
381
+
382
+ # =============================================================================
383
+ # SQL Loading
384
+ # =============================================================================
385
+
386
+ def load_sql(
387
+ query: str,
388
+ connection_string: str,
389
+ params: Optional[Dict] = None,
390
+ index_col: Optional[Union[str, List[str]]] = None,
391
+ parse_dates: Optional[Union[List[str], Dict]] = None,
392
+ chunksize: Optional[int] = None,
393
+ **kwargs
394
+ ) -> Union[pd.DataFrame, pd.io.sql.SQLiteDatabase]:
395
+ """
396
+ Load data from a SQL database.
397
+
398
+ Supports SQLite, PostgreSQL, MySQL, and other SQLAlchemy-compatible databases.
399
+
400
+ Args:
401
+ query: SQL query to execute.
402
+ connection_string: Database connection string.
403
+ Examples:
404
+ - SQLite: "sqlite:///database.db"
405
+ - PostgreSQL: "postgresql://user:pass@host:port/db"
406
+ - MySQL: "mysql+pymysql://user:pass@host:port/db"
407
+ params: Query parameters.
408
+ index_col: Column(s) to use as index.
409
+ parse_dates: Columns to parse as dates.
410
+ chunksize: Number of rows per chunk (for large datasets).
411
+ **kwargs: Additional arguments passed to pd.read_sql.
412
+
413
+ Returns:
414
+ pd.DataFrame or Iterator: Loaded data.
415
+
416
+ Example:
417
+ >>> df = load_sql("SELECT * FROM users", "sqlite:///app.db")
418
+ >>> df = load_sql(
419
+ ... "SELECT * FROM orders WHERE date > :date",
420
+ ... "postgresql://user:pass@localhost:5432/shop",
421
+ ... params={"date": "2023-01-01"}
422
+ ... )
423
+ """
424
+ if not SQLALCHEMY_AVAILABLE:
425
+ raise ImportError("SQLAlchemy is required for SQL loading. Install with: pip install sqlalchemy")
426
+
427
+ logger.info(f"Loading from SQL database")
428
+
429
+ engine = create_engine(connection_string)
430
+
431
+ # Use text() for raw SQL queries with params
432
+ if params:
433
+ query = text(query)
434
+
435
+ df = pd.read_sql(
436
+ query,
437
+ engine,
438
+ params=params,
439
+ index_col=index_col,
440
+ parse_dates=parse_dates,
441
+ chunksize=chunksize,
442
+ **kwargs
443
+ )
444
+
445
+ if chunksize is None:
446
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
447
+ else:
448
+ logger.info(f"Created chunked reader with chunksize={chunksize}")
449
+
450
+ return df
451
+
452
+
453
+ def load_sql_table(
454
+ table_name: str,
455
+ connection_string: str,
456
+ schema: Optional[str] = None,
457
+ columns: Optional[List[str]] = None,
458
+ index_col: Optional[Union[str, List[str]]] = None,
459
+ chunksize: Optional[int] = None,
460
+ **kwargs
461
+ ) -> pd.DataFrame:
462
+ """
463
+ Load an entire table from a SQL database.
464
+
465
+ Args:
466
+ table_name: Name of the table to load.
467
+ connection_string: Database connection string.
468
+ schema: Database schema.
469
+ columns: Columns to load (None for all).
470
+ index_col: Column(s) to use as index.
471
+ chunksize: Number of rows per chunk.
472
+ **kwargs: Additional arguments.
473
+
474
+ Returns:
475
+ pd.DataFrame: Loaded data.
476
+ """
477
+ if not SQLALCHEMY_AVAILABLE:
478
+ raise ImportError("SQLAlchemy is required for SQL loading. Install with: pip install sqlalchemy")
479
+
480
+ logger.info(f"Loading table: {table_name}")
481
+
482
+ engine = create_engine(connection_string)
483
+
484
+ df = pd.read_sql_table(
485
+ table_name,
486
+ engine,
487
+ schema=schema,
488
+ columns=columns,
489
+ index_col=index_col,
490
+ chunksize=chunksize,
491
+ **kwargs
492
+ )
493
+
494
+ if chunksize is None:
495
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
496
+
497
+ return df
498
+
499
+
500
+ # =============================================================================
501
+ # API/URL Loading
502
+ # =============================================================================
503
+
504
+ def load_url(
505
+ url: str,
506
+ format: str = "csv",
507
+ params: Optional[Dict] = None,
508
+ headers: Optional[Dict] = None,
509
+ auth: Optional[tuple] = None,
510
+ timeout: int = 30,
511
+ **kwargs
512
+ ) -> pd.DataFrame:
513
+ """
514
+ Load data from a URL.
515
+
516
+ Args:
517
+ url: URL to load data from.
518
+ format: Data format ('csv', 'json', 'excel').
519
+ params: Query parameters.
520
+ headers: HTTP headers.
521
+ auth: Authentication tuple (username, password).
522
+ timeout: Request timeout in seconds.
523
+ **kwargs: Additional arguments for the format loader.
524
+
525
+ Returns:
526
+ pd.DataFrame: Loaded data.
527
+
528
+ Example:
529
+ >>> df = load_url("https://example.com/data.csv")
530
+ >>> df = load_url(
531
+ ... "https://api.example.com/data",
532
+ ... format="json",
533
+ ... headers={"Authorization": "Bearer token"}
534
+ ... )
535
+ """
536
+ if not REQUESTS_AVAILABLE:
537
+ raise ImportError("requests is required for URL loading. Install with: pip install requests")
538
+
539
+ logger.info(f"Loading from URL: {url}")
540
+
541
+ response = requests.get(
542
+ url,
543
+ params=params,
544
+ headers=headers,
545
+ auth=auth,
546
+ timeout=timeout,
547
+ )
548
+ response.raise_for_status()
549
+
550
+ content = io.BytesIO(response.content)
551
+
552
+ if format == "csv":
553
+ df = pd.read_csv(content, **kwargs)
554
+ elif format == "json":
555
+ df = pd.read_json(content, **kwargs)
556
+ elif format == "excel":
557
+ df = pd.read_excel(content, **kwargs)
558
+ else:
559
+ raise ValueError(f"Unsupported format: {format}")
560
+
561
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
562
+ return df
563
+
564
+
565
+ def load_api(
566
+ url: str,
567
+ method: str = "GET",
568
+ params: Optional[Dict] = None,
569
+ data: Optional[Dict] = None,
570
+ json_data: Optional[Dict] = None,
571
+ headers: Optional[Dict] = None,
572
+ auth: Optional[tuple] = None,
573
+ timeout: int = 30,
574
+ data_key: Optional[str] = None,
575
+ paginate: bool = False,
576
+ page_key: str = "page",
577
+ limit_key: str = "limit",
578
+ limit: int = 100,
579
+ max_pages: int = 100,
580
+ ) -> pd.DataFrame:
581
+ """
582
+ Load data from a REST API with pagination support.
583
+
584
+ Args:
585
+ url: API endpoint URL.
586
+ method: HTTP method.
587
+ params: Query parameters.
588
+ data: Form data.
589
+ json_data: JSON body data.
590
+ headers: HTTP headers.
591
+ auth: Authentication tuple.
592
+ timeout: Request timeout.
593
+ data_key: Key in response containing the data array.
594
+ paginate: Whether to paginate through results.
595
+ page_key: Parameter name for page number.
596
+ limit_key: Parameter name for page size.
597
+ limit: Number of items per page.
598
+ max_pages: Maximum number of pages to fetch.
599
+
600
+ Returns:
601
+ pd.DataFrame: Loaded data.
602
+
603
+ Example:
604
+ >>> df = load_api(
605
+ ... "https://api.example.com/users",
606
+ ... headers={"Authorization": "Bearer token"},
607
+ ... data_key="users",
608
+ ... paginate=True
609
+ ... )
610
+ """
611
+ if not REQUESTS_AVAILABLE:
612
+ raise ImportError("requests is required for API loading. Install with: pip install requests")
613
+
614
+ logger.info(f"Loading from API: {url}")
615
+
616
+ all_data = []
617
+ page = 1
618
+
619
+ while True:
620
+ # Build params for this request
621
+ request_params = dict(params or {})
622
+ if paginate:
623
+ request_params[page_key] = page
624
+ request_params[limit_key] = limit
625
+
626
+ response = requests.request(
627
+ method=method,
628
+ url=url,
629
+ params=request_params,
630
+ data=data,
631
+ json=json_data,
632
+ headers=headers,
633
+ auth=auth,
634
+ timeout=timeout,
635
+ )
636
+ response.raise_for_status()
637
+
638
+ result = response.json()
639
+
640
+ # Extract data
641
+ if data_key:
642
+ page_data = result.get(data_key, [])
643
+ else:
644
+ page_data = result if isinstance(result, list) else [result]
645
+
646
+ all_data.extend(page_data)
647
+
648
+ # Check if we should continue paginating
649
+ if not paginate or len(page_data) < limit or page >= max_pages:
650
+ break
651
+
652
+ page += 1
653
+ logger.debug(f"Fetching page {page}...")
654
+
655
+ df = pd.DataFrame(all_data)
656
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
657
+ return df
658
+
659
+
660
+ # =============================================================================
661
+ # Compressed Files
662
+ # =============================================================================
663
+
664
+ def load_compressed(
665
+ filepath: Union[str, Path],
666
+ format: str = "csv",
667
+ compression: Optional[str] = None,
668
+ **kwargs
669
+ ) -> pd.DataFrame:
670
+ """
671
+ Load data from a compressed file (.zip, .gz, .bz2, .xz).
672
+
673
+ Args:
674
+ filepath: Path to the compressed file.
675
+ format: Data format inside the archive ('csv', 'json', 'excel').
676
+ compression: Compression type. Auto-detected if None.
677
+ **kwargs: Additional arguments for the format loader.
678
+
679
+ Returns:
680
+ pd.DataFrame: Loaded data.
681
+
682
+ Example:
683
+ >>> df = load_compressed("data.csv.gz")
684
+ >>> df = load_compressed("archive.zip", format="csv")
685
+ """
686
+ filepath = Path(filepath)
687
+
688
+ if not filepath.exists():
689
+ raise FileNotFoundError(f"File not found: {filepath}")
690
+
691
+ # Auto-detect compression type
692
+ if compression is None:
693
+ suffix = filepath.suffix.lower()
694
+ if suffix == ".gz":
695
+ compression = "gzip"
696
+ elif suffix == ".bz2":
697
+ compression = "bz2"
698
+ elif suffix == ".xz":
699
+ compression = "xz"
700
+ elif suffix == ".zip":
701
+ compression = "zip"
702
+ else:
703
+ compression = "infer"
704
+
705
+ logger.info(f"Loading compressed file: {filepath} ({compression})")
706
+
707
+ if compression == "zip":
708
+ return _load_from_zip(filepath, format, **kwargs)
709
+ else:
710
+ if format == "csv":
711
+ df = pd.read_csv(filepath, compression=compression, **kwargs)
712
+ elif format == "json":
713
+ df = pd.read_json(filepath, compression=compression, **kwargs)
714
+ else:
715
+ raise ValueError(f"Unsupported format for compression: {format}")
716
+
717
+ logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
718
+ return df
719
+
720
+
721
+ def _load_from_zip(
722
+ filepath: Union[str, Path],
723
+ format: str = "csv",
724
+ file_pattern: Optional[str] = None,
725
+ **kwargs
726
+ ) -> pd.DataFrame:
727
+ """Load data from within a ZIP archive."""
728
+ with zipfile.ZipFile(filepath, "r") as z:
729
+ file_list = z.namelist()
730
+
731
+ # Filter files by pattern or extension
732
+ if file_pattern:
733
+ import fnmatch
734
+ matching_files = [f for f in file_list if fnmatch.fnmatch(f, file_pattern)]
735
+ else:
736
+ ext = f".{format}"
737
+ matching_files = [f for f in file_list if f.endswith(ext)]
738
+
739
+ if not matching_files:
740
+ raise ValueError(f"No {format} files found in archive")
741
+
742
+ # Load the first matching file (or concatenate all)
743
+ if len(matching_files) == 1:
744
+ with z.open(matching_files[0]) as f:
745
+ content = io.BytesIO(f.read())
746
+ if format == "csv":
747
+ return pd.read_csv(content, **kwargs)
748
+ elif format == "json":
749
+ return pd.read_json(content, **kwargs)
750
+ elif format == "excel":
751
+ return pd.read_excel(content, **kwargs)
752
+ else:
753
+ # Concatenate all matching files
754
+ dfs = []
755
+ for filename in matching_files:
756
+ with z.open(filename) as f:
757
+ content = io.BytesIO(f.read())
758
+ if format == "csv":
759
+ df = pd.read_csv(content, **kwargs)
760
+ elif format == "json":
761
+ df = pd.read_json(content, **kwargs)
762
+ dfs.append(df)
763
+ return pd.concat(dfs, ignore_index=True)
764
+
765
+
766
+ # =============================================================================
767
+ # Auto Loader
768
+ # =============================================================================
769
+
770
+ def load_auto(
771
+ source: Union[str, Path],
772
+ **kwargs
773
+ ) -> pd.DataFrame:
774
+ """
775
+ Automatically detect and load data from various sources.
776
+
777
+ Supports CSV, Excel, JSON, SQL, and compressed files.
778
+ Automatically detects the format based on file extension or URL.
779
+
780
+ Args:
781
+ source: Path to file, URL, or SQL connection string.
782
+ **kwargs: Additional arguments passed to the appropriate loader.
783
+
784
+ Returns:
785
+ pd.DataFrame: Loaded data.
786
+
787
+ Example:
788
+ >>> df = load_auto("data.csv")
789
+ >>> df = load_auto("https://example.com/data.json")
790
+ >>> df = load_auto("data.xlsx")
791
+ """
792
+ source_str = str(source)
793
+
794
+ # Check if it's a URL
795
+ if source_str.startswith(("http://", "https://")):
796
+ parsed = urlparse(source_str)
797
+ path = parsed.path.lower()
798
+
799
+ if path.endswith(".csv"):
800
+ return load_url(source_str, format="csv", **kwargs)
801
+ elif path.endswith(".json") or path.endswith(".jsonl"):
802
+ return load_url(source_str, format="json", **kwargs)
803
+ elif path.endswith((".xlsx", ".xls")):
804
+ return load_url(source_str, format="excel", **kwargs)
805
+ else:
806
+ # Try JSON by default for API endpoints
807
+ return load_url(source_str, format="json", **kwargs)
808
+
809
+ # It's a file path
810
+ filepath = Path(source)
811
+ suffix = filepath.suffix.lower()
812
+
813
+ # Remove compression suffix to get actual format
814
+ if suffix in [".gz", ".bz2", ".xz", ".zip"]:
815
+ if suffix == ".zip":
816
+ return load_compressed(filepath, **kwargs)
817
+
818
+ # Get the format from the second-to-last suffix
819
+ stem = filepath.stem
820
+ inner_suffix = Path(stem).suffix.lower()
821
+
822
+ if inner_suffix == ".csv":
823
+ return load_compressed(filepath, format="csv", **kwargs)
824
+ elif inner_suffix == ".json":
825
+ return load_compressed(filepath, format="json", **kwargs)
826
+ else:
827
+ return load_compressed(filepath, format="csv", **kwargs)
828
+
829
+ # Standard file types
830
+ if suffix == ".csv":
831
+ return load_csv(filepath, **kwargs)
832
+ elif suffix in [".xlsx", ".xls"]:
833
+ return load_excel(filepath, **kwargs)
834
+ elif suffix in [".json", ".jsonl"]:
835
+ lines = suffix == ".jsonl"
836
+ return load_json(filepath, lines=lines, **kwargs)
837
+ elif suffix == ".parquet":
838
+ return pd.read_parquet(filepath, **kwargs)
839
+ elif suffix == ".feather":
840
+ return pd.read_feather(filepath, **kwargs)
841
+ elif suffix == ".pickle" or suffix == ".pkl":
842
+ return pd.read_pickle(filepath, **kwargs)
843
+ else:
844
+ # Try CSV as default
845
+ logger.warning(f"Unknown file format: {suffix}, trying CSV")
846
+ return load_csv(filepath, **kwargs)
847
+
848
+
849
+ # =============================================================================
850
+ # Data Saving
851
+ # =============================================================================
852
+
853
+ def save_csv(
854
+ df: pd.DataFrame,
855
+ filepath: Union[str, Path],
856
+ index: bool = False,
857
+ encoding: str = "utf-8",
858
+ **kwargs
859
+ ) -> None:
860
+ """
861
+ Save DataFrame to CSV file.
862
+
863
+ Args:
864
+ df: DataFrame to save.
865
+ filepath: Output file path.
866
+ index: Whether to include index.
867
+ encoding: File encoding.
868
+ **kwargs: Additional arguments passed to df.to_csv.
869
+ """
870
+ filepath = Path(filepath)
871
+ ensure_dir(filepath.parent)
872
+
873
+ df.to_csv(filepath, index=index, encoding=encoding, **kwargs)
874
+ logger.info(f"Saved {len(df)} rows to {filepath}")
875
+
876
+
877
+ def save_excel(
878
+ df: pd.DataFrame,
879
+ filepath: Union[str, Path],
880
+ sheet_name: str = "Sheet1",
881
+ index: bool = False,
882
+ **kwargs
883
+ ) -> None:
884
+ """
885
+ Save DataFrame to Excel file.
886
+
887
+ Args:
888
+ df: DataFrame to save.
889
+ filepath: Output file path.
890
+ sheet_name: Name of the sheet.
891
+ index: Whether to include index.
892
+ **kwargs: Additional arguments.
893
+ """
894
+ filepath = Path(filepath)
895
+ ensure_dir(filepath.parent)
896
+
897
+ df.to_excel(filepath, sheet_name=sheet_name, index=index, **kwargs)
898
+ logger.info(f"Saved {len(df)} rows to {filepath}")
899
+
900
+
901
+ def save_json(
902
+ df: pd.DataFrame,
903
+ filepath: Union[str, Path],
904
+ orient: str = "records",
905
+ indent: int = 2,
906
+ **kwargs
907
+ ) -> None:
908
+ """
909
+ Save DataFrame to JSON file.
910
+
911
+ Args:
912
+ df: DataFrame to save.
913
+ filepath: Output file path.
914
+ orient: JSON structure orientation.
915
+ indent: Indentation level.
916
+ **kwargs: Additional arguments.
917
+ """
918
+ filepath = Path(filepath)
919
+ ensure_dir(filepath.parent)
920
+
921
+ df.to_json(filepath, orient=orient, indent=indent, **kwargs)
922
+ logger.info(f"Saved {len(df)} rows to {filepath}")