adamops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adamops/__init__.py +40 -0
- adamops/cli.py +163 -0
- adamops/data/__init__.py +24 -0
- adamops/data/feature_engineering.py +284 -0
- adamops/data/loaders.py +922 -0
- adamops/data/preprocessors.py +227 -0
- adamops/data/splitters.py +218 -0
- adamops/data/validators.py +148 -0
- adamops/deployment/__init__.py +21 -0
- adamops/deployment/api.py +237 -0
- adamops/deployment/cloud.py +191 -0
- adamops/deployment/containerize.py +262 -0
- adamops/deployment/exporters.py +148 -0
- adamops/evaluation/__init__.py +24 -0
- adamops/evaluation/comparison.py +133 -0
- adamops/evaluation/explainability.py +143 -0
- adamops/evaluation/metrics.py +233 -0
- adamops/evaluation/reports.py +165 -0
- adamops/evaluation/visualization.py +238 -0
- adamops/models/__init__.py +21 -0
- adamops/models/automl.py +277 -0
- adamops/models/ensembles.py +228 -0
- adamops/models/modelops.py +308 -0
- adamops/models/registry.py +250 -0
- adamops/monitoring/__init__.py +21 -0
- adamops/monitoring/alerts.py +200 -0
- adamops/monitoring/dashboard.py +117 -0
- adamops/monitoring/drift.py +212 -0
- adamops/monitoring/performance.py +195 -0
- adamops/pipelines/__init__.py +15 -0
- adamops/pipelines/orchestrators.py +183 -0
- adamops/pipelines/workflows.py +212 -0
- adamops/utils/__init__.py +18 -0
- adamops/utils/config.py +457 -0
- adamops/utils/helpers.py +663 -0
- adamops/utils/logging.py +412 -0
- adamops-0.1.0.dist-info/METADATA +310 -0
- adamops-0.1.0.dist-info/RECORD +42 -0
- adamops-0.1.0.dist-info/WHEEL +5 -0
- adamops-0.1.0.dist-info/entry_points.txt +2 -0
- adamops-0.1.0.dist-info/licenses/LICENSE +21 -0
- adamops-0.1.0.dist-info/top_level.txt +1 -0
adamops/data/loaders.py
ADDED
|
@@ -0,0 +1,922 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Data Loaders Module
|
|
3
|
+
|
|
4
|
+
Provides comprehensive data loading capabilities from various sources:
|
|
5
|
+
- CSV files with auto-encoding detection
|
|
6
|
+
- Excel files (.xlsx, .xls)
|
|
7
|
+
- JSON files
|
|
8
|
+
- SQL databases (SQLite, PostgreSQL, MySQL)
|
|
9
|
+
- API/URL endpoints
|
|
10
|
+
- Compressed files (.zip, .gz)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import io
|
|
15
|
+
import json
|
|
16
|
+
import gzip
|
|
17
|
+
import zipfile
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Dict, List, Optional, Union
|
|
20
|
+
from urllib.parse import urlparse
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import numpy as np
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import chardet
|
|
27
|
+
CHARDET_AVAILABLE = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
CHARDET_AVAILABLE = False
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import requests
|
|
33
|
+
REQUESTS_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
REQUESTS_AVAILABLE = False
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
from sqlalchemy import create_engine, text
|
|
39
|
+
SQLALCHEMY_AVAILABLE = True
|
|
40
|
+
except ImportError:
|
|
41
|
+
SQLALCHEMY_AVAILABLE = False
|
|
42
|
+
|
|
43
|
+
from adamops.utils.logging import get_logger
|
|
44
|
+
from adamops.utils.helpers import ensure_dir
|
|
45
|
+
|
|
46
|
+
logger = get_logger(__name__)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# =============================================================================
|
|
50
|
+
# Encoding Detection
|
|
51
|
+
# =============================================================================
|
|
52
|
+
|
|
53
|
+
def detect_encoding(filepath: Union[str, Path], sample_size: int = 10000) -> str:
|
|
54
|
+
"""
|
|
55
|
+
Detect the encoding of a file.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
filepath: Path to the file.
|
|
59
|
+
sample_size: Number of bytes to sample for detection.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
str: Detected encoding (e.g., 'utf-8', 'latin-1').
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
>>> encoding = detect_encoding("data.csv")
|
|
66
|
+
>>> print(encoding)
|
|
67
|
+
'utf-8'
|
|
68
|
+
"""
|
|
69
|
+
if not CHARDET_AVAILABLE:
|
|
70
|
+
logger.warning("chardet not available, defaulting to utf-8")
|
|
71
|
+
return "utf-8"
|
|
72
|
+
|
|
73
|
+
with open(filepath, "rb") as f:
|
|
74
|
+
raw_data = f.read(sample_size)
|
|
75
|
+
|
|
76
|
+
result = chardet.detect(raw_data)
|
|
77
|
+
encoding = result.get("encoding", "utf-8")
|
|
78
|
+
confidence = result.get("confidence", 0)
|
|
79
|
+
|
|
80
|
+
logger.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2%})")
|
|
81
|
+
|
|
82
|
+
# Fall back to utf-8 if detection is uncertain
|
|
83
|
+
if confidence < 0.5:
|
|
84
|
+
encoding = "utf-8"
|
|
85
|
+
|
|
86
|
+
return encoding or "utf-8"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# =============================================================================
|
|
90
|
+
# CSV Loading
|
|
91
|
+
# =============================================================================
|
|
92
|
+
|
|
93
|
+
def load_csv(
|
|
94
|
+
filepath: Union[str, Path],
|
|
95
|
+
encoding: Optional[str] = None,
|
|
96
|
+
auto_detect_encoding: bool = True,
|
|
97
|
+
sep: str = ",",
|
|
98
|
+
header: Union[int, List[int], str] = "infer",
|
|
99
|
+
index_col: Optional[Union[int, str, List]] = None,
|
|
100
|
+
usecols: Optional[List] = None,
|
|
101
|
+
dtype: Optional[Dict] = None,
|
|
102
|
+
parse_dates: Optional[Union[bool, List]] = None,
|
|
103
|
+
na_values: Optional[List] = None,
|
|
104
|
+
nrows: Optional[int] = None,
|
|
105
|
+
skiprows: Optional[Union[int, List]] = None,
|
|
106
|
+
low_memory: bool = True,
|
|
107
|
+
**kwargs
|
|
108
|
+
) -> pd.DataFrame:
|
|
109
|
+
"""
|
|
110
|
+
Load data from a CSV file with auto-encoding detection.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
filepath: Path to the CSV file.
|
|
114
|
+
encoding: File encoding. If None and auto_detect_encoding is True,
|
|
115
|
+
encoding will be detected automatically.
|
|
116
|
+
auto_detect_encoding: Whether to auto-detect encoding.
|
|
117
|
+
sep: Column separator.
|
|
118
|
+
header: Row number(s) to use as column names.
|
|
119
|
+
index_col: Column(s) to use as index.
|
|
120
|
+
usecols: Columns to load.
|
|
121
|
+
dtype: Data types for columns.
|
|
122
|
+
parse_dates: Columns to parse as dates.
|
|
123
|
+
na_values: Additional values to treat as NA.
|
|
124
|
+
nrows: Number of rows to read.
|
|
125
|
+
skiprows: Rows to skip.
|
|
126
|
+
low_memory: Use low memory mode.
|
|
127
|
+
**kwargs: Additional arguments passed to pd.read_csv.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
pd.DataFrame: Loaded data.
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
>>> df = load_csv("data.csv")
|
|
134
|
+
>>> df = load_csv("data.csv", usecols=["id", "name", "value"])
|
|
135
|
+
>>> df = load_csv("data.csv", parse_dates=["date_column"])
|
|
136
|
+
"""
|
|
137
|
+
filepath = Path(filepath)
|
|
138
|
+
|
|
139
|
+
if not filepath.exists():
|
|
140
|
+
raise FileNotFoundError(f"File not found: {filepath}")
|
|
141
|
+
|
|
142
|
+
# Detect encoding if not specified
|
|
143
|
+
if encoding is None and auto_detect_encoding:
|
|
144
|
+
encoding = detect_encoding(filepath)
|
|
145
|
+
elif encoding is None:
|
|
146
|
+
encoding = "utf-8"
|
|
147
|
+
|
|
148
|
+
logger.info(f"Loading CSV: {filepath} (encoding: {encoding})")
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
df = pd.read_csv(
|
|
152
|
+
filepath,
|
|
153
|
+
encoding=encoding,
|
|
154
|
+
sep=sep,
|
|
155
|
+
header=header,
|
|
156
|
+
index_col=index_col,
|
|
157
|
+
usecols=usecols,
|
|
158
|
+
dtype=dtype,
|
|
159
|
+
parse_dates=parse_dates,
|
|
160
|
+
na_values=na_values,
|
|
161
|
+
nrows=nrows,
|
|
162
|
+
skiprows=skiprows,
|
|
163
|
+
low_memory=low_memory,
|
|
164
|
+
**kwargs
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
168
|
+
return df
|
|
169
|
+
|
|
170
|
+
except UnicodeDecodeError:
|
|
171
|
+
# Try with different encodings
|
|
172
|
+
for fallback_encoding in ["latin-1", "cp1252", "iso-8859-1"]:
|
|
173
|
+
try:
|
|
174
|
+
logger.warning(f"Retrying with {fallback_encoding} encoding")
|
|
175
|
+
df = pd.read_csv(
|
|
176
|
+
filepath,
|
|
177
|
+
encoding=fallback_encoding,
|
|
178
|
+
sep=sep,
|
|
179
|
+
header=header,
|
|
180
|
+
index_col=index_col,
|
|
181
|
+
usecols=usecols,
|
|
182
|
+
dtype=dtype,
|
|
183
|
+
parse_dates=parse_dates,
|
|
184
|
+
na_values=na_values,
|
|
185
|
+
nrows=nrows,
|
|
186
|
+
skiprows=skiprows,
|
|
187
|
+
low_memory=low_memory,
|
|
188
|
+
**kwargs
|
|
189
|
+
)
|
|
190
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
191
|
+
return df
|
|
192
|
+
except UnicodeDecodeError:
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
raise
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# =============================================================================
|
|
199
|
+
# Excel Loading
|
|
200
|
+
# =============================================================================
|
|
201
|
+
|
|
202
|
+
def load_excel(
|
|
203
|
+
filepath: Union[str, Path],
|
|
204
|
+
sheet_name: Union[str, int, List, None] = 0,
|
|
205
|
+
header: Union[int, List[int], None] = 0,
|
|
206
|
+
index_col: Optional[Union[int, str, List]] = None,
|
|
207
|
+
usecols: Optional[Union[str, List]] = None,
|
|
208
|
+
dtype: Optional[Dict] = None,
|
|
209
|
+
parse_dates: Optional[Union[bool, List]] = None,
|
|
210
|
+
na_values: Optional[List] = None,
|
|
211
|
+
nrows: Optional[int] = None,
|
|
212
|
+
skiprows: Optional[Union[int, List]] = None,
|
|
213
|
+
**kwargs
|
|
214
|
+
) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
|
|
215
|
+
"""
|
|
216
|
+
Load data from an Excel file (.xlsx, .xls).
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
filepath: Path to the Excel file.
|
|
220
|
+
sheet_name: Sheet name or index, or list for multiple sheets.
|
|
221
|
+
Use None to read all sheets.
|
|
222
|
+
header: Row number(s) to use as column names.
|
|
223
|
+
index_col: Column(s) to use as index.
|
|
224
|
+
usecols: Columns to load.
|
|
225
|
+
dtype: Data types for columns.
|
|
226
|
+
parse_dates: Columns to parse as dates.
|
|
227
|
+
na_values: Additional values to treat as NA.
|
|
228
|
+
nrows: Number of rows to read.
|
|
229
|
+
skiprows: Rows to skip.
|
|
230
|
+
**kwargs: Additional arguments passed to pd.read_excel.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
pd.DataFrame or Dict[str, pd.DataFrame]: Loaded data.
|
|
234
|
+
|
|
235
|
+
Example:
|
|
236
|
+
>>> df = load_excel("data.xlsx")
|
|
237
|
+
>>> df = load_excel("data.xlsx", sheet_name="Sheet1")
|
|
238
|
+
>>> sheets = load_excel("data.xlsx", sheet_name=None) # All sheets
|
|
239
|
+
"""
|
|
240
|
+
filepath = Path(filepath)
|
|
241
|
+
|
|
242
|
+
if not filepath.exists():
|
|
243
|
+
raise FileNotFoundError(f"File not found: {filepath}")
|
|
244
|
+
|
|
245
|
+
logger.info(f"Loading Excel: {filepath}")
|
|
246
|
+
|
|
247
|
+
result = pd.read_excel(
|
|
248
|
+
filepath,
|
|
249
|
+
sheet_name=sheet_name,
|
|
250
|
+
header=header,
|
|
251
|
+
index_col=index_col,
|
|
252
|
+
usecols=usecols,
|
|
253
|
+
dtype=dtype,
|
|
254
|
+
parse_dates=parse_dates,
|
|
255
|
+
na_values=na_values,
|
|
256
|
+
nrows=nrows,
|
|
257
|
+
skiprows=skiprows,
|
|
258
|
+
**kwargs
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if isinstance(result, dict):
|
|
262
|
+
for name, df in result.items():
|
|
263
|
+
logger.info(f"Sheet '{name}': {len(df)} rows, {len(df.columns)} columns")
|
|
264
|
+
else:
|
|
265
|
+
logger.info(f"Loaded {len(result)} rows, {len(result.columns)} columns")
|
|
266
|
+
|
|
267
|
+
return result
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def get_excel_sheet_names(filepath: Union[str, Path]) -> List[str]:
|
|
271
|
+
"""
|
|
272
|
+
Get sheet names from an Excel file.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
filepath: Path to the Excel file.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
List[str]: List of sheet names.
|
|
279
|
+
"""
|
|
280
|
+
filepath = Path(filepath)
|
|
281
|
+
|
|
282
|
+
if not filepath.exists():
|
|
283
|
+
raise FileNotFoundError(f"File not found: {filepath}")
|
|
284
|
+
|
|
285
|
+
excel_file = pd.ExcelFile(filepath)
|
|
286
|
+
return excel_file.sheet_names
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# =============================================================================
|
|
290
|
+
# JSON Loading
|
|
291
|
+
# =============================================================================
|
|
292
|
+
|
|
293
|
+
def load_json(
|
|
294
|
+
filepath: Union[str, Path],
|
|
295
|
+
orient: Optional[str] = None,
|
|
296
|
+
lines: bool = False,
|
|
297
|
+
encoding: str = "utf-8",
|
|
298
|
+
**kwargs
|
|
299
|
+
) -> pd.DataFrame:
|
|
300
|
+
"""
|
|
301
|
+
Load data from a JSON file.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
filepath: Path to the JSON file.
|
|
305
|
+
orient: JSON structure orientation. Options:
|
|
306
|
+
'split', 'records', 'index', 'columns', 'values', 'table'
|
|
307
|
+
lines: Read file as line-delimited JSON.
|
|
308
|
+
encoding: File encoding.
|
|
309
|
+
**kwargs: Additional arguments passed to pd.read_json.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
pd.DataFrame: Loaded data.
|
|
313
|
+
|
|
314
|
+
Example:
|
|
315
|
+
>>> df = load_json("data.json")
|
|
316
|
+
>>> df = load_json("data.jsonl", lines=True)
|
|
317
|
+
"""
|
|
318
|
+
filepath = Path(filepath)
|
|
319
|
+
|
|
320
|
+
if not filepath.exists():
|
|
321
|
+
raise FileNotFoundError(f"File not found: {filepath}")
|
|
322
|
+
|
|
323
|
+
logger.info(f"Loading JSON: {filepath}")
|
|
324
|
+
|
|
325
|
+
df = pd.read_json(
|
|
326
|
+
filepath,
|
|
327
|
+
orient=orient,
|
|
328
|
+
lines=lines,
|
|
329
|
+
encoding=encoding,
|
|
330
|
+
**kwargs
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
334
|
+
return df
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def load_json_nested(
|
|
338
|
+
filepath: Union[str, Path],
|
|
339
|
+
record_path: Optional[Union[str, List[str]]] = None,
|
|
340
|
+
meta: Optional[List[str]] = None,
|
|
341
|
+
max_level: Optional[int] = None,
|
|
342
|
+
encoding: str = "utf-8",
|
|
343
|
+
) -> pd.DataFrame:
|
|
344
|
+
"""
|
|
345
|
+
Load nested JSON data and normalize it to a flat DataFrame.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
filepath: Path to the JSON file.
|
|
349
|
+
record_path: Path to the records in the JSON structure.
|
|
350
|
+
meta: Fields to include from higher level.
|
|
351
|
+
max_level: Maximum normalization depth.
|
|
352
|
+
encoding: File encoding.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
pd.DataFrame: Normalized data.
|
|
356
|
+
|
|
357
|
+
Example:
|
|
358
|
+
>>> # For JSON like: {"data": [{"id": 1, "info": {"name": "A"}}]}
|
|
359
|
+
>>> df = load_json_nested("data.json", record_path="data")
|
|
360
|
+
"""
|
|
361
|
+
filepath = Path(filepath)
|
|
362
|
+
|
|
363
|
+
if not filepath.exists():
|
|
364
|
+
raise FileNotFoundError(f"File not found: {filepath}")
|
|
365
|
+
|
|
366
|
+
logger.info(f"Loading nested JSON: {filepath}")
|
|
367
|
+
|
|
368
|
+
with open(filepath, "r", encoding=encoding) as f:
|
|
369
|
+
data = json.load(f)
|
|
370
|
+
|
|
371
|
+
df = pd.json_normalize(
|
|
372
|
+
data,
|
|
373
|
+
record_path=record_path,
|
|
374
|
+
meta=meta,
|
|
375
|
+
max_level=max_level,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
379
|
+
return df
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
# =============================================================================
|
|
383
|
+
# SQL Loading
|
|
384
|
+
# =============================================================================
|
|
385
|
+
|
|
386
|
+
def load_sql(
|
|
387
|
+
query: str,
|
|
388
|
+
connection_string: str,
|
|
389
|
+
params: Optional[Dict] = None,
|
|
390
|
+
index_col: Optional[Union[str, List[str]]] = None,
|
|
391
|
+
parse_dates: Optional[Union[List[str], Dict]] = None,
|
|
392
|
+
chunksize: Optional[int] = None,
|
|
393
|
+
**kwargs
|
|
394
|
+
) -> Union[pd.DataFrame, pd.io.sql.SQLiteDatabase]:
|
|
395
|
+
"""
|
|
396
|
+
Load data from a SQL database.
|
|
397
|
+
|
|
398
|
+
Supports SQLite, PostgreSQL, MySQL, and other SQLAlchemy-compatible databases.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
query: SQL query to execute.
|
|
402
|
+
connection_string: Database connection string.
|
|
403
|
+
Examples:
|
|
404
|
+
- SQLite: "sqlite:///database.db"
|
|
405
|
+
- PostgreSQL: "postgresql://user:pass@host:port/db"
|
|
406
|
+
- MySQL: "mysql+pymysql://user:pass@host:port/db"
|
|
407
|
+
params: Query parameters.
|
|
408
|
+
index_col: Column(s) to use as index.
|
|
409
|
+
parse_dates: Columns to parse as dates.
|
|
410
|
+
chunksize: Number of rows per chunk (for large datasets).
|
|
411
|
+
**kwargs: Additional arguments passed to pd.read_sql.
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
pd.DataFrame or Iterator: Loaded data.
|
|
415
|
+
|
|
416
|
+
Example:
|
|
417
|
+
>>> df = load_sql("SELECT * FROM users", "sqlite:///app.db")
|
|
418
|
+
>>> df = load_sql(
|
|
419
|
+
... "SELECT * FROM orders WHERE date > :date",
|
|
420
|
+
... "postgresql://user:pass@localhost:5432/shop",
|
|
421
|
+
... params={"date": "2023-01-01"}
|
|
422
|
+
... )
|
|
423
|
+
"""
|
|
424
|
+
if not SQLALCHEMY_AVAILABLE:
|
|
425
|
+
raise ImportError("SQLAlchemy is required for SQL loading. Install with: pip install sqlalchemy")
|
|
426
|
+
|
|
427
|
+
logger.info(f"Loading from SQL database")
|
|
428
|
+
|
|
429
|
+
engine = create_engine(connection_string)
|
|
430
|
+
|
|
431
|
+
# Use text() for raw SQL queries with params
|
|
432
|
+
if params:
|
|
433
|
+
query = text(query)
|
|
434
|
+
|
|
435
|
+
df = pd.read_sql(
|
|
436
|
+
query,
|
|
437
|
+
engine,
|
|
438
|
+
params=params,
|
|
439
|
+
index_col=index_col,
|
|
440
|
+
parse_dates=parse_dates,
|
|
441
|
+
chunksize=chunksize,
|
|
442
|
+
**kwargs
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
if chunksize is None:
|
|
446
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
447
|
+
else:
|
|
448
|
+
logger.info(f"Created chunked reader with chunksize={chunksize}")
|
|
449
|
+
|
|
450
|
+
return df
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def load_sql_table(
|
|
454
|
+
table_name: str,
|
|
455
|
+
connection_string: str,
|
|
456
|
+
schema: Optional[str] = None,
|
|
457
|
+
columns: Optional[List[str]] = None,
|
|
458
|
+
index_col: Optional[Union[str, List[str]]] = None,
|
|
459
|
+
chunksize: Optional[int] = None,
|
|
460
|
+
**kwargs
|
|
461
|
+
) -> pd.DataFrame:
|
|
462
|
+
"""
|
|
463
|
+
Load an entire table from a SQL database.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
table_name: Name of the table to load.
|
|
467
|
+
connection_string: Database connection string.
|
|
468
|
+
schema: Database schema.
|
|
469
|
+
columns: Columns to load (None for all).
|
|
470
|
+
index_col: Column(s) to use as index.
|
|
471
|
+
chunksize: Number of rows per chunk.
|
|
472
|
+
**kwargs: Additional arguments.
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
pd.DataFrame: Loaded data.
|
|
476
|
+
"""
|
|
477
|
+
if not SQLALCHEMY_AVAILABLE:
|
|
478
|
+
raise ImportError("SQLAlchemy is required for SQL loading. Install with: pip install sqlalchemy")
|
|
479
|
+
|
|
480
|
+
logger.info(f"Loading table: {table_name}")
|
|
481
|
+
|
|
482
|
+
engine = create_engine(connection_string)
|
|
483
|
+
|
|
484
|
+
df = pd.read_sql_table(
|
|
485
|
+
table_name,
|
|
486
|
+
engine,
|
|
487
|
+
schema=schema,
|
|
488
|
+
columns=columns,
|
|
489
|
+
index_col=index_col,
|
|
490
|
+
chunksize=chunksize,
|
|
491
|
+
**kwargs
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
if chunksize is None:
|
|
495
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
496
|
+
|
|
497
|
+
return df
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# =============================================================================
|
|
501
|
+
# API/URL Loading
|
|
502
|
+
# =============================================================================
|
|
503
|
+
|
|
504
|
+
def load_url(
|
|
505
|
+
url: str,
|
|
506
|
+
format: str = "csv",
|
|
507
|
+
params: Optional[Dict] = None,
|
|
508
|
+
headers: Optional[Dict] = None,
|
|
509
|
+
auth: Optional[tuple] = None,
|
|
510
|
+
timeout: int = 30,
|
|
511
|
+
**kwargs
|
|
512
|
+
) -> pd.DataFrame:
|
|
513
|
+
"""
|
|
514
|
+
Load data from a URL.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
url: URL to load data from.
|
|
518
|
+
format: Data format ('csv', 'json', 'excel').
|
|
519
|
+
params: Query parameters.
|
|
520
|
+
headers: HTTP headers.
|
|
521
|
+
auth: Authentication tuple (username, password).
|
|
522
|
+
timeout: Request timeout in seconds.
|
|
523
|
+
**kwargs: Additional arguments for the format loader.
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
pd.DataFrame: Loaded data.
|
|
527
|
+
|
|
528
|
+
Example:
|
|
529
|
+
>>> df = load_url("https://example.com/data.csv")
|
|
530
|
+
>>> df = load_url(
|
|
531
|
+
... "https://api.example.com/data",
|
|
532
|
+
... format="json",
|
|
533
|
+
... headers={"Authorization": "Bearer token"}
|
|
534
|
+
... )
|
|
535
|
+
"""
|
|
536
|
+
if not REQUESTS_AVAILABLE:
|
|
537
|
+
raise ImportError("requests is required for URL loading. Install with: pip install requests")
|
|
538
|
+
|
|
539
|
+
logger.info(f"Loading from URL: {url}")
|
|
540
|
+
|
|
541
|
+
response = requests.get(
|
|
542
|
+
url,
|
|
543
|
+
params=params,
|
|
544
|
+
headers=headers,
|
|
545
|
+
auth=auth,
|
|
546
|
+
timeout=timeout,
|
|
547
|
+
)
|
|
548
|
+
response.raise_for_status()
|
|
549
|
+
|
|
550
|
+
content = io.BytesIO(response.content)
|
|
551
|
+
|
|
552
|
+
if format == "csv":
|
|
553
|
+
df = pd.read_csv(content, **kwargs)
|
|
554
|
+
elif format == "json":
|
|
555
|
+
df = pd.read_json(content, **kwargs)
|
|
556
|
+
elif format == "excel":
|
|
557
|
+
df = pd.read_excel(content, **kwargs)
|
|
558
|
+
else:
|
|
559
|
+
raise ValueError(f"Unsupported format: {format}")
|
|
560
|
+
|
|
561
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
562
|
+
return df
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def load_api(
|
|
566
|
+
url: str,
|
|
567
|
+
method: str = "GET",
|
|
568
|
+
params: Optional[Dict] = None,
|
|
569
|
+
data: Optional[Dict] = None,
|
|
570
|
+
json_data: Optional[Dict] = None,
|
|
571
|
+
headers: Optional[Dict] = None,
|
|
572
|
+
auth: Optional[tuple] = None,
|
|
573
|
+
timeout: int = 30,
|
|
574
|
+
data_key: Optional[str] = None,
|
|
575
|
+
paginate: bool = False,
|
|
576
|
+
page_key: str = "page",
|
|
577
|
+
limit_key: str = "limit",
|
|
578
|
+
limit: int = 100,
|
|
579
|
+
max_pages: int = 100,
|
|
580
|
+
) -> pd.DataFrame:
|
|
581
|
+
"""
|
|
582
|
+
Load data from a REST API with pagination support.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
url: API endpoint URL.
|
|
586
|
+
method: HTTP method.
|
|
587
|
+
params: Query parameters.
|
|
588
|
+
data: Form data.
|
|
589
|
+
json_data: JSON body data.
|
|
590
|
+
headers: HTTP headers.
|
|
591
|
+
auth: Authentication tuple.
|
|
592
|
+
timeout: Request timeout.
|
|
593
|
+
data_key: Key in response containing the data array.
|
|
594
|
+
paginate: Whether to paginate through results.
|
|
595
|
+
page_key: Parameter name for page number.
|
|
596
|
+
limit_key: Parameter name for page size.
|
|
597
|
+
limit: Number of items per page.
|
|
598
|
+
max_pages: Maximum number of pages to fetch.
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
pd.DataFrame: Loaded data.
|
|
602
|
+
|
|
603
|
+
Example:
|
|
604
|
+
>>> df = load_api(
|
|
605
|
+
... "https://api.example.com/users",
|
|
606
|
+
... headers={"Authorization": "Bearer token"},
|
|
607
|
+
... data_key="users",
|
|
608
|
+
... paginate=True
|
|
609
|
+
... )
|
|
610
|
+
"""
|
|
611
|
+
if not REQUESTS_AVAILABLE:
|
|
612
|
+
raise ImportError("requests is required for API loading. Install with: pip install requests")
|
|
613
|
+
|
|
614
|
+
logger.info(f"Loading from API: {url}")
|
|
615
|
+
|
|
616
|
+
all_data = []
|
|
617
|
+
page = 1
|
|
618
|
+
|
|
619
|
+
while True:
|
|
620
|
+
# Build params for this request
|
|
621
|
+
request_params = dict(params or {})
|
|
622
|
+
if paginate:
|
|
623
|
+
request_params[page_key] = page
|
|
624
|
+
request_params[limit_key] = limit
|
|
625
|
+
|
|
626
|
+
response = requests.request(
|
|
627
|
+
method=method,
|
|
628
|
+
url=url,
|
|
629
|
+
params=request_params,
|
|
630
|
+
data=data,
|
|
631
|
+
json=json_data,
|
|
632
|
+
headers=headers,
|
|
633
|
+
auth=auth,
|
|
634
|
+
timeout=timeout,
|
|
635
|
+
)
|
|
636
|
+
response.raise_for_status()
|
|
637
|
+
|
|
638
|
+
result = response.json()
|
|
639
|
+
|
|
640
|
+
# Extract data
|
|
641
|
+
if data_key:
|
|
642
|
+
page_data = result.get(data_key, [])
|
|
643
|
+
else:
|
|
644
|
+
page_data = result if isinstance(result, list) else [result]
|
|
645
|
+
|
|
646
|
+
all_data.extend(page_data)
|
|
647
|
+
|
|
648
|
+
# Check if we should continue paginating
|
|
649
|
+
if not paginate or len(page_data) < limit or page >= max_pages:
|
|
650
|
+
break
|
|
651
|
+
|
|
652
|
+
page += 1
|
|
653
|
+
logger.debug(f"Fetching page {page}...")
|
|
654
|
+
|
|
655
|
+
df = pd.DataFrame(all_data)
|
|
656
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
657
|
+
return df
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
# =============================================================================
|
|
661
|
+
# Compressed Files
|
|
662
|
+
# =============================================================================
|
|
663
|
+
|
|
664
|
+
def load_compressed(
|
|
665
|
+
filepath: Union[str, Path],
|
|
666
|
+
format: str = "csv",
|
|
667
|
+
compression: Optional[str] = None,
|
|
668
|
+
**kwargs
|
|
669
|
+
) -> pd.DataFrame:
|
|
670
|
+
"""
|
|
671
|
+
Load data from a compressed file (.zip, .gz, .bz2, .xz).
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
filepath: Path to the compressed file.
|
|
675
|
+
format: Data format inside the archive ('csv', 'json', 'excel').
|
|
676
|
+
compression: Compression type. Auto-detected if None.
|
|
677
|
+
**kwargs: Additional arguments for the format loader.
|
|
678
|
+
|
|
679
|
+
Returns:
|
|
680
|
+
pd.DataFrame: Loaded data.
|
|
681
|
+
|
|
682
|
+
Example:
|
|
683
|
+
>>> df = load_compressed("data.csv.gz")
|
|
684
|
+
>>> df = load_compressed("archive.zip", format="csv")
|
|
685
|
+
"""
|
|
686
|
+
filepath = Path(filepath)
|
|
687
|
+
|
|
688
|
+
if not filepath.exists():
|
|
689
|
+
raise FileNotFoundError(f"File not found: {filepath}")
|
|
690
|
+
|
|
691
|
+
# Auto-detect compression type
|
|
692
|
+
if compression is None:
|
|
693
|
+
suffix = filepath.suffix.lower()
|
|
694
|
+
if suffix == ".gz":
|
|
695
|
+
compression = "gzip"
|
|
696
|
+
elif suffix == ".bz2":
|
|
697
|
+
compression = "bz2"
|
|
698
|
+
elif suffix == ".xz":
|
|
699
|
+
compression = "xz"
|
|
700
|
+
elif suffix == ".zip":
|
|
701
|
+
compression = "zip"
|
|
702
|
+
else:
|
|
703
|
+
compression = "infer"
|
|
704
|
+
|
|
705
|
+
logger.info(f"Loading compressed file: {filepath} ({compression})")
|
|
706
|
+
|
|
707
|
+
if compression == "zip":
|
|
708
|
+
return _load_from_zip(filepath, format, **kwargs)
|
|
709
|
+
else:
|
|
710
|
+
if format == "csv":
|
|
711
|
+
df = pd.read_csv(filepath, compression=compression, **kwargs)
|
|
712
|
+
elif format == "json":
|
|
713
|
+
df = pd.read_json(filepath, compression=compression, **kwargs)
|
|
714
|
+
else:
|
|
715
|
+
raise ValueError(f"Unsupported format for compression: {format}")
|
|
716
|
+
|
|
717
|
+
logger.info(f"Loaded {len(df)} rows, {len(df.columns)} columns")
|
|
718
|
+
return df
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def _load_from_zip(
|
|
722
|
+
filepath: Union[str, Path],
|
|
723
|
+
format: str = "csv",
|
|
724
|
+
file_pattern: Optional[str] = None,
|
|
725
|
+
**kwargs
|
|
726
|
+
) -> pd.DataFrame:
|
|
727
|
+
"""Load data from within a ZIP archive."""
|
|
728
|
+
with zipfile.ZipFile(filepath, "r") as z:
|
|
729
|
+
file_list = z.namelist()
|
|
730
|
+
|
|
731
|
+
# Filter files by pattern or extension
|
|
732
|
+
if file_pattern:
|
|
733
|
+
import fnmatch
|
|
734
|
+
matching_files = [f for f in file_list if fnmatch.fnmatch(f, file_pattern)]
|
|
735
|
+
else:
|
|
736
|
+
ext = f".{format}"
|
|
737
|
+
matching_files = [f for f in file_list if f.endswith(ext)]
|
|
738
|
+
|
|
739
|
+
if not matching_files:
|
|
740
|
+
raise ValueError(f"No {format} files found in archive")
|
|
741
|
+
|
|
742
|
+
# Load the first matching file (or concatenate all)
|
|
743
|
+
if len(matching_files) == 1:
|
|
744
|
+
with z.open(matching_files[0]) as f:
|
|
745
|
+
content = io.BytesIO(f.read())
|
|
746
|
+
if format == "csv":
|
|
747
|
+
return pd.read_csv(content, **kwargs)
|
|
748
|
+
elif format == "json":
|
|
749
|
+
return pd.read_json(content, **kwargs)
|
|
750
|
+
elif format == "excel":
|
|
751
|
+
return pd.read_excel(content, **kwargs)
|
|
752
|
+
else:
|
|
753
|
+
# Concatenate all matching files
|
|
754
|
+
dfs = []
|
|
755
|
+
for filename in matching_files:
|
|
756
|
+
with z.open(filename) as f:
|
|
757
|
+
content = io.BytesIO(f.read())
|
|
758
|
+
if format == "csv":
|
|
759
|
+
df = pd.read_csv(content, **kwargs)
|
|
760
|
+
elif format == "json":
|
|
761
|
+
df = pd.read_json(content, **kwargs)
|
|
762
|
+
dfs.append(df)
|
|
763
|
+
return pd.concat(dfs, ignore_index=True)
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
# =============================================================================
|
|
767
|
+
# Auto Loader
|
|
768
|
+
# =============================================================================
|
|
769
|
+
|
|
770
|
+
def load_auto(
|
|
771
|
+
source: Union[str, Path],
|
|
772
|
+
**kwargs
|
|
773
|
+
) -> pd.DataFrame:
|
|
774
|
+
"""
|
|
775
|
+
Automatically detect and load data from various sources.
|
|
776
|
+
|
|
777
|
+
Supports CSV, Excel, JSON, SQL, and compressed files.
|
|
778
|
+
Automatically detects the format based on file extension or URL.
|
|
779
|
+
|
|
780
|
+
Args:
|
|
781
|
+
source: Path to file, URL, or SQL connection string.
|
|
782
|
+
**kwargs: Additional arguments passed to the appropriate loader.
|
|
783
|
+
|
|
784
|
+
Returns:
|
|
785
|
+
pd.DataFrame: Loaded data.
|
|
786
|
+
|
|
787
|
+
Example:
|
|
788
|
+
>>> df = load_auto("data.csv")
|
|
789
|
+
>>> df = load_auto("https://example.com/data.json")
|
|
790
|
+
>>> df = load_auto("data.xlsx")
|
|
791
|
+
"""
|
|
792
|
+
source_str = str(source)
|
|
793
|
+
|
|
794
|
+
# Check if it's a URL
|
|
795
|
+
if source_str.startswith(("http://", "https://")):
|
|
796
|
+
parsed = urlparse(source_str)
|
|
797
|
+
path = parsed.path.lower()
|
|
798
|
+
|
|
799
|
+
if path.endswith(".csv"):
|
|
800
|
+
return load_url(source_str, format="csv", **kwargs)
|
|
801
|
+
elif path.endswith(".json") or path.endswith(".jsonl"):
|
|
802
|
+
return load_url(source_str, format="json", **kwargs)
|
|
803
|
+
elif path.endswith((".xlsx", ".xls")):
|
|
804
|
+
return load_url(source_str, format="excel", **kwargs)
|
|
805
|
+
else:
|
|
806
|
+
# Try JSON by default for API endpoints
|
|
807
|
+
return load_url(source_str, format="json", **kwargs)
|
|
808
|
+
|
|
809
|
+
# It's a file path
|
|
810
|
+
filepath = Path(source)
|
|
811
|
+
suffix = filepath.suffix.lower()
|
|
812
|
+
|
|
813
|
+
# Remove compression suffix to get actual format
|
|
814
|
+
if suffix in [".gz", ".bz2", ".xz", ".zip"]:
|
|
815
|
+
if suffix == ".zip":
|
|
816
|
+
return load_compressed(filepath, **kwargs)
|
|
817
|
+
|
|
818
|
+
# Get the format from the second-to-last suffix
|
|
819
|
+
stem = filepath.stem
|
|
820
|
+
inner_suffix = Path(stem).suffix.lower()
|
|
821
|
+
|
|
822
|
+
if inner_suffix == ".csv":
|
|
823
|
+
return load_compressed(filepath, format="csv", **kwargs)
|
|
824
|
+
elif inner_suffix == ".json":
|
|
825
|
+
return load_compressed(filepath, format="json", **kwargs)
|
|
826
|
+
else:
|
|
827
|
+
return load_compressed(filepath, format="csv", **kwargs)
|
|
828
|
+
|
|
829
|
+
# Standard file types
|
|
830
|
+
if suffix == ".csv":
|
|
831
|
+
return load_csv(filepath, **kwargs)
|
|
832
|
+
elif suffix in [".xlsx", ".xls"]:
|
|
833
|
+
return load_excel(filepath, **kwargs)
|
|
834
|
+
elif suffix in [".json", ".jsonl"]:
|
|
835
|
+
lines = suffix == ".jsonl"
|
|
836
|
+
return load_json(filepath, lines=lines, **kwargs)
|
|
837
|
+
elif suffix == ".parquet":
|
|
838
|
+
return pd.read_parquet(filepath, **kwargs)
|
|
839
|
+
elif suffix == ".feather":
|
|
840
|
+
return pd.read_feather(filepath, **kwargs)
|
|
841
|
+
elif suffix == ".pickle" or suffix == ".pkl":
|
|
842
|
+
return pd.read_pickle(filepath, **kwargs)
|
|
843
|
+
else:
|
|
844
|
+
# Try CSV as default
|
|
845
|
+
logger.warning(f"Unknown file format: {suffix}, trying CSV")
|
|
846
|
+
return load_csv(filepath, **kwargs)
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
# =============================================================================
|
|
850
|
+
# Data Saving
|
|
851
|
+
# =============================================================================
|
|
852
|
+
|
|
853
|
+
def save_csv(
|
|
854
|
+
df: pd.DataFrame,
|
|
855
|
+
filepath: Union[str, Path],
|
|
856
|
+
index: bool = False,
|
|
857
|
+
encoding: str = "utf-8",
|
|
858
|
+
**kwargs
|
|
859
|
+
) -> None:
|
|
860
|
+
"""
|
|
861
|
+
Save DataFrame to CSV file.
|
|
862
|
+
|
|
863
|
+
Args:
|
|
864
|
+
df: DataFrame to save.
|
|
865
|
+
filepath: Output file path.
|
|
866
|
+
index: Whether to include index.
|
|
867
|
+
encoding: File encoding.
|
|
868
|
+
**kwargs: Additional arguments passed to df.to_csv.
|
|
869
|
+
"""
|
|
870
|
+
filepath = Path(filepath)
|
|
871
|
+
ensure_dir(filepath.parent)
|
|
872
|
+
|
|
873
|
+
df.to_csv(filepath, index=index, encoding=encoding, **kwargs)
|
|
874
|
+
logger.info(f"Saved {len(df)} rows to {filepath}")
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
def save_excel(
|
|
878
|
+
df: pd.DataFrame,
|
|
879
|
+
filepath: Union[str, Path],
|
|
880
|
+
sheet_name: str = "Sheet1",
|
|
881
|
+
index: bool = False,
|
|
882
|
+
**kwargs
|
|
883
|
+
) -> None:
|
|
884
|
+
"""
|
|
885
|
+
Save DataFrame to Excel file.
|
|
886
|
+
|
|
887
|
+
Args:
|
|
888
|
+
df: DataFrame to save.
|
|
889
|
+
filepath: Output file path.
|
|
890
|
+
sheet_name: Name of the sheet.
|
|
891
|
+
index: Whether to include index.
|
|
892
|
+
**kwargs: Additional arguments.
|
|
893
|
+
"""
|
|
894
|
+
filepath = Path(filepath)
|
|
895
|
+
ensure_dir(filepath.parent)
|
|
896
|
+
|
|
897
|
+
df.to_excel(filepath, sheet_name=sheet_name, index=index, **kwargs)
|
|
898
|
+
logger.info(f"Saved {len(df)} rows to {filepath}")
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
def save_json(
|
|
902
|
+
df: pd.DataFrame,
|
|
903
|
+
filepath: Union[str, Path],
|
|
904
|
+
orient: str = "records",
|
|
905
|
+
indent: int = 2,
|
|
906
|
+
**kwargs
|
|
907
|
+
) -> None:
|
|
908
|
+
"""
|
|
909
|
+
Save DataFrame to JSON file.
|
|
910
|
+
|
|
911
|
+
Args:
|
|
912
|
+
df: DataFrame to save.
|
|
913
|
+
filepath: Output file path.
|
|
914
|
+
orient: JSON structure orientation.
|
|
915
|
+
indent: Indentation level.
|
|
916
|
+
**kwargs: Additional arguments.
|
|
917
|
+
"""
|
|
918
|
+
filepath = Path(filepath)
|
|
919
|
+
ensure_dir(filepath.parent)
|
|
920
|
+
|
|
921
|
+
df.to_json(filepath, orient=orient, indent=indent, **kwargs)
|
|
922
|
+
logger.info(f"Saved {len(df)} rows to {filepath}")
|