py2ls 0.1.10.26__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py
CHANGED
@@ -46,7 +46,7 @@ from collections import Counter
|
|
46
46
|
from fuzzywuzzy import fuzz, process
|
47
47
|
from langdetect import detect
|
48
48
|
from duckduckgo_search import DDGS
|
49
|
-
|
49
|
+
from typing import List, Optional, Union
|
50
50
|
from bs4 import BeautifulSoup
|
51
51
|
|
52
52
|
from . import netfinder
|
@@ -1250,6 +1250,119 @@ def get_encoding(fpath, alternative_encodings=None, verbose=False):
|
|
1250
1250
|
return None
|
1251
1251
|
|
1252
1252
|
|
1253
|
+
def unzip(dir_path, output_dir=None):
|
1254
|
+
"""
|
1255
|
+
Unzips or extracts various compressed file formats (.gz, .zip, .7z, .tar, .bz2, .xz, .rar).
|
1256
|
+
If the output directory already exists, it will be replaced.
|
1257
|
+
|
1258
|
+
Parameters:
|
1259
|
+
dir_path (str): Path to the compressed file.
|
1260
|
+
output_dir (str): Directory where the extracted files will be saved.
|
1261
|
+
If None, it extracts to the same directory as the file, with the same name.
|
1262
|
+
|
1263
|
+
Returns:
|
1264
|
+
str: The path to the output directory where files are extracted.
|
1265
|
+
"""
|
1266
|
+
|
1267
|
+
# Set default output directory to the same as the input file
|
1268
|
+
if output_dir is None:
|
1269
|
+
output_dir = os.path.splitext(dir_path)[0]
|
1270
|
+
|
1271
|
+
# If the output directory already exists, remove it and replace it
|
1272
|
+
if os.path.exists(output_dir):
|
1273
|
+
if os.path.isdir(output_dir): # check if it is a folder
|
1274
|
+
shutil.rmtree(output_dir) # remove folder
|
1275
|
+
else:
|
1276
|
+
os.remove(output_dir) # remove file
|
1277
|
+
|
1278
|
+
# Handle .tar.gz files
|
1279
|
+
if dir_path.endswith(".tar.gz") or dir_path.endswith(".tgz"):
|
1280
|
+
import tarfile
|
1281
|
+
|
1282
|
+
with tarfile.open(dir_path, "r:gz") as tar_ref:
|
1283
|
+
tar_ref.extractall(output_dir)
|
1284
|
+
return output_dir
|
1285
|
+
# Handle .gz files
|
1286
|
+
if dir_path.endswith(".gz"):
|
1287
|
+
import gzip
|
1288
|
+
|
1289
|
+
output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
|
1290
|
+
with gzip.open(dir_path, "rb") as gz_file:
|
1291
|
+
with open(output_file, "wb") as out_file:
|
1292
|
+
shutil.copyfileobj(gz_file, out_file)
|
1293
|
+
return output_file
|
1294
|
+
|
1295
|
+
# Handle .zip files
|
1296
|
+
elif dir_path.endswith(".zip"):
|
1297
|
+
import zipfile
|
1298
|
+
|
1299
|
+
with zipfile.ZipFile(dir_path, "r") as zip_ref:
|
1300
|
+
zip_ref.extractall(output_dir)
|
1301
|
+
return output_dir
|
1302
|
+
|
1303
|
+
# Handle .7z files (requires py7zr)
|
1304
|
+
elif dir_path.endswith(".7z"):
|
1305
|
+
import py7zr
|
1306
|
+
|
1307
|
+
with py7zr.SevenZipFile(dir_path, mode="r") as z:
|
1308
|
+
z.extractall(path=output_dir)
|
1309
|
+
return output_dir
|
1310
|
+
|
1311
|
+
# Handle .tar files
|
1312
|
+
elif dir_path.endswith(".tar"):
|
1313
|
+
import tarfile
|
1314
|
+
|
1315
|
+
with tarfile.open(dir_path, "r") as tar_ref:
|
1316
|
+
tar_ref.extractall(output_dir)
|
1317
|
+
return output_dir
|
1318
|
+
|
1319
|
+
# Handle .tar.bz2 files
|
1320
|
+
elif dir_path.endswith(".tar.bz2"):
|
1321
|
+
import tarfile
|
1322
|
+
|
1323
|
+
with tarfile.open(dir_path, "r:bz2") as tar_ref:
|
1324
|
+
tar_ref.extractall(output_dir)
|
1325
|
+
return output_dir
|
1326
|
+
|
1327
|
+
# Handle .bz2 files
|
1328
|
+
elif dir_path.endswith(".bz2"):
|
1329
|
+
import bz2
|
1330
|
+
|
1331
|
+
output_file = os.path.splitext(dir_path)[0] # remove the .bz2 extension
|
1332
|
+
with bz2.open(dir_path, "rb") as bz_file:
|
1333
|
+
with open(output_file, "wb") as out_file:
|
1334
|
+
shutil.copyfileobj(bz_file, out_file)
|
1335
|
+
return output_file
|
1336
|
+
|
1337
|
+
# Handle .xz files
|
1338
|
+
elif dir_path.endswith(".xz"):
|
1339
|
+
import lzma
|
1340
|
+
|
1341
|
+
output_file = os.path.splitext(dir_path)[0] # remove the .xz extension
|
1342
|
+
with lzma.open(dir_path, "rb") as xz_file:
|
1343
|
+
with open(output_file, "wb") as out_file:
|
1344
|
+
shutil.copyfileobj(xz_file, out_file)
|
1345
|
+
return output_file
|
1346
|
+
|
1347
|
+
# Handle .rar files (requires rarfile)
|
1348
|
+
elif dir_path.endswith(".rar"):
|
1349
|
+
import rarfile
|
1350
|
+
|
1351
|
+
with rarfile.RarFile(dir_path) as rar_ref:
|
1352
|
+
rar_ref.extractall(output_dir)
|
1353
|
+
return output_dir
|
1354
|
+
|
1355
|
+
else:
|
1356
|
+
raise ValueError(f"Unsupported file format: {os.path.splitext(dir_path)[1]}")
|
1357
|
+
|
1358
|
+
|
1359
|
+
# Example usage:
|
1360
|
+
# output_dir = unzip('data.tar.gz')
|
1361
|
+
# output_file = unzip('file.csv.gz')
|
1362
|
+
# output_dir_zip = unzip('archive.zip')
|
1363
|
+
# output_dir_7z = unzip('archive.7z')
|
1364
|
+
|
1365
|
+
|
1253
1366
|
def fload(fpath, kind=None, **kwargs):
|
1254
1367
|
"""
|
1255
1368
|
Load content from a file with specified file type.
|
@@ -1286,13 +1399,65 @@ def fload(fpath, kind=None, **kwargs):
|
|
1286
1399
|
root = tree.getroot()
|
1287
1400
|
return etree.tostring(root, pretty_print=True).decode()
|
1288
1401
|
|
1289
|
-
def load_csv(fpath,
|
1290
|
-
|
1291
|
-
|
1402
|
+
def load_csv(fpath, **kwargs):
|
1403
|
+
engine = kwargs.get("engine", "pyarrow")
|
1404
|
+
kwargs.pop("engine", None)
|
1405
|
+
index_col = kwargs.get("index_col", None)
|
1406
|
+
kwargs.pop("index_col", None)
|
1407
|
+
memory_map = kwargs.get("memory_map", True)
|
1408
|
+
kwargs.pop("memory_map", None)
|
1409
|
+
skipinitialspace = kwargs.get("skipinitialspace", True)
|
1410
|
+
kwargs.pop("skipinitialspace", None)
|
1411
|
+
encoding = kwargs.get("encoding", "utf-8")
|
1412
|
+
kwargs.pop("encoding", None)
|
1413
|
+
try:
|
1414
|
+
if engine == "pyarrow":
|
1415
|
+
df = pd.read_csv(
|
1416
|
+
fpath,
|
1417
|
+
engine=engine,
|
1418
|
+
index_col=index_col,
|
1419
|
+
encoding=encoding,
|
1420
|
+
**kwargs,
|
1421
|
+
)
|
1422
|
+
else:
|
1423
|
+
df = pd.read_csv(
|
1424
|
+
fpath,
|
1425
|
+
engine=engine,
|
1426
|
+
index_col=index_col,
|
1427
|
+
memory_map=memory_map,
|
1428
|
+
encoding=encoding,
|
1429
|
+
skipinitialspace=skipinitialspace,
|
1430
|
+
**kwargs,
|
1431
|
+
)
|
1432
|
+
print("File loaded successfully with utf-8 encoding.")
|
1433
|
+
except UnicodeDecodeError:
|
1434
|
+
encoding = get_encoding(fpath)
|
1435
|
+
print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
|
1436
|
+
if engine == "pyarrow":
|
1437
|
+
df = pd.read_csv(
|
1438
|
+
fpath,
|
1439
|
+
engine=engine,
|
1440
|
+
index_col=index_col,
|
1441
|
+
encoding=encoding,
|
1442
|
+
**kwargs,
|
1443
|
+
)
|
1444
|
+
else:
|
1445
|
+
df = pd.read_csv(
|
1446
|
+
fpath,
|
1447
|
+
engine=engine,
|
1448
|
+
index_col=index_col,
|
1449
|
+
memory_map=memory_map,
|
1450
|
+
encoding=encoding,
|
1451
|
+
skipinitialspace=skipinitialspace,
|
1452
|
+
**kwargs,
|
1453
|
+
)
|
1454
|
+
print("File loaded successfully with utf-8 encoding.")
|
1292
1455
|
return df
|
1293
1456
|
|
1294
1457
|
def load_xlsx(fpath, **kwargs):
|
1295
|
-
|
1458
|
+
engine = kwargs.get("engine", "openpyxl")
|
1459
|
+
kwargs.pop("engine", None)
|
1460
|
+
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
1296
1461
|
return df
|
1297
1462
|
|
1298
1463
|
def load_ipynb(fpath, **kwargs):
|
@@ -1398,11 +1563,37 @@ def fload(fpath, kind=None, **kwargs):
|
|
1398
1563
|
"pdf",
|
1399
1564
|
"ipynb",
|
1400
1565
|
]
|
1401
|
-
|
1566
|
+
zip_types = [
|
1567
|
+
"gz",
|
1568
|
+
"zip",
|
1569
|
+
"7z",
|
1570
|
+
"tar",
|
1571
|
+
"tar.gz",
|
1572
|
+
"tar.bz2",
|
1573
|
+
"bz2",
|
1574
|
+
"xz",
|
1575
|
+
"rar",
|
1576
|
+
"tgz",
|
1577
|
+
]
|
1578
|
+
supported_types = [*doc_types, *img_types, *zip_types]
|
1402
1579
|
if kind not in supported_types:
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1580
|
+
print(f'Error:\n"{kind}" is not in the supported list {supported_types}')
|
1581
|
+
# if os.path.splitext(fpath)[1][1:].lower() in zip_types:
|
1582
|
+
# keep=kwargs.get("keep", False)
|
1583
|
+
# ifile=kwargs.get("ifile",(0,0))
|
1584
|
+
# kwargs.pop("keep",None)
|
1585
|
+
# kwargs.pop("ifile",None)
|
1586
|
+
# fpath_unzip=unzip(fpath)
|
1587
|
+
# if isinstance(fpath_unzip,list):
|
1588
|
+
# fpath_unzip=fpath_unzip[ifile[0]]
|
1589
|
+
# if os.path.isdir(fpath_unzip):
|
1590
|
+
# fpath_selected=listdir(fpath_unzip,kind=kind).fpath[ifile[1]]
|
1591
|
+
# fpath_unzip=fpath_selected
|
1592
|
+
# content_unzip=fload(fpath_unzip, **kwargs)
|
1593
|
+
# if not keep:
|
1594
|
+
# os.remove(fpath_unzip)
|
1595
|
+
# return content_unzip
|
1596
|
+
|
1406
1597
|
if kind == "docx":
|
1407
1598
|
return load_docx(fpath)
|
1408
1599
|
elif kind == "txt" or kind == "md":
|
@@ -1417,6 +1608,14 @@ def fload(fpath, kind=None, **kwargs):
|
|
1417
1608
|
return load_xml(fpath)
|
1418
1609
|
elif kind == "csv":
|
1419
1610
|
return load_csv(fpath, **kwargs)
|
1611
|
+
elif kind in ["ods", "ods", "odt"]:
|
1612
|
+
engine = kwargs.get("engine", "odf")
|
1613
|
+
kwargs.pop("engine", None)
|
1614
|
+
return load_xlsx(fpath, engine=engine, **kwargs)
|
1615
|
+
elif kind == "xls":
|
1616
|
+
engine = kwargs.get("engine", "xlrd")
|
1617
|
+
kwargs.pop("engine", None)
|
1618
|
+
return load_xlsx(fpath, engine=engine, **kwargs)
|
1420
1619
|
elif kind == "xlsx":
|
1421
1620
|
return load_xlsx(fpath, **kwargs)
|
1422
1621
|
elif kind == "ipynb":
|
@@ -1427,10 +1626,55 @@ def fload(fpath, kind=None, **kwargs):
|
|
1427
1626
|
elif kind.lower() in img_types:
|
1428
1627
|
print(f'Image ".{kind}" is loaded.')
|
1429
1628
|
return load_img(fpath)
|
1629
|
+
elif kind.lower() in zip_types:
|
1630
|
+
keep = kwargs.get("keep", False)
|
1631
|
+
fpath_unzip = unzip(fpath)
|
1632
|
+
if os.path.isdir(fpath_unzip):
|
1633
|
+
print(f"{fpath_unzip} is a folder. fload stoped.")
|
1634
|
+
fpath_list = os.listdir("./datasets/GSE10927_family.xml")
|
1635
|
+
print(f"{len(fpath_list)} files within the folder")
|
1636
|
+
if len(fpath_list) > 5:
|
1637
|
+
pp(fpath_list[:5])
|
1638
|
+
print("there are more ...")
|
1639
|
+
else:
|
1640
|
+
pp(fpath_list)
|
1641
|
+
return fpath_list
|
1642
|
+
elif os.path.isfile(fpath_unzip):
|
1643
|
+
print(f"{fpath_unzip} is a file.")
|
1644
|
+
content_unzip = fload(fpath_unzip, **kwargs)
|
1645
|
+
if not keep:
|
1646
|
+
os.remove(fpath_unzip)
|
1647
|
+
return content_unzip
|
1648
|
+
else:
|
1649
|
+
print(f"{fpath_unzip} does not exist or is a different type.")
|
1650
|
+
|
1651
|
+
elif kind.lower() == "gmt":
|
1652
|
+
import gseapy as gp
|
1653
|
+
|
1654
|
+
gene_sets = gp.read_gmt(fpath)
|
1655
|
+
return gene_sets
|
1430
1656
|
else:
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1657
|
+
try:
|
1658
|
+
try:
|
1659
|
+
with open(fpath, "r", encoding="utf-8") as f:
|
1660
|
+
content = f.readlines()
|
1661
|
+
except UnicodeDecodeError:
|
1662
|
+
print("Failed to read as utf-8, trying different encoding...")
|
1663
|
+
with open(
|
1664
|
+
fpath, "r", encoding=get_encoding(fpath)
|
1665
|
+
) as f: # Trying with a different encoding
|
1666
|
+
content = f.readlines()
|
1667
|
+
except:
|
1668
|
+
try:
|
1669
|
+
with open(fpath, "r", encoding="utf-8") as f:
|
1670
|
+
content = f.read()
|
1671
|
+
except UnicodeDecodeError:
|
1672
|
+
print("Failed to read as utf-8, trying different encoding...")
|
1673
|
+
with open(
|
1674
|
+
fpath, "r", encoding=get_encoding(fpath)
|
1675
|
+
) as f: # Trying with a different encoding
|
1676
|
+
content = f.read()
|
1677
|
+
return content
|
1434
1678
|
|
1435
1679
|
|
1436
1680
|
# Example usage
|
@@ -2030,10 +2274,10 @@ def mkdir(*args, **kwargs):
|
|
2030
2274
|
if isinstance(arg, (str, list)):
|
2031
2275
|
if "/" in arg or "\\" in arg:
|
2032
2276
|
pardir = arg
|
2033
|
-
print(f"
|
2277
|
+
print(f'pardir: "{pardir}"')
|
2034
2278
|
else:
|
2035
2279
|
chdir = arg
|
2036
|
-
print(f"
|
2280
|
+
print(f'chdir:"{chdir}"')
|
2037
2281
|
elif isinstance(arg, bool):
|
2038
2282
|
overwrite = arg
|
2039
2283
|
print(overwrite)
|
@@ -2049,6 +2293,13 @@ def mkdir(*args, **kwargs):
|
|
2049
2293
|
pardir = os.path.normpath(pardir)
|
2050
2294
|
# Get the slash type: "/" or "\"
|
2051
2295
|
stype = "/" if "/" in pardir else "\\"
|
2296
|
+
if "mac" in get_os().lower() or "lin" in get_os().lower():
|
2297
|
+
stype = "/"
|
2298
|
+
elif "win" in get_os().lower():
|
2299
|
+
stype = "\\"
|
2300
|
+
else:
|
2301
|
+
stype = "/"
|
2302
|
+
|
2052
2303
|
# Check if the parent directory exists and is a directory path
|
2053
2304
|
if os.path.isdir(pardir):
|
2054
2305
|
os.chdir(pardir) # Set current path
|
@@ -4046,3 +4297,234 @@ def preview(var):
|
|
4046
4297
|
# preview("# This is a Markdown header")
|
4047
4298
|
# preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
|
4048
4299
|
# preview({"key": "value", "numbers": [1, 2, 3]})
|
4300
|
+
|
4301
|
+
|
4302
|
+
# ! DataFrame
|
4303
|
+
def df_as_type(
|
4304
|
+
df: pd.DataFrame,
|
4305
|
+
columns: Optional[Union[str, List[str]]] = None,
|
4306
|
+
astype: str = "datetime",
|
4307
|
+
format: Optional[str] = None,
|
4308
|
+
inplace: bool = True,
|
4309
|
+
errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
|
4310
|
+
**kwargs,
|
4311
|
+
) -> Optional[pd.DataFrame]:
|
4312
|
+
"""
|
4313
|
+
Convert specified columns of a DataFrame to a specified type (e.g., datetime, float, int, numeric, timedelta).
|
4314
|
+
If columns is None, all columns in the DataFrame will be converted.
|
4315
|
+
|
4316
|
+
Parameters:
|
4317
|
+
- df: DataFrame containing the columns to convert.
|
4318
|
+
- columns: Either a single column name, a list of column names, or None to convert all columns.
|
4319
|
+
- astype: The target type to convert the columns to ('datetime', 'float', 'int', 'numeric', 'timedelta', etc.).
|
4320
|
+
- format: Optional; format to specify the datetime format (only relevant for 'datetime' conversion).
|
4321
|
+
- inplace: Whether to modify the DataFrame in place or return a new one. Defaults to False.
|
4322
|
+
- errors: Can be "ignore", "raise", or "coerce"
|
4323
|
+
- **kwargs: Additional keyword arguments to pass to the conversion function (e.g., errors='ignore' for pd.to_datetime or pd.to_numeric).
|
4324
|
+
|
4325
|
+
Returns:
|
4326
|
+
- If inplace=False: DataFrame with the specified columns (or all columns if columns=None) converted to the specified type.
|
4327
|
+
- If inplace=True: The original DataFrame is modified in place, and nothing is returned.
|
4328
|
+
"""
|
4329
|
+
astypes = [
|
4330
|
+
"datetime",
|
4331
|
+
"timedelta",
|
4332
|
+
"numeric",
|
4333
|
+
"int",
|
4334
|
+
"int8",
|
4335
|
+
"int16",
|
4336
|
+
"int32",
|
4337
|
+
"int64",
|
4338
|
+
"uint8",
|
4339
|
+
"uint16",
|
4340
|
+
"uint32",
|
4341
|
+
"uint64",
|
4342
|
+
"float",
|
4343
|
+
"float16",
|
4344
|
+
"float32",
|
4345
|
+
"float64",
|
4346
|
+
"complex",
|
4347
|
+
"complex64",
|
4348
|
+
"complex128",
|
4349
|
+
"str",
|
4350
|
+
"string",
|
4351
|
+
"bool",
|
4352
|
+
"datetime64",
|
4353
|
+
"datetime64[ns]",
|
4354
|
+
"timedelta64",
|
4355
|
+
"timedelta64[ns]",
|
4356
|
+
"category",
|
4357
|
+
"object",
|
4358
|
+
"Sparse",
|
4359
|
+
"hour",
|
4360
|
+
"minute",
|
4361
|
+
"second",
|
4362
|
+
"time",
|
4363
|
+
"week",
|
4364
|
+
"date",
|
4365
|
+
"month",
|
4366
|
+
"year",
|
4367
|
+
]
|
4368
|
+
# correct the astype input
|
4369
|
+
astype = strcmp(astype, astypes)[0]
|
4370
|
+
print(f"converting {columns} as type: {astype}")
|
4371
|
+
# If inplace is False, make a copy of the DataFrame
|
4372
|
+
if not inplace:
|
4373
|
+
df = df.copy()
|
4374
|
+
# If columns is None, apply to all columns
|
4375
|
+
if columns is None:
|
4376
|
+
columns = df.columns
|
4377
|
+
|
4378
|
+
# Ensure columns is a list
|
4379
|
+
if isinstance(columns, (str, int)):
|
4380
|
+
columns = [columns]
|
4381
|
+
|
4382
|
+
# Convert specified columns
|
4383
|
+
for column in columns:
|
4384
|
+
try:
|
4385
|
+
if astype in [
|
4386
|
+
"datetime",
|
4387
|
+
"hour",
|
4388
|
+
"minute",
|
4389
|
+
"second",
|
4390
|
+
"time",
|
4391
|
+
"week",
|
4392
|
+
"date",
|
4393
|
+
"month",
|
4394
|
+
"year",
|
4395
|
+
]:
|
4396
|
+
kwargs.pop("errors", None)
|
4397
|
+
# convert it as type: datetime
|
4398
|
+
if isinstance(column, int):
|
4399
|
+
df.iloc[:, column] = pd.to_datetime(
|
4400
|
+
df.iloc[:, column], format=format, errors=errors, **kwargs
|
4401
|
+
)
|
4402
|
+
# further convert:
|
4403
|
+
if astype == "time":
|
4404
|
+
df.iloc[:, column] = df.iloc[:, column].dt.time
|
4405
|
+
elif astype == "month":
|
4406
|
+
df.iloc[:, column] = df.iloc[:, column].dt.month
|
4407
|
+
elif astype == "year":
|
4408
|
+
df.iloc[:, column] = df.iloc[:, column].dt.year
|
4409
|
+
elif astype == "date":
|
4410
|
+
df.iloc[:, column] = df.iloc[:, column].dt.date
|
4411
|
+
elif astype == "hour":
|
4412
|
+
df.iloc[:, column] = df.iloc[:, column].dt.hour
|
4413
|
+
elif astype == "minute":
|
4414
|
+
df.iloc[:, column] = df.iloc[:, column].dt.minute
|
4415
|
+
elif astype == "second":
|
4416
|
+
df.iloc[:, column] = df.iloc[:, column].dt.second
|
4417
|
+
elif astype == "week":
|
4418
|
+
df.iloc[:, column] = df.iloc[:, column].dt.day_name()
|
4419
|
+
else:
|
4420
|
+
df[column] = (
|
4421
|
+
pd.to_datetime(
|
4422
|
+
df[column], format=format, errors=errors, **kwargs
|
4423
|
+
)
|
4424
|
+
if format
|
4425
|
+
else pd.to_datetime(df[column], errors=errors, **kwargs)
|
4426
|
+
)
|
4427
|
+
# further convert:
|
4428
|
+
if astype == "time":
|
4429
|
+
df[column] = df[column].dt.time
|
4430
|
+
elif astype == "month":
|
4431
|
+
df[column] = df[column].dt.month
|
4432
|
+
elif astype == "year":
|
4433
|
+
df[column] = df[column].dt.year
|
4434
|
+
elif astype == "date":
|
4435
|
+
df[column] = df[column].dt.date
|
4436
|
+
elif astype == "hour":
|
4437
|
+
df[column] = df[column].dt.hour
|
4438
|
+
elif astype == "minute":
|
4439
|
+
df[column] = df[column].dt.minute
|
4440
|
+
elif astype == "second":
|
4441
|
+
df[column] = df[column].dt.second
|
4442
|
+
elif astype == "week":
|
4443
|
+
df[column] = df[column].dt.day_name()
|
4444
|
+
|
4445
|
+
elif astype == "numeric":
|
4446
|
+
kwargs.pop("errors", None)
|
4447
|
+
df[column] = pd.to_numeric(df[column], errors=errors, **kwargs)
|
4448
|
+
# print(f"Successfully converted '{column}' to numeric.")
|
4449
|
+
elif astype == "timedelta":
|
4450
|
+
kwargs.pop("errors", None)
|
4451
|
+
df[column] = pd.to_timedelta(df[column], errors=errors, **kwargs)
|
4452
|
+
# print(f"Successfully converted '{column}' to timedelta.")
|
4453
|
+
else:
|
4454
|
+
# Convert to other types (e.g., float, int)
|
4455
|
+
df[column] = df[column].astype(astype)
|
4456
|
+
# print(f"Successfully converted '{column}' to {astype}.")
|
4457
|
+
except Exception as e:
|
4458
|
+
print(f"Error converting '{column}' to {astype}: {e}")
|
4459
|
+
|
4460
|
+
# Return the modified DataFrame if inplace is False
|
4461
|
+
return df
|
4462
|
+
|
4463
|
+
|
4464
|
+
# ! DataFrame
|
4465
|
+
def df_sort_values(df, column, by=None, ascending=True, inplace=False, **kwargs):
|
4466
|
+
"""
|
4467
|
+
Sort a DataFrame by a specified column based on a custom order.
|
4468
|
+
|
4469
|
+
Parameters:
|
4470
|
+
- df: DataFrame to be sorted.
|
4471
|
+
- column: The name of the column to sort by.
|
4472
|
+
- by: List specifying the custom order for sorting.
|
4473
|
+
- ascending: Boolean or list of booleans, default True.
|
4474
|
+
Sort ascending vs. descending.
|
4475
|
+
- inplace: If True, perform operation in place and return None.
|
4476
|
+
- **kwargs: Additional arguments to pass to sort_values.
|
4477
|
+
|
4478
|
+
Returns:
|
4479
|
+
- Sorted DataFrame if inplace is False, otherwise None.
|
4480
|
+
"""
|
4481
|
+
if column not in df.columns:
|
4482
|
+
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
4483
|
+
|
4484
|
+
if not isinstance(by, list):
|
4485
|
+
raise ValueError("custom_order must be a list.")
|
4486
|
+
|
4487
|
+
try:
|
4488
|
+
# Convert the specified column to a categorical type with the custom order
|
4489
|
+
df[column] = pd.Categorical(df[column], categories=by, ordered=True)
|
4490
|
+
if inplace: # replace the original
|
4491
|
+
df.sort_values(column, ascending=ascending, inplace=True, **kwargs)
|
4492
|
+
print(f"Successfully sorted DataFrame by '{column}'")
|
4493
|
+
return None
|
4494
|
+
else:
|
4495
|
+
sorted_df = df.sort_values(column, ascending=ascending, **kwargs)
|
4496
|
+
print(f"Successfully sorted DataFrame by '{column}' using custom order.")
|
4497
|
+
return sorted_df
|
4498
|
+
except Exception as e:
|
4499
|
+
print(f"Error sorting DataFrame by '{column}': {e}")
|
4500
|
+
return df
|
4501
|
+
|
4502
|
+
|
4503
|
+
# # Example usage:
|
4504
|
+
# # Sample DataFrame
|
4505
|
+
# data = {
|
4506
|
+
# "month": ["March", "January", "February", "April", "December"],
|
4507
|
+
# "Amount": [200, 100, 150, 300, 250],
|
4508
|
+
# }
|
4509
|
+
# df_month = pd.DataFrame(data)
|
4510
|
+
|
4511
|
+
# # Define the month order
|
4512
|
+
# month_order = [
|
4513
|
+
# "January",
|
4514
|
+
# "February",
|
4515
|
+
# "March",
|
4516
|
+
# "April",
|
4517
|
+
# "May",
|
4518
|
+
# "June",
|
4519
|
+
# "July",
|
4520
|
+
# "August",
|
4521
|
+
# "September",
|
4522
|
+
# "October",
|
4523
|
+
# "November",
|
4524
|
+
# "December",
|
4525
|
+
# ]
|
4526
|
+
# display(df_month)
|
4527
|
+
# sorted_df_month = df_sort_values(df_month, "month", month_order, ascending=True)
|
4528
|
+
# display(sorted_df_month)
|
4529
|
+
# df_sort_values(df_month, "month", month_order, ascending=True, inplace=True)
|
4530
|
+
# display(df_month)
|
py2ls/plot.py
CHANGED
@@ -17,6 +17,96 @@ from .stats import *
|
|
17
17
|
logging.getLogger("fontTools").setLevel(logging.WARNING)
|
18
18
|
|
19
19
|
|
20
|
+
def df_corr(
|
21
|
+
df,
|
22
|
+
columns="all",
|
23
|
+
tri="u",
|
24
|
+
mask=True,
|
25
|
+
k=1,
|
26
|
+
annot=True,
|
27
|
+
cmap="coolwarm",
|
28
|
+
fmt=".2f",
|
29
|
+
cluster=False, # New parameter for clustermap option
|
30
|
+
figsize=(10, 8),
|
31
|
+
row_cluster=True, # Perform clustering on rows
|
32
|
+
col_cluster=True, # Perform clustering on columns
|
33
|
+
dendrogram_ratio=(0.2, 0.1), # Adjust size of dendrograms
|
34
|
+
cbar_pos=(0.02, 1, 0.02, 0.1), # Adjust colorbar position
|
35
|
+
xticklabels=True, # Show column labels
|
36
|
+
yticklabels=True, # Show row labels
|
37
|
+
**kwargs,
|
38
|
+
):
|
39
|
+
# Select numeric columns or specific subset of columns
|
40
|
+
if columns == "all":
|
41
|
+
df_numeric = df.select_dtypes(include=[float, int])
|
42
|
+
else:
|
43
|
+
df_numeric = df[columns]
|
44
|
+
|
45
|
+
# Compute the correlation matrix
|
46
|
+
correlation_matrix = df_numeric.corr()
|
47
|
+
|
48
|
+
# Generate mask for the upper triangle if mask is True
|
49
|
+
if mask:
|
50
|
+
if "u" in tri.lower(): # upper => np.tril
|
51
|
+
mask_array = np.tril(np.ones_like(correlation_matrix, dtype=bool), k=k)
|
52
|
+
else: # lower => np.triu
|
53
|
+
mask_array = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=k)
|
54
|
+
else:
|
55
|
+
mask_array = None
|
56
|
+
|
57
|
+
# Remove conflicting kwargs
|
58
|
+
kwargs.pop("mask", None)
|
59
|
+
kwargs.pop("annot", None)
|
60
|
+
kwargs.pop("cmap", None)
|
61
|
+
kwargs.pop("fmt", None)
|
62
|
+
|
63
|
+
kwargs.pop("clustermap", None)
|
64
|
+
kwargs.pop("row_cluster", None)
|
65
|
+
kwargs.pop("col_cluster", None)
|
66
|
+
kwargs.pop("dendrogram_ratio", None)
|
67
|
+
kwargs.pop("cbar_pos", None)
|
68
|
+
kwargs.pop("xticklabels", None)
|
69
|
+
kwargs.pop("col_cluster", None)
|
70
|
+
|
71
|
+
# Plot the heatmap or clustermap
|
72
|
+
if cluster:
|
73
|
+
# Create a clustermap
|
74
|
+
cluster_obj = sns.clustermap(
|
75
|
+
correlation_matrix,
|
76
|
+
mask=mask_array,
|
77
|
+
annot=annot,
|
78
|
+
cmap=cmap,
|
79
|
+
fmt=fmt,
|
80
|
+
figsize=figsize, # Figure size, adjusted for professional display
|
81
|
+
row_cluster=row_cluster, # Perform clustering on rows
|
82
|
+
col_cluster=col_cluster, # Perform clustering on columns
|
83
|
+
dendrogram_ratio=dendrogram_ratio, # Adjust size of dendrograms
|
84
|
+
cbar_pos=cbar_pos, # Adjust colorbar position
|
85
|
+
xticklabels=xticklabels, # Show column labels
|
86
|
+
yticklabels=yticklabels, # Show row labels
|
87
|
+
**kwargs, # Pass any additional arguments to sns.clustermap
|
88
|
+
)
|
89
|
+
|
90
|
+
return (
|
91
|
+
cluster_obj.ax_row_dendrogram,
|
92
|
+
cluster_obj.ax_col_dendrogram,
|
93
|
+
cluster_obj.ax_heatmap,
|
94
|
+
)
|
95
|
+
else:
|
96
|
+
# Create a standard heatmap
|
97
|
+
plt.figure(figsize=figsize)
|
98
|
+
ax = sns.heatmap(
|
99
|
+
correlation_matrix,
|
100
|
+
mask=mask_array,
|
101
|
+
annot=annot,
|
102
|
+
cmap=cmap,
|
103
|
+
fmt=fmt,
|
104
|
+
**kwargs, # Pass any additional arguments to sns.heatmap
|
105
|
+
)
|
106
|
+
# Return the Axes object for further customization if needed
|
107
|
+
return ax
|
108
|
+
|
109
|
+
|
20
110
|
def catplot(data, *args, **kwargs):
|
21
111
|
"""
|
22
112
|
catplot(data, opt=None, ax=None)
|
@@ -207,15 +207,15 @@ py2ls/doc.py,sha256=xN3g1OWfoaGUhikbJ0NqbN5eKy1VZVvWwRlhHMgyVEc,4243
|
|
207
207
|
py2ls/export_requirements.py,sha256=x2WgUF0jYKz9GfA1MVKN-MdsM-oQ8yUeC6Ua8oCymio,2325
|
208
208
|
py2ls/freqanalysis.py,sha256=F4218VSPbgL5tnngh6xNCYuNnfR-F_QjECUUxrPYZss,32594
|
209
209
|
py2ls/ich2ls.py,sha256=3E9R8oVpyYZXH5PiIQgT3CN5NxLe4Dwtm2LwaeacE6I,21381
|
210
|
-
py2ls/ips.py,sha256=
|
210
|
+
py2ls/ips.py,sha256=HjMZDXzfOiqhgNOdtoX7dxoY2cRsrD78LXilWyIUffE,164940
|
211
211
|
py2ls/netfinder.py,sha256=vgOOMhzwbjRuLWMAPyf_kh3HoOhsJ9dlA-tCkMf7kNU,55371
|
212
212
|
py2ls/ocr.py,sha256=5lhUbJufIKRSOL6wAWVLEo8TqMYSjoI_Q-IO-_4u3DE,31419
|
213
|
-
py2ls/plot.py,sha256=
|
213
|
+
py2ls/plot.py,sha256=x_bvQyPM6sl7IscgHPUbOEnqR82Iefcyur1JOweEAZw,100536
|
214
214
|
py2ls/setuptools-70.1.0-py3-none-any.whl,sha256=2bi3cUVal8ip86s0SOvgspteEF8SKLukECi-EWmFomc,882588
|
215
215
|
py2ls/sleep_events_detectors.py,sha256=bQA3HJqv5qnYKJJEIhCyhlDtkXQfIzqksnD0YRXso68,52145
|
216
216
|
py2ls/stats.py,sha256=fJmXQ9Lq460StOn-kfEljE97cySq7876HUPTnpB5hLs,38123
|
217
217
|
py2ls/translator.py,sha256=zBeq4pYZeroqw3DT-5g7uHfVqKd-EQptT6LJ-Adi8JY,34244
|
218
218
|
py2ls/wb_detector.py,sha256=7y6TmBUj9exCZeIgBAJ_9hwuhkDh1x_-yg4dvNY1_GQ,6284
|
219
|
-
py2ls-0.1.
|
220
|
-
py2ls-0.1.
|
221
|
-
py2ls-0.1.
|
219
|
+
py2ls-0.2.1.dist-info/METADATA,sha256=Qr6DFCoJWEj0_JrHmUDLJYRtoPqO7GyHth0Apsq5wOk,20036
|
220
|
+
py2ls-0.2.1.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
221
|
+
py2ls-0.2.1.dist-info/RECORD,,
|
File without changes
|