py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py CHANGED
@@ -16,7 +16,12 @@ import warnings
16
16
 
17
17
  warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
18
18
  warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
19
-
19
+ warnings.filterwarnings("ignore")
20
+ import os
21
+ import shutil
22
+ import logging
23
+ from pathlib import Path
24
+ from datetime import datetime
20
25
 
21
26
  def run_once_within(duration=60,reverse=False): # default 60s
22
27
  import time
@@ -541,8 +546,7 @@ def is_text(s):
541
546
 
542
547
  from typing import Any, Union
543
548
 
544
-
545
- def shared(*args, strict=True, n_shared=2, verbose=True):
549
+ def share(*args, strict=True, n_shared=2, verbose=True):
546
550
  """
547
551
  check the shared elelements in two list.
548
552
  usage:
@@ -587,12 +591,68 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
587
591
  elements2show = (
588
592
  shared_elements if len(shared_elements) < 10 else shared_elements[:5]
589
593
  )
594
+ tail = '' if len(shared_elements) < 10 else '......'
595
+ elements2show.append(tail)
590
596
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
591
597
  print("********* checking shared elements *********")
592
598
  return shared_elements
593
599
 
600
+ def shared(*args, n_shared=None, verbose=True,**kwargs):
601
+ """
602
+ check the shared elelements in two list.
603
+ usage:
604
+ list1 = [1, 2, 3, 4, 5]
605
+ list2 = [4, 5, 6, 7, 8]
606
+ list3 = [5, 6, 9, 10]
607
+ a = shared(list1, list2,list3)
608
+ """
609
+ if verbose:
610
+ print("\n********* checking shared elements *********")
611
+
612
+ if len(args) == 1 and isinstance(args[0], list):
613
+ lists = args[0] # Unpack the single list
614
+ else:
615
+ lists = args # Use the provided arguments as lists
616
+ flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
617
+
618
+ if n_shared is None:
619
+ n_shared = len(flattened_lists)
620
+ strict = True
621
+ else:
622
+ strict = False
623
+ # Ensure all arguments are lists
624
+ if any(not isinstance(lst, list) for lst in flattened_lists):
625
+ print(f"{' ' * 2}All inputs must be lists.")
626
+ return []
627
+ first_list = flattened_lists[0]
628
+ shared_elements = [
629
+ item for item in first_list if all(item in lst for lst in flattened_lists)
630
+ ]
631
+ if strict:
632
+ # Strict mode: require elements to be in all lists
633
+ shared_elements = set(flattened_lists[0])
634
+ for lst in flattened_lists[1:]:
635
+ shared_elements.intersection_update(lst)
636
+ else:
637
+ from collections import Counter
594
638
 
595
- def not_shared(*args, strict=True, n_shared=2, verbose=False):
639
+ all_elements = [item for sublist in flattened_lists for item in sublist]
640
+ element_count = Counter(all_elements)
641
+ # Get elements that appear in at least n_shared lists
642
+ shared_elements = [
643
+ item for item, count in element_count.items() if count >= n_shared
644
+ ]
645
+
646
+ shared_elements = flatten(shared_elements, verbose=verbose)
647
+ if verbose:
648
+ elements2show = (
649
+ shared_elements if len(shared_elements) < 10 else shared_elements[:5]
650
+ )
651
+ print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
652
+ print("********* checking shared elements *********")
653
+ return shared_elements
654
+
655
+ def share_not(*args, n_shared=None, verbose=False):
596
656
  """
597
657
  To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
598
658
  usage:
@@ -600,7 +660,19 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
600
660
  list2 = [4, 5, 6, 7, 8]
601
661
  not_shared(list1,list2)# output [1,3]
602
662
  """
603
- _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
663
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
664
+ list1 = flatten(args[0], verbose=verbose)
665
+ _not_shared = [item for item in list1 if item not in _common]
666
+ return _not_shared
667
+ def not_shared(*args, n_shared=None, verbose=False):
668
+ """
669
+ To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
670
+ usage:
671
+ list1 = [1, 8, 3, 3, 4, 5]
672
+ list2 = [4, 5, 6, 7, 8]
673
+ not_shared(list1,list2)# output [1,3]
674
+ """
675
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
604
676
  list1 = flatten(args[0], verbose=verbose)
605
677
  _not_shared = [item for item in list1 if item not in _common]
606
678
  return _not_shared
@@ -806,6 +878,19 @@ def counter(list_, verbose=True):
806
878
  # print(f"Return a list of the n most common elements:\n{c.most_common()}")
807
879
  # print(f"Compute the sum of the counts:\n{c.total()}")
808
880
 
881
+ def dict2df(dict_, fill=None):
882
+ len_max = 0
883
+ for key, value in dict_.items():
884
+ # value部分需要是list
885
+ if isinstance(value, list):
886
+ pass
887
+ # get the max_length
888
+ len_max = len(value) if len(value) > len_max else len_max
889
+ # 补齐长度
890
+ for key, value in dict_.items():
891
+ value.extend([fill] * (len_max - len(value)))
892
+ dict_[key] = value
893
+ return pd.DataFrame.from_dict(dict_)
809
894
 
810
895
  def str2time(time_str, fmt="24"):
811
896
  """
@@ -1254,7 +1339,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
1254
1339
  convert(dir_docx)
1255
1340
 
1256
1341
 
1257
- def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
1342
+ def img2pdf(dir_img, kind=None, page=None, dir_save=None, page_size="a4", dpi=300):
1258
1343
  import img2pdf as image2pdf
1259
1344
 
1260
1345
  def mm_to_point(size):
@@ -1263,7 +1348,8 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
1263
1348
  def set_dpi(x):
1264
1349
  dpix = dpiy = x
1265
1350
  return image2pdf.get_fixed_dpi_layout_fun((dpix, dpiy))
1266
-
1351
+ if kind is None:
1352
+ _, kind = os.path.splitext(dir_img)
1267
1353
  if not kind.startswith("."):
1268
1354
  kind = "." + kind
1269
1355
  if dir_save is None:
@@ -1286,8 +1372,10 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
1286
1372
  continue
1287
1373
  imgs.append(path)
1288
1374
  else:
1289
- imgs = [os.path.isdir(dir_img), dir_img]
1290
-
1375
+ imgs = [
1376
+ # os.path.isdir(dir_img),
1377
+ dir_img]
1378
+ print(imgs)
1291
1379
  if page_size:
1292
1380
  if isinstance(page_size, str):
1293
1381
  pdf_in_mm = mm_to_point(paper_size(page_size))
@@ -1983,7 +2071,6 @@ def fload(fpath, kind=None, **kwargs):
1983
2071
 
1984
2072
  def load_csv(fpath, **kwargs):
1985
2073
  from pandas.errors import EmptyDataError
1986
-
1987
2074
  engine = kwargs.pop("engine", "pyarrow")# default: None
1988
2075
  sep = kwargs.pop("sep", None)# default: ','
1989
2076
  index_col = kwargs.pop("index_col", None)# default: None
@@ -1994,13 +2081,20 @@ def fload(fpath, kind=None, **kwargs):
1994
2081
  comment = kwargs.pop("comment", None)# default: None
1995
2082
  fmt = kwargs.pop("fmt", False)# default:
1996
2083
  chunksize = kwargs.pop("chunksize", None)# default: None
2084
+
2085
+ #check filesize
2086
+ f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
2087
+ if f_size>=50: #50 MB
2088
+ if chunksize is None:
2089
+ chunksize = 5000
2090
+ print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
1997
2091
  engine = "c" if chunksize else engine # when chunksize, recommend 'c'
1998
2092
  low_memory = kwargs.pop("low_memory", True)# default: True
1999
2093
  low_memory = (
2000
2094
  False if chunksize else True
2001
2095
  ) # when chunksize, recommend low_memory=False # default:
2002
2096
  verbose = kwargs.pop("verbose", False)
2003
- if run_once_within():
2097
+ if run_once_within(reverse=True):
2004
2098
  use_pd("read_csv", verbose=verbose)
2005
2099
 
2006
2100
  if comment is None:# default: None
@@ -2176,7 +2270,7 @@ def fload(fpath, kind=None, **kwargs):
2176
2270
  def load_excel(fpath, **kwargs):
2177
2271
  engine = kwargs.get("engine", "openpyxl")
2178
2272
  verbose = kwargs.pop("verbose", False)
2179
- if run_once_within():
2273
+ if run_once_within(reverse=True):
2180
2274
  use_pd("read_excel", verbose=verbose)
2181
2275
  df = pd.read_excel(fpath, engine=engine, **kwargs)
2182
2276
  try:
@@ -2206,7 +2300,7 @@ def fload(fpath, kind=None, **kwargs):
2206
2300
  engine = kwargs.get("engine", "pyarrow")
2207
2301
  verbose = kwargs.pop("verbose", False)
2208
2302
 
2209
- if run_once_within():
2303
+ if run_once_within(reverse=True):
2210
2304
  use_pd("read_parquet", verbose=verbose)
2211
2305
  try:
2212
2306
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2383,13 +2477,13 @@ def fload(fpath, kind=None, **kwargs):
2383
2477
  return load_xml(fpath)
2384
2478
  elif kind in ["csv", "tsv"]:
2385
2479
  # verbose = kwargs.pop("verbose", False)
2386
- if run_once_within():
2480
+ if run_once_within(reverse=True):
2387
2481
  use_pd("read_csv")
2388
2482
  content = load_csv(fpath, **kwargs)
2389
2483
  return content
2390
2484
  elif kind == "pkl":
2391
2485
  verbose = kwargs.pop("verbose", False)
2392
- if run_once_within():
2486
+ if run_once_within(reverse=True):
2393
2487
  use_pd("read_pickle")
2394
2488
  return pd.read_pickle(fpath, **kwargs)
2395
2489
  elif kind in ["ods", "ods", "odt"]:
@@ -2420,12 +2514,12 @@ def fload(fpath, kind=None, **kwargs):
2420
2514
  return load_ipynb(fpath, **kwargs)
2421
2515
  elif kind in ["parquet", "snappy"]:
2422
2516
  verbose = kwargs.pop("verbose", False)
2423
- if run_once_within():
2517
+ if run_once_within(reverse=True):
2424
2518
  use_pd("read_parquet")
2425
2519
  return load_parquet(fpath, **kwargs)
2426
2520
  elif kind == "feather":
2427
2521
  verbose = kwargs.pop("verbose", False)
2428
- if run_once_within():
2522
+ if run_once_within(reverse=True):
2429
2523
  use_pd("read_feather")
2430
2524
  content = pd.read_feather(fpath, **kwargs)
2431
2525
  return content
@@ -2684,7 +2778,7 @@ def fsave(
2684
2778
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
2685
2779
 
2686
2780
  verbose = kwargs.pop("verbose", False)
2687
- if run_once_within():
2781
+ if run_once_within(reverse=True):
2688
2782
  use_pd("to_csv", verbose=verbose)
2689
2783
  kwargs_csv = dict(
2690
2784
  path_or_buf=None,
@@ -2716,7 +2810,7 @@ def fsave(
2716
2810
  def save_xlsx(fpath, data, **kwargs):
2717
2811
  verbose = kwargs.pop("verbose", False)
2718
2812
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2719
- if run_once_within():
2813
+ if run_once_within(reverse=True):
2720
2814
  use_pd("to_excel", verbose=verbose)
2721
2815
  if any(kwargs):
2722
2816
  format_excel(df=data, filename=fpath, **kwargs)
@@ -3131,21 +3225,437 @@ def isa(content, kind):
3131
3225
  return False
3132
3226
 
3133
3227
 
3134
- import sys
3228
+ def get_os(full=False, verbose=False):
3229
+ """Collects comprehensive system information.
3230
+ full(bool): True, get more detailed info
3231
+ verbose(bool): True, print it
3232
+ usage:
3233
+ info = get_os(full=True, verbose=False)
3234
+ """
3235
+ import sys
3236
+ import platform
3237
+ import psutil
3238
+ import GPUtil
3239
+ import socket
3240
+ import uuid
3241
+ import cpuinfo
3242
+ import os
3243
+ import subprocess
3244
+ from datetime import datetime, timedelta
3245
+ from collections import defaultdict
3246
+
3247
+ def get_os_type():
3248
+ os_name = sys.platform
3249
+ if "dar" in os_name:
3250
+ return "macOS"
3251
+ else:
3252
+ if "win" in os_name:
3253
+ return "Windows"
3254
+ elif "linux" in os_name:
3255
+ return "Linux"
3256
+ else:
3257
+ print(f"{os_name}, returned 'None'")
3258
+ return None
3259
+
3260
+ def get_os_info():
3261
+ """Get the detailed OS name, version, and other platform-specific details."""
3262
+
3263
+ def get_mac_os_info():
3264
+ """Get detailed macOS version and product name."""
3265
+ try:
3266
+ sw_vers = subprocess.check_output(["sw_vers"]).decode("utf-8")
3267
+ product_name = (
3268
+ [
3269
+ line
3270
+ for line in sw_vers.split("\n")
3271
+ if line.startswith("ProductName")
3272
+ ][0]
3273
+ .split(":")[1]
3274
+ .strip()
3275
+ )
3276
+ product_version = (
3277
+ [
3278
+ line
3279
+ for line in sw_vers.split("\n")
3280
+ if line.startswith("ProductVersion")
3281
+ ][0]
3282
+ .split(":")[1]
3283
+ .strip()
3284
+ )
3285
+ build_version = (
3286
+ [
3287
+ line
3288
+ for line in sw_vers.split("\n")
3289
+ if line.startswith("BuildVersion")
3290
+ ][0]
3291
+ .split(":")[1]
3292
+ .strip()
3293
+ )
3294
+
3295
+ # Return the formatted macOS name, version, and build
3296
+ return f"{product_name} {product_version} (Build {build_version})"
3297
+ except Exception as e:
3298
+ return f"Error retrieving macOS name: {str(e)}"
3299
+
3300
+ def get_windows_info():
3301
+ """Get detailed Windows version and edition."""
3302
+ try:
3303
+ # Get basic Windows version using platform
3304
+ windows_version = platform.version()
3305
+ release = platform.release()
3306
+ version = platform.win32_ver()[0]
3307
+
3308
+ # Additional information using Windows-specific system commands
3309
+ edition_command = "wmic os get caption"
3310
+ edition = (
3311
+ subprocess.check_output(edition_command, shell=True)
3312
+ .decode("utf-8")
3313
+ .strip()
3314
+ .split("\n")[1]
3315
+ )
3135
3316
 
3317
+ # Return Windows information
3318
+ return f"Windows {version} {release} ({edition})"
3319
+ except Exception as e:
3320
+ return f"Error retrieving Windows information: {str(e)}"
3136
3321
 
3137
- def get_os():
3138
- os_name = sys.platform
3139
- if "dar" in os_name:
3140
- return "macOS"
3141
- else:
3142
- if "win" in os_name:
3143
- return "Windows"
3144
- elif "linux" in os_name:
3145
- return "Linux"
3322
+ def get_linux_info():
3323
+ """Get detailed Linux version and distribution info."""
3324
+ try:
3325
+ # Check /etc/os-release for modern Linux distros
3326
+ with open("/etc/os-release") as f:
3327
+ os_info = f.readlines()
3328
+
3329
+ os_name = (
3330
+ next(line for line in os_info if line.startswith("NAME"))
3331
+ .split("=")[1]
3332
+ .strip()
3333
+ .replace('"', "")
3334
+ )
3335
+ os_version = (
3336
+ next(line for line in os_info if line.startswith("VERSION"))
3337
+ .split("=")[1]
3338
+ .strip()
3339
+ .replace('"', "")
3340
+ )
3341
+
3342
+ # For additional info, check for the package manager (e.g., apt, dnf)
3343
+ package_manager = "Unknown"
3344
+ if os.path.exists("/usr/bin/apt"):
3345
+ package_manager = "APT (Debian/Ubuntu)"
3346
+ elif os.path.exists("/usr/bin/dnf"):
3347
+ package_manager = "DNF (Fedora/RHEL)"
3348
+
3349
+ # Return Linux distribution, version, and package manager
3350
+ return f"{os_name} {os_version} (Package Manager: {package_manager})"
3351
+ except Exception as e:
3352
+ return f"Error retrieving Linux information: {str(e)}"
3353
+
3354
+ os_name = platform.system()
3355
+
3356
+ if os_name == "Darwin":
3357
+ return get_mac_os_info()
3358
+ elif os_name == "Windows":
3359
+ return get_windows_info()
3360
+ elif os_name == "Linux":
3361
+ return get_linux_info()
3146
3362
  else:
3147
- print(f"{os_name}, returned 'None'")
3148
- return None
3363
+ return f"Unknown OS: {os_name} {platform.release()}"
3364
+
3365
+ def get_os_name_and_version():
3366
+ os_name = platform.system()
3367
+ if os_name == "Darwin":
3368
+ try:
3369
+ # Run 'sw_vers' command to get macOS details like "macOS Sequoia"
3370
+ sw_vers = subprocess.check_output(["sw_vers"]).decode("utf-8")
3371
+ product_name = (
3372
+ [
3373
+ line
3374
+ for line in sw_vers.split("\n")
3375
+ if line.startswith("ProductName")
3376
+ ][0]
3377
+ .split(":")[1]
3378
+ .strip()
3379
+ )
3380
+ product_version = (
3381
+ [
3382
+ line
3383
+ for line in sw_vers.split("\n")
3384
+ if line.startswith("ProductVersion")
3385
+ ][0]
3386
+ .split(":")[1]
3387
+ .strip()
3388
+ )
3389
+
3390
+ # Return the formatted macOS name and version
3391
+ return f"{product_name} {product_version}"
3392
+
3393
+ except Exception as e:
3394
+ return f"Error retrieving macOS name: {str(e)}"
3395
+
3396
+ # For Windows, we use platform to get the OS name and version
3397
+ elif os_name == "Windows":
3398
+ os_version = platform.version()
3399
+ return f"Windows {os_version}"
3400
+
3401
+ # For Linux, check for distribution info using platform and os-release file
3402
+ elif os_name == "Linux":
3403
+ try:
3404
+ # Try to read Linux distribution info from '/etc/os-release'
3405
+ with open("/etc/os-release") as f:
3406
+ os_info = f.readlines()
3407
+
3408
+ # Find fields like NAME and VERSION
3409
+ os_name = (
3410
+ next(line for line in os_info if line.startswith("NAME"))
3411
+ .split("=")[1]
3412
+ .strip()
3413
+ .replace('"', "")
3414
+ )
3415
+ os_version = (
3416
+ next(line for line in os_info if line.startswith("VERSION"))
3417
+ .split("=")[1]
3418
+ .strip()
3419
+ .replace('"', "")
3420
+ )
3421
+ return f"{os_name} {os_version}"
3422
+
3423
+ except Exception as e:
3424
+ return f"Error retrieving Linux name: {str(e)}"
3425
+
3426
+ # Default fallback (for unknown OS or edge cases)
3427
+ return f"{os_name} {platform.release()}"
3428
+
3429
+ def get_system_uptime():
3430
+ """Returns system uptime as a human-readable string."""
3431
+ boot_time = datetime.fromtimestamp(psutil.boot_time())
3432
+ uptime = datetime.now() - boot_time
3433
+ return str(uptime).split(".")[0] # Remove microseconds
3434
+
3435
+ def get_active_processes(limit=10):
3436
+ processes = []
3437
+ for proc in psutil.process_iter(
3438
+ ["pid", "name", "cpu_percent", "memory_percent"]
3439
+ ):
3440
+ try:
3441
+ processes.append(proc.info)
3442
+ except psutil.NoSuchProcess:
3443
+ pass
3444
+ # Handle NoneType values by treating them as 0
3445
+ processes.sort(key=lambda x: x["cpu_percent"] or 0, reverse=True)
3446
+ return processes[:limit]
3447
+
3448
+ def get_virtual_environment_info():
3449
+ """Checks if the script is running in a virtual environment and returns details."""
3450
+ try:
3451
+ # Check if running in a virtual environment
3452
+ if hasattr(sys, "real_prefix") or (
3453
+ hasattr(sys, "base_prefix") and sys.base_prefix != sys.prefix
3454
+ ):
3455
+ return {
3456
+ "Virtual Environment": sys.prefix,
3457
+ "Site-Packages Path": os.path.join(
3458
+ sys.prefix,
3459
+ "lib",
3460
+ "python{}/site-packages".format(sys.version_info.major),
3461
+ ),
3462
+ }
3463
+ else:
3464
+ return {"Virtual Environment": "Not in a virtual environment"}
3465
+ except Exception as e:
3466
+ return {"Error": str(e)}
3467
+
3468
+ def get_temperatures():
3469
+ """Returns temperature sensor readings."""
3470
+ try:
3471
+ return psutil.sensors_temperatures(fahrenheit=False)
3472
+ except AttributeError:
3473
+ return {"Error": "Temperature sensors not available"}
3474
+
3475
+ def get_battery_status():
3476
+ """Returns battery status."""
3477
+ battery = psutil.sensors_battery()
3478
+ if battery:
3479
+ time_left = (
3480
+ str(timedelta(seconds=battery.secsleft))
3481
+ if battery.secsleft != psutil.POWER_TIME_UNLIMITED
3482
+ else "Charging/Unlimited"
3483
+ )
3484
+ return {
3485
+ "Percentage": battery.percent,
3486
+ "Plugged In": battery.power_plugged,
3487
+ "Time Left": time_left,
3488
+ }
3489
+ return {"Status": "No battery detected"}
3490
+
3491
+ def get_disk_io():
3492
+ """Returns disk I/O statistics."""
3493
+ disk_io = psutil.disk_io_counters()
3494
+ return {
3495
+ "Read (GB)": disk_io.read_bytes / (1024**3),
3496
+ "Write (GB)": disk_io.write_bytes / (1024**3),
3497
+ "Read Count": disk_io.read_count,
3498
+ "Write Count": disk_io.write_count,
3499
+ }
3500
+
3501
+ def get_network_io():
3502
+ """Returns network I/O statistics."""
3503
+ net_io = psutil.net_io_counters()
3504
+ return {
3505
+ "Bytes Sent (GB)": net_io.bytes_sent / (1024**3),
3506
+ "Bytes Received (GB)": net_io.bytes_recv / (1024**3),
3507
+ "Packets Sent": net_io.packets_sent,
3508
+ "Packets Received": net_io.packets_recv,
3509
+ }
3510
+
3511
+ def run_shell_command(command):
3512
+ """Runs a shell command and returns its output."""
3513
+ try:
3514
+ result = subprocess.run(
3515
+ command,
3516
+ shell=True,
3517
+ stdout=subprocess.PIPE,
3518
+ stderr=subprocess.PIPE,
3519
+ text=True,
3520
+ )
3521
+ return (
3522
+ result.stdout.strip()
3523
+ if result.returncode == 0
3524
+ else result.stderr.strip()
3525
+ )
3526
+ except Exception as e:
3527
+ return f"Error running command: {e}"
3528
+
3529
+ system_info = {
3530
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
3531
+ "os": get_os_type(),
3532
+ "system": {
3533
+ "os": get_os_info(),
3534
+ "platform": f"{platform.system()} {platform.release()}",
3535
+ "version": platform.version(),
3536
+ "machine": platform.machine(),
3537
+ "processor": platform.processor(),
3538
+ "architecture": platform.architecture()[0],
3539
+ "hostname": socket.gethostname(),
3540
+ "ip address": socket.gethostbyname(socket.gethostname()),
3541
+ "mac address": ":".join(
3542
+ ["{:02x}".format((uuid.getnode() >> i) & 0xFF) for i in range(0, 48, 8)]
3543
+ ),
3544
+ "cpu brand": cpuinfo.get_cpu_info().get("brand_raw", "Unknown"),
3545
+ "python version": platform.python_version(),
3546
+ "uptime": get_system_uptime(),
3547
+ },
3548
+ "cpu": {
3549
+ "physical cores": psutil.cpu_count(logical=False),
3550
+ "logical cores": psutil.cpu_count(logical=True),
3551
+ "max frequency (MHz)": psutil.cpu_freq().max,
3552
+ "min frequency (MHz)": psutil.cpu_freq().min,
3553
+ "current frequency (MHz)": psutil.cpu_freq().current,
3554
+ "usage per core (%)": psutil.cpu_percent(percpu=True),
3555
+ "total cpu Usage (%)": psutil.cpu_percent(),
3556
+ "load average (1m, 5m, 15m)": (
3557
+ os.getloadavg() if hasattr(os, "getloadavg") else "N/A"
3558
+ ),
3559
+ },
3560
+ "memory": {
3561
+ "total memory (GB)": psutil.virtual_memory().total / (1024**3),
3562
+ "available memory (GB)": psutil.virtual_memory().available / (1024**3),
3563
+ "used memory (GB)": psutil.virtual_memory().used / (1024**3),
3564
+ "memory usage (%)": psutil.virtual_memory().percent,
3565
+ "swap total (GB)": psutil.swap_memory().total / (1024**3),
3566
+ "swap free (GB)": psutil.swap_memory().free / (1024**3),
3567
+ "swap used (GB)": psutil.swap_memory().used / (1024**3),
3568
+ "swap usage (%)": psutil.swap_memory().percent,
3569
+ },
3570
+ "disk": {},
3571
+ "disk io": get_disk_io(),
3572
+ "network": {},
3573
+ "network io": get_network_io(),
3574
+ "gpu": [],
3575
+ "temperatures": get_temperatures(),
3576
+ "battery": get_battery_status(),
3577
+ "active processes": get_active_processes(),
3578
+ "environment": {
3579
+ "user": os.getenv("USER", "Unknown"),
3580
+ "environment variables": dict(os.environ),
3581
+ "virtual environment info": get_virtual_environment_info(), # Virtual env details
3582
+ "docker running": os.path.exists("/.dockerenv"), # Check for Docker
3583
+ "shell": os.environ.get("SHELL", "Unknown"),
3584
+ "default terminal": run_shell_command("echo $TERM"),
3585
+ "kernel version": platform.uname().release,
3586
+ "virtualization type": run_shell_command("systemd-detect-virt"),
3587
+ },
3588
+ "additional info": {
3589
+ "Shell": os.environ.get("SHELL", "Unknown"),
3590
+ "default terminal": run_shell_command("echo $TERM"),
3591
+ "kernel version": platform.uname().release,
3592
+ "virtualization type": run_shell_command("systemd-detect-virt"),
3593
+ "running in docker": os.path.exists("/.dockerenv"),
3594
+ },
3595
+ }
3596
+
3597
+ # Disk Information
3598
+ for partition in psutil.disk_partitions():
3599
+ try:
3600
+ usage = psutil.disk_usage(partition.mountpoint)
3601
+ system_info["disk"][partition.device] = {
3602
+ "mountpoint": partition.mountpoint,
3603
+ "file system type": partition.fstype,
3604
+ "total size (GB)": usage.total / (1024**3),
3605
+ "used (GB)": usage.used / (1024**3),
3606
+ "free (GB)": usage.free / (1024**3),
3607
+ "usage (%)": usage.percent,
3608
+ }
3609
+ except PermissionError:
3610
+ system_info["Disk"][partition.device] = "Permission Denied"
3611
+
3612
+ # Network Information
3613
+ if_addrs = psutil.net_if_addrs()
3614
+ for interface_name, interface_addresses in if_addrs.items():
3615
+ system_info["network"][interface_name] = []
3616
+ for address in interface_addresses:
3617
+ if str(address.family) == "AddressFamily.AF_INET":
3618
+ system_info["network"][interface_name].append(
3619
+ {
3620
+ "ip address": address.address,
3621
+ "netmask": address.netmask,
3622
+ "broadcast ip": address.broadcast,
3623
+ }
3624
+ )
3625
+ elif str(address.family) == "AddressFamily.AF_PACKET":
3626
+ system_info["network"][interface_name].append(
3627
+ {
3628
+ "mac address": address.address,
3629
+ "netmask": address.netmask,
3630
+ "broadcast mac": address.broadcast,
3631
+ }
3632
+ )
3633
+
3634
+ # GPU Information
3635
+ gpus = GPUtil.getGPUs()
3636
+ for gpu in gpus:
3637
+ gpu_info = {
3638
+ "name": gpu.name,
3639
+ "load (%)": gpu.load * 100,
3640
+ "free memory (MB)": gpu.memoryFree,
3641
+ "used memory (MB)": gpu.memoryUsed,
3642
+ "total memory (MB)": gpu.memoryTotal,
3643
+ "driver version": gpu.driver,
3644
+ "temperature (°C)": gpu.temperature,
3645
+ }
3646
+ if hasattr(gpu, "powerDraw"):
3647
+ gpu_info["Power Draw (W)"] = gpu.powerDraw
3648
+ if hasattr(gpu, "powerLimit"):
3649
+ gpu_info["Power Limit (W)"] = gpu.powerLimit
3650
+ system_info["gpu"].append(gpu_info)
3651
+
3652
+ res = system_info if full else get_os_type()
3653
+ if verbose:
3654
+ try:
3655
+ preview(res)
3656
+ except Exception as e:
3657
+ pnrint(e)
3658
+ return res
3149
3659
 
3150
3660
 
3151
3661
  def listdir(
@@ -3168,8 +3678,9 @@ def listdir(
3168
3678
  print(ls)
3169
3679
  df_all = pd.DataFrame(
3170
3680
  {
3171
- "fname": ls,
3172
- "fpath": [os.path.join(rootdir, i) for i in ls],
3681
+ "name": ls,
3682
+ "path": [os.path.join(rootdir, i) for i in ls],
3683
+ "kind":[os.path.splitext(i)[1] for i in ls]
3173
3684
  }
3174
3685
  )
3175
3686
  if verbose:
@@ -3308,7 +3819,94 @@ def listfunc(lib_name, opt="call"):
3308
3819
  def func_list(lib_name, opt="call"):
3309
3820
  return list_func(lib_name, opt=opt)
3310
3821
 
3822
+ def copy(src, dst, overwrite=False):
3823
+ """Copy a file from src to dst."""
3824
+ try:
3825
+ src = Path(src)
3826
+ dst = Path(dst)
3827
+ if not src.is_dir():
3828
+ if dst.is_dir():
3829
+ dst = dst / src.name
3830
+
3831
+ if dst.exists():
3832
+ if overwrite:
3833
+ dst.unlink()
3834
+ else:
3835
+ dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}")
3836
+ shutil.copy(src, dst)
3837
+ print(f"\n Done! copy to {dst}\n")
3838
+ else:
3839
+ dst = dst/src.name
3840
+ if dst.exists():
3841
+ if overwrite:
3842
+ shutil.rmtree(dst) # Remove existing directory
3843
+ else:
3844
+ dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('%H%M%S')}")
3845
+ shutil.copytree(src, dst)
3846
+ print(f"\n Done! copy to {dst}\n")
3847
+
3848
+ except Exception as e:
3849
+ logging.error(f"Failed {e}")
3850
+
3851
+ def move(src, dst, overwrite=False):
3852
+ return cut(src=src, dst=dst, overwrite=overwrite)
3311
3853
 
3854
+ def cut(src, dst, overwrite=False):
3855
+ try:
3856
+ src = Path(src)
3857
+ dst = Path(dst)
3858
+ if dst.is_dir():
3859
+ dst = dst / src.name
3860
+ if dst.exists():
3861
+ if overwrite:
3862
+ # dst.unlink() # Delete the existing file
3863
+ pass
3864
+ else:
3865
+ dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}")
3866
+ shutil.move(src, dst)
3867
+ print(f"\n Done! moved to {dst}\n")
3868
+ except Exception as e:
3869
+ logging.error(f"Failed to move file from {src} to {dst}: {e}")
3870
+
3871
+ def delete(fpath):
3872
+ """Delete a file/folder."""
3873
+ try:
3874
+ fpath = Path(fpath)
3875
+ if not fpath.is_dir(): # file
3876
+ if fpath.exists():
3877
+ fpath.unlink()
3878
+ print(f"\n Done! delete {fpath}\n")
3879
+ else:
3880
+ print(f"File '{fpath}' does not exist.")
3881
+ else:#folder
3882
+ if fpath.exists():
3883
+ shutil.rmtree(fpath) # Remove existing directory
3884
+ print(f"\n Done! delete {fpath}\n")
3885
+ else:
3886
+ print(f"Folder '{fpath}' does not exist.")
3887
+ except Exception as e:
3888
+ logging.error(f"Failed to delete {fpath}: {e}")
3889
+ def rename(fpath, dst, smart=True):
3890
+ """Rename a file or folder."""
3891
+ try:
3892
+ src_kind,dst_kind = None,None
3893
+ if smart:
3894
+ dir_name_src=os.path.dirname(fpath)
3895
+ dir_name_dst=os.path.dirname(dst)
3896
+ src_kind=os.path.splitext(fpath)[1]
3897
+ dst_kind=os.path.splitext(dst)[1]
3898
+ if dir_name_dst!=dir_name_src:
3899
+ dst=os.path.join(dir_name_src,dst)
3900
+ if dst_kind is not None and src_kind is not None:
3901
+ if dst_kind!=src_kind:
3902
+ dst=dst + src_kind
3903
+ if os.path.exists(fpath):
3904
+ os.rename(fpath,dst)
3905
+ print(f"Done! rename to {dst}")
3906
+ else:
3907
+ print(f"Failed: {fpath} does not exist.")
3908
+ except Exception as e:
3909
+ logging.error(f"Failed to rename {fpath} to {dst}: {e}")
3312
3910
  def mkdir_nest(fpath: str) -> str:
3313
3911
  """
3314
3912
  Create nested directories based on the provided file path.
@@ -3327,7 +3925,9 @@ def mkdir_nest(fpath: str) -> str:
3327
3925
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
3328
3926
 
3329
3927
  # Start creating directories from the root to the desired path
3330
- current_path = ""
3928
+ root_dir = os.path.splitdrive(fpath)[0] # Get the root drive on Windows (e.g., 'C:')
3929
+ current_path = root_dir if root_dir else f_slash # Start from the root directory or POSIX '/'
3930
+
3331
3931
  for part in dir_parts:
3332
3932
  if part:
3333
3933
  current_path = os.path.join(current_path, part)
@@ -3351,10 +3951,13 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
3351
3951
  Returns:
3352
3952
  - str: The path of the created directory or an error message.
3353
3953
  """
3354
-
3355
3954
  rootdir = []
3955
+ pardir= mkdir_nest(pardir)
3356
3956
  if chdir is None:
3357
- return mkdir_nest(pardir)
3957
+ return pardir
3958
+ else:
3959
+ pass
3960
+ print(pardir)
3358
3961
  if isinstance(chdir, str):
3359
3962
  chdir = [chdir]
3360
3963
  chdir = list(set(chdir))
@@ -3392,7 +3995,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
3392
3995
  # Dir is the main output, if only one dir, then str type is inconvenient
3393
3996
  if len(rootdir) == 1:
3394
3997
  rootdir = rootdir[0]
3395
- rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
3998
+ rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
3396
3999
 
3397
4000
  return rootdir
3398
4001
 
@@ -3791,6 +4394,114 @@ def apply_filter(img, *args):
3791
4394
  )
3792
4395
  return img.filter(supported_filters[filter_name])
3793
4396
 
4397
+ def detect_angle(image, by="median", template=None):
4398
+ """Detect the angle of rotation using various methods."""
4399
+ from sklearn.decomposition import PCA
4400
+ from skimage import transform, feature, filters, measure
4401
+ from skimage.color import rgb2gray
4402
+ from scipy.fftpack import fftshift, fft2
4403
+ import numpy as np
4404
+ import cv2
4405
+ # Convert to grayscale
4406
+ gray_image = rgb2gray(image)
4407
+
4408
+ # Detect edges using Canny edge detector
4409
+ edges = feature.canny(gray_image, sigma=2)
4410
+
4411
+ # Use Hough transform to detect lines
4412
+ lines = transform.probabilistic_hough_line(edges)
4413
+
4414
+ if not lines and any(["me" in by, "pca" in by]):
4415
+ print("No lines detected. Adjust the edge detection parameters.")
4416
+ return 0
4417
+
4418
+ # Hough Transform-based angle detection (Median/Mean)
4419
+ if "me" in by:
4420
+ angles = []
4421
+ for line in lines:
4422
+ (x0, y0), (x1, y1) = line
4423
+ angle = np.arctan2(y1 - y0, x1 - x0) * 180 / np.pi
4424
+ if 80 < abs(angle) < 100:
4425
+ angles.append(angle)
4426
+ if not angles:
4427
+ return 0
4428
+ if "di" in by:
4429
+ median_angle = np.median(angles)
4430
+ rotation_angle = (
4431
+ 90 - median_angle if median_angle > 0 else -90 - median_angle
4432
+ )
4433
+
4434
+ return rotation_angle
4435
+ else:
4436
+ mean_angle = np.mean(angles)
4437
+ rotation_angle = 90 - mean_angle if mean_angle > 0 else -90 - mean_angle
4438
+
4439
+ return rotation_angle
4440
+
4441
+ # PCA-based angle detection
4442
+ elif "pca" in by:
4443
+ y, x = np.nonzero(edges)
4444
+ if len(x) == 0:
4445
+ return 0
4446
+ pca = PCA(n_components=2)
4447
+ pca.fit(np.vstack((x, y)).T)
4448
+ angle = np.arctan2(pca.components_[0, 1], pca.components_[0, 0]) * 180 / np.pi
4449
+ return angle
4450
+
4451
+ # Gradient Orientation-based angle detection
4452
+ elif "gra" in by:
4453
+ gx, gy = np.gradient(gray_image)
4454
+ angles = np.arctan2(gy, gx) * 180 / np.pi
4455
+ hist, bin_edges = np.histogram(angles, bins=360, range=(-180, 180))
4456
+ return bin_edges[np.argmax(hist)]
4457
+
4458
+ # Template Matching-based angle detection
4459
+ elif "temp" in by:
4460
+ if template is None:
4461
+ # Automatically extract a template from the center of the image
4462
+ height, width = gray_image.shape
4463
+ center_x, center_y = width // 2, height // 2
4464
+ size = (
4465
+ min(height, width) // 4
4466
+ ) # Size of the template as a fraction of image size
4467
+ template = gray_image[
4468
+ center_y - size : center_y + size, center_x - size : center_x + size
4469
+ ]
4470
+ best_angle = None
4471
+ best_corr = -1
4472
+ for angle in range(0, 180, 1): # Checking every degree
4473
+ rotated_template = transform.rotate(template, angle)
4474
+ res = cv2.matchTemplate(gray_image, rotated_template, cv2.TM_CCOEFF)
4475
+ _, max_val, _, _ = cv2.minMaxLoc(res)
4476
+ if max_val > best_corr:
4477
+ best_corr = max_val
4478
+ best_angle = angle
4479
+ return best_angle
4480
+
4481
+ # Image Moments-based angle detection
4482
+ elif "mo" in by:
4483
+ moments = measure.moments_central(gray_image)
4484
+ angle = (
4485
+ 0.5
4486
+ * np.arctan2(2 * moments[1, 1], moments[0, 2] - moments[2, 0])
4487
+ * 180
4488
+ / np.pi
4489
+ )
4490
+ return angle
4491
+
4492
+ # Fourier Transform-based angle detection
4493
+ elif "fft" in by:
4494
+ f = fft2(gray_image)
4495
+ fshift = fftshift(f)
4496
+ magnitude_spectrum = np.log(np.abs(fshift) + 1)
4497
+ rows, cols = magnitude_spectrum.shape
4498
+ r, c = np.unravel_index(np.argmax(magnitude_spectrum), (rows, cols))
4499
+ angle = np.arctan2(r - rows // 2, c - cols // 2) * 180 / np.pi
4500
+ return angle
4501
+
4502
+ else:
4503
+ print(f"Unknown method {by}")
4504
+ return 0
3794
4505
 
3795
4506
  def imgsets(img, **kwargs):
3796
4507
  """
@@ -5911,6 +6622,9 @@ def df_scaler(
5911
6622
  scaler=None,
5912
6623
  method="standard",
5913
6624
  columns=None, # default, select all numeric col/row
6625
+ feature_range=None,# specific for 'minmax'
6626
+ vmin=0,
6627
+ vmax=1,
5914
6628
  inplace=False,
5915
6629
  verbose=False, # show usage
5916
6630
  axis=0, # defalut column-wise
@@ -5943,11 +6657,13 @@ def df_scaler(
5943
6657
  scaler = StandardScaler(**kwargs)
5944
6658
  elif method == "minmax":
5945
6659
  from sklearn.preprocessing import MinMaxScaler
6660
+ if feature_range is None:
6661
+ feature_range=(vmin,vmax)
5946
6662
  if verbose:
5947
6663
  print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
5948
6664
  print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
5949
6665
  print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
5950
- scaler = MinMaxScaler(**kwargs)
6666
+ scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
5951
6667
  elif method == "robust":
5952
6668
  from sklearn.preprocessing import RobustScaler
5953
6669
  if verbose:
@@ -6035,15 +6751,20 @@ def df_special_characters_cleaner(
6035
6751
 
6036
6752
  # 1. Clean column names by replacing special characters with underscores
6037
6753
  if "column" in where_:
6038
- data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
6754
+ try:
6755
+ data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
6756
+ except Exception as e:
6757
+ print(e)
6039
6758
 
6040
6759
  # 2. Clean only object-type columns (text columns)
6041
- if "content" in where_:
6042
- for col in data.select_dtypes(include=["object"]).columns:
6043
- data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
6044
- if data.index.dtype == "object" and index in where_:
6045
- data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
6046
-
6760
+ try:
6761
+ if "content" in where_:
6762
+ for col in data.select_dtypes(include=["object"]).columns:
6763
+ data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
6764
+ if data.index.dtype == "object" and index in where_:
6765
+ data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
6766
+ except:
6767
+ pass
6047
6768
  return data
6048
6769
 
6049
6770
 
@@ -6426,6 +7147,9 @@ def df_reducer(
6426
7147
  # "autoencoder","nmf",
6427
7148
  ]
6428
7149
  method = strcmp(method, methods)[0]
7150
+ if run_once_within(reverse=True):
7151
+ print(f"support methods:{methods}")
7152
+
6429
7153
  if verbose:
6430
7154
  print(f"\nprocessing with using {dict_methods[method]}:")
6431
7155
  xlabel, ylabel = None, None
@@ -6433,16 +7157,20 @@ def df_reducer(
6433
7157
  columns = data.select_dtypes(include="number").columns.tolist()
6434
7158
  if hue is None:
6435
7159
  hue = data.select_dtypes(exclude="number").columns.tolist()
7160
+ print(f"auto select the non-number as 'hue':{hue}")
6436
7161
  if isinstance(hue, list):
6437
7162
  print("Warning: hue is a list, only select the 1st one")
6438
7163
  hue = hue[0]
6439
- if not hue:
7164
+ if not any(hue):
6440
7165
  # Select columns if specified, else use all columns
6441
7166
  X = data[columns].values if columns else data.values
6442
7167
  else:
6443
7168
  # Select columns to reduce and hue for LDA
6444
- X = data[columns].values if columns else data.drop(columns=[hue]).values
6445
- y = data[hue].values
7169
+ try:
7170
+ X = data[columns].values if columns else data.drop(columns=[hue]).values
7171
+ y = data[hue].values
7172
+ except:
7173
+ pass
6446
7174
  print(X.shape)
6447
7175
  # Handle missing values
6448
7176
  if fill_missing:
@@ -6909,33 +7637,49 @@ def df_reducer(
6909
7637
  colname_met = "SVD_"
6910
7638
  # Quick plots
6911
7639
  if plot_ and (not method in ["isolation_forest"]):
6912
- from .plot import plotxy
6913
- if ax is None:
6914
- if figsize is None:
6915
- _, ax = plt.subplots(figsize=cm2inch(8, 8))
6916
- else:
6917
- _, ax = plt.subplots(figsize=figsize)
6918
- else:
6919
- ax = ax.cla()
7640
+ from .plot import plotxy,figsets,get_color
7641
+ # if ax is None:
7642
+ # if figsize is None:
7643
+ # _, ax = plt.subplots(figsize=cm2inch(8, 8))
7644
+ # else:
7645
+ # _, ax = plt.subplots(figsize=figsize)
7646
+ # else:
7647
+ # ax = ax.cla()
6920
7648
  xlabel = f"{colname_met}1" if xlabel is None else xlabel
6921
7649
  ylabel = f"{colname_met}2" if ylabel is None else ylabel
7650
+ palette=get_color(len(flatten(data[hue],verbose=0)))
7651
+
7652
+ reduced_df=reduced_df.sort_values(by=hue)
7653
+ print(flatten(reduced_df[hue]))
6922
7654
  ax = plotxy(
6923
7655
  data=reduced_df,
6924
7656
  x=colname_met + "1",
6925
7657
  y=colname_met + "2",
6926
7658
  hue=hue,
6927
- s=size,
7659
+ palette=palette,
7660
+ # size=size,
6928
7661
  edgecolor=edgecolor,
6929
- kind_="scater",
6930
- figsets=dict(
6931
- legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
6932
- xlabel=xlabel if xlabel else None,
6933
- ylabel=ylabel if ylabel else None,
6934
- ),
6935
- ax=ax,
7662
+ kind_=["joint",
7663
+ # "kde",
7664
+ "ell",
7665
+ ],
7666
+ kws_kde=dict(
7667
+ hue=hue,
7668
+ levels=2,
7669
+ common_norm=False,
7670
+ fill=True,
7671
+ alpha=0.05,
7672
+ ),
7673
+ kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
7674
+ kws_ellipse=dict(alpha=0.1,lw=1,label=None),
6936
7675
  verbose=False,
6937
7676
  **kwargs,
6938
7677
  )
7678
+ figsets(
7679
+ legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
7680
+ xlabel=xlabel if xlabel else None,
7681
+ ylabel=ylabel if ylabel else None,
7682
+ )
6939
7683
 
6940
7684
  if inplace:
6941
7685
  # If inplace=True, add components back into the original data
@@ -7412,6 +8156,7 @@ def df_qc(
7412
8156
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7413
8157
  from scipy.stats import skew, kurtosis, entropy
7414
8158
 
8159
+ pd.options.display.max_seq_items = 10
7415
8160
  #! display(data.select_dtypes(include=[np.number]).describe())
7416
8161
  #!skim
7417
8162
  if columns is not None:
@@ -7428,16 +8173,18 @@ def df_qc(
7428
8173
  data = data.copy()
7429
8174
  data.loc[:, data.isna().all()] = 0
7430
8175
  res_qc = {}
7431
- print(f"data.shape:{data.shape}")
8176
+ print(f"data.shape:{data.shape}\n⤵ data.sample(10):")
8177
+ display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
7432
8178
 
7433
8179
  # Missing values
7434
8180
  res_qc["missing_values"] = data.isnull().sum()
7435
- res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
8181
+ res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
7436
8182
  res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
7437
8183
 
7438
8184
  # Data types and unique values
7439
8185
  res_qc["data_types"] = data.dtypes
7440
- res_qc["unique_values"] = data.nunique()
8186
+ res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
8187
+ res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
7441
8188
  res_qc["constant_columns"] = [
7442
8189
  col for col in data.columns if data[col].nunique() <= 1
7443
8190
  ]
@@ -7453,33 +8200,42 @@ def df_qc(
7453
8200
  data_outliers = df_outlier(data)
7454
8201
  outlier_num = data_outliers.isna().sum() - data.isnull().sum()
7455
8202
  res_qc["outlier_num"] = outlier_num[outlier_num > 0]
7456
- outlier_percentage=(outlier_num / len(data_outliers)) * 100
8203
+ outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
7457
8204
  res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
7458
- # Correlation and multicollinearity (VIF)
7459
- if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7460
- numeric_df = data.select_dtypes(include=[np.number]).dropna()
7461
- corr_matrix = numeric_df.corr()
7462
- high_corr_pairs = [
7463
- (col1, col2)
7464
- for col1 in corr_matrix.columns
7465
- for col2 in corr_matrix.columns
7466
- if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7467
- ]
7468
- res_qc["high_correlations"] = high_corr_pairs
7469
-
7470
- # VIF for multicollinearity check
7471
- numeric_df = data.select_dtypes(include=[np.number]).dropna()
7472
- vif_data = pd.DataFrame()
7473
- res_qc["vif"]=vif_data
7474
- if numeric_df.shape[1] > 1 and not numeric_df.empty:
7475
- vif_data["feature"] = numeric_df.columns
7476
- vif_data["VIF"] = [
7477
- variance_inflation_factor(numeric_df.values, i)
7478
- for i in range(numeric_df.shape[1])
8205
+ try:
8206
+ # Correlation and multicollinearity (VIF)
8207
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
8208
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
8209
+ corr_matrix = numeric_df.corr()
8210
+ high_corr_pairs = [
8211
+ (col1, col2)
8212
+ for col1 in corr_matrix.columns
8213
+ for col2 in corr_matrix.columns
8214
+ if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7479
8215
  ]
7480
- res_qc["vif"] = vif_data[
7481
- vif_data["VIF"] > 5
7482
- ] # Typically VIF > 5 indicates multicollinearity
8216
+ res_qc["high_correlations"] = high_corr_pairs
8217
+
8218
+ # VIF for multicollinearity check
8219
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
8220
+ if isinstance(numeric_df.columns, pd.MultiIndex):
8221
+ numeric_df.columns = [
8222
+ "_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
8223
+ ]
8224
+
8225
+
8226
+ vif_data = pd.DataFrame()
8227
+ res_qc["vif"]=vif_data
8228
+ if numeric_df.shape[1] > 1 and not numeric_df.empty:
8229
+ vif_data["feature"] = numeric_df.columns.tolist()
8230
+ vif_data["VIF"] = [
8231
+ round(variance_inflation_factor(numeric_df.values, i),2)
8232
+ for i in range(numeric_df.shape[1])
8233
+ ]
8234
+ res_qc["vif"] = vif_data[
8235
+ vif_data["VIF"] > 5
8236
+ ] # Typically VIF > 5 indicates multicollinearity
8237
+ except Exception as e:
8238
+ print(e)
7483
8239
  # Skewness and Kurtosis
7484
8240
  skewness = data.skew(numeric_only=True)
7485
8241
  kurtosis_vals = data.kurt(numeric_only=True)
@@ -7492,8 +8248,7 @@ def df_qc(
7492
8248
  col: entropy(data[col].value_counts(normalize=True), base=2)
7493
8249
  for col in categorical_cols
7494
8250
  }
7495
- # number of unique
7496
- res_qc["unique_counts"] = data.nunique()
8251
+
7497
8252
  # dtypes counts
7498
8253
  res_qc['dtype_counts']=data.dtypes.value_counts()
7499
8254
 
@@ -7540,7 +8295,7 @@ def df_qc(
7540
8295
  res_qc["text_length_analysis"] = text_lengths
7541
8296
 
7542
8297
  # Summary statistics
7543
- res_qc["summary_statistics"] = data.describe().T
8298
+ res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
7544
8299
 
7545
8300
  # Automated warnings
7546
8301
  warnings = []
@@ -7562,28 +8317,45 @@ def df_qc(
7562
8317
 
7563
8318
  # Report generation
7564
8319
  if verbose:
7565
- print("=== QC Report Summary ===")
7566
8320
  print("\n⤵ Summary Statistics:")
7567
8321
  display(res_qc["summary_statistics"])
7568
8322
  print("\n⤵ Data Types:")
7569
8323
  display(res_qc["data_types"])
7570
8324
  if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
7571
8325
  print(" ⤵ Missing Values Counts:")
7572
- display(res_qc["missing_values"][res_qc["missing_values"] > 0])
8326
+ display(pd.DataFrame(
8327
+ {
8328
+ "missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
8329
+ "missing_percent(%)": res_qc["missing_percentage"][
8330
+ res_qc["missing_percentage"] > 0
8331
+ ],
8332
+ }
8333
+ ).style.background_gradient(cmap="coolwarm", axis=0)
8334
+ )
7573
8335
  # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7574
8336
  print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
7575
8337
 
8338
+ print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
8339
+ print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
8340
+ print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
8341
+
7576
8342
  if any(res_qc["outlier_num"]):
7577
8343
  print("\n⤵ Outlier Report:")
7578
- display(res_qc["outlier_num"])
7579
- if any(res_qc["unique_values"]):
7580
- print("\n⤵ Unique Values per Column:")
7581
- display(res_qc["unique_values"])
8344
+ display(pd.DataFrame(
8345
+ {
8346
+ "outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
8347
+ "outlier_percentage(%)": res_qc["outlier_percentage"][
8348
+ res_qc["outlier_percentage"] > 0
8349
+ ],
8350
+ }
8351
+ ).style.background_gradient(cmap="coolwarm", axis=0)
8352
+ )
7582
8353
 
7583
- print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
8354
+ if any(res_qc["unique_counts"]):
8355
+ print("\n⤵ Unique Values per Column:")
8356
+ display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
8357
+ "unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
7584
8358
 
7585
- print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7586
- print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7587
8359
 
7588
8360
  if res_qc["empty_columns"]:
7589
8361
  print("\n⤵ Empty Columns:", res_qc["empty_columns"])
@@ -7595,7 +8367,7 @@ def df_qc(
7595
8367
 
7596
8368
  if "vif" in res_qc:
7597
8369
  print("\n⤵ Features with High VIF (>|5|):")
7598
- print(res_qc["vif"])
8370
+ display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
7599
8371
 
7600
8372
  if any(res_qc["high_cardinality_categoricals"]):
7601
8373
  print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
@@ -7614,6 +8386,8 @@ def df_qc(
7614
8386
  print("\nWarnings:")
7615
8387
  for warning in res_qc["warnings"]:
7616
8388
  print(" -", warning)
8389
+
8390
+ pd.reset_option("display.max_seq_items")
7617
8391
  if plot_:
7618
8392
  df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
7619
8393
  if output or not plot_:
@@ -7632,7 +8406,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7632
8406
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7633
8407
  data=data[columns]
7634
8408
  len_total = len(res_qc)
7635
- n_row, n_col = int((len_total + 10) / 3), 3
8409
+ n_row, n_col = int((len_total + 10)), 3
7636
8410
  nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7637
8411
 
7638
8412
  missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
@@ -7789,8 +8563,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7789
8563
  title="Dtypes",
7790
8564
  ylabel="#",
7791
8565
  ax=ax_dtype_counts,
7792
- fontsize=8 if len(dtype_counts.index)<=20 else 6,
8566
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7793
8567
  )
8568
+ # from .plot import pie
8569
+ # pie()
7794
8570
 
7795
8571
  # High cardinality: Show top categorical columns by unique value count
7796
8572
  high_cardinality = res_qc["high_cardinality_categoricals"]
@@ -7871,16 +8647,17 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7871
8647
  title="Correlation Heatmap",
7872
8648
  ax=ax_heatmap
7873
8649
  )
7874
- # save figure
7875
- if dir_save:
7876
- figsave(dir_save,f"qc_plot_{now_}.pdf")
8650
+ # # save figure
8651
+ # if dir_save:
8652
+ # figsave(dir_save,f"qc_plot_{now_}.pdf")
7877
8653
 
7878
8654
  if columns is not None:
7879
8655
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7880
8656
  data=data[columns]
7881
- len_total = len(res_qc)
7882
- n_row, n_col = int((len_total + 10) / 3), 3
7883
- nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
8657
+
8658
+ # len_total = len(res_qc)
8659
+ # n_row, n_col = int((len_total + 10) / 3), 3
8660
+ # nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7884
8661
  #! check distribution
7885
8662
  data_num = data.select_dtypes(include=np.number)
7886
8663
  if len(data_num) > max_cols:
@@ -7907,7 +8684,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7907
8684
  figsets(ylabel=f'Q-Q Plot:{column}',title=None)
7908
8685
  # save figure
7909
8686
  if dir_save:
7910
- figsave(dir_save,f"qq_plot_{now_}.pdf")
8687
+ figsave(dir_save,f"qc_plot_{now_}.pdf")
8688
+
8689
+ def df_corr(df: pd.DataFrame, method="pearson"):
8690
+ """
8691
+ Compute correlation coefficients and p-values for a DataFrame.
8692
+
8693
+ Parameters:
8694
+ - df (pd.DataFrame): Input DataFrame with numeric data.
8695
+ - method (str): Correlation method ("pearson", "spearman", "kendall").
8696
+
8697
+ Returns:
8698
+ - corr_matrix (pd.DataFrame): Correlation coefficient matrix.
8699
+ - pval_matrix (pd.DataFrame): P-value matrix.
8700
+ """
8701
+ from scipy.stats import pearsonr, spearmanr, kendalltau
8702
+
8703
+ methods = ["pearson", "spearman", "kendall"]
8704
+ method = strcmp(method, methods)[0]
8705
+ methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
8706
+
8707
+ cols = df.columns
8708
+ corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
8709
+ pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
8710
+ correlation_func = methods_dict[method]
8711
+
8712
+ for col1 in cols:
8713
+ for col2 in cols:
8714
+ if col1 == col2:
8715
+ corr_matrix.loc[col1, col2] = 1.0
8716
+ pval_matrix.loc[col1, col2] = 0.0
8717
+ else:
8718
+ corr, pval = correlation_func(df[col1], df[col2])
8719
+ corr_matrix.loc[col1, col2] = corr
8720
+ pval_matrix.loc[col1, col2] = pval
8721
+
8722
+ return corr_matrix, pval_matrix
8723
+
7911
8724
  def use_pd(
7912
8725
  func_name="excel",
7913
8726
  verbose=True,
@@ -7927,3 +8740,135 @@ def use_pd(
7927
8740
  except Exception as e:
7928
8741
  if verbose:
7929
8742
  print(e)
8743
+
8744
+ def get_phone(phone_number: str, region: str = None,verbose=True):
8745
+ """
8746
+ usage:
8747
+ info = get_phone(15237654321, "DE")
8748
+ preview(info)
8749
+
8750
+ Extremely advanced phone number analysis function.
8751
+
8752
+ Args:
8753
+ phone_number (str): The phone number to analyze.
8754
+ region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
8755
+
8756
+ Returns:
8757
+ dict: Comprehensive information about the phone number.
8758
+ """
8759
+ import phonenumbers
8760
+ from phonenumbers import geocoder, carrier, timezone, number_type
8761
+ from datetime import datetime
8762
+ import pytz
8763
+ from tzlocal import get_localzone
8764
+
8765
+ if not isinstance(phone_number, str):
8766
+ phone_number = str(phone_number)
8767
+ if isinstance(region, str):
8768
+ region = region.upper()
8769
+
8770
+ try:
8771
+ # Parse the phone number
8772
+ parsed_number = phonenumbers.parse(phone_number, region)
8773
+
8774
+ # Validate the phone number
8775
+ valid = phonenumbers.is_valid_number(parsed_number)
8776
+ possible = phonenumbers.is_possible_number(parsed_number)
8777
+
8778
+ if not valid:
8779
+ suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
8780
+ return {
8781
+ "valid": False,
8782
+ "error": "Invalid phone number",
8783
+ "suggested_fix": suggested_fix,
8784
+ }
8785
+
8786
+ # Basic details
8787
+ formatted_international = phonenumbers.format_number(
8788
+ parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
8789
+ )
8790
+ formatted_national = phonenumbers.format_number(
8791
+ parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
8792
+ )
8793
+ formatted_e164 = phonenumbers.format_number(
8794
+ parsed_number, phonenumbers.PhoneNumberFormat.E164
8795
+ )
8796
+ country_code = parsed_number.country_code
8797
+ region_code = geocoder.region_code_for_number(parsed_number)
8798
+ country_name = geocoder.country_name_for_number(parsed_number, "en")
8799
+
8800
+ location = geocoder.description_for_number(parsed_number, "en")
8801
+ carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
8802
+ time_zones = timezone.time_zones_for_number(parsed_number)[0]
8803
+ current_times = datetime.now(pytz.timezone(time_zones)).strftime(
8804
+ "%Y-%m-%d %H:%M:%S %Z"
8805
+ )
8806
+ number_type_str = {
8807
+ phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
8808
+ phonenumbers.PhoneNumberType.MOBILE: "Mobile",
8809
+ phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
8810
+ phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
8811
+ phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
8812
+ phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
8813
+ phonenumbers.PhoneNumberType.VOIP: "VOIP",
8814
+ phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
8815
+ phonenumbers.PhoneNumberType.PAGER: "Pager",
8816
+ phonenumbers.PhoneNumberType.UAN: "UAN",
8817
+ phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
8818
+ }.get(number_type(parsed_number), "Unknown")
8819
+
8820
+ # Advanced Features
8821
+ is_toll_free = (
8822
+ number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
8823
+ )
8824
+ is_premium_rate = (
8825
+ number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
8826
+ )
8827
+
8828
+ # Dialing Information
8829
+ dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
8830
+
8831
+ # Advanced Timezone Handling
8832
+ gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
8833
+ # Get the local timezone (current computer's time)
8834
+ local_timezone = get_localzone()
8835
+ #local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
8836
+ local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
8837
+ offset_diff = local_offset - gmt_offsets
8838
+ head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
8839
+ res= {
8840
+ "valid": True,
8841
+ "possible": possible,
8842
+ "formatted": {
8843
+ "international": formatted_international,
8844
+ "national": formatted_national,
8845
+ "e164": formatted_e164,
8846
+ },
8847
+ "country_code": country_code,
8848
+ "country_name": country_name,
8849
+ "region_code": region_code,
8850
+ "location": location if location else "Unknown",
8851
+ "carrier": carrier_name,
8852
+ "time_zone": time_zones,
8853
+ "current_times": current_times,
8854
+ "local_offset":f"{local_offset} utcoffset",
8855
+ "time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
8856
+ "number_type": number_type_str,
8857
+ "is_toll_free": is_toll_free,
8858
+ "is_premium_rate": is_premium_rate,
8859
+ "dialing_instructions": dialing_instructions,
8860
+ "suggested_fix": None, # Use phonenumbers.example_number if invalid
8861
+ "logs": {
8862
+ "number_analysis_completed": datetime.now().strftime(
8863
+ "%Y-%m-%d %H:%M:%S"
8864
+ ),
8865
+ "raw_input": phone_number,
8866
+ "parsed_number": str(parsed_number),
8867
+ },
8868
+ }
8869
+
8870
+ except phonenumbers.NumberParseException as e:
8871
+ res= {"valid": False, "error": str(e)}
8872
+ if verbose:
8873
+ preview(res)
8874
+ return res