py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -16,7 +16,12 @@ import warnings
16
16
 
17
17
  warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
18
18
  warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
19
-
19
+ warnings.filterwarnings("ignore")
20
+ import os
21
+ import shutil
22
+ import logging
23
+ from pathlib import Path
24
+ from datetime import datetime
20
25
 
21
26
  def run_once_within(duration=60,reverse=False): # default 60s
22
27
  import time
@@ -541,8 +546,7 @@ def is_text(s):
541
546
 
542
547
  from typing import Any, Union
543
548
 
544
-
545
- def shared(*args, strict=True, n_shared=2, verbose=True):
549
+ def share(*args, strict=True, n_shared=2, verbose=True):
546
550
  """
547
551
  check the shared elelements in two list.
548
552
  usage:
@@ -587,12 +591,68 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
587
591
  elements2show = (
588
592
  shared_elements if len(shared_elements) < 10 else shared_elements[:5]
589
593
  )
594
+ tail = '' if len(shared_elements) < 10 else '......'
595
+ elements2show.append(tail)
590
596
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
591
597
  print("********* checking shared elements *********")
592
598
  return shared_elements
593
599
 
600
+ def shared(*args, n_shared=None, verbose=True,**kwargs):
601
+ """
602
+ check the shared elelements in two list.
603
+ usage:
604
+ list1 = [1, 2, 3, 4, 5]
605
+ list2 = [4, 5, 6, 7, 8]
606
+ list3 = [5, 6, 9, 10]
607
+ a = shared(list1, list2,list3)
608
+ """
609
+ if verbose:
610
+ print("\n********* checking shared elements *********")
611
+
612
+ if len(args) == 1 and isinstance(args[0], list):
613
+ lists = args[0] # Unpack the single list
614
+ else:
615
+ lists = args # Use the provided arguments as lists
616
+ flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
617
+
618
+ if n_shared is None:
619
+ n_shared = len(flattened_lists)
620
+ strict = True
621
+ else:
622
+ strict = False
623
+ # Ensure all arguments are lists
624
+ if any(not isinstance(lst, list) for lst in flattened_lists):
625
+ print(f"{' ' * 2}All inputs must be lists.")
626
+ return []
627
+ first_list = flattened_lists[0]
628
+ shared_elements = [
629
+ item for item in first_list if all(item in lst for lst in flattened_lists)
630
+ ]
631
+ if strict:
632
+ # Strict mode: require elements to be in all lists
633
+ shared_elements = set(flattened_lists[0])
634
+ for lst in flattened_lists[1:]:
635
+ shared_elements.intersection_update(lst)
636
+ else:
637
+ from collections import Counter
594
638
 
595
- def not_shared(*args, strict=True, n_shared=2, verbose=False):
639
+ all_elements = [item for sublist in flattened_lists for item in sublist]
640
+ element_count = Counter(all_elements)
641
+ # Get elements that appear in at least n_shared lists
642
+ shared_elements = [
643
+ item for item, count in element_count.items() if count >= n_shared
644
+ ]
645
+
646
+ shared_elements = flatten(shared_elements, verbose=verbose)
647
+ if verbose:
648
+ elements2show = (
649
+ shared_elements if len(shared_elements) < 10 else shared_elements[:5]
650
+ )
651
+ print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
652
+ print("********* checking shared elements *********")
653
+ return shared_elements
654
+
655
+ def share_not(*args, n_shared=None, verbose=False):
596
656
  """
597
657
  To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
598
658
  usage:
@@ -600,7 +660,19 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
600
660
  list2 = [4, 5, 6, 7, 8]
601
661
  not_shared(list1,list2)# output [1,3]
602
662
  """
603
- _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
663
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
664
+ list1 = flatten(args[0], verbose=verbose)
665
+ _not_shared = [item for item in list1 if item not in _common]
666
+ return _not_shared
667
+ def not_shared(*args, n_shared=None, verbose=False):
668
+ """
669
+ To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
670
+ usage:
671
+ list1 = [1, 8, 3, 3, 4, 5]
672
+ list2 = [4, 5, 6, 7, 8]
673
+ not_shared(list1,list2)# output [1,3]
674
+ """
675
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
604
676
  list1 = flatten(args[0], verbose=verbose)
605
677
  _not_shared = [item for item in list1 if item not in _common]
606
678
  return _not_shared
@@ -806,6 +878,19 @@ def counter(list_, verbose=True):
806
878
  # print(f"Return a list of the n most common elements:\n{c.most_common()}")
807
879
  # print(f"Compute the sum of the counts:\n{c.total()}")
808
880
 
881
+ def dict2df(dict_, fill=None):
882
+ len_max = 0
883
+ for key, value in dict_.items():
884
+ # value部分需要是list
885
+ if isinstance(value, list):
886
+ pass
887
+ # get the max_length
888
+ len_max = len(value) if len(value) > len_max else len_max
889
+ # 补齐长度
890
+ for key, value in dict_.items():
891
+ value.extend([fill] * (len_max - len(value)))
892
+ dict_[key] = value
893
+ return pd.DataFrame.from_dict(dict_)
809
894
 
810
895
  def str2time(time_str, fmt="24"):
811
896
  """
@@ -1254,7 +1339,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
1254
1339
  convert(dir_docx)
1255
1340
 
1256
1341
 
1257
- def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
1342
+ def img2pdf(dir_img, kind=None, page=None, dir_save=None, page_size="a4", dpi=300):
1258
1343
  import img2pdf as image2pdf
1259
1344
 
1260
1345
  def mm_to_point(size):
@@ -1263,7 +1348,8 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
1263
1348
  def set_dpi(x):
1264
1349
  dpix = dpiy = x
1265
1350
  return image2pdf.get_fixed_dpi_layout_fun((dpix, dpiy))
1266
-
1351
+ if kind is None:
1352
+ _, kind = os.path.splitext(dir_img)
1267
1353
  if not kind.startswith("."):
1268
1354
  kind = "." + kind
1269
1355
  if dir_save is None:
@@ -1286,8 +1372,10 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
1286
1372
  continue
1287
1373
  imgs.append(path)
1288
1374
  else:
1289
- imgs = [os.path.isdir(dir_img), dir_img]
1290
-
1375
+ imgs = [
1376
+ # os.path.isdir(dir_img),
1377
+ dir_img]
1378
+ print(imgs)
1291
1379
  if page_size:
1292
1380
  if isinstance(page_size, str):
1293
1381
  pdf_in_mm = mm_to_point(paper_size(page_size))
@@ -1983,7 +2071,6 @@ def fload(fpath, kind=None, **kwargs):
1983
2071
 
1984
2072
  def load_csv(fpath, **kwargs):
1985
2073
  from pandas.errors import EmptyDataError
1986
-
1987
2074
  engine = kwargs.pop("engine", "pyarrow")# default: None
1988
2075
  sep = kwargs.pop("sep", None)# default: ','
1989
2076
  index_col = kwargs.pop("index_col", None)# default: None
@@ -1994,13 +2081,20 @@ def fload(fpath, kind=None, **kwargs):
1994
2081
  comment = kwargs.pop("comment", None)# default: None
1995
2082
  fmt = kwargs.pop("fmt", False)# default:
1996
2083
  chunksize = kwargs.pop("chunksize", None)# default: None
2084
+
2085
+ #check filesize
2086
+ f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
2087
+ if f_size>=50: #50 MB
2088
+ if chunksize is None:
2089
+ chunksize = 5000
2090
+ print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
1997
2091
  engine = "c" if chunksize else engine # when chunksize, recommend 'c'
1998
2092
  low_memory = kwargs.pop("low_memory", True)# default: True
1999
2093
  low_memory = (
2000
2094
  False if chunksize else True
2001
2095
  ) # when chunksize, recommend low_memory=False # default:
2002
2096
  verbose = kwargs.pop("verbose", False)
2003
- if run_once_within():
2097
+ if run_once_within(reverse=True):
2004
2098
  use_pd("read_csv", verbose=verbose)
2005
2099
 
2006
2100
  if comment is None:# default: None
@@ -2176,7 +2270,7 @@ def fload(fpath, kind=None, **kwargs):
2176
2270
  def load_excel(fpath, **kwargs):
2177
2271
  engine = kwargs.get("engine", "openpyxl")
2178
2272
  verbose = kwargs.pop("verbose", False)
2179
- if run_once_within():
2273
+ if run_once_within(reverse=True):
2180
2274
  use_pd("read_excel", verbose=verbose)
2181
2275
  df = pd.read_excel(fpath, engine=engine, **kwargs)
2182
2276
  try:
@@ -2206,7 +2300,7 @@ def fload(fpath, kind=None, **kwargs):
2206
2300
  engine = kwargs.get("engine", "pyarrow")
2207
2301
  verbose = kwargs.pop("verbose", False)
2208
2302
 
2209
- if run_once_within():
2303
+ if run_once_within(reverse=True):
2210
2304
  use_pd("read_parquet", verbose=verbose)
2211
2305
  try:
2212
2306
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2383,13 +2477,13 @@ def fload(fpath, kind=None, **kwargs):
2383
2477
  return load_xml(fpath)
2384
2478
  elif kind in ["csv", "tsv"]:
2385
2479
  # verbose = kwargs.pop("verbose", False)
2386
- if run_once_within():
2480
+ if run_once_within(reverse=True):
2387
2481
  use_pd("read_csv")
2388
2482
  content = load_csv(fpath, **kwargs)
2389
2483
  return content
2390
2484
  elif kind == "pkl":
2391
2485
  verbose = kwargs.pop("verbose", False)
2392
- if run_once_within():
2486
+ if run_once_within(reverse=True):
2393
2487
  use_pd("read_pickle")
2394
2488
  return pd.read_pickle(fpath, **kwargs)
2395
2489
  elif kind in ["ods", "ods", "odt"]:
@@ -2420,12 +2514,12 @@ def fload(fpath, kind=None, **kwargs):
2420
2514
  return load_ipynb(fpath, **kwargs)
2421
2515
  elif kind in ["parquet", "snappy"]:
2422
2516
  verbose = kwargs.pop("verbose", False)
2423
- if run_once_within():
2517
+ if run_once_within(reverse=True):
2424
2518
  use_pd("read_parquet")
2425
2519
  return load_parquet(fpath, **kwargs)
2426
2520
  elif kind == "feather":
2427
2521
  verbose = kwargs.pop("verbose", False)
2428
- if run_once_within():
2522
+ if run_once_within(reverse=True):
2429
2523
  use_pd("read_feather")
2430
2524
  content = pd.read_feather(fpath, **kwargs)
2431
2525
  return content
@@ -2684,7 +2778,7 @@ def fsave(
2684
2778
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
2685
2779
 
2686
2780
  verbose = kwargs.pop("verbose", False)
2687
- if run_once_within():
2781
+ if run_once_within(reverse=True):
2688
2782
  use_pd("to_csv", verbose=verbose)
2689
2783
  kwargs_csv = dict(
2690
2784
  path_or_buf=None,
@@ -2716,7 +2810,7 @@ def fsave(
2716
2810
  def save_xlsx(fpath, data, **kwargs):
2717
2811
  verbose = kwargs.pop("verbose", False)
2718
2812
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2719
- if run_once_within():
2813
+ if run_once_within(reverse=True):
2720
2814
  use_pd("to_excel", verbose=verbose)
2721
2815
  if any(kwargs):
2722
2816
  format_excel(df=data, filename=fpath, **kwargs)
@@ -3131,21 +3225,437 @@ def isa(content, kind):
3131
3225
  return False
3132
3226
 
3133
3227
 
3134
- import sys
3228
+ def get_os(full=False, verbose=False):
3229
+ """Collects comprehensive system information.
3230
+ full(bool): True, get more detailed info
3231
+ verbose(bool): True, print it
3232
+ usage:
3233
+ info = get_os(full=True, verbose=False)
3234
+ """
3235
+ import sys
3236
+ import platform
3237
+ import psutil
3238
+ import GPUtil
3239
+ import socket
3240
+ import uuid
3241
+ import cpuinfo
3242
+ import os
3243
+ import subprocess
3244
+ from datetime import datetime, timedelta
3245
+ from collections import defaultdict
3246
+
3247
+ def get_os_type():
3248
+ os_name = sys.platform
3249
+ if "dar" in os_name:
3250
+ return "macOS"
3251
+ else:
3252
+ if "win" in os_name:
3253
+ return "Windows"
3254
+ elif "linux" in os_name:
3255
+ return "Linux"
3256
+ else:
3257
+ print(f"{os_name}, returned 'None'")
3258
+ return None
3259
+
3260
+ def get_os_info():
3261
+ """Get the detailed OS name, version, and other platform-specific details."""
3262
+
3263
+ def get_mac_os_info():
3264
+ """Get detailed macOS version and product name."""
3265
+ try:
3266
+ sw_vers = subprocess.check_output(["sw_vers"]).decode("utf-8")
3267
+ product_name = (
3268
+ [
3269
+ line
3270
+ for line in sw_vers.split("\n")
3271
+ if line.startswith("ProductName")
3272
+ ][0]
3273
+ .split(":")[1]
3274
+ .strip()
3275
+ )
3276
+ product_version = (
3277
+ [
3278
+ line
3279
+ for line in sw_vers.split("\n")
3280
+ if line.startswith("ProductVersion")
3281
+ ][0]
3282
+ .split(":")[1]
3283
+ .strip()
3284
+ )
3285
+ build_version = (
3286
+ [
3287
+ line
3288
+ for line in sw_vers.split("\n")
3289
+ if line.startswith("BuildVersion")
3290
+ ][0]
3291
+ .split(":")[1]
3292
+ .strip()
3293
+ )
3294
+
3295
+ # Return the formatted macOS name, version, and build
3296
+ return f"{product_name} {product_version} (Build {build_version})"
3297
+ except Exception as e:
3298
+ return f"Error retrieving macOS name: {str(e)}"
3299
+
3300
+ def get_windows_info():
3301
+ """Get detailed Windows version and edition."""
3302
+ try:
3303
+ # Get basic Windows version using platform
3304
+ windows_version = platform.version()
3305
+ release = platform.release()
3306
+ version = platform.win32_ver()[0]
3307
+
3308
+ # Additional information using Windows-specific system commands
3309
+ edition_command = "wmic os get caption"
3310
+ edition = (
3311
+ subprocess.check_output(edition_command, shell=True)
3312
+ .decode("utf-8")
3313
+ .strip()
3314
+ .split("\n")[1]
3315
+ )
3135
3316
 
3317
+ # Return Windows information
3318
+ return f"Windows {version} {release} ({edition})"
3319
+ except Exception as e:
3320
+ return f"Error retrieving Windows information: {str(e)}"
3136
3321
 
3137
- def get_os():
3138
- os_name = sys.platform
3139
- if "dar" in os_name:
3140
- return "macOS"
3141
- else:
3142
- if "win" in os_name:
3143
- return "Windows"
3144
- elif "linux" in os_name:
3145
- return "Linux"
3322
+ def get_linux_info():
3323
+ """Get detailed Linux version and distribution info."""
3324
+ try:
3325
+ # Check /etc/os-release for modern Linux distros
3326
+ with open("/etc/os-release") as f:
3327
+ os_info = f.readlines()
3328
+
3329
+ os_name = (
3330
+ next(line for line in os_info if line.startswith("NAME"))
3331
+ .split("=")[1]
3332
+ .strip()
3333
+ .replace('"', "")
3334
+ )
3335
+ os_version = (
3336
+ next(line for line in os_info if line.startswith("VERSION"))
3337
+ .split("=")[1]
3338
+ .strip()
3339
+ .replace('"', "")
3340
+ )
3341
+
3342
+ # For additional info, check for the package manager (e.g., apt, dnf)
3343
+ package_manager = "Unknown"
3344
+ if os.path.exists("/usr/bin/apt"):
3345
+ package_manager = "APT (Debian/Ubuntu)"
3346
+ elif os.path.exists("/usr/bin/dnf"):
3347
+ package_manager = "DNF (Fedora/RHEL)"
3348
+
3349
+ # Return Linux distribution, version, and package manager
3350
+ return f"{os_name} {os_version} (Package Manager: {package_manager})"
3351
+ except Exception as e:
3352
+ return f"Error retrieving Linux information: {str(e)}"
3353
+
3354
+ os_name = platform.system()
3355
+
3356
+ if os_name == "Darwin":
3357
+ return get_mac_os_info()
3358
+ elif os_name == "Windows":
3359
+ return get_windows_info()
3360
+ elif os_name == "Linux":
3361
+ return get_linux_info()
3146
3362
  else:
3147
- print(f"{os_name}, returned 'None'")
3148
- return None
3363
+ return f"Unknown OS: {os_name} {platform.release()}"
3364
+
3365
+ def get_os_name_and_version():
3366
+ os_name = platform.system()
3367
+ if os_name == "Darwin":
3368
+ try:
3369
+ # Run 'sw_vers' command to get macOS details like "macOS Sequoia"
3370
+ sw_vers = subprocess.check_output(["sw_vers"]).decode("utf-8")
3371
+ product_name = (
3372
+ [
3373
+ line
3374
+ for line in sw_vers.split("\n")
3375
+ if line.startswith("ProductName")
3376
+ ][0]
3377
+ .split(":")[1]
3378
+ .strip()
3379
+ )
3380
+ product_version = (
3381
+ [
3382
+ line
3383
+ for line in sw_vers.split("\n")
3384
+ if line.startswith("ProductVersion")
3385
+ ][0]
3386
+ .split(":")[1]
3387
+ .strip()
3388
+ )
3389
+
3390
+ # Return the formatted macOS name and version
3391
+ return f"{product_name} {product_version}"
3392
+
3393
+ except Exception as e:
3394
+ return f"Error retrieving macOS name: {str(e)}"
3395
+
3396
+ # For Windows, we use platform to get the OS name and version
3397
+ elif os_name == "Windows":
3398
+ os_version = platform.version()
3399
+ return f"Windows {os_version}"
3400
+
3401
+ # For Linux, check for distribution info using platform and os-release file
3402
+ elif os_name == "Linux":
3403
+ try:
3404
+ # Try to read Linux distribution info from '/etc/os-release'
3405
+ with open("/etc/os-release") as f:
3406
+ os_info = f.readlines()
3407
+
3408
+ # Find fields like NAME and VERSION
3409
+ os_name = (
3410
+ next(line for line in os_info if line.startswith("NAME"))
3411
+ .split("=")[1]
3412
+ .strip()
3413
+ .replace('"', "")
3414
+ )
3415
+ os_version = (
3416
+ next(line for line in os_info if line.startswith("VERSION"))
3417
+ .split("=")[1]
3418
+ .strip()
3419
+ .replace('"', "")
3420
+ )
3421
+ return f"{os_name} {os_version}"
3422
+
3423
+ except Exception as e:
3424
+ return f"Error retrieving Linux name: {str(e)}"
3425
+
3426
+ # Default fallback (for unknown OS or edge cases)
3427
+ return f"{os_name} {platform.release()}"
3428
+
3429
+ def get_system_uptime():
3430
+ """Returns system uptime as a human-readable string."""
3431
+ boot_time = datetime.fromtimestamp(psutil.boot_time())
3432
+ uptime = datetime.now() - boot_time
3433
+ return str(uptime).split(".")[0] # Remove microseconds
3434
+
3435
+ def get_active_processes(limit=10):
3436
+ processes = []
3437
+ for proc in psutil.process_iter(
3438
+ ["pid", "name", "cpu_percent", "memory_percent"]
3439
+ ):
3440
+ try:
3441
+ processes.append(proc.info)
3442
+ except psutil.NoSuchProcess:
3443
+ pass
3444
+ # Handle NoneType values by treating them as 0
3445
+ processes.sort(key=lambda x: x["cpu_percent"] or 0, reverse=True)
3446
+ return processes[:limit]
3447
+
3448
+ def get_virtual_environment_info():
3449
+ """Checks if the script is running in a virtual environment and returns details."""
3450
+ try:
3451
+ # Check if running in a virtual environment
3452
+ if hasattr(sys, "real_prefix") or (
3453
+ hasattr(sys, "base_prefix") and sys.base_prefix != sys.prefix
3454
+ ):
3455
+ return {
3456
+ "Virtual Environment": sys.prefix,
3457
+ "Site-Packages Path": os.path.join(
3458
+ sys.prefix,
3459
+ "lib",
3460
+ "python{}/site-packages".format(sys.version_info.major),
3461
+ ),
3462
+ }
3463
+ else:
3464
+ return {"Virtual Environment": "Not in a virtual environment"}
3465
+ except Exception as e:
3466
+ return {"Error": str(e)}
3467
+
3468
+ def get_temperatures():
3469
+ """Returns temperature sensor readings."""
3470
+ try:
3471
+ return psutil.sensors_temperatures(fahrenheit=False)
3472
+ except AttributeError:
3473
+ return {"Error": "Temperature sensors not available"}
3474
+
3475
+ def get_battery_status():
3476
+ """Returns battery status."""
3477
+ battery = psutil.sensors_battery()
3478
+ if battery:
3479
+ time_left = (
3480
+ str(timedelta(seconds=battery.secsleft))
3481
+ if battery.secsleft != psutil.POWER_TIME_UNLIMITED
3482
+ else "Charging/Unlimited"
3483
+ )
3484
+ return {
3485
+ "Percentage": battery.percent,
3486
+ "Plugged In": battery.power_plugged,
3487
+ "Time Left": time_left,
3488
+ }
3489
+ return {"Status": "No battery detected"}
3490
+
3491
+ def get_disk_io():
3492
+ """Returns disk I/O statistics."""
3493
+ disk_io = psutil.disk_io_counters()
3494
+ return {
3495
+ "Read (GB)": disk_io.read_bytes / (1024**3),
3496
+ "Write (GB)": disk_io.write_bytes / (1024**3),
3497
+ "Read Count": disk_io.read_count,
3498
+ "Write Count": disk_io.write_count,
3499
+ }
3500
+
3501
+ def get_network_io():
3502
+ """Returns network I/O statistics."""
3503
+ net_io = psutil.net_io_counters()
3504
+ return {
3505
+ "Bytes Sent (GB)": net_io.bytes_sent / (1024**3),
3506
+ "Bytes Received (GB)": net_io.bytes_recv / (1024**3),
3507
+ "Packets Sent": net_io.packets_sent,
3508
+ "Packets Received": net_io.packets_recv,
3509
+ }
3510
+
3511
+ def run_shell_command(command):
3512
+ """Runs a shell command and returns its output."""
3513
+ try:
3514
+ result = subprocess.run(
3515
+ command,
3516
+ shell=True,
3517
+ stdout=subprocess.PIPE,
3518
+ stderr=subprocess.PIPE,
3519
+ text=True,
3520
+ )
3521
+ return (
3522
+ result.stdout.strip()
3523
+ if result.returncode == 0
3524
+ else result.stderr.strip()
3525
+ )
3526
+ except Exception as e:
3527
+ return f"Error running command: {e}"
3528
+
3529
+ system_info = {
3530
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
3531
+ "os": get_os_type(),
3532
+ "system": {
3533
+ "os": get_os_info(),
3534
+ "platform": f"{platform.system()} {platform.release()}",
3535
+ "version": platform.version(),
3536
+ "machine": platform.machine(),
3537
+ "processor": platform.processor(),
3538
+ "architecture": platform.architecture()[0],
3539
+ "hostname": socket.gethostname(),
3540
+ "ip address": socket.gethostbyname(socket.gethostname()),
3541
+ "mac address": ":".join(
3542
+ ["{:02x}".format((uuid.getnode() >> i) & 0xFF) for i in range(0, 48, 8)]
3543
+ ),
3544
+ "cpu brand": cpuinfo.get_cpu_info().get("brand_raw", "Unknown"),
3545
+ "python version": platform.python_version(),
3546
+ "uptime": get_system_uptime(),
3547
+ },
3548
+ "cpu": {
3549
+ "physical cores": psutil.cpu_count(logical=False),
3550
+ "logical cores": psutil.cpu_count(logical=True),
3551
+ "max frequency (MHz)": psutil.cpu_freq().max,
3552
+ "min frequency (MHz)": psutil.cpu_freq().min,
3553
+ "current frequency (MHz)": psutil.cpu_freq().current,
3554
+ "usage per core (%)": psutil.cpu_percent(percpu=True),
3555
+ "total cpu Usage (%)": psutil.cpu_percent(),
3556
+ "load average (1m, 5m, 15m)": (
3557
+ os.getloadavg() if hasattr(os, "getloadavg") else "N/A"
3558
+ ),
3559
+ },
3560
+ "memory": {
3561
+ "total memory (GB)": psutil.virtual_memory().total / (1024**3),
3562
+ "available memory (GB)": psutil.virtual_memory().available / (1024**3),
3563
+ "used memory (GB)": psutil.virtual_memory().used / (1024**3),
3564
+ "memory usage (%)": psutil.virtual_memory().percent,
3565
+ "swap total (GB)": psutil.swap_memory().total / (1024**3),
3566
+ "swap free (GB)": psutil.swap_memory().free / (1024**3),
3567
+ "swap used (GB)": psutil.swap_memory().used / (1024**3),
3568
+ "swap usage (%)": psutil.swap_memory().percent,
3569
+ },
3570
+ "disk": {},
3571
+ "disk io": get_disk_io(),
3572
+ "network": {},
3573
+ "network io": get_network_io(),
3574
+ "gpu": [],
3575
+ "temperatures": get_temperatures(),
3576
+ "battery": get_battery_status(),
3577
+ "active processes": get_active_processes(),
3578
+ "environment": {
3579
+ "user": os.getenv("USER", "Unknown"),
3580
+ "environment variables": dict(os.environ),
3581
+ "virtual environment info": get_virtual_environment_info(), # Virtual env details
3582
+ "docker running": os.path.exists("/.dockerenv"), # Check for Docker
3583
+ "shell": os.environ.get("SHELL", "Unknown"),
3584
+ "default terminal": run_shell_command("echo $TERM"),
3585
+ "kernel version": platform.uname().release,
3586
+ "virtualization type": run_shell_command("systemd-detect-virt"),
3587
+ },
3588
+ "additional info": {
3589
+ "Shell": os.environ.get("SHELL", "Unknown"),
3590
+ "default terminal": run_shell_command("echo $TERM"),
3591
+ "kernel version": platform.uname().release,
3592
+ "virtualization type": run_shell_command("systemd-detect-virt"),
3593
+ "running in docker": os.path.exists("/.dockerenv"),
3594
+ },
3595
+ }
3596
+
3597
+ # Disk Information
3598
+ for partition in psutil.disk_partitions():
3599
+ try:
3600
+ usage = psutil.disk_usage(partition.mountpoint)
3601
+ system_info["disk"][partition.device] = {
3602
+ "mountpoint": partition.mountpoint,
3603
+ "file system type": partition.fstype,
3604
+ "total size (GB)": usage.total / (1024**3),
3605
+ "used (GB)": usage.used / (1024**3),
3606
+ "free (GB)": usage.free / (1024**3),
3607
+ "usage (%)": usage.percent,
3608
+ }
3609
+ except PermissionError:
3610
+ system_info["Disk"][partition.device] = "Permission Denied"
3611
+
3612
+ # Network Information
3613
+ if_addrs = psutil.net_if_addrs()
3614
+ for interface_name, interface_addresses in if_addrs.items():
3615
+ system_info["network"][interface_name] = []
3616
+ for address in interface_addresses:
3617
+ if str(address.family) == "AddressFamily.AF_INET":
3618
+ system_info["network"][interface_name].append(
3619
+ {
3620
+ "ip address": address.address,
3621
+ "netmask": address.netmask,
3622
+ "broadcast ip": address.broadcast,
3623
+ }
3624
+ )
3625
+ elif str(address.family) == "AddressFamily.AF_PACKET":
3626
+ system_info["network"][interface_name].append(
3627
+ {
3628
+ "mac address": address.address,
3629
+ "netmask": address.netmask,
3630
+ "broadcast mac": address.broadcast,
3631
+ }
3632
+ )
3633
+
3634
+ # GPU Information
3635
+ gpus = GPUtil.getGPUs()
3636
+ for gpu in gpus:
3637
+ gpu_info = {
3638
+ "name": gpu.name,
3639
+ "load (%)": gpu.load * 100,
3640
+ "free memory (MB)": gpu.memoryFree,
3641
+ "used memory (MB)": gpu.memoryUsed,
3642
+ "total memory (MB)": gpu.memoryTotal,
3643
+ "driver version": gpu.driver,
3644
+ "temperature (°C)": gpu.temperature,
3645
+ }
3646
+ if hasattr(gpu, "powerDraw"):
3647
+ gpu_info["Power Draw (W)"] = gpu.powerDraw
3648
+ if hasattr(gpu, "powerLimit"):
3649
+ gpu_info["Power Limit (W)"] = gpu.powerLimit
3650
+ system_info["gpu"].append(gpu_info)
3651
+
3652
+ res = system_info if full else get_os_type()
3653
+ if verbose:
3654
+ try:
3655
+ preview(res)
3656
+ except Exception as e:
3657
+ pnrint(e)
3658
+ return res
3149
3659
 
3150
3660
 
3151
3661
  def listdir(
@@ -3168,8 +3678,9 @@ def listdir(
3168
3678
  print(ls)
3169
3679
  df_all = pd.DataFrame(
3170
3680
  {
3171
- "fname": ls,
3172
- "fpath": [os.path.join(rootdir, i) for i in ls],
3681
+ "name": ls,
3682
+ "path": [os.path.join(rootdir, i) for i in ls],
3683
+ "kind":[os.path.splitext(i)[1] for i in ls]
3173
3684
  }
3174
3685
  )
3175
3686
  if verbose:
@@ -3308,7 +3819,94 @@ def listfunc(lib_name, opt="call"):
3308
3819
  def func_list(lib_name, opt="call"):
3309
3820
  return list_func(lib_name, opt=opt)
3310
3821
 
3822
+ def copy(src, dst, overwrite=False):
3823
+ """Copy a file from src to dst."""
3824
+ try:
3825
+ src = Path(src)
3826
+ dst = Path(dst)
3827
+ if not src.is_dir():
3828
+ if dst.is_dir():
3829
+ dst = dst / src.name
3830
+
3831
+ if dst.exists():
3832
+ if overwrite:
3833
+ dst.unlink()
3834
+ else:
3835
+ dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}")
3836
+ shutil.copy(src, dst)
3837
+ print(f"\n Done! copy to {dst}\n")
3838
+ else:
3839
+ dst = dst/src.name
3840
+ if dst.exists():
3841
+ if overwrite:
3842
+ shutil.rmtree(dst) # Remove existing directory
3843
+ else:
3844
+ dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('%H%M%S')}")
3845
+ shutil.copytree(src, dst)
3846
+ print(f"\n Done! copy to {dst}\n")
3847
+
3848
+ except Exception as e:
3849
+ logging.error(f"Failed {e}")
3850
+
3851
+ def move(src, dst, overwrite=False):
3852
+ return cut(src=src, dst=dst, overwrite=overwrite)
3311
3853
 
3854
+ def cut(src, dst, overwrite=False):
3855
+ try:
3856
+ src = Path(src)
3857
+ dst = Path(dst)
3858
+ if dst.is_dir():
3859
+ dst = dst / src.name
3860
+ if dst.exists():
3861
+ if overwrite:
3862
+ # dst.unlink() # Delete the existing file
3863
+ pass
3864
+ else:
3865
+ dst = dst.with_name(f"{dst.stem}_{datetime.now().strftime('_%H%M%S')}{dst.suffix}")
3866
+ shutil.move(src, dst)
3867
+ print(f"\n Done! moved to {dst}\n")
3868
+ except Exception as e:
3869
+ logging.error(f"Failed to move file from {src} to {dst}: {e}")
3870
+
3871
+ def delete(fpath):
3872
+ """Delete a file/folder."""
3873
+ try:
3874
+ fpath = Path(fpath)
3875
+ if not fpath.is_dir(): # file
3876
+ if fpath.exists():
3877
+ fpath.unlink()
3878
+ print(f"\n Done! delete {fpath}\n")
3879
+ else:
3880
+ print(f"File '{fpath}' does not exist.")
3881
+ else:#folder
3882
+ if fpath.exists():
3883
+ shutil.rmtree(fpath) # Remove existing directory
3884
+ print(f"\n Done! delete {fpath}\n")
3885
+ else:
3886
+ print(f"Folder '{fpath}' does not exist.")
3887
+ except Exception as e:
3888
+ logging.error(f"Failed to delete {fpath}: {e}")
3889
+ def rename(fpath, dst, smart=True):
3890
+ """Rename a file or folder."""
3891
+ try:
3892
+ src_kind,dst_kind = None,None
3893
+ if smart:
3894
+ dir_name_src=os.path.dirname(fpath)
3895
+ dir_name_dst=os.path.dirname(dst)
3896
+ src_kind=os.path.splitext(fpath)[1]
3897
+ dst_kind=os.path.splitext(dst)[1]
3898
+ if dir_name_dst!=dir_name_src:
3899
+ dst=os.path.join(dir_name_src,dst)
3900
+ if dst_kind is not None and src_kind is not None:
3901
+ if dst_kind!=src_kind:
3902
+ dst=dst + src_kind
3903
+ if os.path.exists(fpath):
3904
+ os.rename(fpath,dst)
3905
+ print(f"Done! rename to {dst}")
3906
+ else:
3907
+ print(f"Failed: {fpath} does not exist.")
3908
+ except Exception as e:
3909
+ logging.error(f"Failed to rename {fpath} to {dst}: {e}")
3312
3910
  def mkdir_nest(fpath: str) -> str:
3313
3911
  """
3314
3912
  Create nested directories based on the provided file path.
@@ -3327,7 +3925,9 @@ def mkdir_nest(fpath: str) -> str:
3327
3925
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
3328
3926
 
3329
3927
  # Start creating directories from the root to the desired path
3330
- current_path = ""
3928
+ root_dir = os.path.splitdrive(fpath)[0] # Get the root drive on Windows (e.g., 'C:')
3929
+ current_path = root_dir if root_dir else f_slash # Start from the root directory or POSIX '/'
3930
+
3331
3931
  for part in dir_parts:
3332
3932
  if part:
3333
3933
  current_path = os.path.join(current_path, part)
@@ -3351,10 +3951,13 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
3351
3951
  Returns:
3352
3952
  - str: The path of the created directory or an error message.
3353
3953
  """
3354
-
3355
3954
  rootdir = []
3955
+ pardir= mkdir_nest(pardir)
3356
3956
  if chdir is None:
3357
- return mkdir_nest(pardir)
3957
+ return pardir
3958
+ else:
3959
+ pass
3960
+ print(pardir)
3358
3961
  if isinstance(chdir, str):
3359
3962
  chdir = [chdir]
3360
3963
  chdir = list(set(chdir))
@@ -3392,7 +3995,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
3392
3995
  # Dir is the main output, if only one dir, then str type is inconvenient
3393
3996
  if len(rootdir) == 1:
3394
3997
  rootdir = rootdir[0]
3395
- rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
3998
+ rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
3396
3999
 
3397
4000
  return rootdir
3398
4001
 
@@ -3791,6 +4394,114 @@ def apply_filter(img, *args):
3791
4394
  )
3792
4395
  return img.filter(supported_filters[filter_name])
3793
4396
 
4397
+ def detect_angle(image, by="median", template=None):
4398
+ """Detect the angle of rotation using various methods."""
4399
+ from sklearn.decomposition import PCA
4400
+ from skimage import transform, feature, filters, measure
4401
+ from skimage.color import rgb2gray
4402
+ from scipy.fftpack import fftshift, fft2
4403
+ import numpy as np
4404
+ import cv2
4405
+ # Convert to grayscale
4406
+ gray_image = rgb2gray(image)
4407
+
4408
+ # Detect edges using Canny edge detector
4409
+ edges = feature.canny(gray_image, sigma=2)
4410
+
4411
+ # Use Hough transform to detect lines
4412
+ lines = transform.probabilistic_hough_line(edges)
4413
+
4414
+ if not lines and any(["me" in by, "pca" in by]):
4415
+ print("No lines detected. Adjust the edge detection parameters.")
4416
+ return 0
4417
+
4418
+ # Hough Transform-based angle detection (Median/Mean)
4419
+ if "me" in by:
4420
+ angles = []
4421
+ for line in lines:
4422
+ (x0, y0), (x1, y1) = line
4423
+ angle = np.arctan2(y1 - y0, x1 - x0) * 180 / np.pi
4424
+ if 80 < abs(angle) < 100:
4425
+ angles.append(angle)
4426
+ if not angles:
4427
+ return 0
4428
+ if "di" in by:
4429
+ median_angle = np.median(angles)
4430
+ rotation_angle = (
4431
+ 90 - median_angle if median_angle > 0 else -90 - median_angle
4432
+ )
4433
+
4434
+ return rotation_angle
4435
+ else:
4436
+ mean_angle = np.mean(angles)
4437
+ rotation_angle = 90 - mean_angle if mean_angle > 0 else -90 - mean_angle
4438
+
4439
+ return rotation_angle
4440
+
4441
+ # PCA-based angle detection
4442
+ elif "pca" in by:
4443
+ y, x = np.nonzero(edges)
4444
+ if len(x) == 0:
4445
+ return 0
4446
+ pca = PCA(n_components=2)
4447
+ pca.fit(np.vstack((x, y)).T)
4448
+ angle = np.arctan2(pca.components_[0, 1], pca.components_[0, 0]) * 180 / np.pi
4449
+ return angle
4450
+
4451
+ # Gradient Orientation-based angle detection
4452
+ elif "gra" in by:
4453
+ gx, gy = np.gradient(gray_image)
4454
+ angles = np.arctan2(gy, gx) * 180 / np.pi
4455
+ hist, bin_edges = np.histogram(angles, bins=360, range=(-180, 180))
4456
+ return bin_edges[np.argmax(hist)]
4457
+
4458
+ # Template Matching-based angle detection
4459
+ elif "temp" in by:
4460
+ if template is None:
4461
+ # Automatically extract a template from the center of the image
4462
+ height, width = gray_image.shape
4463
+ center_x, center_y = width // 2, height // 2
4464
+ size = (
4465
+ min(height, width) // 4
4466
+ ) # Size of the template as a fraction of image size
4467
+ template = gray_image[
4468
+ center_y - size : center_y + size, center_x - size : center_x + size
4469
+ ]
4470
+ best_angle = None
4471
+ best_corr = -1
4472
+ for angle in range(0, 180, 1): # Checking every degree
4473
+ rotated_template = transform.rotate(template, angle)
4474
+ res = cv2.matchTemplate(gray_image, rotated_template, cv2.TM_CCOEFF)
4475
+ _, max_val, _, _ = cv2.minMaxLoc(res)
4476
+ if max_val > best_corr:
4477
+ best_corr = max_val
4478
+ best_angle = angle
4479
+ return best_angle
4480
+
4481
+ # Image Moments-based angle detection
4482
+ elif "mo" in by:
4483
+ moments = measure.moments_central(gray_image)
4484
+ angle = (
4485
+ 0.5
4486
+ * np.arctan2(2 * moments[1, 1], moments[0, 2] - moments[2, 0])
4487
+ * 180
4488
+ / np.pi
4489
+ )
4490
+ return angle
4491
+
4492
+ # Fourier Transform-based angle detection
4493
+ elif "fft" in by:
4494
+ f = fft2(gray_image)
4495
+ fshift = fftshift(f)
4496
+ magnitude_spectrum = np.log(np.abs(fshift) + 1)
4497
+ rows, cols = magnitude_spectrum.shape
4498
+ r, c = np.unravel_index(np.argmax(magnitude_spectrum), (rows, cols))
4499
+ angle = np.arctan2(r - rows // 2, c - cols // 2) * 180 / np.pi
4500
+ return angle
4501
+
4502
+ else:
4503
+ print(f"Unknown method {by}")
4504
+ return 0
3794
4505
 
3795
4506
  def imgsets(img, **kwargs):
3796
4507
  """
@@ -5911,6 +6622,9 @@ def df_scaler(
5911
6622
  scaler=None,
5912
6623
  method="standard",
5913
6624
  columns=None, # default, select all numeric col/row
6625
+ feature_range=None,# specific for 'minmax'
6626
+ vmin=0,
6627
+ vmax=1,
5914
6628
  inplace=False,
5915
6629
  verbose=False, # show usage
5916
6630
  axis=0, # defalut column-wise
@@ -5943,11 +6657,13 @@ def df_scaler(
5943
6657
  scaler = StandardScaler(**kwargs)
5944
6658
  elif method == "minmax":
5945
6659
  from sklearn.preprocessing import MinMaxScaler
6660
+ if feature_range is None:
6661
+ feature_range=(vmin,vmax)
5946
6662
  if verbose:
5947
6663
  print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
5948
6664
  print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
5949
6665
  print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
5950
- scaler = MinMaxScaler(**kwargs)
6666
+ scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
5951
6667
  elif method == "robust":
5952
6668
  from sklearn.preprocessing import RobustScaler
5953
6669
  if verbose:
@@ -6035,15 +6751,20 @@ def df_special_characters_cleaner(
6035
6751
 
6036
6752
  # 1. Clean column names by replacing special characters with underscores
6037
6753
  if "column" in where_:
6038
- data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
6754
+ try:
6755
+ data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
6756
+ except Exception as e:
6757
+ print(e)
6039
6758
 
6040
6759
  # 2. Clean only object-type columns (text columns)
6041
- if "content" in where_:
6042
- for col in data.select_dtypes(include=["object"]).columns:
6043
- data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
6044
- if data.index.dtype == "object" and index in where_:
6045
- data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
6046
-
6760
+ try:
6761
+ if "content" in where_:
6762
+ for col in data.select_dtypes(include=["object"]).columns:
6763
+ data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
6764
+ if data.index.dtype == "object" and index in where_:
6765
+ data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
6766
+ except:
6767
+ pass
6047
6768
  return data
6048
6769
 
6049
6770
 
@@ -6426,6 +7147,9 @@ def df_reducer(
6426
7147
  # "autoencoder","nmf",
6427
7148
  ]
6428
7149
  method = strcmp(method, methods)[0]
7150
+ if run_once_within(reverse=True):
7151
+ print(f"support methods:{methods}")
7152
+
6429
7153
  if verbose:
6430
7154
  print(f"\nprocessing with using {dict_methods[method]}:")
6431
7155
  xlabel, ylabel = None, None
@@ -6433,16 +7157,20 @@ def df_reducer(
6433
7157
  columns = data.select_dtypes(include="number").columns.tolist()
6434
7158
  if hue is None:
6435
7159
  hue = data.select_dtypes(exclude="number").columns.tolist()
7160
+ print(f"auto select the non-number as 'hue':{hue}")
6436
7161
  if isinstance(hue, list):
6437
7162
  print("Warning: hue is a list, only select the 1st one")
6438
7163
  hue = hue[0]
6439
- if not hue:
7164
+ if not any(hue):
6440
7165
  # Select columns if specified, else use all columns
6441
7166
  X = data[columns].values if columns else data.values
6442
7167
  else:
6443
7168
  # Select columns to reduce and hue for LDA
6444
- X = data[columns].values if columns else data.drop(columns=[hue]).values
6445
- y = data[hue].values
7169
+ try:
7170
+ X = data[columns].values if columns else data.drop(columns=[hue]).values
7171
+ y = data[hue].values
7172
+ except:
7173
+ pass
6446
7174
  print(X.shape)
6447
7175
  # Handle missing values
6448
7176
  if fill_missing:
@@ -6909,33 +7637,49 @@ def df_reducer(
6909
7637
  colname_met = "SVD_"
6910
7638
  # Quick plots
6911
7639
  if plot_ and (not method in ["isolation_forest"]):
6912
- from .plot import plotxy
6913
- if ax is None:
6914
- if figsize is None:
6915
- _, ax = plt.subplots(figsize=cm2inch(8, 8))
6916
- else:
6917
- _, ax = plt.subplots(figsize=figsize)
6918
- else:
6919
- ax = ax.cla()
7640
+ from .plot import plotxy,figsets,get_color
7641
+ # if ax is None:
7642
+ # if figsize is None:
7643
+ # _, ax = plt.subplots(figsize=cm2inch(8, 8))
7644
+ # else:
7645
+ # _, ax = plt.subplots(figsize=figsize)
7646
+ # else:
7647
+ # ax = ax.cla()
6920
7648
  xlabel = f"{colname_met}1" if xlabel is None else xlabel
6921
7649
  ylabel = f"{colname_met}2" if ylabel is None else ylabel
7650
+ palette=get_color(len(flatten(data[hue],verbose=0)))
7651
+
7652
+ reduced_df=reduced_df.sort_values(by=hue)
7653
+ print(flatten(reduced_df[hue]))
6922
7654
  ax = plotxy(
6923
7655
  data=reduced_df,
6924
7656
  x=colname_met + "1",
6925
7657
  y=colname_met + "2",
6926
7658
  hue=hue,
6927
- s=size,
7659
+ palette=palette,
7660
+ # size=size,
6928
7661
  edgecolor=edgecolor,
6929
- kind_="scater",
6930
- figsets=dict(
6931
- legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
6932
- xlabel=xlabel if xlabel else None,
6933
- ylabel=ylabel if ylabel else None,
6934
- ),
6935
- ax=ax,
7662
+ kind_=["joint",
7663
+ # "kde",
7664
+ "ell",
7665
+ ],
7666
+ kws_kde=dict(
7667
+ hue=hue,
7668
+ levels=2,
7669
+ common_norm=False,
7670
+ fill=True,
7671
+ alpha=0.05,
7672
+ ),
7673
+ kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
7674
+ kws_ellipse=dict(alpha=0.1,lw=1,label=None),
6936
7675
  verbose=False,
6937
7676
  **kwargs,
6938
7677
  )
7678
+ figsets(
7679
+ legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
7680
+ xlabel=xlabel if xlabel else None,
7681
+ ylabel=ylabel if ylabel else None,
7682
+ )
6939
7683
 
6940
7684
  if inplace:
6941
7685
  # If inplace=True, add components back into the original data
@@ -7412,6 +8156,7 @@ def df_qc(
7412
8156
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7413
8157
  from scipy.stats import skew, kurtosis, entropy
7414
8158
 
8159
+ pd.options.display.max_seq_items = 10
7415
8160
  #! display(data.select_dtypes(include=[np.number]).describe())
7416
8161
  #!skim
7417
8162
  if columns is not None:
@@ -7428,16 +8173,18 @@ def df_qc(
7428
8173
  data = data.copy()
7429
8174
  data.loc[:, data.isna().all()] = 0
7430
8175
  res_qc = {}
7431
- print(f"data.shape:{data.shape}")
8176
+ print(f"data.shape:{data.shape}\n⤵ data.sample(10):")
8177
+ display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
7432
8178
 
7433
8179
  # Missing values
7434
8180
  res_qc["missing_values"] = data.isnull().sum()
7435
- res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
8181
+ res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
7436
8182
  res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
7437
8183
 
7438
8184
  # Data types and unique values
7439
8185
  res_qc["data_types"] = data.dtypes
7440
- res_qc["unique_values"] = data.nunique()
8186
+ res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
8187
+ res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
7441
8188
  res_qc["constant_columns"] = [
7442
8189
  col for col in data.columns if data[col].nunique() <= 1
7443
8190
  ]
@@ -7453,33 +8200,42 @@ def df_qc(
7453
8200
  data_outliers = df_outlier(data)
7454
8201
  outlier_num = data_outliers.isna().sum() - data.isnull().sum()
7455
8202
  res_qc["outlier_num"] = outlier_num[outlier_num > 0]
7456
- outlier_percentage=(outlier_num / len(data_outliers)) * 100
8203
+ outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
7457
8204
  res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
7458
- # Correlation and multicollinearity (VIF)
7459
- if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7460
- numeric_df = data.select_dtypes(include=[np.number]).dropna()
7461
- corr_matrix = numeric_df.corr()
7462
- high_corr_pairs = [
7463
- (col1, col2)
7464
- for col1 in corr_matrix.columns
7465
- for col2 in corr_matrix.columns
7466
- if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7467
- ]
7468
- res_qc["high_correlations"] = high_corr_pairs
7469
-
7470
- # VIF for multicollinearity check
7471
- numeric_df = data.select_dtypes(include=[np.number]).dropna()
7472
- vif_data = pd.DataFrame()
7473
- res_qc["vif"]=vif_data
7474
- if numeric_df.shape[1] > 1 and not numeric_df.empty:
7475
- vif_data["feature"] = numeric_df.columns
7476
- vif_data["VIF"] = [
7477
- variance_inflation_factor(numeric_df.values, i)
7478
- for i in range(numeric_df.shape[1])
8205
+ try:
8206
+ # Correlation and multicollinearity (VIF)
8207
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
8208
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
8209
+ corr_matrix = numeric_df.corr()
8210
+ high_corr_pairs = [
8211
+ (col1, col2)
8212
+ for col1 in corr_matrix.columns
8213
+ for col2 in corr_matrix.columns
8214
+ if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7479
8215
  ]
7480
- res_qc["vif"] = vif_data[
7481
- vif_data["VIF"] > 5
7482
- ] # Typically VIF > 5 indicates multicollinearity
8216
+ res_qc["high_correlations"] = high_corr_pairs
8217
+
8218
+ # VIF for multicollinearity check
8219
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
8220
+ if isinstance(numeric_df.columns, pd.MultiIndex):
8221
+ numeric_df.columns = [
8222
+ "_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
8223
+ ]
8224
+
8225
+
8226
+ vif_data = pd.DataFrame()
8227
+ res_qc["vif"]=vif_data
8228
+ if numeric_df.shape[1] > 1 and not numeric_df.empty:
8229
+ vif_data["feature"] = numeric_df.columns.tolist()
8230
+ vif_data["VIF"] = [
8231
+ round(variance_inflation_factor(numeric_df.values, i),2)
8232
+ for i in range(numeric_df.shape[1])
8233
+ ]
8234
+ res_qc["vif"] = vif_data[
8235
+ vif_data["VIF"] > 5
8236
+ ] # Typically VIF > 5 indicates multicollinearity
8237
+ except Exception as e:
8238
+ print(e)
7483
8239
  # Skewness and Kurtosis
7484
8240
  skewness = data.skew(numeric_only=True)
7485
8241
  kurtosis_vals = data.kurt(numeric_only=True)
@@ -7492,8 +8248,7 @@ def df_qc(
7492
8248
  col: entropy(data[col].value_counts(normalize=True), base=2)
7493
8249
  for col in categorical_cols
7494
8250
  }
7495
- # number of unique
7496
- res_qc["unique_counts"] = data.nunique()
8251
+
7497
8252
  # dtypes counts
7498
8253
  res_qc['dtype_counts']=data.dtypes.value_counts()
7499
8254
 
@@ -7540,7 +8295,7 @@ def df_qc(
7540
8295
  res_qc["text_length_analysis"] = text_lengths
7541
8296
 
7542
8297
  # Summary statistics
7543
- res_qc["summary_statistics"] = data.describe().T
8298
+ res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
7544
8299
 
7545
8300
  # Automated warnings
7546
8301
  warnings = []
@@ -7562,28 +8317,45 @@ def df_qc(
7562
8317
 
7563
8318
  # Report generation
7564
8319
  if verbose:
7565
- print("=== QC Report Summary ===")
7566
8320
  print("\n⤵ Summary Statistics:")
7567
8321
  display(res_qc["summary_statistics"])
7568
8322
  print("\n⤵ Data Types:")
7569
8323
  display(res_qc["data_types"])
7570
8324
  if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
7571
8325
  print(" ⤵ Missing Values Counts:")
7572
- display(res_qc["missing_values"][res_qc["missing_values"] > 0])
8326
+ display(pd.DataFrame(
8327
+ {
8328
+ "missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
8329
+ "missing_percent(%)": res_qc["missing_percentage"][
8330
+ res_qc["missing_percentage"] > 0
8331
+ ],
8332
+ }
8333
+ ).style.background_gradient(cmap="coolwarm", axis=0)
8334
+ )
7573
8335
  # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7574
8336
  print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
7575
8337
 
8338
+ print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
8339
+ print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
8340
+ print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
8341
+
7576
8342
  if any(res_qc["outlier_num"]):
7577
8343
  print("\n⤵ Outlier Report:")
7578
- display(res_qc["outlier_num"])
7579
- if any(res_qc["unique_values"]):
7580
- print("\n⤵ Unique Values per Column:")
7581
- display(res_qc["unique_values"])
8344
+ display(pd.DataFrame(
8345
+ {
8346
+ "outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
8347
+ "outlier_percentage(%)": res_qc["outlier_percentage"][
8348
+ res_qc["outlier_percentage"] > 0
8349
+ ],
8350
+ }
8351
+ ).style.background_gradient(cmap="coolwarm", axis=0)
8352
+ )
7582
8353
 
7583
- print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
8354
+ if any(res_qc["unique_counts"]):
8355
+ print("\n⤵ Unique Values per Column:")
8356
+ display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
8357
+ "unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
7584
8358
 
7585
- print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7586
- print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7587
8359
 
7588
8360
  if res_qc["empty_columns"]:
7589
8361
  print("\n⤵ Empty Columns:", res_qc["empty_columns"])
@@ -7595,7 +8367,7 @@ def df_qc(
7595
8367
 
7596
8368
  if "vif" in res_qc:
7597
8369
  print("\n⤵ Features with High VIF (>|5|):")
7598
- print(res_qc["vif"])
8370
+ display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
7599
8371
 
7600
8372
  if any(res_qc["high_cardinality_categoricals"]):
7601
8373
  print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
@@ -7614,6 +8386,8 @@ def df_qc(
7614
8386
  print("\nWarnings:")
7615
8387
  for warning in res_qc["warnings"]:
7616
8388
  print(" -", warning)
8389
+
8390
+ pd.reset_option("display.max_seq_items")
7617
8391
  if plot_:
7618
8392
  df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
7619
8393
  if output or not plot_:
@@ -7632,7 +8406,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7632
8406
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7633
8407
  data=data[columns]
7634
8408
  len_total = len(res_qc)
7635
- n_row, n_col = int((len_total + 10) / 3), 3
8409
+ n_row, n_col = int((len_total + 10)), 3
7636
8410
  nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7637
8411
 
7638
8412
  missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
@@ -7789,8 +8563,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7789
8563
  title="Dtypes",
7790
8564
  ylabel="#",
7791
8565
  ax=ax_dtype_counts,
7792
- fontsize=8 if len(dtype_counts.index)<=20 else 6,
8566
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7793
8567
  )
8568
+ # from .plot import pie
8569
+ # pie()
7794
8570
 
7795
8571
  # High cardinality: Show top categorical columns by unique value count
7796
8572
  high_cardinality = res_qc["high_cardinality_categoricals"]
@@ -7871,16 +8647,17 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7871
8647
  title="Correlation Heatmap",
7872
8648
  ax=ax_heatmap
7873
8649
  )
7874
- # save figure
7875
- if dir_save:
7876
- figsave(dir_save,f"qc_plot_{now_}.pdf")
8650
+ # # save figure
8651
+ # if dir_save:
8652
+ # figsave(dir_save,f"qc_plot_{now_}.pdf")
7877
8653
 
7878
8654
  if columns is not None:
7879
8655
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7880
8656
  data=data[columns]
7881
- len_total = len(res_qc)
7882
- n_row, n_col = int((len_total + 10) / 3), 3
7883
- nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
8657
+
8658
+ # len_total = len(res_qc)
8659
+ # n_row, n_col = int((len_total + 10) / 3), 3
8660
+ # nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7884
8661
  #! check distribution
7885
8662
  data_num = data.select_dtypes(include=np.number)
7886
8663
  if len(data_num) > max_cols:
@@ -7907,7 +8684,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7907
8684
  figsets(ylabel=f'Q-Q Plot:{column}',title=None)
7908
8685
  # save figure
7909
8686
  if dir_save:
7910
- figsave(dir_save,f"qq_plot_{now_}.pdf")
8687
+ figsave(dir_save,f"qc_plot_{now_}.pdf")
8688
+
8689
+ def df_corr(df: pd.DataFrame, method="pearson"):
8690
+ """
8691
+ Compute correlation coefficients and p-values for a DataFrame.
8692
+
8693
+ Parameters:
8694
+ - df (pd.DataFrame): Input DataFrame with numeric data.
8695
+ - method (str): Correlation method ("pearson", "spearman", "kendall").
8696
+
8697
+ Returns:
8698
+ - corr_matrix (pd.DataFrame): Correlation coefficient matrix.
8699
+ - pval_matrix (pd.DataFrame): P-value matrix.
8700
+ """
8701
+ from scipy.stats import pearsonr, spearmanr, kendalltau
8702
+
8703
+ methods = ["pearson", "spearman", "kendall"]
8704
+ method = strcmp(method, methods)[0]
8705
+ methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
8706
+
8707
+ cols = df.columns
8708
+ corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
8709
+ pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
8710
+ correlation_func = methods_dict[method]
8711
+
8712
+ for col1 in cols:
8713
+ for col2 in cols:
8714
+ if col1 == col2:
8715
+ corr_matrix.loc[col1, col2] = 1.0
8716
+ pval_matrix.loc[col1, col2] = 0.0
8717
+ else:
8718
+ corr, pval = correlation_func(df[col1], df[col2])
8719
+ corr_matrix.loc[col1, col2] = corr
8720
+ pval_matrix.loc[col1, col2] = pval
8721
+
8722
+ return corr_matrix, pval_matrix
8723
+
7911
8724
  def use_pd(
7912
8725
  func_name="excel",
7913
8726
  verbose=True,
@@ -7927,3 +8740,135 @@ def use_pd(
7927
8740
  except Exception as e:
7928
8741
  if verbose:
7929
8742
  print(e)
8743
+
8744
+ def get_phone(phone_number: str, region: str = None,verbose=True):
8745
+ """
8746
+ usage:
8747
+ info = get_phone(15237654321, "DE")
8748
+ preview(info)
8749
+
8750
+ Extremely advanced phone number analysis function.
8751
+
8752
+ Args:
8753
+ phone_number (str): The phone number to analyze.
8754
+ region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
8755
+
8756
+ Returns:
8757
+ dict: Comprehensive information about the phone number.
8758
+ """
8759
+ import phonenumbers
8760
+ from phonenumbers import geocoder, carrier, timezone, number_type
8761
+ from datetime import datetime
8762
+ import pytz
8763
+ from tzlocal import get_localzone
8764
+
8765
+ if not isinstance(phone_number, str):
8766
+ phone_number = str(phone_number)
8767
+ if isinstance(region, str):
8768
+ region = region.upper()
8769
+
8770
+ try:
8771
+ # Parse the phone number
8772
+ parsed_number = phonenumbers.parse(phone_number, region)
8773
+
8774
+ # Validate the phone number
8775
+ valid = phonenumbers.is_valid_number(parsed_number)
8776
+ possible = phonenumbers.is_possible_number(parsed_number)
8777
+
8778
+ if not valid:
8779
+ suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
8780
+ return {
8781
+ "valid": False,
8782
+ "error": "Invalid phone number",
8783
+ "suggested_fix": suggested_fix,
8784
+ }
8785
+
8786
+ # Basic details
8787
+ formatted_international = phonenumbers.format_number(
8788
+ parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
8789
+ )
8790
+ formatted_national = phonenumbers.format_number(
8791
+ parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
8792
+ )
8793
+ formatted_e164 = phonenumbers.format_number(
8794
+ parsed_number, phonenumbers.PhoneNumberFormat.E164
8795
+ )
8796
+ country_code = parsed_number.country_code
8797
+ region_code = geocoder.region_code_for_number(parsed_number)
8798
+ country_name = geocoder.country_name_for_number(parsed_number, "en")
8799
+
8800
+ location = geocoder.description_for_number(parsed_number, "en")
8801
+ carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
8802
+ time_zones = timezone.time_zones_for_number(parsed_number)[0]
8803
+ current_times = datetime.now(pytz.timezone(time_zones)).strftime(
8804
+ "%Y-%m-%d %H:%M:%S %Z"
8805
+ )
8806
+ number_type_str = {
8807
+ phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
8808
+ phonenumbers.PhoneNumberType.MOBILE: "Mobile",
8809
+ phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
8810
+ phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
8811
+ phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
8812
+ phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
8813
+ phonenumbers.PhoneNumberType.VOIP: "VOIP",
8814
+ phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
8815
+ phonenumbers.PhoneNumberType.PAGER: "Pager",
8816
+ phonenumbers.PhoneNumberType.UAN: "UAN",
8817
+ phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
8818
+ }.get(number_type(parsed_number), "Unknown")
8819
+
8820
+ # Advanced Features
8821
+ is_toll_free = (
8822
+ number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
8823
+ )
8824
+ is_premium_rate = (
8825
+ number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
8826
+ )
8827
+
8828
+ # Dialing Information
8829
+ dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
8830
+
8831
+ # Advanced Timezone Handling
8832
+ gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
8833
+ # Get the local timezone (current computer's time)
8834
+ local_timezone = get_localzone()
8835
+ #local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
8836
+ local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
8837
+ offset_diff = local_offset - gmt_offsets
8838
+ head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
8839
+ res= {
8840
+ "valid": True,
8841
+ "possible": possible,
8842
+ "formatted": {
8843
+ "international": formatted_international,
8844
+ "national": formatted_national,
8845
+ "e164": formatted_e164,
8846
+ },
8847
+ "country_code": country_code,
8848
+ "country_name": country_name,
8849
+ "region_code": region_code,
8850
+ "location": location if location else "Unknown",
8851
+ "carrier": carrier_name,
8852
+ "time_zone": time_zones,
8853
+ "current_times": current_times,
8854
+ "local_offset":f"{local_offset} utcoffset",
8855
+ "time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
8856
+ "number_type": number_type_str,
8857
+ "is_toll_free": is_toll_free,
8858
+ "is_premium_rate": is_premium_rate,
8859
+ "dialing_instructions": dialing_instructions,
8860
+ "suggested_fix": None, # Use phonenumbers.example_number if invalid
8861
+ "logs": {
8862
+ "number_analysis_completed": datetime.now().strftime(
8863
+ "%Y-%m-%d %H:%M:%S"
8864
+ ),
8865
+ "raw_input": phone_number,
8866
+ "parsed_number": str(parsed_number),
8867
+ },
8868
+ }
8869
+
8870
+ except phonenumbers.NumberParseException as e:
8871
+ res= {"valid": False, "error": str(e)}
8872
+ if verbose:
8873
+ preview(res)
8874
+ return res