py2ls 0.2.4.5__py3-none-any.whl → 0.2.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -60,6 +60,7 @@ except NameError:
60
60
  def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
61
61
  """
62
62
  Add the Chinese (default) font to the font manager
63
+ show chinese
63
64
  Args:
64
65
  dir_font (str, optional): _description_. Defaults to "/System/Library/Fonts/Hiragino Sans GB.ttc".
65
66
  """
@@ -554,14 +555,28 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
554
555
  # Get elements that appear in at least n_shared lists
555
556
  shared_elements = [item for item, count in element_count.items() if count >= n_shared]
556
557
 
557
- shared_elements = flatten(shared_elements, verbose=verbose)
558
+ shared_elements = flatten(shared_elements, verbose=verbose)
558
559
  if verbose:
559
560
  elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
560
561
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
561
562
  print("********* checking shared elements *********")
562
563
  return shared_elements
563
564
 
564
- def flatten(nested: Any, unique_list=True, verbose=True):
565
+ def not_shared(*args, strict=True, n_shared=2, verbose=False):
566
+ """
567
+ To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
568
+ usage:
569
+ list1 = [1, 8, 3, 3, 4, 5]
570
+ list2 = [4, 5, 6, 7, 8]
571
+ not_shared(list1,list2)# output [1,3]
572
+ """
573
+ _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
574
+ list1 = args[0]
575
+ _not_shared=[item for item in list1 if item not in _common]
576
+ return flatten(_not_shared, verbose=verbose)
577
+
578
+
579
+ def flatten(nested: Any, unique_list=True, verbose=False):
565
580
  """
566
581
  Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
567
582
  Parameters:
@@ -589,7 +604,7 @@ def flatten(nested: Any, unique_list=True, verbose=True):
589
604
  else:
590
605
  return flattened_list
591
606
 
592
- def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
607
+ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=False, scorer="WR"):
593
608
  """
594
609
  Compares a search term with a list of candidate strings and finds the best match based on similarity score.
595
610
 
@@ -623,6 +638,11 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
623
638
  similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
624
639
  else:
625
640
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
641
+ if get_rank:
642
+ idx = [similarity_scores.index(i) for i in sorted(similarity_scores,reverse=True)]
643
+ if verbose:
644
+ display([candidates[ii] for ii in idx])
645
+ return [candidates[ii] for ii in idx]
626
646
  best_match_index = similarity_scores.index(max(similarity_scores))
627
647
  best_match_score = similarity_scores[best_match_index]
628
648
  else:
@@ -1554,13 +1574,26 @@ def unzip(dir_path, output_dir=None):
1554
1574
  tar_ref.extractall(output_dir)
1555
1575
  return output_dir
1556
1576
  # Handle .gz files
1557
- if dir_path.endswith(".gz"):
1577
+ if dir_path.endswith(".gz") or dir_path.endswith(".gzip"):
1558
1578
  import gzip
1559
1579
 
1560
1580
  output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
1561
- with gzip.open(dir_path, "rb") as gz_file:
1562
- with open(output_file, "wb") as out_file:
1563
- shutil.copyfileobj(gz_file, out_file)
1581
+ try:
1582
+ with gzip.open(dir_path, "rb") as gz_file:
1583
+ with open(output_file, "wb") as out_file:
1584
+ shutil.copyfileobj(gz_file, out_file)
1585
+ print(f"unzipped '{dir_path}' to '{output_file}'")
1586
+ except FileNotFoundError:
1587
+ print(f"Error: The file '{dir_path}' was not found.")
1588
+ except PermissionError:
1589
+ print(f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'.")
1590
+ except Exception as e:
1591
+ try:
1592
+ import tarfile
1593
+ with tarfile.open(dir_path, 'r:gz') as tar:
1594
+ tar.extractall(path=output_file)
1595
+ except Exception as final_e:
1596
+ print(f"An final unexpected error occurred: {final_e}")
1564
1597
  return output_file
1565
1598
 
1566
1599
  # Handle .zip files
@@ -1642,9 +1675,12 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1642
1675
  False: normal
1643
1676
 
1644
1677
  """
1678
+ if not isinstance(df, pd.DataFrame):
1679
+ return False
1680
+ df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
1645
1681
  # Initialize a list to hold messages about abnormalities
1646
1682
  messages = []
1647
- is_abnormal = False
1683
+ is_abnormal = True
1648
1684
  # Check the shape of the DataFrame
1649
1685
  actual_shape = df.shape
1650
1686
  messages.append(f"Shape of DataFrame: {actual_shape}")
@@ -1739,10 +1775,12 @@ def fload(fpath, kind=None, **kwargs):
1739
1775
  content = file.read()
1740
1776
  return content
1741
1777
 
1742
- def load_html(fpath):
1743
- with open(fpath, "r") as file:
1744
- content = file.read()
1745
- return content
1778
+ # def load_html(fpath):
1779
+ # with open(fpath, "r") as file:
1780
+ # content = file.read()
1781
+ # return content
1782
+ def load_html(fpath,**kwargs):
1783
+ return pd.read_html(fpath,**kwargs)
1746
1784
 
1747
1785
  def load_json(fpath, **kwargs):
1748
1786
  output=kwargs.pop("output","json")
@@ -1956,8 +1994,8 @@ def fload(fpath, kind=None, **kwargs):
1956
1994
  # display(df.head(2))
1957
1995
  # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
1958
1996
  if not is_df_abnormal(df, verbose=0):
1959
- display(df.head(2))
1960
- print(f"shape: {df.shape}")
1997
+ display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
1998
+ print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
1961
1999
  return df
1962
2000
  except EmptyDataError as e:
1963
2001
  continue
@@ -1981,6 +2019,42 @@ def fload(fpath, kind=None, **kwargs):
1981
2019
  pass
1982
2020
  return df
1983
2021
 
2022
+
2023
+ def load_parquet(fpath, **kwargs):
2024
+ """
2025
+ Load a Parquet file into a Pandas DataFrame with advanced options.
2026
+
2027
+ Parameters:
2028
+ - fpath (str): The file path to the Parquet file.
2029
+ - engine (str): The engine to use for reading the Parquet file (default is 'pyarrow').
2030
+ - columns (list): List of columns to load. If None, loads all columns.
2031
+ - verbose (bool): If True, prints additional information about the loading process.
2032
+ - filters (list): List of filter conditions for predicate pushdown.
2033
+ - **kwargs: Additional keyword arguments for `pd.read_parquet`.
2034
+
2035
+ Returns:
2036
+ - df (DataFrame): The loaded DataFrame.
2037
+ """
2038
+
2039
+ engine = kwargs.get("engine", "pyarrow")
2040
+ verbose = kwargs.pop("verbose", False)
2041
+
2042
+ if verbose:
2043
+ use_pd("read_parquet", verbose=verbose)
2044
+ try:
2045
+ df = pd.read_parquet(fpath, engine=engine, **kwargs)
2046
+ if verbose:
2047
+ if 'columns' in kwargs:
2048
+ print(f"Loaded columns: {kwargs['columns']}")
2049
+ else:
2050
+ print("Loaded all columns.")
2051
+ print(f"shape: {df.shape}")
2052
+ except Exception as e:
2053
+ print(f"An error occurred while loading the Parquet file: {e}")
2054
+ df = None
2055
+
2056
+ return df
2057
+
1984
2058
  def load_ipynb(fpath, **kwargs):
1985
2059
  as_version = kwargs.get("as_version", 4)
1986
2060
  with open(fpath, "r") as file:
@@ -2049,51 +2123,21 @@ def fload(fpath, kind=None, **kwargs):
2049
2123
  kind = kind.lower()
2050
2124
  kind = kind.lstrip(".").lower()
2051
2125
  img_types = [
2052
- "bmp",
2053
- "eps",
2054
- "gif",
2055
- "icns",
2056
- "ico",
2057
- "im",
2058
- "jpg",
2059
- "jpeg",
2060
- "jpeg2000",
2061
- "msp",
2062
- "pcx",
2063
- "png",
2064
- "ppm",
2065
- "sgi",
2066
- "spider",
2067
- "tga",
2068
- "tiff",
2069
- "tif",
2070
- "webp",
2071
- "json",
2126
+ "bmp","eps","gif","png","jpg","jpeg","jpeg2000","tiff","tif",
2127
+ "icns","ico","im","msp","pcx","ppm","sgi","spider","tga","webp",
2072
2128
  ]
2073
2129
  doc_types = [
2074
- "docx",
2075
- "txt",
2076
- "md",
2077
- "html",
2078
- "json",
2079
- "yaml",
2080
- "xml",
2081
- "csv",
2082
- "xlsx",
2083
- "pdf",
2130
+ "docx","pdf",
2131
+ "txt","csv","xlsx","tsv","parquet","snappy",
2132
+ "md","html",
2133
+ "json","yaml","xml",
2084
2134
  "ipynb",
2135
+ "mtx"
2085
2136
  ]
2086
2137
  zip_types = [
2087
- "gz",
2088
- "zip",
2089
- "7z",
2090
- "tar",
2091
- "tar.gz",
2092
- "tar.bz2",
2093
- "bz2",
2094
- "xz",
2095
- "rar",
2096
- "tgz",
2138
+ "gz","zip","7z","rar","tgz",
2139
+ "tar","tar.gz","tar.bz2",
2140
+ "bz2","xz","gzip"
2097
2141
  ]
2098
2142
  other_types = ["fcs"]
2099
2143
  supported_types = [*doc_types, *img_types, *zip_types, *other_types]
@@ -2122,14 +2166,14 @@ def fload(fpath, kind=None, **kwargs):
2122
2166
  elif kind == "txt" or kind == "md":
2123
2167
  return load_txt_md(fpath)
2124
2168
  elif kind == "html":
2125
- return load_html(fpath)
2169
+ return load_html(fpath, **kwargs)
2126
2170
  elif kind == "json":
2127
- return load_json(fpath)
2171
+ return load_json(fpath, **kwargs)
2128
2172
  elif kind == "yaml":
2129
2173
  return load_yaml(fpath)
2130
2174
  elif kind == "xml":
2131
2175
  return load_xml(fpath)
2132
- elif kind == "csv":
2176
+ elif kind in ["csv","tsv"]:
2133
2177
  content = load_csv(fpath, **kwargs)
2134
2178
  return content
2135
2179
  elif kind in ["ods", "ods", "odt"]:
@@ -2140,14 +2184,25 @@ def fload(fpath, kind=None, **kwargs):
2140
2184
  engine = kwargs.get("engine", "xlrd")
2141
2185
  kwargs.pop("engine", None)
2142
2186
  content = load_excel(fpath, engine=engine, **kwargs)
2187
+ print(f"shape: {content.shape}")
2143
2188
  display(content.head(3))
2144
2189
  return content
2145
2190
  elif kind == "xlsx":
2146
2191
  content = load_excel(fpath, **kwargs)
2147
2192
  display(content.head(3))
2193
+ print(f"shape: {content.shape}")
2194
+ return content
2195
+ elif kind=='mtx':
2196
+ from scipy.io import mmread
2197
+ dat_mtx=mmread(fpath)
2198
+ content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
2199
+ display(content.head(3))
2200
+ print(f"shape: {content.shape}")
2148
2201
  return content
2149
2202
  elif kind == "ipynb":
2150
2203
  return load_ipynb(fpath, **kwargs)
2204
+ elif kind in ['parquet','snappy']:
2205
+ return load_parquet(fpath,**kwargs)
2151
2206
  elif kind == "pdf":
2152
2207
  # print('usage:load_pdf(fpath, page="all", verbose=False)')
2153
2208
  return load_pdf(fpath, **kwargs)
@@ -2193,9 +2248,7 @@ def fload(fpath, kind=None, **kwargs):
2193
2248
  return meta, data
2194
2249
 
2195
2250
  else:
2196
- # try:
2197
- # content = load_csv(fpath, **kwargs)
2198
- # except:
2251
+ print("direct reading...")
2199
2252
  try:
2200
2253
  try:
2201
2254
  with open(fpath, "r", encoding="utf-8") as f:
@@ -2495,6 +2548,25 @@ def fsave(
2495
2548
  tree = etree.ElementTree(root)
2496
2549
  tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
2497
2550
 
2551
+ def save_parquet(fpath:str, data:pd.DataFrame, **kwargs):
2552
+ engine = kwargs.pop("engine","auto") # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
2553
+ compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2554
+ try:
2555
+ # Attempt to save with "pyarrow" if engine is set to "auto"
2556
+ data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
2557
+ print(f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression.")
2558
+ except Exception as e:
2559
+ print(f"Error using with engine '{engine}' and {compression} compression: {e}")
2560
+ if "Sparse" in str(e):
2561
+ try:
2562
+ # Handle sparse data by converting columns to dense
2563
+ print("Attempting to convert sparse columns to dense format...")
2564
+ data = data.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x)
2565
+ save_parquet(fpath, data=data,**kwargs)
2566
+ except Exception as last_e:
2567
+ print(f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}")
2568
+
2569
+
2498
2570
  if kind is None:
2499
2571
  _, kind = os.path.splitext(fpath)
2500
2572
  kind = kind.lower()
@@ -2540,6 +2612,15 @@ def fsave(
2540
2612
  save_yaml(fpath, content, **kwargs)
2541
2613
  elif kind == "ipynb":
2542
2614
  save_ipynb(fpath, content, **kwargs)
2615
+ elif kind.lower() in ["parquet","pq","big","par"]:
2616
+ compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2617
+ # fix the fpath ends
2618
+ if not '.parquet' in fpath:
2619
+ fpath=fpath.replace(kind, 'parquet')
2620
+ if compression is not None:
2621
+ if not fpath.endswith(compression):
2622
+ fpath=fpath+f".{compression}"
2623
+ save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
2543
2624
  else:
2544
2625
  try:
2545
2626
  netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
@@ -3058,8 +3139,11 @@ def figsave(*args, dpi=300):
3058
3139
 
3059
3140
  def is_str_color(s):
3060
3141
  # Regular expression pattern for hexadecimal color codes
3061
- color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
3062
- return re.match(color_code_pattern, s) is not None
3142
+ if isinstance(s,str):
3143
+ color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
3144
+ return re.match(color_code_pattern, s) is not None
3145
+ else:
3146
+ return True
3063
3147
 
3064
3148
 
3065
3149
  def is_num(s):
@@ -5509,7 +5593,21 @@ def df_reducer(
5509
5593
  ) -> pd.DataFrame:
5510
5594
  """
5511
5595
  Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
5512
-
5596
+ method:
5597
+ 1. 'umap':
5598
+ - big dataset and global structure, often preferred in large-scale datasets for
5599
+ visualization and dimensionality reduction, balancing speed and quality of visualization.
5600
+ - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5601
+ relationships, causing clusters to appear in arbitrary proximities to each other.
5602
+ 2. 'pca':
5603
+ - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5604
+ relationships, causing clusters to appear in arbitrary proximities to each other.
5605
+ - useful as a preprocessing step and in datasets where linear relationships dominate.
5606
+ 3. 't-SNE':
5607
+ a. t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5608
+ relationships, causing clusters to appear in arbitrary proximities to each other.
5609
+ b. often preferred in large-scale datasets for visualization and dimensionality
5610
+ reduction, balancing speed and quality of visualization.
5513
5611
  Parameters:
5514
5612
  -----------
5515
5613
  data : pd.DataFrame
py2ls/mol.py ADDED
@@ -0,0 +1,289 @@
1
+ import os
2
+ import subprocess
3
+ from rdkit import Chem
4
+ from rdkit.Chem import AllChem,Draw
5
+ from openbabel import openbabel
6
+ import matplotlib.pyplot as plt
7
+ # import pymol2 # 使用 PyMOL API 进行分子展示
8
+
9
+ from typing import Any, Dict, Union, List
10
+
11
+ def load_mol(fpath: str) -> Union[Dict[str, Any], None]:
12
+ """
13
+ Master function to read various molecular structure files and return a consistent molecule dictionary.
14
+ Supports formats: .pdb, .mol, .sdf, .xyz, .gro, and others through RDKit, Pybel, MDAnalysis, and ASE.
15
+
16
+ Parameters:
17
+ - fpath (str): Path to the molecular file
18
+
19
+ Returns:
20
+ - mol_dict (Dict[str, Any]): Dictionary with molecule information:
21
+ - 'atoms': List of atom information dictionaries
22
+ - 'bonds': List of bond information dictionaries
23
+ - 'metadata': Metadata for molecule (e.g., file name)
24
+ """
25
+ ext = os.path.splitext(fpath)[-1].lower() # Get the file extension
26
+
27
+ def create_atom_dict(atom) -> Dict[str, Any]:
28
+ """Helper to create a consistent atom dictionary."""
29
+ return {
30
+ 'element': atom.atomic_symbol,
31
+ 'coords': atom.coords,
32
+ 'index': atom.idx,
33
+ 'charge': atom.formalcharge
34
+ }
35
+
36
+ def create_bond_dict(bond) -> Dict[str, Any]:
37
+ """Helper to create a consistent bond dictionary."""
38
+ return {
39
+ 'start_atom_idx': bond.GetBeginAtomIdx(),
40
+ 'end_atom_idx': bond.GetEndAtomIdx(),
41
+ 'bond_type': bond.GetBondTypeAsDouble()
42
+ }
43
+
44
+ mol_dict = {
45
+ "atoms": [],
46
+ "bonds": [],
47
+ "metadata": {
48
+ "file_name": os.path.basename(fpath),
49
+ "format": ext
50
+ }
51
+ }
52
+
53
+ try:
54
+ # Handling with RDKit (for .mol and .sdf)
55
+ if ext in ['.mol', '.sdf']:
56
+ from rdkit import Chem
57
+ if ext == '.mol':
58
+ mol = Chem.MolFromMolFile(fpath)
59
+ if mol is None:
60
+ raise ValueError("RDKit failed to parse the .mol file.")
61
+ atoms = mol.GetAtoms()
62
+ bonds = mol.GetBonds()
63
+ elif ext == '.sdf':
64
+ supplier = Chem.SDMolSupplier(fpath)
65
+ mol = next(supplier, None)
66
+ if mol is None:
67
+ raise ValueError("RDKit failed to parse the .sdf file.")
68
+ atoms = mol.GetAtoms()
69
+ bonds = mol.GetBonds()
70
+
71
+ # Populate atom and bond data
72
+ mol_dict["atoms"] = [
73
+ {
74
+ "element": atom.GetSymbol(),
75
+ "coords": atom.GetOwningMol().GetConformer().GetAtomPosition(atom.GetIdx()),
76
+ "index": atom.GetIdx(),
77
+ "charge": atom.GetFormalCharge()
78
+ }
79
+ for atom in atoms
80
+ ]
81
+ mol_dict["bonds"] = [
82
+ create_bond_dict(bond)
83
+ for bond in bonds
84
+ ]
85
+
86
+ # Handling with Pybel (supports multiple formats: .pdb, .mol, .xyz, etc.)
87
+ elif ext in ['.pdb', '.mol', '.xyz', '.sdf']:
88
+ from openbabel import pybel
89
+
90
+ mol = next(pybel.readfile(ext[1:], fpath), None)
91
+ if mol is None:
92
+ raise ValueError("Pybel failed to parse the file.")
93
+ # Populate atom and bond data
94
+ mol_dict["atoms"] = [
95
+ {
96
+ "element": atom.type,
97
+ "coords": atom.coords,
98
+ "index": atom.idx,
99
+ "charge": atom.partialcharge
100
+ }
101
+ for atom in mol.atoms
102
+ ]
103
+ mol_dict["bonds"] = [
104
+ {
105
+ "start_atom_idx": bond.GetBeginAtomIdx(),
106
+ "end_atom_idx": bond.GetEndAtomIdx(),
107
+ "bond_type": bond.GetBondOrder()
108
+ }
109
+ for bond in openbabel.OBMolBondIter(mol.OBMol)
110
+ ]
111
+
112
+ # Handling with MDAnalysis (for .pdb, .gro, and trajectory files)
113
+ elif ext in ['.pdb', '.gro', '.xyz', '.xtc', '.dcd', '.trr']:
114
+ import MDAnalysis as mda
115
+ u = mda.Universe(fpath)
116
+ atoms = u.atoms
117
+ mol_dict["atoms"] = [
118
+ {
119
+ "element": atom.name,
120
+ "coords": atom.position,
121
+ "index": atom.id,
122
+ "charge": atom.charge if hasattr(atom, 'charge') else None
123
+ }
124
+ for atom in atoms
125
+ ]
126
+ mol_dict["bonds"] = [
127
+ {"start_atom_idx": bond[0], "end_atom_idx": bond[1], "bond_type": 1}
128
+ for bond in u.bonds.indices
129
+ ]
130
+
131
+ # Handling with ASE (for .xyz, .pdb, and other atomic structure formats)
132
+ elif ext in ['.xyz', '.pdb', '.vasp', '.cif']:
133
+ from ase.io import read as ase_read
134
+ atoms = ase_read(fpath)
135
+ mol_dict["atoms"] = [
136
+ {
137
+ "element": atom.symbol,
138
+ "coords": atom.position,
139
+ "index": i,
140
+ "charge": None
141
+ }
142
+ for i, atom in enumerate(atoms)
143
+ ]
144
+ # ASE does not explicitly support bonds by default, so bonds are not populated here.
145
+
146
+ else:
147
+ raise ValueError(f"Unsupported file extension: {ext}")
148
+
149
+ except Exception as e:
150
+ print(f"Error loading molecule from {fpath}: {e}")
151
+ return None
152
+
153
+ return mol_dict
154
+
155
+ class DockingConfig:
156
+ def __init__(self, receptor_file, ligand_smiles_list, center=(0, 0, 0), size=(20, 20, 20), output_dir="docking_results"):
157
+ self.receptor_file = receptor_file
158
+ self.ligand_smiles_list = ligand_smiles_list
159
+ self.center = center
160
+ self.size = size
161
+ self.output_dir = output_dir
162
+ os.makedirs(output_dir, exist_ok=True)
163
+
164
+ def mol_to_pdbqt(mol, output_file):
165
+ """Converts an RDKit Mol object to PDBQT format."""
166
+ obConversion = openbabel.OBConversion()
167
+ obConversion.SetInAndOutFormats("mol", "pdbqt")
168
+ obMol = openbabel.OBMol()
169
+ obConversion.ReadString(obMol, Chem.MolToMolBlock(mol))
170
+ obConversion.WriteFile(obMol, output_file)
171
+
172
+ def prepare_ligand(smiles, ligand_id):
173
+ """Prepare the ligand file in PDBQT format."""
174
+ mol = Chem.MolFromSmiles(smiles)
175
+ mol = Chem.AddHs(mol)
176
+ AllChem.EmbedMolecule(mol)
177
+ AllChem.UFFOptimizeMolecule(mol)
178
+ ligand_file = f"ligand_{ligand_id}.pdbqt"
179
+ mol_to_pdbqt(mol, ligand_file)
180
+ return ligand_file
181
+
182
+ def run_docking(receptor_file, ligand_file, output_file, center, size):
183
+ """Runs Vina docking using the receptor and ligand files."""
184
+ vina_command = [
185
+ "vina",
186
+ "--receptor", receptor_file,
187
+ "--ligand", ligand_file,
188
+ "--center_x", str(center[0]),
189
+ "--center_y", str(center[1]),
190
+ "--center_z", str(center[2]),
191
+ "--size_x", str(size[0]),
192
+ "--size_y", str(size[1]),
193
+ "--size_z", str(size[2]),
194
+ "--out", output_file,
195
+ "--log", output_file.replace(".pdbqt", ".log")
196
+ ]
197
+ subprocess.run(vina_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
198
+
199
+ def parse_vina_output(output_file):
200
+ """Parses Vina output log file to extract docking scores."""
201
+ scores = []
202
+ with open(output_file.replace(".pdbqt", ".log"), 'r') as f:
203
+ for line in f:
204
+ if line.startswith("REMARK VINA RESULT"):
205
+ score = float(line.split()[3])
206
+ scores.append(score)
207
+ return scores
208
+
209
+ def docking_master_function(config: DockingConfig):
210
+ """Master function to run molecular docking for multiple ligands."""
211
+ receptor_pdbqt = config.receptor_file
212
+ results = {}
213
+
214
+ for i, smiles in enumerate(config.ligand_smiles_list):
215
+ ligand_file = prepare_ligand(smiles, ligand_id=i)
216
+ output_file = os.path.join(config.output_dir, f"docked_ligand_{i}.pdbqt")
217
+
218
+ # Run docking for each ligand
219
+ run_docking(
220
+ receptor_file=receptor_pdbqt,
221
+ ligand_file=ligand_file,
222
+ output_file=output_file,
223
+ center=config.center,
224
+ size=config.size
225
+ )
226
+
227
+ # Parse docking results and store them
228
+ scores = parse_vina_output(output_file)
229
+ results[smiles] = scores
230
+ print(f"Ligand {i} (SMILES: {smiles}) docking scores: {scores}")
231
+
232
+ # Visualize individual docking result
233
+ visualize_docking(config.receptor_file, output_file, f"{config.output_dir}/ligand_{i}_visualization.png")
234
+
235
+ # Clean up intermediate files
236
+ os.remove(ligand_file)
237
+
238
+ # Plot binding affinity distribution
239
+ plot_binding_affinities(results, f"{config.output_dir}/binding_affinities.png")
240
+ return results
241
+
242
+ def visualize_docking(receptor_file, ligand_file, dir_save):
243
+ """Generates a 2D visualization of the docking result using RDKit and Matplotlib."""
244
+ # Load the receptor and ligand molecules
245
+ receptor = Chem.MolFromPDBFile(receptor_file, removeHs=False)
246
+ ligand = Chem.MolFromPDBFile(ligand_file, removeHs=False)
247
+
248
+ # Draw the receptor and ligand
249
+ img = Draw.MolToImage(receptor, size=(300, 300))
250
+ img_ligand = Draw.MolToImage(ligand, size=(300, 300))
251
+
252
+ # Save images
253
+ img.save(dir_save.replace('.png', '_receptor.png'))
254
+ img_ligand.save(dir_save.replace('.png', '_ligand.png'))
255
+
256
+ print(f"Saved 2D visualizations to {dir_save.replace('.png', '_receptor.png')} and {dir_save.replace('.png', '_ligand.png')}")
257
+
258
+
259
+ def plot_binding_affinities(results, dir_save):
260
+ """Plots binding affinities for all ligands."""
261
+ ligands = list(results.keys())
262
+ affinities = [min(scores) for scores in results.values()] # Minimum binding affinity per ligand
263
+
264
+ plt.figure(figsize=(10, 6))
265
+ plt.barh(ligands, affinities, color="skyblue")
266
+ plt.xlabel("Binding Affinity (kcal/mol)")
267
+ plt.ylabel("Ligands (SMILES)")
268
+ plt.title("Binding Affinities of Different Ligands")
269
+ plt.gca().invert_yaxis()
270
+ plt.tight_layout()
271
+ plt.savefig(dir_save)
272
+ plt.show()
273
+ print(f"Saved binding affinity plot to {dir_save}")
274
+
275
+ # 示例使用
276
+ if __name__ == "__main__":
277
+ # 配置
278
+ receptor_file = "receptor.pdbqt"
279
+ ligand_smiles_list = ["CCO", "CCC", "CCN"] # 示例的配体SMILES列表
280
+ docking_config = DockingConfig(
281
+ receptor_file=receptor_file,
282
+ ligand_smiles_list=ligand_smiles_list,
283
+ center=(10, 10, 10), # 假设对接中心
284
+ size=(20, 20, 20) # 假设对接区域大小
285
+ )
286
+
287
+ # 运行master function
288
+ docking_results = docking_master_function(docking_config)
289
+ print("Final docking results:", docking_results)