rcsb-embedding-model 0.0.31__tar.gz → 0.0.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rcsb-embedding-model might be problematic. Click here for more details.

Files changed (50) hide show
  1. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/PKG-INFO +1 -1
  2. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/pyproject.toml +1 -1
  3. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/dataset/esm_prot_from_chain.py +9 -3
  4. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/utils/data.py +4 -6
  5. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/utils/structure_parser.py +39 -23
  6. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/.dockerignore +0 -0
  7. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/.github/workflows/_workflow-docker.yaml +0 -0
  8. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/.github/workflows/publish.yaml +0 -0
  9. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/.gitignore +0 -0
  10. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/Dockerfile +0 -0
  11. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/LICENSE.md +0 -0
  12. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/README.md +0 -0
  13. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/assets/embedding-model-architecture.png +0 -0
  14. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/examples/esm_embeddings.py +0 -0
  15. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/__init__.py +0 -0
  16. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/cli/args_utils.py +0 -0
  17. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/cli/inference.py +0 -0
  18. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/dataset/esm_prot_from_structure.py +0 -0
  19. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/dataset/resdiue_assembly_embedding_from_structure.py +0 -0
  20. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/dataset/residue_assembly_embedding_from_tensor_file.py +0 -0
  21. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/dataset/residue_embedding_from_structure.py +0 -0
  22. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/dataset/residue_embedding_from_tensor_file.py +0 -0
  23. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/inference/assembly_inferece.py +0 -0
  24. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/inference/chain_inference.py +0 -0
  25. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/inference/esm_inference.py +0 -0
  26. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/inference/structure_inference.py +0 -0
  27. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/model/layers.py +0 -0
  28. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/model/residue_embedding_aggregator.py +0 -0
  29. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/modules/chain_module.py +0 -0
  30. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/modules/esm_module.py +0 -0
  31. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/modules/structure_module.py +0 -0
  32. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/rcsb_structure_embedding.py +0 -0
  33. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/types/api_types.py +0 -0
  34. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/utils/model.py +0 -0
  35. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/utils/structure_provider.py +0 -0
  36. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/src/rcsb_embedding_model/writer/batch_writer.py +0 -0
  37. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/embeddings/1acb.A.pt +0 -0
  38. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/embeddings/1acb.B.pt +0 -0
  39. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/embeddings/2uzi.A.pt +0 -0
  40. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/embeddings/2uzi.B.pt +0 -0
  41. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/embeddings/2uzi.C.pt +0 -0
  42. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/pdb/1acb.cif +0 -0
  43. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/pdb/2uzi.cif +0 -0
  44. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/src_stream/assembly-complete-test.csv +0 -0
  45. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/src_stream/instance-complete-test.csv +0 -0
  46. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/resources/src_stream/instance.csv +0 -0
  47. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/test_cli_inference.py +0 -0
  48. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/test_embedding_model.py +0 -0
  49. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/test_inference.py +0 -0
  50. {rcsb_embedding_model-0.0.31 → rcsb_embedding_model-0.0.33}/tests/test_remote_inference.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rcsb-embedding-model
3
- Version: 0.0.31
3
+ Version: 0.0.33
4
4
  Summary: Protein Embedding Model for Structure Search
5
5
  Project-URL: Homepage, https://github.com/rcsb/rcsb-embedding-model
6
6
  Project-URL: Issues, https://github.com/rcsb/rcsb-embedding-model/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "rcsb-embedding-model"
3
- version = "0.0.31"
3
+ version = "0.0.33"
4
4
  authors = [
5
5
  { name="Joan Segura", email="joan.segura@rcsb.org" },
6
6
  ]
@@ -11,7 +11,7 @@ import pandas as pd
11
11
 
12
12
  from rcsb_embedding_model.types.api_types import StructureFormat, StructureLocation, SrcLocation
13
13
  from rcsb_embedding_model.utils.data import stringio_from_url
14
- from rcsb_embedding_model.utils.structure_parser import rename_atom_ch
14
+ from rcsb_embedding_model.utils.structure_parser import rename_atom_ch, check_all_hetero, remove_hetero
15
15
  from rcsb_embedding_model.utils.structure_provider import StructureProvider
16
16
 
17
17
 
@@ -72,8 +72,14 @@ class EsmProtFromChain(Dataset):
72
72
  for atom_ch in chain_iter(structure):
73
73
  if len(atom_ch) == 0:
74
74
  raise IOError(f"No atoms were found in structure chain {src_name}.{chain_id}")
75
- protein_chain = ProteinChain.from_atomarray(rename_atom_ch(atom_ch))
76
- return ESMProtein.from_protein_chain(protein_chain), item_name
75
+ if check_all_hetero(atom_ch):
76
+ atom_ch = remove_hetero(atom_ch)
77
+ atom_ch = rename_atom_ch(atom_ch)
78
+ protein_chain = ProteinChain.from_atomarray(atom_ch)
79
+ protein_chain = ESMProtein.from_protein_chain(protein_chain)
80
+ if len(protein_chain) == 0:
81
+ raise IOError(f"No atoms were found in structure chain {src_name}.{chain_id}")
82
+ return protein_chain, item_name
77
83
  raise IOError(f"No atoms were found in structure chain {src_name}.{chain_id}")
78
84
 
79
85
 
@@ -4,6 +4,7 @@ import gzip
4
4
  from io import StringIO, BytesIO
5
5
 
6
6
  import torch
7
+ from requests import RequestException
7
8
 
8
9
 
9
10
  def collate_seq_embeddings(batch_list):
@@ -52,13 +53,10 @@ def stringio_from_url(url):
52
53
  return StringIO(f.read())
53
54
  else:
54
55
  return StringIO(response.text)
55
- except requests.exceptions.RequestException as e:
56
- print(f"Error fetching URL: {e}")
57
- return None
56
+ except RequestException as e:
57
+ raise RuntimeError(f"Error fetching URL: {e}")
58
58
  except (OSError, gzip.BadGzipFile) as e:
59
- print(f"Error decompressing gzip file: {e}")
60
- return None
61
-
59
+ raise RuntimeError(f"Error decompressing gzip file: {e}")
62
60
 
63
61
 
64
62
  def concatenate_tensors(file_list, max_residues, dim=0):
@@ -9,17 +9,20 @@ def get_structure_from_src(
9
9
  chain_id=None,
10
10
  assembly_id=None
11
11
  ):
12
- if structure_format == "pdb":
13
- pdb_file = PDBFile.read(src_structure)
14
- structure = __get_pdb_structure(pdb_file, assembly_id)
15
- elif structure_format == "mmcif":
16
- cif_file = CIFFile.read(src_structure)
17
- structure = __get_structure(cif_file, assembly_id)
18
- elif structure_format == "binarycif":
19
- cif_file = BinaryCIFFile.read(src_structure)
20
- structure = __get_structure(cif_file, assembly_id)
21
- else:
22
- raise RuntimeError(f"Unknown file format {structure_format}")
12
+ try:
13
+ if structure_format == "pdb":
14
+ pdb_file = PDBFile.read(src_structure)
15
+ structure = __get_pdb_structure(pdb_file, assembly_id)
16
+ elif structure_format == "mmcif":
17
+ cif_file = CIFFile.read(src_structure)
18
+ structure = __get_structure(cif_file, assembly_id)
19
+ elif structure_format == "binarycif":
20
+ cif_file = BinaryCIFFile.read(src_structure)
21
+ structure = __get_structure(cif_file, assembly_id)
22
+ else:
23
+ raise RuntimeError(f"Unknown file format {structure_format}")
24
+ except Exception as e:
25
+ raise RuntimeError(f"Error reading structure from {src_structure}: {e}")
23
26
 
24
27
  if chain_id is not None:
25
28
  return structure[structure.chain_id == chain_id]
@@ -38,26 +41,39 @@ def get_protein_chains(structure, min_res_n=0):
38
41
 
39
42
 
40
43
  def get_assemblies(structure, structure_format="mmcif"):
41
- if structure_format == "pdb":
42
- return tuple(list_pdb_assemblies(PDBFile.read(structure)))
43
- elif structure_format == "mmcif":
44
- return tuple(list_assemblies(CIFFile.read(structure)).keys())
45
- elif structure_format == "binarycif":
46
- return tuple(list_assemblies(BinaryCIFFile.read(structure)))
47
- else:
48
- raise RuntimeError(f"Unknown file format {structure_format}")
44
+ try:
45
+ if structure_format == "pdb":
46
+ return tuple(list_pdb_assemblies(PDBFile.read(structure)))
47
+ elif structure_format == "mmcif":
48
+ return tuple(list_assemblies(CIFFile.read(structure)).keys())
49
+ elif structure_format == "binarycif":
50
+ return tuple(list_assemblies(BinaryCIFFile.read(structure)))
51
+ else:
52
+ raise RuntimeError(f"Unknown file format {structure_format}")
53
+ except Exception as e:
54
+ raise RuntimeError(f"Error reading assemblies from {structure}: {e}")
49
55
 
50
56
 
51
57
  def rename_atom_ch(atom_ch, ch="A"):
52
58
  renamed_atom_ch = AtomArray(len(atom_ch))
53
- n = 0
54
- for atom in atom_ch:
59
+ for idx, atom in enumerate(atom_ch):
55
60
  atom.chain_id = ch
56
- renamed_atom_ch[n] = atom
57
- n += 1
61
+ renamed_atom_ch[idx] = atom
58
62
  return renamed_atom_ch
59
63
 
60
64
 
65
+ def remove_hetero(atom_ch):
66
+ renamed_atom_ch = AtomArray(len(atom_ch))
67
+ for idx, atom in enumerate(atom_ch):
68
+ atom.hetero = False
69
+ renamed_atom_ch[idx] = atom
70
+ return renamed_atom_ch
71
+
72
+
73
+ def check_all_hetero(atom_ch):
74
+ return sum(atom_ch.hetero) == len(atom_ch)
75
+
76
+
61
77
  def __get_pdb_structure(pdb_file, assembly_id=None):
62
78
  return get_pdb_structure(
63
79
  pdb_file,