biotite 1.5.0__cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,963 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.database.rcsb"
|
|
6
|
+
__author__ = "Patrick Kunzmann, Maximilian Dombrowsky"
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Query",
|
|
9
|
+
"SingleQuery",
|
|
10
|
+
"CompositeQuery",
|
|
11
|
+
"BasicQuery",
|
|
12
|
+
"FieldQuery",
|
|
13
|
+
"SequenceQuery",
|
|
14
|
+
"StructureQuery",
|
|
15
|
+
"MotifQuery",
|
|
16
|
+
"Sorting",
|
|
17
|
+
"Grouping",
|
|
18
|
+
"DepositGrouping",
|
|
19
|
+
"IdentityGrouping",
|
|
20
|
+
"UniprotGrouping",
|
|
21
|
+
"search",
|
|
22
|
+
"count",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
import abc
|
|
26
|
+
import copy
|
|
27
|
+
import json
|
|
28
|
+
from datetime import datetime
|
|
29
|
+
import numpy as np
|
|
30
|
+
import requests
|
|
31
|
+
from biotite.database.error import RequestError
|
|
32
|
+
from biotite.sequence.seqtypes import NucleotideSequence
|
|
33
|
+
|
|
34
|
+
_search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
|
|
35
|
+
_scope_to_target = {
|
|
36
|
+
"protein": "pdb_protein_sequence",
|
|
37
|
+
"rna": "pdb_rna_sequence",
|
|
38
|
+
"dna": "pdb_dna_sequence",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Query(metaclass=abc.ABCMeta):
|
|
43
|
+
"""
|
|
44
|
+
A representation of a JSON query for the RCSB search API.
|
|
45
|
+
|
|
46
|
+
This is the abstract base class for all queries.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
@abc.abstractmethod
|
|
50
|
+
def get_content(self):
|
|
51
|
+
"""
|
|
52
|
+
Get the query content, i.e. the data belonging to the
|
|
53
|
+
``'query'`` attribute in the RCSB search API.
|
|
54
|
+
|
|
55
|
+
This content is converted into JSON by the :func:`search`
|
|
56
|
+
and :func:`count` functions.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
content : dict
|
|
61
|
+
The content dictionary for the ``'query'`` attributes.
|
|
62
|
+
"""
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
def __and__(self, query):
|
|
66
|
+
return CompositeQuery([self, query], "and")
|
|
67
|
+
|
|
68
|
+
def __or__(self, query):
|
|
69
|
+
return CompositeQuery([self, query], "or")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class SingleQuery(Query, metaclass=abc.ABCMeta):
|
|
73
|
+
"""
|
|
74
|
+
A terminal query node for the RCSB search API.
|
|
75
|
+
|
|
76
|
+
Multiple :class:`SingleQuery` objects can be combined to
|
|
77
|
+
:class:`CompositeQuery` objects using the ``|`` and ``&`` operators.
|
|
78
|
+
|
|
79
|
+
This is the abstract base class for all queries that are
|
|
80
|
+
terminal nodes.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
@abc.abstractmethod
|
|
84
|
+
def get_content(self):
|
|
85
|
+
return {"parameters": {}}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class CompositeQuery(Query):
|
|
89
|
+
"""
|
|
90
|
+
A group query node for the RCSB search API.
|
|
91
|
+
|
|
92
|
+
A composite query is an combination of other queries, combined
|
|
93
|
+
either with the `'and'` or `'or'` operator.
|
|
94
|
+
Usually, a :class:`CompositeQuery` will not be created by calling
|
|
95
|
+
its constructor, but by combining queries using the ``|`` or ``&``
|
|
96
|
+
operator.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
queries : iterable object of Query
|
|
101
|
+
The queries to be combined.
|
|
102
|
+
operator : {'or', 'and'}
|
|
103
|
+
The type of combination.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, queries, operator):
|
|
107
|
+
self._queries = queries
|
|
108
|
+
if operator not in ("or", "and"):
|
|
109
|
+
raise ValueError(f"Operator must be 'or' or 'and', not '{operator}'")
|
|
110
|
+
self._operator = operator
|
|
111
|
+
|
|
112
|
+
def get_content(self):
|
|
113
|
+
"""
|
|
114
|
+
A dictionary representation of the query.
|
|
115
|
+
This dictionary is the content of the ``'query'`` key in the
|
|
116
|
+
JSON query.
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
content : dict
|
|
121
|
+
The dictionary representation of the query.
|
|
122
|
+
"""
|
|
123
|
+
content = {
|
|
124
|
+
"type": "group",
|
|
125
|
+
"logical_operator": self._operator,
|
|
126
|
+
"nodes": [query.get_content() for query in self._queries],
|
|
127
|
+
}
|
|
128
|
+
return content
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class BasicQuery(SingleQuery):
|
|
132
|
+
"""
|
|
133
|
+
A text query for searching for a given term across all available
|
|
134
|
+
fields.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
term : str
|
|
139
|
+
The search term.
|
|
140
|
+
If the term contains multiple words, the query will return
|
|
141
|
+
results where the entire term is present.
|
|
142
|
+
The matching is not case-sensitive.
|
|
143
|
+
Logic combinations of terms is described
|
|
144
|
+
`here <https://search.rcsb.org/#basic-queries>`_.
|
|
145
|
+
|
|
146
|
+
Examples
|
|
147
|
+
--------
|
|
148
|
+
|
|
149
|
+
>>> query = BasicQuery("Miniprotein Construct")
|
|
150
|
+
>>> print(sorted(search(query)))
|
|
151
|
+
['1L2Y']
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
def __init__(self, term):
|
|
155
|
+
super().__init__()
|
|
156
|
+
self._term = term
|
|
157
|
+
|
|
158
|
+
def get_content(self):
|
|
159
|
+
content = super().get_content()
|
|
160
|
+
content["type"] = "terminal"
|
|
161
|
+
content["service"] = "full_text"
|
|
162
|
+
content["parameters"]["value"] = f'"{self._term}"'
|
|
163
|
+
return content
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class FieldQuery(SingleQuery):
|
|
167
|
+
"""
|
|
168
|
+
A text query for searching for values in a given field using the
|
|
169
|
+
given operator.
|
|
170
|
+
|
|
171
|
+
The operators are keyword arguments of this function and the search
|
|
172
|
+
value is the value given to the respective parameter.
|
|
173
|
+
The operators are mutually exclusive.
|
|
174
|
+
If none is given, the search will return results where the given
|
|
175
|
+
field exists.
|
|
176
|
+
|
|
177
|
+
A :class:`FieldQuery` is negated using the ``~`` operator.
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
field : str
|
|
182
|
+
The field to search in.
|
|
183
|
+
molecular_definition : bool, optional
|
|
184
|
+
If set true, this query searches in fields
|
|
185
|
+
associated with
|
|
186
|
+
`molecular definitions <https://search.rcsb.org/chemical-search-attributes.html>`_.
|
|
187
|
+
If false (default), this query searches in fields
|
|
188
|
+
associated with `PDB structures <https://search.rcsb.org/structure-search-attributes.html>`_.
|
|
189
|
+
case_sensitive : bool, optional
|
|
190
|
+
If set to true, searches are case sensitive.
|
|
191
|
+
By default matching is case-insensitive.
|
|
192
|
+
exact_match : str, optional
|
|
193
|
+
Operator for returning results whose field exactly matches the
|
|
194
|
+
given value.
|
|
195
|
+
contains_words, contains_phrase : str, optional
|
|
196
|
+
Operator for returning results whose field matches
|
|
197
|
+
individual words from the given value or the value as exact
|
|
198
|
+
phrase, respectively.
|
|
199
|
+
greater, less, greater_or_equal, less_or_equal, equals : int or float or datetime, optional
|
|
200
|
+
Operator for returning results whose field values are larger,
|
|
201
|
+
smaller or equal to the given value.
|
|
202
|
+
range, range_closed : tuple(int, int) or tuple(float, float) or tuple(datetime, datetime), optional
|
|
203
|
+
Operator for returning results whose field matches values within
|
|
204
|
+
the given range.
|
|
205
|
+
`range_closed` includes the interval limits.
|
|
206
|
+
is_in : tuple of str or list of str, optional
|
|
207
|
+
Operator for returning results whose field matches any of the
|
|
208
|
+
values in the given list.
|
|
209
|
+
|
|
210
|
+
Notes
|
|
211
|
+
-----
|
|
212
|
+
A complete list of the available fields and its supported operators
|
|
213
|
+
is documented at
|
|
214
|
+
`<https://search.rcsb.org/structure-search-attributes.html>`_
|
|
215
|
+
and
|
|
216
|
+
`<https://search.rcsb.org/chemical-search-attributes.html>`_.
|
|
217
|
+
|
|
218
|
+
Examples
|
|
219
|
+
--------
|
|
220
|
+
|
|
221
|
+
>>> query = FieldQuery("reflns.d_resolution_high", less_or_equal=0.6)
|
|
222
|
+
>>> print(sorted(search(query)))
|
|
223
|
+
['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H']
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
def __init__(
|
|
227
|
+
self, field, molecular_definition=False, case_sensitive=False, **kwargs
|
|
228
|
+
):
|
|
229
|
+
super().__init__()
|
|
230
|
+
self._negation = False
|
|
231
|
+
self._field = field
|
|
232
|
+
self._mol_definition = molecular_definition
|
|
233
|
+
self._case_sensitive = case_sensitive
|
|
234
|
+
|
|
235
|
+
if len(kwargs) > 1:
|
|
236
|
+
raise TypeError("Only one operator must be given")
|
|
237
|
+
elif len(kwargs) == 1:
|
|
238
|
+
self._operator = list(kwargs.keys())[0]
|
|
239
|
+
self._value = list(kwargs.values())[0]
|
|
240
|
+
else:
|
|
241
|
+
# No operator is given
|
|
242
|
+
self._operator = "exists"
|
|
243
|
+
self._value = None
|
|
244
|
+
|
|
245
|
+
if self._operator not in [
|
|
246
|
+
"exact_match",
|
|
247
|
+
"contains_words",
|
|
248
|
+
"contains_phrase",
|
|
249
|
+
"greater",
|
|
250
|
+
"less",
|
|
251
|
+
"greater_or_equal",
|
|
252
|
+
"less_or_equal",
|
|
253
|
+
"equals",
|
|
254
|
+
"range",
|
|
255
|
+
"range_closed",
|
|
256
|
+
"is_in",
|
|
257
|
+
"exists",
|
|
258
|
+
]:
|
|
259
|
+
raise TypeError(
|
|
260
|
+
f"Constructor got an unexpected keyword argument '{self._operator}'"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Convert dates into ISO 8601
|
|
264
|
+
if isinstance(self._value, datetime):
|
|
265
|
+
self._value = _to_isoformat(self._value)
|
|
266
|
+
elif isinstance(self._value, (tuple, list, np.ndarray)):
|
|
267
|
+
self._value = [
|
|
268
|
+
_to_isoformat(val) if isinstance(val, datetime) else val
|
|
269
|
+
for val in self._value
|
|
270
|
+
]
|
|
271
|
+
|
|
272
|
+
# Create dictionary for 'range' operator
|
|
273
|
+
if self._operator == "range":
|
|
274
|
+
self._value = {
|
|
275
|
+
"from": self._value[0],
|
|
276
|
+
"include_lower": False,
|
|
277
|
+
"to": self._value[1],
|
|
278
|
+
"include_upper": False,
|
|
279
|
+
}
|
|
280
|
+
elif self._operator == "range_closed":
|
|
281
|
+
self._value = {
|
|
282
|
+
"from": self._value[0],
|
|
283
|
+
"include_lower": True,
|
|
284
|
+
"to": self._value[1],
|
|
285
|
+
"include_upper": True,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
# Rename operators to names used in API
|
|
289
|
+
if self._operator == "is_in":
|
|
290
|
+
# 'in' is not an available parameter name in Python
|
|
291
|
+
self._operator = "in"
|
|
292
|
+
elif self._operator == "range_closed":
|
|
293
|
+
# For backwards compatibility
|
|
294
|
+
self._operator = "range"
|
|
295
|
+
|
|
296
|
+
def get_content(self):
|
|
297
|
+
content = super().get_content()
|
|
298
|
+
content["type"] = "terminal"
|
|
299
|
+
if self._mol_definition:
|
|
300
|
+
content["service"] = "text_chem"
|
|
301
|
+
else:
|
|
302
|
+
content["service"] = "text"
|
|
303
|
+
content["parameters"]["attribute"] = self._field
|
|
304
|
+
content["parameters"]["operator"] = self._operator
|
|
305
|
+
content["parameters"]["negation"] = self._negation
|
|
306
|
+
content["parameters"]["case_sensitive"] = self._case_sensitive
|
|
307
|
+
if self._value is not None:
|
|
308
|
+
content["parameters"]["value"] = self._value
|
|
309
|
+
return content
|
|
310
|
+
|
|
311
|
+
def __invert__(self):
|
|
312
|
+
clone = copy.deepcopy(self)
|
|
313
|
+
clone._negation = not clone._negation
|
|
314
|
+
return clone
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class SequenceQuery(SingleQuery):
|
|
318
|
+
"""
|
|
319
|
+
A query for protein/DNA/RNA molecules with a sequence similar to a
|
|
320
|
+
given input sequence using
|
|
321
|
+
`MMseqs2 <https://github.com/soedinglab/mmseqs2>`_.
|
|
322
|
+
|
|
323
|
+
Parameters
|
|
324
|
+
----------
|
|
325
|
+
sequence : Sequence or str
|
|
326
|
+
The input sequence.
|
|
327
|
+
If `sequence` is a :class:`NucleotideSequence` and the `scope`
|
|
328
|
+
is ``'rna'``, ``'T'`` is automatically replaced by ``'U'``.
|
|
329
|
+
scope : {'protein', 'dna', 'rna'}
|
|
330
|
+
The type of molecule to find.
|
|
331
|
+
min_identity : float, optional
|
|
332
|
+
A match is only returned, if the sequence identity between
|
|
333
|
+
the match and the input sequence exceeds this value.
|
|
334
|
+
Must be between 0 and 1.
|
|
335
|
+
By default, the sequence identity is ignored.
|
|
336
|
+
max_expect_value : float, optional
|
|
337
|
+
A match is only returned, if the *expect value* (E-value) does
|
|
338
|
+
not exceed this value.
|
|
339
|
+
By default, the value is effectively ignored.
|
|
340
|
+
|
|
341
|
+
Notes
|
|
342
|
+
-----
|
|
343
|
+
*MMseqs2* is run on the RCSB servers.
|
|
344
|
+
|
|
345
|
+
Examples
|
|
346
|
+
--------
|
|
347
|
+
|
|
348
|
+
>>> sequence = "NLYIQWLKDGGPSSGRPPPS"
|
|
349
|
+
>>> query = SequenceQuery(sequence, scope="protein", min_identity=0.95)
|
|
350
|
+
>>> print(sorted(search(query)))
|
|
351
|
+
['1L2Y', '2LDJ', '9G22', '9G2N', '9G2O', '9G31', '9G32', '9GDL', '9GDN', '9GDT', '9GDU', '9GE1']
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
def __init__(self, sequence, scope, min_identity=0.0, max_expect_value=10000000.0):
|
|
355
|
+
super().__init__()
|
|
356
|
+
self._target = _scope_to_target.get(scope.lower())
|
|
357
|
+
if self._target is None:
|
|
358
|
+
raise ValueError(f"'{scope}' is an invalid scope")
|
|
359
|
+
|
|
360
|
+
if isinstance(sequence, NucleotideSequence) and scope.lower() == "rna":
|
|
361
|
+
self._sequence = str(sequence).replace("T", "U")
|
|
362
|
+
else:
|
|
363
|
+
self._sequence = str(sequence)
|
|
364
|
+
|
|
365
|
+
self._min_identity = min_identity
|
|
366
|
+
self._max_expect_value = max_expect_value
|
|
367
|
+
|
|
368
|
+
def get_content(self):
|
|
369
|
+
content = super().get_content()
|
|
370
|
+
content["type"] = "terminal"
|
|
371
|
+
content["service"] = "sequence"
|
|
372
|
+
content["parameters"]["value"] = self._sequence
|
|
373
|
+
content["parameters"]["target"] = self._target
|
|
374
|
+
content["parameters"]["identity_cutoff"] = self._min_identity
|
|
375
|
+
content["parameters"]["evalue_cutoff"] = self._max_expect_value
|
|
376
|
+
return content
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class MotifQuery(SingleQuery):
|
|
380
|
+
"""
|
|
381
|
+
A query for protein/DNA/RNA molecules containing the given sequence
|
|
382
|
+
motif.
|
|
383
|
+
|
|
384
|
+
Parameters
|
|
385
|
+
----------
|
|
386
|
+
pattern : str
|
|
387
|
+
The sequence pattern.
|
|
388
|
+
pattern_type : {'simple', 'prosite', 'regex'}
|
|
389
|
+
The type of the pattern.
|
|
390
|
+
scope : {'protein', 'dna', 'rna'}
|
|
391
|
+
The type of molecule to find.
|
|
392
|
+
|
|
393
|
+
Examples
|
|
394
|
+
--------
|
|
395
|
+
|
|
396
|
+
>>> query = MotifQuery(
|
|
397
|
+
... "C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H.",
|
|
398
|
+
... "prosite",
|
|
399
|
+
... "protein"
|
|
400
|
+
... )
|
|
401
|
+
"""
|
|
402
|
+
|
|
403
|
+
def __init__(self, pattern, pattern_type, scope):
|
|
404
|
+
super().__init__()
|
|
405
|
+
self._pattern = pattern
|
|
406
|
+
self._pattern_type = pattern_type
|
|
407
|
+
self._target = _scope_to_target.get(scope.lower())
|
|
408
|
+
|
|
409
|
+
def get_content(self):
|
|
410
|
+
content = super().get_content()
|
|
411
|
+
content["type"] = "terminal"
|
|
412
|
+
content["service"] = "seqmotif"
|
|
413
|
+
content["parameters"]["value"] = self._pattern
|
|
414
|
+
content["parameters"]["pattern_type"] = self._pattern_type
|
|
415
|
+
content["parameters"]["target"] = self._target
|
|
416
|
+
return content
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class StructureQuery(SingleQuery):
|
|
420
|
+
"""
|
|
421
|
+
A query for protein/DNA/RNA molecules with structural similarity
|
|
422
|
+
to the query structure.
|
|
423
|
+
|
|
424
|
+
Either the chain or assembly ID of the query structure must be
|
|
425
|
+
specified.
|
|
426
|
+
|
|
427
|
+
Parameters
|
|
428
|
+
----------
|
|
429
|
+
pdb_id : str
|
|
430
|
+
The PDB ID of the query structure.
|
|
431
|
+
chain : str, optional
|
|
432
|
+
The chain ID (more exactly ``asym_id``) of the query structure.
|
|
433
|
+
assembly : str, optional
|
|
434
|
+
The assembly ID (``assembly_id``) of the query structure.
|
|
435
|
+
strict : bool, optional
|
|
436
|
+
If true, structure comparison is strict, otherwise it is
|
|
437
|
+
relaxed.
|
|
438
|
+
|
|
439
|
+
Examples
|
|
440
|
+
--------
|
|
441
|
+
|
|
442
|
+
>>> query = StructureQuery("1L2Y", chain="A")
|
|
443
|
+
>>> print(sorted(search(query)))
|
|
444
|
+
['1L2Y', '1RIJ', '2JOF', '2LDJ', '2M7D', '7MQS', '9DPF']
|
|
445
|
+
"""
|
|
446
|
+
|
|
447
|
+
def __init__(self, pdb_id, chain=None, assembly=None, strict=True):
|
|
448
|
+
super().__init__()
|
|
449
|
+
|
|
450
|
+
if (chain is None and assembly is None) or (
|
|
451
|
+
chain is not None and assembly is not None
|
|
452
|
+
):
|
|
453
|
+
raise TypeError("Either the chain ID or assembly ID must be set")
|
|
454
|
+
elif chain is None:
|
|
455
|
+
self._value = {"entry_id": pdb_id, "asssembly_id": assembly}
|
|
456
|
+
else:
|
|
457
|
+
self._value = {"entry_id": pdb_id, "asym_id": chain}
|
|
458
|
+
|
|
459
|
+
self._operator = "strict_shape_match" if strict else "relaxed_shape_match"
|
|
460
|
+
|
|
461
|
+
def get_content(self):
|
|
462
|
+
content = super().get_content()
|
|
463
|
+
content["type"] = "terminal"
|
|
464
|
+
content["service"] = "structure"
|
|
465
|
+
content["parameters"]["value"] = self._value
|
|
466
|
+
content["parameters"]["operator"] = self._operator
|
|
467
|
+
return content
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class Sorting:
|
|
471
|
+
def __init__(self, field, descending=True):
|
|
472
|
+
self._field = field
|
|
473
|
+
self._descending = descending
|
|
474
|
+
|
|
475
|
+
@property
|
|
476
|
+
def field(self):
|
|
477
|
+
return self._field
|
|
478
|
+
|
|
479
|
+
@property
|
|
480
|
+
def descending(self):
|
|
481
|
+
return self._descending
|
|
482
|
+
|
|
483
|
+
def get_content(self):
|
|
484
|
+
"""
|
|
485
|
+
Get the sorting content, i.e. the data belonging to the
|
|
486
|
+
``'sort'`` and ``'ranking_criteria_type'`` attributes in the
|
|
487
|
+
RCSB search API.
|
|
488
|
+
|
|
489
|
+
This content is converted into JSON by the :func:`search`
|
|
490
|
+
function.
|
|
491
|
+
|
|
492
|
+
Returns
|
|
493
|
+
-------
|
|
494
|
+
content : dict
|
|
495
|
+
The content dictionary for the ``'sort'`` and
|
|
496
|
+
``'ranking_criteria_type'`` attributes.
|
|
497
|
+
"""
|
|
498
|
+
direction = "desc" if self._descending else "asc"
|
|
499
|
+
return {"sort_by": self._field, "direction": direction}
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class Grouping(metaclass=abc.ABCMeta):
|
|
503
|
+
"""
|
|
504
|
+
A representation of the JSON grouping options of the RCSB search
|
|
505
|
+
API.
|
|
506
|
+
|
|
507
|
+
Parameters
|
|
508
|
+
----------
|
|
509
|
+
sort_by : str or Sorting, optional
|
|
510
|
+
If specified, the returned PDB IDs within each group are sorted
|
|
511
|
+
by the values of the given field name.
|
|
512
|
+
A complete list of the available fields is documented at
|
|
513
|
+
`<https://search.rcsb.org/structure-search-attributes.html>`_.
|
|
514
|
+
and
|
|
515
|
+
`<https://search.rcsb.org/chemical-search-attributes.html>`_.
|
|
516
|
+
If a string is given, sorting is performed in descending order.
|
|
517
|
+
To choose the order a :class:`Sorting` object needs to be
|
|
518
|
+
provided.
|
|
519
|
+
"""
|
|
520
|
+
|
|
521
|
+
def __init__(self, sort_by=None):
|
|
522
|
+
if sort_by is None:
|
|
523
|
+
self._sorting = None
|
|
524
|
+
elif isinstance(sort_by, Sorting):
|
|
525
|
+
self._sorting = sort_by
|
|
526
|
+
else:
|
|
527
|
+
self._sorting = Sorting(sort_by)
|
|
528
|
+
|
|
529
|
+
@abc.abstractmethod
|
|
530
|
+
def get_content(self):
|
|
531
|
+
"""
|
|
532
|
+
Get the grouping content, i.e. the data belonging to the
|
|
533
|
+
``'group_by'`` attribute in the RCSB search API.
|
|
534
|
+
|
|
535
|
+
This content is converted into JSON by the :func:`search`
|
|
536
|
+
and :func:`count` functions.
|
|
537
|
+
|
|
538
|
+
ABSTRACT: Override when inheriting.
|
|
539
|
+
|
|
540
|
+
Returns
|
|
541
|
+
-------
|
|
542
|
+
content : dict
|
|
543
|
+
The content dictionary for the ``'group_by'`` attributes.
|
|
544
|
+
"""
|
|
545
|
+
if self._sorting is not None:
|
|
546
|
+
return {"ranking_criteria_type": self._sorting.get_content()}
|
|
547
|
+
else:
|
|
548
|
+
return {}
|
|
549
|
+
|
|
550
|
+
@abc.abstractmethod
|
|
551
|
+
def is_compatible_return_type(self, return_type):
|
|
552
|
+
"""
|
|
553
|
+
Check whether this :class:`Group` is compatible with the
|
|
554
|
+
RCSB search API ``return_type``.
|
|
555
|
+
|
|
556
|
+
ABSTRACT: Override when inheriting.
|
|
557
|
+
|
|
558
|
+
Parameters
|
|
559
|
+
----------
|
|
560
|
+
return_type : str
|
|
561
|
+
The ``return_type`` attribute to be checked.
|
|
562
|
+
|
|
563
|
+
Returns
|
|
564
|
+
-------
|
|
565
|
+
is_compatible : bool
|
|
566
|
+
True, if this :class:`Group` is compatible with the
|
|
567
|
+
`return_type`, false otherwise.
|
|
568
|
+
"""
|
|
569
|
+
pass
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
class DepositGrouping(Grouping):
|
|
573
|
+
"""
|
|
574
|
+
This class groups PDB entries if they were deposited as a
|
|
575
|
+
collection.
|
|
576
|
+
Such a group usually contain the same protein with e.g. a different
|
|
577
|
+
bound molecule.
|
|
578
|
+
|
|
579
|
+
This :class:`Grouping` is only applicable, if the
|
|
580
|
+
:func:`count()`/:func:`search()` return type is set to ``entry``.
|
|
581
|
+
|
|
582
|
+
Parameters
|
|
583
|
+
----------
|
|
584
|
+
sort_by : str or Sorting, optional
|
|
585
|
+
If specified, the returned PDB IDs within each group are sorted
|
|
586
|
+
by the values of the given field name.
|
|
587
|
+
A complete list of the available fields is documented at
|
|
588
|
+
`<https://search.rcsb.org/structure-search-attributes.html>`_.
|
|
589
|
+
and
|
|
590
|
+
`<https://search.rcsb.org/chemical-search-attributes.html>`_.
|
|
591
|
+
If a string is given, sorting is performed in descending order.
|
|
592
|
+
To choose the order a :class:`Sorting` object needs to be
|
|
593
|
+
provided.
|
|
594
|
+
"""
|
|
595
|
+
|
|
596
|
+
def get_content(self):
|
|
597
|
+
content = super().get_content()
|
|
598
|
+
content["aggregation_method"] = "matching_deposit_group_id"
|
|
599
|
+
return content
|
|
600
|
+
|
|
601
|
+
def is_compatible_return_type(self, return_type):
|
|
602
|
+
return return_type == "entry"
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
class IdentityGrouping(Grouping):
|
|
606
|
+
"""
|
|
607
|
+
This class groups protein chains with a given sequence identity
|
|
608
|
+
with each other.
|
|
609
|
+
|
|
610
|
+
This :class:`Grouping` is only applicable, if the
|
|
611
|
+
:func:`count()`/:func:`search()` return type is set to
|
|
612
|
+
``polymer_entity``.
|
|
613
|
+
|
|
614
|
+
Parameters
|
|
615
|
+
----------
|
|
616
|
+
similarity_cutoff : {100, 95, 90, 70, 50, 30}
|
|
617
|
+
The sequence identity in percent at which the structures are
|
|
618
|
+
grouped.
|
|
619
|
+
In other words, a returned group contains sequences that have
|
|
620
|
+
`similarity_cutoff` sequence identity with each other.
|
|
621
|
+
Since the PDB uses precalculated clusters, only certain values
|
|
622
|
+
are available.
|
|
623
|
+
sort_by : str or Sorting, optional
|
|
624
|
+
If specified, the returned PDB IDs within each group are sorted
|
|
625
|
+
by the values of the given field name.
|
|
626
|
+
A complete list of the available fields is documented at
|
|
627
|
+
`<https://search.rcsb.org/structure-search-attributes.html>`_.
|
|
628
|
+
and
|
|
629
|
+
`<https://search.rcsb.org/chemical-search-attributes.html>`_.
|
|
630
|
+
If a string is given, sorting is performed in descending order.
|
|
631
|
+
To choose the order a :class:`Sorting` object needs to be
|
|
632
|
+
provided.
|
|
633
|
+
"""
|
|
634
|
+
|
|
635
|
+
def __init__(self, similarity_cutoff, sort_by=None):
|
|
636
|
+
super().__init__(sort_by)
|
|
637
|
+
if similarity_cutoff not in (100, 95, 90, 70, 50, 30):
|
|
638
|
+
raise ValueError(
|
|
639
|
+
f"A similarity cutoff of {similarity_cutoff}% is not supported"
|
|
640
|
+
)
|
|
641
|
+
self._similarity_cutoff = similarity_cutoff
|
|
642
|
+
|
|
643
|
+
def get_content(self):
|
|
644
|
+
content = super().get_content()
|
|
645
|
+
content["aggregation_method"] = "sequence_identity"
|
|
646
|
+
content["similarity_cutoff"] = self._similarity_cutoff
|
|
647
|
+
return content
|
|
648
|
+
|
|
649
|
+
def is_compatible_return_type(self, return_type):
|
|
650
|
+
return return_type == "polymer_entity"
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
class UniprotGrouping(Grouping):
|
|
654
|
+
"""
|
|
655
|
+
This class groups protein chains that point to the same *Uniprot*
|
|
656
|
+
accession ID.
|
|
657
|
+
|
|
658
|
+
This :class:`Grouping` is only applicable, if the
|
|
659
|
+
:func:`count()`/:func:`search()` return type is set to
|
|
660
|
+
``polymer_entity``.
|
|
661
|
+
|
|
662
|
+
Parameters
|
|
663
|
+
----------
|
|
664
|
+
sort_by : str or Sorting, optional
|
|
665
|
+
If specified, the returned PDB IDs within each group are sorted
|
|
666
|
+
by the values of the given field name.
|
|
667
|
+
A complete list of the available fields is documented at
|
|
668
|
+
`<https://search.rcsb.org/structure-search-attributes.html>`_.
|
|
669
|
+
and
|
|
670
|
+
`<https://search.rcsb.org/chemical-search-attributes.html>`_.
|
|
671
|
+
If a string is given, sorting is performed in descending order.
|
|
672
|
+
To choose the order a :class:`Sorting` object needs to be
|
|
673
|
+
provided.
|
|
674
|
+
"""
|
|
675
|
+
|
|
676
|
+
def get_content(self):
|
|
677
|
+
content = super().get_content()
|
|
678
|
+
content["aggregation_method"] = "matching_uniprot_accession"
|
|
679
|
+
return content
|
|
680
|
+
|
|
681
|
+
def is_compatible_return_type(self, return_type):
|
|
682
|
+
return return_type == "polymer_entity"
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def count(query, return_type="entry", group_by=None, content_types=("experimental",)):
|
|
686
|
+
"""
|
|
687
|
+
Count PDB entries that meet the given query requirements,
|
|
688
|
+
via the RCSB search API.
|
|
689
|
+
|
|
690
|
+
This function requires an internet connection.
|
|
691
|
+
|
|
692
|
+
Parameters
|
|
693
|
+
----------
|
|
694
|
+
query : Query
|
|
695
|
+
The search query.
|
|
696
|
+
return_type : {'entry', 'assembly', 'polymer_entity', 'non_polymer_entity', 'polymer_instance'}, optional
|
|
697
|
+
The type of the counted identifiers:
|
|
698
|
+
|
|
699
|
+
- ``'entry'``: All macthing PDB entries are counted.
|
|
700
|
+
- ``'assembly'``: All matching assemblies are counted.
|
|
701
|
+
- ``'polymer_entity'``: All matching polymeric entities are
|
|
702
|
+
counted.
|
|
703
|
+
- ``'non_polymer_entity'``: All matching non-polymeric entities
|
|
704
|
+
are counted.
|
|
705
|
+
- ``'polymer_instance'``: All matching chains are counted.
|
|
706
|
+
group_by : Grouping
|
|
707
|
+
If this parameter is set, the number of groups is returned
|
|
708
|
+
instead.
|
|
709
|
+
content_types : iterable of {"experimental", "computational"}, optional
|
|
710
|
+
Specify whether experimental and computational structures should
|
|
711
|
+
be included.
|
|
712
|
+
At least one of them needs to be specified.
|
|
713
|
+
By default only experimental structures are included.
|
|
714
|
+
Note, that identifiers for computational structures cannot be
|
|
715
|
+
downloaded via :func:`biotite.database.rcsb.fetch()` as they
|
|
716
|
+
point to *AlphaFold DB* and *ModelArchive*.
|
|
717
|
+
|
|
718
|
+
Returns
|
|
719
|
+
-------
|
|
720
|
+
count : int
|
|
721
|
+
The total number of PDB IDs (or groups) that would be returned
|
|
722
|
+
by calling :func:`search()` using the same parameters.
|
|
723
|
+
|
|
724
|
+
Notes
|
|
725
|
+
-----
|
|
726
|
+
If `group_by` is set, the number of results may be lower than in an
|
|
727
|
+
ungrouped query, as grouping is not applicable to all structures.
|
|
728
|
+
For example a DNA structure has no associated *Uniprot* accession
|
|
729
|
+
and hence is omitted by :class:`UniprotGrouping`.
|
|
730
|
+
|
|
731
|
+
Examples
|
|
732
|
+
--------
|
|
733
|
+
|
|
734
|
+
>>> query = FieldQuery("reflns.d_resolution_high", less_or_equal=0.6)
|
|
735
|
+
>>> print(count(query))
|
|
736
|
+
9
|
|
737
|
+
>>> ids = search(query)
|
|
738
|
+
>>> print(sorted(ids))
|
|
739
|
+
['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H']
|
|
740
|
+
"""
|
|
741
|
+
query_dict = _initialize_query_dict(query, return_type, group_by, content_types)
|
|
742
|
+
|
|
743
|
+
query_dict["request_options"]["return_counts"] = True
|
|
744
|
+
|
|
745
|
+
r = requests.get(_search_url, params={"json": json.dumps(query_dict)})
|
|
746
|
+
|
|
747
|
+
if r.status_code == 200:
|
|
748
|
+
if group_by is None:
|
|
749
|
+
return r.json()["total_count"]
|
|
750
|
+
else:
|
|
751
|
+
return r.json()["group_by_count"]
|
|
752
|
+
elif r.status_code == 204:
|
|
753
|
+
# Search did not return any results
|
|
754
|
+
return 0
|
|
755
|
+
else:
|
|
756
|
+
try:
|
|
757
|
+
raise RequestError(f"Error {r.status_code}: {r.json()['message']}")
|
|
758
|
+
except json.decoder.JSONDecodeError:
|
|
759
|
+
# In case there an error response without message
|
|
760
|
+
raise RequestError(f"Error {r.status_code}")
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def search(
|
|
764
|
+
query,
|
|
765
|
+
return_type="entry",
|
|
766
|
+
range=None,
|
|
767
|
+
sort_by=None,
|
|
768
|
+
group_by=None,
|
|
769
|
+
return_groups=False,
|
|
770
|
+
content_types=("experimental",),
|
|
771
|
+
):
|
|
772
|
+
"""
|
|
773
|
+
Get all PDB IDs that meet the given query requirements,
|
|
774
|
+
via the RCSB search API.
|
|
775
|
+
|
|
776
|
+
This function requires an internet connection.
|
|
777
|
+
|
|
778
|
+
Parameters
|
|
779
|
+
----------
|
|
780
|
+
query : Query
|
|
781
|
+
The search query.
|
|
782
|
+
return_type : {'entry', 'assembly', 'polymer_entity', 'non_polymer_entity', 'polymer_instance'}, optional
|
|
783
|
+
The type of the returned identifiers:
|
|
784
|
+
|
|
785
|
+
- ``'entry'``: Only the PDB ID is returned (e.g. ``'XXXX'``).
|
|
786
|
+
These can be used directly as input to :func:`fetch()`.
|
|
787
|
+
- ``'assembly'``: The PDB ID appended with assembly ID is
|
|
788
|
+
returned (e.g. ``'XXXX-1'``).
|
|
789
|
+
- ``'polymer_entity'``: The PDB ID appended with entity ID of
|
|
790
|
+
polymers is returned (e.g. ``'XXXX_1'``).
|
|
791
|
+
- ``'non_polymer_entity'``: The PDB ID appended with entity ID
|
|
792
|
+
of non-polymeric entities is returned (e.g. ``'XXXX_1'``).
|
|
793
|
+
- ``'polymer_instance'``: The PDB ID appended with chain ID
|
|
794
|
+
(more exactly ``'asym_id'``) is returned (e.g. ``'XXXX.A'``).
|
|
795
|
+
|
|
796
|
+
range : tuple(int, int), optional
|
|
797
|
+
If this parameter is specified, only PDB IDs in this range
|
|
798
|
+
are selected from all matching PDB IDs and returned
|
|
799
|
+
(pagination).
|
|
800
|
+
The range is zero-indexed and the stop value is exclusive.
|
|
801
|
+
sort_by : str or Sorting, optional
|
|
802
|
+
If specified, the returned PDB IDs are sorted by the values
|
|
803
|
+
of the given field name.
|
|
804
|
+
A complete list of the available fields is documented at
|
|
805
|
+
`<https://search.rcsb.org/structure-search-attributes.html>`_.
|
|
806
|
+
and
|
|
807
|
+
`<https://search.rcsb.org/chemical-search-attributes.html>`_.
|
|
808
|
+
If a string is given sorting is performed in descending order.
|
|
809
|
+
To choose the order, a :class:`Sorting` object needs to be
|
|
810
|
+
provided.
|
|
811
|
+
group_by : Grouping
|
|
812
|
+
If this parameter is set, the PDB IDs that meet the query
|
|
813
|
+
requirements, are grouped according to the given criterion.
|
|
814
|
+
return_groups : boolean, optional
|
|
815
|
+
Only has effect, if `group_by` is set.
|
|
816
|
+
By default the representative with the highest rank in each
|
|
817
|
+
group is returned.
|
|
818
|
+
The rank is determined by the `sort_by` parameter of
|
|
819
|
+
:class:`Grouping` provided in `group_by`.
|
|
820
|
+
If set to true, groups containing all structures belonging to
|
|
821
|
+
the group are returned instead.
|
|
822
|
+
content_types : iterable of {"experimental", "computational"}, optional
|
|
823
|
+
Specify whether experimental and computational structures should
|
|
824
|
+
be included.
|
|
825
|
+
At least one of them needs to be specified.
|
|
826
|
+
By default only experimental structures are included.
|
|
827
|
+
Note, that identifiers for computational structures cannot be
|
|
828
|
+
downloaded via :func:`biotite.database.rcsb.fetch()` as they
|
|
829
|
+
point to *AlphaFold DB* and *ModelArchive*.
|
|
830
|
+
|
|
831
|
+
Returns
|
|
832
|
+
-------
|
|
833
|
+
ids : list of str or dict (str -> list of str)
|
|
834
|
+
If `return_groups` is false (default case), a list of strings
|
|
835
|
+
containing all PDB IDs that meet the query requirements is
|
|
836
|
+
returned.
|
|
837
|
+
If `return_groups` is set to true a dictionary of groups is
|
|
838
|
+
returned.
|
|
839
|
+
This dictionary maps group identifiers to a list of all PDB IDs
|
|
840
|
+
belonging to this group.
|
|
841
|
+
|
|
842
|
+
Notes
|
|
843
|
+
-----
|
|
844
|
+
If `group_by` is set, the number of results may be lower than in an
|
|
845
|
+
ungrouped query, as grouping is not applicable to all structures.
|
|
846
|
+
For example a DNA structure has no associated *Uniprot* accession
|
|
847
|
+
and hence is omitted by :class:`UniprotGrouping`.
|
|
848
|
+
|
|
849
|
+
Also note that `sort_by` does not affect the order within a group.
|
|
850
|
+
This order is determined by the `sort_by` parameter of the
|
|
851
|
+
:class:`Grouping`.
|
|
852
|
+
|
|
853
|
+
Examples
|
|
854
|
+
--------
|
|
855
|
+
|
|
856
|
+
>>> query = FieldQuery("reflns.d_resolution_high", less_or_equal=0.6)
|
|
857
|
+
>>> print(sorted(search(query)))
|
|
858
|
+
['1EJG', '1I0T', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H']
|
|
859
|
+
>>> print(search(query, sort_by="rcsb_accession_info.initial_release_date"))
|
|
860
|
+
['7R0H', '7ATG', '5NW3', '5D8V', '4JLJ', '3P4J', '3NIR', '1I0T', '1EJG']
|
|
861
|
+
>>> print(search(
|
|
862
|
+
... query, range=(1,4), sort_by="rcsb_accession_info.initial_release_date"
|
|
863
|
+
... ))
|
|
864
|
+
['7ATG', '5NW3', '5D8V']
|
|
865
|
+
>>> print(sorted(search(query, return_type="polymer_instance")))
|
|
866
|
+
['1EJG.A', '1I0T.A', '1I0T.B', '3NIR.A', '3P4J.A', '3P4J.B', '4JLJ.A', '4JLJ.B', '5D8V.A', '5NW3.A', '7ATG.A', '7ATG.B', '7R0H.A']
|
|
867
|
+
>>> print(search(
|
|
868
|
+
... query, return_type="polymer_entity", return_groups=True,
|
|
869
|
+
... group_by=UniprotGrouping(sort_by="rcsb_accession_info.initial_release_date"),
|
|
870
|
+
... ))
|
|
871
|
+
{'P24297': ['5NW3_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['3NIR_1', '1EJG_1']}
|
|
872
|
+
"""
|
|
873
|
+
query_dict = _initialize_query_dict(query, return_type, group_by, content_types)
|
|
874
|
+
|
|
875
|
+
if group_by is not None:
|
|
876
|
+
if return_groups:
|
|
877
|
+
query_dict["request_options"]["group_by_return_type"] = "groups"
|
|
878
|
+
else:
|
|
879
|
+
query_dict["request_options"]["group_by_return_type"] = "representatives"
|
|
880
|
+
|
|
881
|
+
if sort_by is not None:
|
|
882
|
+
if isinstance(sort_by, Sorting):
|
|
883
|
+
sorting = sort_by
|
|
884
|
+
else:
|
|
885
|
+
sorting = Sorting(sort_by)
|
|
886
|
+
query_dict["request_options"]["sort"] = [sorting.get_content()]
|
|
887
|
+
|
|
888
|
+
if range is None:
|
|
889
|
+
query_dict["request_options"]["return_all_hits"] = True
|
|
890
|
+
elif range[1] <= range[0]:
|
|
891
|
+
raise ValueError("Range stop must be greater than range start")
|
|
892
|
+
else:
|
|
893
|
+
query_dict["request_options"]["paginate"] = {
|
|
894
|
+
"start": int(range[0]),
|
|
895
|
+
"rows": int(range[1]) - int(range[0]),
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
r = requests.get(_search_url, params={"json": json.dumps(query_dict)})
|
|
899
|
+
|
|
900
|
+
if r.status_code == 200:
|
|
901
|
+
if group_by is None or not return_groups:
|
|
902
|
+
return [result["identifier"] for result in r.json()["result_set"]]
|
|
903
|
+
else:
|
|
904
|
+
return {
|
|
905
|
+
group["identifier"]: [
|
|
906
|
+
result["identifier"] for result in group["result_set"]
|
|
907
|
+
]
|
|
908
|
+
for group in r.json()["group_set"]
|
|
909
|
+
}
|
|
910
|
+
elif r.status_code == 204:
|
|
911
|
+
# Search did not return any results
|
|
912
|
+
return []
|
|
913
|
+
else:
|
|
914
|
+
try:
|
|
915
|
+
raise RequestError(f"Error {r.status_code}: {r.json()['message']}")
|
|
916
|
+
except json.decoder.JSONDecodeError:
|
|
917
|
+
# In case there an error response without message
|
|
918
|
+
raise RequestError(f"Error {r.status_code}")
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
def _initialize_query_dict(query, return_type, group_by, content_types):
|
|
922
|
+
"""
|
|
923
|
+
Initialize the request parameter dictionary with attributes that
|
|
924
|
+
`count()` and `search()` have in common.
|
|
925
|
+
"""
|
|
926
|
+
if return_type not in [
|
|
927
|
+
"entry",
|
|
928
|
+
"polymer_instance",
|
|
929
|
+
"assembly",
|
|
930
|
+
"polymer_entity",
|
|
931
|
+
"non_polymer_entity",
|
|
932
|
+
]:
|
|
933
|
+
raise ValueError(f"'{return_type}' is an invalid return type")
|
|
934
|
+
|
|
935
|
+
request_options = {}
|
|
936
|
+
|
|
937
|
+
if len(content_types) == 0:
|
|
938
|
+
raise ValueError("At least one content type must be specified")
|
|
939
|
+
for content_type in content_types:
|
|
940
|
+
if content_type not in ("experimental", "computational"):
|
|
941
|
+
raise ValueError(f"Unknown content type '{content_type}'")
|
|
942
|
+
request_options["results_content_type"] = content_types
|
|
943
|
+
|
|
944
|
+
if group_by is not None:
|
|
945
|
+
if not group_by.is_compatible_return_type(return_type):
|
|
946
|
+
raise ValueError(
|
|
947
|
+
f"Return type '{return_type}' is not compatible with the given Grouping"
|
|
948
|
+
)
|
|
949
|
+
request_options["group_by"] = group_by.get_content()
|
|
950
|
+
|
|
951
|
+
query_dict = {
|
|
952
|
+
"query": query.get_content(),
|
|
953
|
+
"return_type": return_type,
|
|
954
|
+
"request_options": request_options,
|
|
955
|
+
}
|
|
956
|
+
return query_dict
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
def _to_isoformat(object):
|
|
960
|
+
"""
|
|
961
|
+
Convert a datetime into the specifc ISO 8601 format required by the RCSB.
|
|
962
|
+
"""
|
|
963
|
+
return object.strftime("%Y-%m-%dT%H:%M:%SZ")
|