rcsb-api 0.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb_api-0.1.0.dist-info/LICENSE +21 -0
- rcsb_api-0.1.0.dist-info/METADATA +485 -0
- rcsb_api-0.1.0.dist-info/RECORD +10 -0
- rcsb_api-0.1.0.dist-info/WHEEL +6 -0
- rcsb_api-0.1.0.dist-info/top_level.txt +1 -0
- rcsbapi/__init__.py +7 -0
- rcsbapi/data/__init__.py +0 -0
- rcsbapi/data/query.py +122 -0
- rcsbapi/data/schema.py +650 -0
- rcsbapi/resources/data_api_schema.json +34805 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 RCSB PDB
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: rcsb-api
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python package interface for RCSB.org API services
|
|
5
|
+
Home-page: https://github.com/rcsb/py-rcsb-api
|
|
6
|
+
Author: Dennis Piehl
|
|
7
|
+
Author-email: dennis.piehl@rcsb.org
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Classifier: Natural Language :: English
|
|
17
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: requests >=2.0.0
|
|
22
|
+
Requires-Dist: rustworkx
|
|
23
|
+
Requires-Dist: networkx
|
|
24
|
+
Requires-Dist: rcsbsearchapi
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: check-manifest ; extra == 'dev'
|
|
27
|
+
Provides-Extra: docs
|
|
28
|
+
Requires-Dist: sphinx ; extra == 'docs'
|
|
29
|
+
Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
|
|
30
|
+
Requires-Dist: myst-parser ; extra == 'docs'
|
|
31
|
+
Provides-Extra: test
|
|
32
|
+
Requires-Dist: coverage ; extra == 'test'
|
|
33
|
+
Provides-Extra: tests
|
|
34
|
+
Requires-Dist: tox ; extra == 'tests'
|
|
35
|
+
Requires-Dist: pylint ; extra == 'tests'
|
|
36
|
+
Requires-Dist: black >=21.5b1 ; extra == 'tests'
|
|
37
|
+
Requires-Dist: flake8 ; extra == 'tests'
|
|
38
|
+
|
|
39
|
+
# py-rcsb-api
|
|
40
|
+
Python interface for RCSB PDB API services at RCSB.org.
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
Get it from PyPI:
|
|
45
|
+
|
|
46
|
+
pip install rcsbapi
|
|
47
|
+
|
|
48
|
+
Or, download from [GitHub](https://github.com/rcsb/py-rcsbsearchapi)
|
|
49
|
+
|
|
50
|
+
## Jupyter Notebooks
|
|
51
|
+
A notebook briefly summarizing the README is available in [notebooks/quickstart.ipynb](notebooks/quickstart.ipynb), or can be run online using binder:
|
|
52
|
+
[]()
|
|
53
|
+
|
|
54
|
+
Another notebook using both Search and Data API packages for a COVID-19 related example is available in [notebooks/search_data_workflow.ipynb](notebooks/search_data_workflow.ipynb), or can be run online using binder:
|
|
55
|
+
[]()
|
|
56
|
+
|
|
57
|
+
## Background
|
|
58
|
+
The [RCSB PDB Data API](https://data.rcsb.org/#data-organization) supports requests using [GraphQL](https://graphql.org/), a language for API queries. This package simplifies generating queries in GraphQL syntax.
|
|
59
|
+
|
|
60
|
+
GraphQL is built on "types" and their associated "fields". All types and their fields are defined in a "schema". An example of a type in our schema is "CoreEntry" and a field under CoreEntry is "exptl" (experimental). Upon initialization, the Data API package fetches the schema from the RCSB PDB website (See [Implementation Details](#implementation-details) for more).
|
|
61
|
+
|
|
62
|
+
In GraphQL, you must begin your query at specific fields. These are fields like entry, polymer_entity, and polymer_entity_instance (see full list [here](#input_types)). Each field can return a scalar (e.g. string, integer) or a type. Every query must ultimately request scalar value(s), which can be seen in the example query below. As shown in the example, only fields are explicitly included in queries while types are implicit. Types are named in CamelCase (CoreEntry) while fields are in snake case (exptl or audit_author).
|
|
63
|
+
|
|
64
|
+
This is a query in GraphQL syntax requesting the experimental method of a structure with PDB ID 4HHB (Hemoglobin).
|
|
65
|
+
```
|
|
66
|
+
{
|
|
67
|
+
entry(entry_id: "4HHB") { # returns type "CoreEntry"
|
|
68
|
+
exptl { # returns type "Exptl"
|
|
69
|
+
method # returns a scalar (string)
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
Data is returned in JSON format
|
|
76
|
+
```json
|
|
77
|
+
{
|
|
78
|
+
"data": {
|
|
79
|
+
"entry": {
|
|
80
|
+
"exptl": [
|
|
81
|
+
{
|
|
82
|
+
"method": "X-RAY DIFFRACTION"
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
To generate the same query in this package, you would create a Query object. From the object, you can access the Data API response, get an interactive editor link, or access the arguments used to create the query.
|
|
91
|
+
```python
|
|
92
|
+
Query(input_ids={"entry_id":"4HHB"},input_type="entry", return_data_list=["Exptl.method"])
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
One way this package simplifies making requests is by auto-populating fields that return scalars if you request a field that returns a type.
|
|
96
|
+
```python
|
|
97
|
+
Query(input_ids={"entry_id":"4HHB"},input_type="entry", return_data_list=["exptl"])
|
|
98
|
+
```
|
|
99
|
+
This creates a valid query even though "exptl" doesn't return a scalar. However, the resulting query will be more verbose (see [return_data_list](#return_data_list)).
|
|
100
|
+
|
|
101
|
+
## Query Objects
|
|
102
|
+
Constructing a query object requires three inputs. The JSON response to a query is stored in the `response` attribute of a Query object and can be accessed using the `get_response()` method.
|
|
103
|
+
```python
|
|
104
|
+
# constructing the Query object
|
|
105
|
+
Query(input_ids={"entry_id":"4HHB"},input_type="entry", return_data_list=["Exptl.method"])
|
|
106
|
+
|
|
107
|
+
# accessing the response
|
|
108
|
+
print(query.get_response())
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### input_ids
|
|
112
|
+
|
|
113
|
+
Specifies which entry, entity, etc you would like to request data for.
|
|
114
|
+
|
|
115
|
+
This can be a dictionary or a list. Dictionaries must be passed with specific keys corresponding to the arguments required in the GraphQL schema and viewable in the [GraphiQL editor](https://data.rcsb.org/graphql/index.html) Docs menu or by the running the `get_input_id_dict(input_type)` method (see [Helpful Methods](#get_input_id_dict)). Lists must be passed in PDB identifier format.
|
|
116
|
+
|
|
117
|
+
|Type|Format|Example|
|
|
118
|
+
|---|---|---|
|
|
119
|
+
|polymer, branched, or non-polymer entities|[entry_id]_[entity_id]|4HHB_1|
|
|
120
|
+
|polymer, branched, or non-polymer entity instances|[entry_id].[asym_id]|4HHB.A|
|
|
121
|
+
|biological assemblies|[entry_id]-[assembly_id]|4HHB-1|
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
Dictionaries and Lists will be treated equivalently for the input_ids argument. For example, these input_ids arguments are equivalent.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# input_type is polymer_entity_instance
|
|
128
|
+
input_ids=["4HHB.A"]
|
|
129
|
+
input_ids={"entry_id":"4HHB", "asym_id":"A"}
|
|
130
|
+
```
|
|
131
|
+
```python
|
|
132
|
+
# input_type is polymer_entity_instances (plural)
|
|
133
|
+
input_ids=["4HHB.A","4HHB.B"]
|
|
134
|
+
input_ids={"entry_ids":["4HHB.A","4HHB.B"]}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### input_types
|
|
138
|
+
Specifies which field you are starting your query from.
|
|
139
|
+
|
|
140
|
+
input_types, also called "root fields", are designated points where you can begin querying. This includes entry, polymer_entity, polymer_entity_instance, etc. For the full list see below:
|
|
141
|
+
|
|
142
|
+
<details>
|
|
143
|
+
<summary>Full list of input_types</summary>
|
|
144
|
+
|
|
145
|
+
- entry
|
|
146
|
+
- entries
|
|
147
|
+
- polymer_entity
|
|
148
|
+
- polymer_entities
|
|
149
|
+
- branched_entity
|
|
150
|
+
- branched_entities
|
|
151
|
+
- nonpolymer_entity
|
|
152
|
+
- nonpolymer_entities
|
|
153
|
+
- polymer_entity_instance
|
|
154
|
+
- polymer_entity_instances
|
|
155
|
+
- nonpolymer_entity_instance
|
|
156
|
+
- nonpolymer_entity_instances
|
|
157
|
+
- branched_entity_instance
|
|
158
|
+
- branched_entity_instances
|
|
159
|
+
- assembly
|
|
160
|
+
- assemblies
|
|
161
|
+
- interface
|
|
162
|
+
- interfaces
|
|
163
|
+
- chem_comps
|
|
164
|
+
- uniprot
|
|
165
|
+
- pubmed
|
|
166
|
+
- chem_comp
|
|
167
|
+
- entry_group
|
|
168
|
+
- entry_groups
|
|
169
|
+
- polymer_entity_group
|
|
170
|
+
- polymer_entity_groups
|
|
171
|
+
- group_provenance
|
|
172
|
+
|
|
173
|
+
</details>
|
|
174
|
+
|
|
175
|
+
### return_data_list
|
|
176
|
+
These are the data that you are requesting (or "fields").
|
|
177
|
+
|
|
178
|
+
In GraphQL syntax, the final requested data must be a "scalar" type (string, integer, boolean). However, if you request non-scalar data, the package will auto-populate the query to include all fields under the specified data until scalars are reached. Once you receive the query response and understand what specific data you would like to request, you can refine your query by requesting more specific fields.
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
Query(input_ids={"entry_id":"4HHB"}, input_type="entry", return_data_list=["exptl"])
|
|
182
|
+
```
|
|
183
|
+
```json
|
|
184
|
+
{
|
|
185
|
+
"data": {
|
|
186
|
+
"entry": {
|
|
187
|
+
"exptl": [
|
|
188
|
+
{
|
|
189
|
+
"details": null,
|
|
190
|
+
"crystals_number": null,
|
|
191
|
+
"method_details": null,
|
|
192
|
+
"method": "X-RAY DIFFRACTION"
|
|
193
|
+
}
|
|
194
|
+
]
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
This query can be made more concise by specifying a field, like `"method"`. In this case, the field name "method" is redundant because it appears under other types and must be further specified using dot notation. For more details see [ValueError: Not a unique field](#valueerror-not-a-unique-field)
|
|
200
|
+
```python
|
|
201
|
+
Query(input_ids={"entry_id":"4HHB"},input_type="entry", return_data_list=["Exptl.method"])
|
|
202
|
+
```
|
|
203
|
+
```json
|
|
204
|
+
{
|
|
205
|
+
"data": {
|
|
206
|
+
"entry": {
|
|
207
|
+
"exptl": [
|
|
208
|
+
{
|
|
209
|
+
"method": "X-RAY DIFFRACTION"
|
|
210
|
+
}
|
|
211
|
+
]
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Helpful Methods
|
|
218
|
+
There are several methods included to make working with query objects easier. These methods can also help you further understand the GraphQL syntax and refine your queries to request exactly and only what you want.
|
|
219
|
+
|
|
220
|
+
### get_editor_link()
|
|
221
|
+
This method returns the link to a [GraphiQL](https://data.rcsb.org/graphql/index.html) window with the query. From the window, you can use the user interface to explore other fields and refine your query. Method of Query class.
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
query = Query(input_ids={"entry_id":"4HHB"},input_type="entry", return_data_list=["exptl"])
|
|
225
|
+
print(query.get_editor_link())
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### get_unique_fields() <!--Should this be moved outside the schema method?-->
|
|
229
|
+
Given a redundant field, this method returns a list of matching fields in dot notation. You can look through the list to identify your intended field. Method of Schema class.
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
SCHEMA.get_unique_fields("id")
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### find_field_names()
|
|
236
|
+
Given a string, this method will return all fields containing that string, along with a description of each field.
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
SCHEMA.find_field_names("exptl")
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### get_input_id_dict()
|
|
243
|
+
Given a valid input_type, returns a dictionary with the corresponding keys and descriptions of each key. Method of Schema class.
|
|
244
|
+
```python
|
|
245
|
+
SCHEMA.get_input_id_dict("polymer_entity_instance")
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Trouble-shooting
|
|
249
|
+
### ValueError: Not a unique field
|
|
250
|
+
Some fields are redundant within our GraphQL Data API schema. For example, "id" appears over 50 times. To allow for specific querying, redundant fields are identified by the syntax `<type>.<field name>`. If you request a redundant field without this syntax, a `ValueError` will be returned stating that the field exists, but is redundant. You can then use `get_unique_fields("<field name>")` to find notation that would specify a unique field for the given name.
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
# querying a redundant field
|
|
254
|
+
Query(input_ids={"entry_id":"4HHB"},input_type="entry", return_data_list=["id"])
|
|
255
|
+
```
|
|
256
|
+
```
|
|
257
|
+
ValueError: Not a unique field, must specify further. To find valid fields with this name, run: get_unique_fields(id)
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
# Run get_unique_field("<field name>")
|
|
262
|
+
print(get_unique_fields("id"))
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
```
|
|
266
|
+
['PdbxStructSpecialSymmetry.id',
|
|
267
|
+
'RcsbBirdCitation.id',
|
|
268
|
+
'ChemComp.id',
|
|
269
|
+
'Entry.id',
|
|
270
|
+
...
|
|
271
|
+
'RcsbUniprotKeyword.id',
|
|
272
|
+
'RcsbPolymerInstanceAnnotationAnnotationLineage.id',
|
|
273
|
+
'RcsbPolymerStructConn.id']
|
|
274
|
+
```
|
|
275
|
+
```python
|
|
276
|
+
# valid Query
|
|
277
|
+
Query(input_ids={"entry_id":"4HHB"},input_type="entry", return_data_list=["Entry.id"])
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## Implementation Details
|
|
281
|
+
### Parsing Schema
|
|
282
|
+
Upon initialization of the package, the GraphQL schema is fetched from the RCSB PDB website. After fetching the file, the Python package parses the schema and creates a graph object to represent it within the package. This graph representation of how fields and types connect is key to how queries are automatically constructed using a shortest path algoritm. By default the graph is constructed as a directed graph in [rustworkx](https://www.rustworkx.org/), but if an `ImportError` is encountered, a `NetworkX` directed graph is created instead.
|
|
283
|
+
|
|
284
|
+
### Constructing queries
|
|
285
|
+
Queries are constructed by finding the shortest path from an `input_type` to each item in the `return_data_list`. The name of each field in the path is found and used to construct a GraphQL query. Currently, constructing queries is not implemented using Networkx and only rustworkx is supported.
|
|
286
|
+
|
|
287
|
+
### Error Handling
|
|
288
|
+
In GraphQL, all requests return HTTP status code 200 and instead errors appear in the JSON that is returned. The package will parse these errors, throwing a ValueError and displaying the corresponding error message or messages. To access the full query and return JSON in an interactive editor, you can use the `get_editor_link()` method on the Query object. (see [Helpful Methods](#get_editor_link))
|
|
289
|
+
|
|
290
|
+
## Additional examples
|
|
291
|
+
Examples come from [RCSB PDB Data API documentation](https://data.rcsb.org/#examples)
|
|
292
|
+
|
|
293
|
+
### Entries
|
|
294
|
+
Fetch information about structure title and experimental method for PDB entries:
|
|
295
|
+
```
|
|
296
|
+
{
|
|
297
|
+
entries(entry_ids: ["1STP", "2JEF", "1CDG"]) {
|
|
298
|
+
rcsb_id
|
|
299
|
+
struct {
|
|
300
|
+
title
|
|
301
|
+
}
|
|
302
|
+
exptl {
|
|
303
|
+
method
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
```
|
|
308
|
+
```python
|
|
309
|
+
query = Query(input_ids={"entry_ids": ["1STP","2JEF","1CDG"]},input_type="entries", return_data_list=["CoreEntry.rcsb_id", "Struct.title", "Exptl.method"])
|
|
310
|
+
```
|
|
311
|
+
To find more about the return_data_list dot notation, see [ValueError: Not a unique field](#valueerror-not-a-unique-field)
|
|
312
|
+
|
|
313
|
+
### Primary Citation
|
|
314
|
+
Fetch primary citation information (structure authors, PubMed ID, DOI) and release date for PDB entries:
|
|
315
|
+
|
|
316
|
+
```
|
|
317
|
+
{
|
|
318
|
+
entries(entry_ids: ["1STP", "2JEF", "1CDG"]) {
|
|
319
|
+
rcsb_id
|
|
320
|
+
rcsb_accession_info {
|
|
321
|
+
initial_release_date
|
|
322
|
+
}
|
|
323
|
+
audit_author {
|
|
324
|
+
name
|
|
325
|
+
}
|
|
326
|
+
rcsb_primary_citation {
|
|
327
|
+
pdbx_database_id_PubMed
|
|
328
|
+
pdbx_database_id_DOI
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
```
|
|
333
|
+
```python
|
|
334
|
+
query = Query(input_ids={"entry_ids": ["1STP","2JEF","1CDG"]},input_type="entries", return_data_list=["CoreEntry.rcsb_id", "RcsbAccessionInfo.initial_release_date", "AuditAuthor.name", "RcsbPrimaryCitation.pdbx_database_id_PubMed", "RcsbPrimaryCitation.pdbx_database_id_DOI"])
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
### Polymer Entities
|
|
338
|
+
Fetch taxonomy information and information about membership in the sequence clusters for polymer entities:
|
|
339
|
+
|
|
340
|
+
```
|
|
341
|
+
{
|
|
342
|
+
polymer_entities(entity_ids:["2CPK_1","3WHM_1","2D5Z_1"]) {
|
|
343
|
+
rcsb_id
|
|
344
|
+
rcsb_entity_source_organism {
|
|
345
|
+
ncbi_taxonomy_id
|
|
346
|
+
ncbi_scientific_name
|
|
347
|
+
}
|
|
348
|
+
rcsb_cluster_membership {
|
|
349
|
+
cluster_id
|
|
350
|
+
identity
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
```
|
|
355
|
+
```python
|
|
356
|
+
query = Query(input_ids={"entity_ids":["2CPK_1","3WHM_1","2D5Z_1"]},input_type="polymer_entities", return_data_list=["CorePolymerEntity.rcsb_id", "RcsbEntitySourceOrganism.ncbi_taxonomy_id", "RcsbEntitySourceOrganism.ncbi_scientific_name", "cluster_id", "identity"])
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
### Polymer Instances
|
|
360
|
+
Fetch information about the domain assignments for polymer entity instances:
|
|
361
|
+
|
|
362
|
+
```
|
|
363
|
+
{
|
|
364
|
+
polymer_entity_instances(instance_ids: ["4HHB.A", "12CA.A", "3PQR.A"]) {
|
|
365
|
+
rcsb_id
|
|
366
|
+
rcsb_polymer_instance_annotation {
|
|
367
|
+
annotation_id
|
|
368
|
+
name
|
|
369
|
+
type
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
```
|
|
374
|
+
```python
|
|
375
|
+
query = Query(input_ids={"instance_ids":["4HHB.A", "12CA.A", "3PQR.A"]},input_type="polymer_entity_instances", return_data_list=["CorePolymerEntityInstance.rcsb_id", "RcsbPolymerInstanceAnnotation.annotation_id", "RcsbPolymerInstanceAnnotation.name", "RcsbPolymerInstanceAnnotation.type"])
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
### Carbohydrates
|
|
379
|
+
Query branched entities (sugars or oligosaccharides) for commonly used linear descriptors:
|
|
380
|
+
|
|
381
|
+
```
|
|
382
|
+
{
|
|
383
|
+
branched_entities(entity_ids:["5FMB_2", "6L63_3"]) {
|
|
384
|
+
pdbx_entity_branch {
|
|
385
|
+
type
|
|
386
|
+
}
|
|
387
|
+
pdbx_entity_branch_descriptor {
|
|
388
|
+
type
|
|
389
|
+
descriptor
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
```
|
|
394
|
+
```python
|
|
395
|
+
query = Query(input_ids={"entity_ids":["5FMB_2", "6L63_3"]},input_type="branched_entities", return_data_list=["PdbxEntityBranch.type","PdbxEntityBranchDescriptor.type","PdbxEntityBranchDescriptor.descriptor"])
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
### Sequence Positional Features
|
|
399
|
+
|
|
400
|
+
Sequence positional features describe regions or sites of interest in the PDB sequences, such as binding sites, active sites, linear motifs, local secondary structure, structural and functional domains, etc. Positional annotations include depositor-provided information available in the PDB archive as well as annotations integrated from external resources (e.g. UniProtKB).
|
|
401
|
+
|
|
402
|
+
This example queries 'polymer_entity_instances' positional features. The query returns features of different type: for example, CATH and SCOP classifications assignments integrated from UniProtKB data, or the secondary structure annotations from the PDB archive data calculated by the data-processing program called MAXIT (Macromolecular Exchange and Input Tool) that is based on an earlier ProMotif implementation.
|
|
403
|
+
|
|
404
|
+
```
|
|
405
|
+
{
|
|
406
|
+
polymer_entity_instances(instance_ids: ["1NDO.A"]) {
|
|
407
|
+
rcsb_id
|
|
408
|
+
rcsb_polymer_instance_feature {
|
|
409
|
+
type
|
|
410
|
+
feature_positions {
|
|
411
|
+
beg_seq_id
|
|
412
|
+
end_seq_id
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
```
|
|
418
|
+
```python
|
|
419
|
+
query = Query(input_ids={"instance_ids":["1NDO.A"]},input_type="polymer_entity_instances", return_data_list=["CorePolymerEntityInstance.rcsb_id", "RcsbPolymerInstanceFeature.type", "RcsbPolymerInstanceFeatureFeaturePositions.beg_seq_id", "RcsbPolymerInstanceFeatureFeaturePositions.end_seq_id"])
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
### Reference Sequence Identifiers
|
|
423
|
+
This example shows how to access identifiers related to entries (cross-references) and found in data collections other than PDB. Each cross-reference is described by the database name and the database accession. A single entry can have cross-references to several databases, e.g. UniProt and GenBank in 7NHM, or no cross-references, e.g. 5L2G:
|
|
424
|
+
```
|
|
425
|
+
{
|
|
426
|
+
entries(entry_ids:["7NHM", "5L2G"]){
|
|
427
|
+
polymer_entities {
|
|
428
|
+
rcsb_id
|
|
429
|
+
rcsb_polymer_entity_container_identifiers {
|
|
430
|
+
reference_sequence_identifiers {
|
|
431
|
+
database_accession
|
|
432
|
+
database_name
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
```
|
|
439
|
+
```python
|
|
440
|
+
query = Query(input_ids={"entry_ids": ["7NHM", "5L2G"]}, input_type="entries", return_data_list=["CoreEntry.rcsb_id", "RcsbPolymerEntityContainerIdentifiersReferenceSequenceIdentifiers.database_accession", "RcsbPolymerEntityContainerIdentifiersReferenceSequenceIdentifiers.database_name"])
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
### Chemical Components
|
|
444
|
+
Query for specific items in the chemical component dictionary based on a given list of CCD ids:
|
|
445
|
+
|
|
446
|
+
```
|
|
447
|
+
{
|
|
448
|
+
chem_comps(comp_ids:["NAG", "EBW"]) {
|
|
449
|
+
rcsb_id
|
|
450
|
+
chem_comp {
|
|
451
|
+
type
|
|
452
|
+
formula_weight
|
|
453
|
+
name
|
|
454
|
+
formula
|
|
455
|
+
}
|
|
456
|
+
rcsb_chem_comp_info {
|
|
457
|
+
initial_release_date
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
```
|
|
462
|
+
```python
|
|
463
|
+
query = Query(input_ids={"comp_ids":["NAG", "EBW"]}, input_type="chem_comps", return_data_list=["CoreChemComp.rcsb_id","ChemComp.type","ChemComp.formula_weight","ChemComp.name","ChemComp.formula","RcsbChemCompInfo.initial_release_date"])
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
### Computed Structure Models
|
|
467
|
+
This example shows how to get a list of global Model Quality Assessment metrics for AlphaFold structure of Hemoglobin subunit beta:
|
|
468
|
+
|
|
469
|
+
```
|
|
470
|
+
{
|
|
471
|
+
entries(entry_ids: ["AF_AFP68871F1"]) {
|
|
472
|
+
rcsb_ma_qa_metric_global {
|
|
473
|
+
ma_qa_metric_global {
|
|
474
|
+
type
|
|
475
|
+
value
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
```
|
|
481
|
+
```python
|
|
482
|
+
query = Query(input_ids={"entry_ids": ["AF_AFP68871F1"]}, input_type="entries", return_data_list=["RcsbMaQaMetricGlobalMaQaMetricGlobal.type", "RcsbMaQaMetricGlobalMaQaMetricGlobal.value"])
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
rcsbapi/__init__.py,sha256=2o-SSg314fRdacwvS2y9rSwKJDZCtCkZzXXsWdYXUsQ,210
|
|
2
|
+
rcsbapi/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
rcsbapi/data/query.py,sha256=eqe2HOz77ij1Td2pyjCLplozDcD2wqwZioCAETTypI4,5280
|
|
4
|
+
rcsbapi/data/schema.py,sha256=NtOHJsPfs0cDLM9KlwihqKI2O-rXc7J_6j1a4z7qnlk,28112
|
|
5
|
+
rcsbapi/resources/data_api_schema.json,sha256=IvL7hB6--kzk2Uy6vzejsTKc6xvmSjapzIr2E4Zjs8E,1771007
|
|
6
|
+
rcsb_api-0.1.0.dist-info/LICENSE,sha256=jFdfp4-j1dWjHcGjkMswp9D4sXxRsjz01ixAIQRo3OY,1065
|
|
7
|
+
rcsb_api-0.1.0.dist-info/METADATA,sha256=GICo8qWtW3j6YTVyd24oOmeF8tTS5MseZdp28DCgD8I,18381
|
|
8
|
+
rcsb_api-0.1.0.dist-info/WHEEL,sha256=ANi2y8tYx-p70pj7MSdqQMJNFJEUgAOyfPpHzqz0w84,109
|
|
9
|
+
rcsb_api-0.1.0.dist-info/top_level.txt,sha256=jcE11C-JZ-lfORcm-78iwwnXdiSWltaHWesYk9dzeuE,8
|
|
10
|
+
rcsb_api-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
rcsbapi
|
rcsbapi/__init__.py
ADDED
rcsbapi/data/__init__.py
ADDED
|
File without changes
|
rcsbapi/data/query.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import re
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Union, List, Dict
|
|
6
|
+
import requests
|
|
7
|
+
from rcsbapi.data import schema
|
|
8
|
+
|
|
9
|
+
PDB_URL = "https://data.rcsb.org/graphql"
|
|
10
|
+
SCHEMA = schema.Schema(PDB_URL)
|
|
11
|
+
logger = logging.getLogger()
|
|
12
|
+
logging.basicConfig(level=logging.INFO, format="[%(levelname)s]: %(message)s")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Query:
|
|
16
|
+
|
|
17
|
+
def __init__(self, input_ids: Union[List[str], Dict[Any, Any]], input_type: str, return_data_list: List[str]):
|
|
18
|
+
input_id_limit = 200
|
|
19
|
+
if isinstance(input_ids, list):
|
|
20
|
+
if len(input_ids) > input_id_limit:
|
|
21
|
+
logging.warning("More than %d input_ids. For a more readable response, reduce number of ids.", input_id_limit)
|
|
22
|
+
if isinstance(input_ids, dict):
|
|
23
|
+
for value in input_ids.values():
|
|
24
|
+
if len(value) > input_id_limit:
|
|
25
|
+
logging.warning("More than %d input_ids. For a more readable response, reduce number of ids.", input_id_limit)
|
|
26
|
+
self.__input_ids = input_ids
|
|
27
|
+
self.__input_type = input_type
|
|
28
|
+
self.__return_data_list = return_data_list
|
|
29
|
+
self.__query = SCHEMA.construct_query(input_ids, input_type, return_data_list)
|
|
30
|
+
self.__plural_input = False
|
|
31
|
+
if SCHEMA.root_dict[input_type][0]["kind"] == "LIST":
|
|
32
|
+
self.__plural_input = True
|
|
33
|
+
if isinstance(input_ids, dict):
|
|
34
|
+
self.__input_ids_list: List[str] = input_ids[SCHEMA.root_dict[input_type][0]["name"]]
|
|
35
|
+
if isinstance(input_ids, list):
|
|
36
|
+
self.__input_ids_list = input_ids
|
|
37
|
+
try:
|
|
38
|
+
self.__response = self.post_query()
|
|
39
|
+
except ValueError as error:
|
|
40
|
+
self.__response = None
|
|
41
|
+
logging.warning("Was not able to post query due to the following error.")
|
|
42
|
+
print(str(error))
|
|
43
|
+
|
|
44
|
+
def get_input_ids(self):
|
|
45
|
+
return self.__input_ids
|
|
46
|
+
|
|
47
|
+
def get_input_type(self):
|
|
48
|
+
return self.__input_type
|
|
49
|
+
|
|
50
|
+
def get_return_data_list(self):
|
|
51
|
+
return self.__return_data_list
|
|
52
|
+
|
|
53
|
+
def get_query(self):
|
|
54
|
+
return self.__query
|
|
55
|
+
|
|
56
|
+
def get_input_ids_list(self):
|
|
57
|
+
return self.__input_ids_list
|
|
58
|
+
|
|
59
|
+
def get_response(self):
|
|
60
|
+
return self.__response
|
|
61
|
+
|
|
62
|
+
def get_editor_link(self):
|
|
63
|
+
editor_base_link = PDB_URL + "/index.html?query="
|
|
64
|
+
return editor_base_link + urllib.parse.quote(self.__query)
|
|
65
|
+
|
|
66
|
+
def post_query(self):
|
|
67
|
+
batch_size = 50
|
|
68
|
+
if (self.__plural_input is True) and (len(self.__input_ids_list) > batch_size):
|
|
69
|
+
batched_ids = self.batch_ids(batch_size)
|
|
70
|
+
response_json = {}
|
|
71
|
+
# count = 0
|
|
72
|
+
for id_batch in batched_ids:
|
|
73
|
+
query = re.sub(r"\[([^]]+)\]", f"{id_batch}".replace("\'", "\""), self.__query)
|
|
74
|
+
part_response = requests.post(headers={"Content-Type": "application/graphql"}, data=query, url=PDB_URL, timeout=10).json()
|
|
75
|
+
self.parse_gql_error(part_response)
|
|
76
|
+
time.sleep(0.2)
|
|
77
|
+
if not response_json:
|
|
78
|
+
response_json = part_response
|
|
79
|
+
else:
|
|
80
|
+
response_json = self.merge_response(response_json, part_response)
|
|
81
|
+
else:
|
|
82
|
+
response_json = requests.post(headers={"Content-Type": "application/graphql"}, data=self.__query, url=PDB_URL, timeout=10).json()
|
|
83
|
+
self.parse_gql_error(response_json)
|
|
84
|
+
if "data" in response_json.keys():
|
|
85
|
+
query_response = response_json["data"][self.__input_type]
|
|
86
|
+
if query_response is None:
|
|
87
|
+
logging.warning("Input produced no results. Check that input ids are valid")
|
|
88
|
+
if isinstance(query_response, list):
|
|
89
|
+
if len(query_response) == 0:
|
|
90
|
+
logging.warning("Input produced no results. Check that input ids are valid")
|
|
91
|
+
return response_json
|
|
92
|
+
# parse_response(response_json)
|
|
93
|
+
# fields_list=
|
|
94
|
+
|
|
95
|
+
def parse_gql_error(self, response_json):
|
|
96
|
+
if "errors" in response_json.keys():
|
|
97
|
+
error_msg_list = []
|
|
98
|
+
for error_dict in response_json["errors"]:
|
|
99
|
+
error_msg_list.append(error_dict["message"])
|
|
100
|
+
combined_error_msg = ""
|
|
101
|
+
for i, error_msg in enumerate(error_msg_list):
|
|
102
|
+
combined_error_msg += f"{i+1}. {error_msg}\n"
|
|
103
|
+
raise ValueError(f"{combined_error_msg}. Run <query object name>.get_editor_link() to get a link to GraphiQL editor with query")
|
|
104
|
+
|
|
105
|
+
def batch_ids(self, batch_size) -> List[List[str]]: # assumes that plural types have only one arg, which is true right now
|
|
106
|
+
batched_ids: List[List[str]] = []
|
|
107
|
+
i = 0
|
|
108
|
+
while i < len(self.__input_ids_list):
|
|
109
|
+
count = 0
|
|
110
|
+
batch_list: List[str] = []
|
|
111
|
+
while count < batch_size and i < len(self.__input_ids_list):
|
|
112
|
+
batch_list.append(self.__input_ids_list[i])
|
|
113
|
+
count += 1
|
|
114
|
+
i += 1
|
|
115
|
+
if len(batch_list) > 0:
|
|
116
|
+
batched_ids.append(batch_list)
|
|
117
|
+
return batched_ids
|
|
118
|
+
|
|
119
|
+
def merge_response(self, merge_into_response, to_merge_response):
|
|
120
|
+
combined_response = merge_into_response
|
|
121
|
+
combined_response["data"][self.__input_type] += to_merge_response["data"][self.__input_type]
|
|
122
|
+
return combined_response
|