rcsb.exdb 1.31__py3-none-any.whl → 1.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/exdb/tree/TreeNodeListWorker.py +72 -49
- {rcsb_exdb-1.31.dist-info → rcsb_exdb-1.33.dist-info}/METADATA +2 -2
- {rcsb_exdb-1.31.dist-info → rcsb_exdb-1.33.dist-info}/RECORD +5 -42
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +0 -19
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +0 -12
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -104
- rcsb/exdb/tests/fixturePdbxLoader.py +0 -298
- rcsb/exdb/tests/test-data/components-abbrev.cif +0 -2739
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +0 -9171
- rcsb/exdb/tests/testAnnotationExtractor.py +0 -79
- rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -81
- rcsb/exdb/tests/testChemRefLoader.py +0 -106
- rcsb/exdb/tests/testChemRefMappingProvider.py +0 -95
- rcsb/exdb/tests/testCitationAdapter.py +0 -97
- rcsb/exdb/tests/testCitationExtractor.py +0 -93
- rcsb/exdb/tests/testCitationUtils.py +0 -92
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -70
- rcsb/exdb/tests/testEntryInfoProvider.py +0 -97
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -70
- rcsb/exdb/tests/testGlycanProvider.py +0 -98
- rcsb/exdb/tests/testGlycanUtils.py +0 -64
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -90
- rcsb/exdb/tests/testObjectExtractor.py +0 -342
- rcsb/exdb/tests/testObjectTransformer.py +0 -83
- rcsb/exdb/tests/testObjectUpdater.py +0 -120
- rcsb/exdb/tests/testPolymerEntityExtractor.py +0 -93
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -124
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +0 -134
- rcsb/exdb/tests/testPubChemEtlWrapper.py +0 -155
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +0 -123
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +0 -106
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -121
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -122
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +0 -117
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +0 -94
- rcsb/exdb/tests/testTaxonomyExtractor.py +0 -75
- rcsb/exdb/tests/testTreeNodeListWorker.py +0 -111
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -99
- rcsb/exdb/tests/testUniProtExtractor.py +0 -77
- {rcsb_exdb-1.31.dist-info → rcsb_exdb-1.33.dist-info}/WHEEL +0 -0
- {rcsb_exdb-1.31.dist-info → rcsb_exdb-1.33.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,342 +0,0 @@
|
|
|
1
|
-
##
|
|
2
|
-
# File: ObjectExtractorTests.py
|
|
3
|
-
# Author: J. Westbrook
|
|
4
|
-
# Date: 25-Apr-2019
|
|
5
|
-
#
|
|
6
|
-
# Updates:
|
|
7
|
-
#
|
|
8
|
-
##
|
|
9
|
-
"""
|
|
10
|
-
Tests for extractor selected values from collections (limited tests from mock-data repos)
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
__docformat__ = "google en"
|
|
14
|
-
__author__ = "John Westbrook"
|
|
15
|
-
__email__ = "jwest@rcsb.rutgers.edu"
|
|
16
|
-
__license__ = "Apache 2.0"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
import logging
|
|
20
|
-
import os
|
|
21
|
-
import platform
|
|
22
|
-
import resource
|
|
23
|
-
import pprint
|
|
24
|
-
import time
|
|
25
|
-
import unittest
|
|
26
|
-
from collections import defaultdict
|
|
27
|
-
|
|
28
|
-
from rcsb.db.mongo.Connection import Connection
|
|
29
|
-
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
30
|
-
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
31
|
-
from rcsb.utils.io.TimeUtil import TimeUtil
|
|
32
|
-
|
|
33
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
34
|
-
logger = logging.getLogger()
|
|
35
|
-
|
|
36
|
-
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
37
|
-
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class ObjectExtractorTests(unittest.TestCase):
|
|
41
|
-
def __init__(self, methodName="runTest"):
|
|
42
|
-
super(ObjectExtractorTests, self).__init__(methodName)
|
|
43
|
-
self.__verbose = False
|
|
44
|
-
|
|
45
|
-
def setUp(self):
|
|
46
|
-
#
|
|
47
|
-
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
48
|
-
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
49
|
-
#
|
|
50
|
-
configName = "site_info_configuration"
|
|
51
|
-
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
52
|
-
#
|
|
53
|
-
self.__resourceName = "MONGO_DB"
|
|
54
|
-
self.__workPath = os.path.join(TOPDIR, "CACHE", "exdb")
|
|
55
|
-
#
|
|
56
|
-
self.__testEntryCacheKwargs = {"fmt": "json", "indent": 3}
|
|
57
|
-
self.__objectLimitTest = 5
|
|
58
|
-
#
|
|
59
|
-
self.__startTime = time.time()
|
|
60
|
-
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
61
|
-
|
|
62
|
-
def tearDown(self):
|
|
63
|
-
unitS = "MB" if platform.system() == "Darwin" else "GB"
|
|
64
|
-
rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
65
|
-
logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS)
|
|
66
|
-
endTime = time.time()
|
|
67
|
-
logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
68
|
-
|
|
69
|
-
def testCreateMultipleConnections(self):
|
|
70
|
-
"""Test case - multiple connection creation"""
|
|
71
|
-
try:
|
|
72
|
-
for _ in range(5):
|
|
73
|
-
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
74
|
-
self.assertNotEqual(client, None)
|
|
75
|
-
except Exception as e:
|
|
76
|
-
logger.exception("Failing with %s", str(e))
|
|
77
|
-
self.fail()
|
|
78
|
-
|
|
79
|
-
def testExtractDrugbankMapping(self):
|
|
80
|
-
"""Test case - extract Drugbank mapping"""
|
|
81
|
-
try:
|
|
82
|
-
obEx = ObjectExtractor(
|
|
83
|
-
self.__cfgOb,
|
|
84
|
-
databaseName="dw",
|
|
85
|
-
collectionName="core_chem_comp",
|
|
86
|
-
cacheFilePath=os.path.join(self.__workPath, "drugbank-mapping-cache.json"),
|
|
87
|
-
useCache=False,
|
|
88
|
-
cacheKwargs=self.__testEntryCacheKwargs,
|
|
89
|
-
keyAttribute="chem_comp",
|
|
90
|
-
uniqueAttributes=["rcsb_id"],
|
|
91
|
-
selectionQuery={"rcsb_chem_comp_container_identifiers.drugbank_id": {"$exists": True}},
|
|
92
|
-
selectionList=["rcsb_id", "rcsb_chem_comp_container_identifiers", "rcsb_chem_comp_related"],
|
|
93
|
-
)
|
|
94
|
-
eCount = obEx.getCount()
|
|
95
|
-
logger.info("Component count ifs %d", eCount)
|
|
96
|
-
self.assertGreaterEqual(eCount, 3)
|
|
97
|
-
except Exception as e:
|
|
98
|
-
logger.exception("Failing with %s", str(e))
|
|
99
|
-
self.fail()
|
|
100
|
-
|
|
101
|
-
def testExtractEntriesBefore(self):
|
|
102
|
-
"""Test case - extract entries subject to date restriction"""
|
|
103
|
-
try:
|
|
104
|
-
tU = TimeUtil()
|
|
105
|
-
tS = tU.getTimestamp(useUtc=True, before={"days": 365 * 7})
|
|
106
|
-
tD = tU.getDateTimeObj(tS)
|
|
107
|
-
obEx = ObjectExtractor(
|
|
108
|
-
self.__cfgOb,
|
|
109
|
-
databaseName="pdbx_core",
|
|
110
|
-
collectionName="pdbx_core_entry",
|
|
111
|
-
useCache=False,
|
|
112
|
-
keyAttribute="entry",
|
|
113
|
-
uniqueAttributes=["rcsb_id"],
|
|
114
|
-
selectionQuery={"rcsb_accession_info.initial_release_date": {"$gt": tD}},
|
|
115
|
-
selectionList=["rcsb_id", "rcsb_accession_info"],
|
|
116
|
-
)
|
|
117
|
-
eD = obEx.getObjects()
|
|
118
|
-
eCount = obEx.getCount()
|
|
119
|
-
logger.info("Entry count is %d", eCount)
|
|
120
|
-
logger.info("Entries are %r", list(eD.keys()))
|
|
121
|
-
self.assertGreaterEqual(eCount, 5)
|
|
122
|
-
except Exception as e:
|
|
123
|
-
logger.exception("Failing with %s", str(e))
|
|
124
|
-
self.fail()
|
|
125
|
-
|
|
126
|
-
def testExtractEntries(self):
|
|
127
|
-
"""Test case - extract entries"""
|
|
128
|
-
try:
|
|
129
|
-
obEx = ObjectExtractor(
|
|
130
|
-
self.__cfgOb,
|
|
131
|
-
databaseName="pdbx_core",
|
|
132
|
-
collectionName="pdbx_core_entry",
|
|
133
|
-
cacheFilePath=os.path.join(self.__workPath, "entry-data-test-cache.json"),
|
|
134
|
-
useCache=False,
|
|
135
|
-
keyAttribute="entry",
|
|
136
|
-
uniqueAttributes=["rcsb_id"],
|
|
137
|
-
cacheKwargs=self.__testEntryCacheKwargs,
|
|
138
|
-
objectLimit=self.__objectLimitTest,
|
|
139
|
-
)
|
|
140
|
-
eCount = obEx.getCount()
|
|
141
|
-
logger.info("Entry count is %d", eCount)
|
|
142
|
-
self.assertGreaterEqual(eCount, self.__objectLimitTest)
|
|
143
|
-
|
|
144
|
-
objD = obEx.getObjects()
|
|
145
|
-
for _, obj in objD.items():
|
|
146
|
-
# obEx.genPathList(obj["software"], path=["software"])
|
|
147
|
-
obEx.genPathList(obj, path=None)
|
|
148
|
-
|
|
149
|
-
#
|
|
150
|
-
pL = obEx.getPathList(filterList=True)
|
|
151
|
-
obEx.setPathList(pL)
|
|
152
|
-
if self.__verbose:
|
|
153
|
-
for ky, obj in objD.items():
|
|
154
|
-
obEx.genValueList(obj, path=None)
|
|
155
|
-
tD = obEx.getValues()
|
|
156
|
-
logger.debug("Index object %r %s", ky, pprint.pformat(tD, indent=3, width=120))
|
|
157
|
-
|
|
158
|
-
except Exception as e:
|
|
159
|
-
logger.exception("Failing with %s", str(e))
|
|
160
|
-
self.fail()
|
|
161
|
-
|
|
162
|
-
def testExtractEntities(self):
|
|
163
|
-
"""Test case - extract entities"""
|
|
164
|
-
try:
|
|
165
|
-
obEx = ObjectExtractor(
|
|
166
|
-
self.__cfgOb,
|
|
167
|
-
databaseName="pdbx_core",
|
|
168
|
-
collectionName="pdbx_core_polymer_entity",
|
|
169
|
-
cacheFilePath=os.path.join(self.__workPath, "entity-data-test-cache.json"),
|
|
170
|
-
useCache=False,
|
|
171
|
-
keyAttribute="entity",
|
|
172
|
-
uniqueAttributes=["rcsb_id"],
|
|
173
|
-
cacheKwargs=self.__testEntryCacheKwargs,
|
|
174
|
-
objectLimit=self.__objectLimitTest,
|
|
175
|
-
)
|
|
176
|
-
eCount = obEx.getCount()
|
|
177
|
-
logger.info("Entity count is %d", eCount)
|
|
178
|
-
self.assertGreaterEqual(eCount, self.__objectLimitTest)
|
|
179
|
-
|
|
180
|
-
objD = obEx.getObjects()
|
|
181
|
-
for _, obj in objD.items():
|
|
182
|
-
obEx.genPathList(obj, path=None)
|
|
183
|
-
#
|
|
184
|
-
pL = obEx.getPathList(filterList=False)
|
|
185
|
-
logger.debug("Path list (unfiltered) %r", pL)
|
|
186
|
-
#
|
|
187
|
-
pL = obEx.getPathList()
|
|
188
|
-
logger.debug("Path list %r", pL)
|
|
189
|
-
obEx.setPathList(pL)
|
|
190
|
-
if self.__verbose:
|
|
191
|
-
for ky, obj in objD.items():
|
|
192
|
-
obEx.genValueList(obj, path=None)
|
|
193
|
-
tD = obEx.getValues()
|
|
194
|
-
logger.info("Index object %r %s", ky, pprint.pformat(tD, indent=3, width=120))
|
|
195
|
-
|
|
196
|
-
except Exception as e:
|
|
197
|
-
logger.exception("Failing with %s", str(e))
|
|
198
|
-
self.fail()
|
|
199
|
-
|
|
200
|
-
def testExtractSelectedEntityContent(self):
|
|
201
|
-
"""Test case - extract selected entity content
|
|
202
|
-
|
|
203
|
-
"reference_sequence_identifiers": [
|
|
204
|
-
{
|
|
205
|
-
"database_name": "UniProt",
|
|
206
|
-
"database_accession": "Q5SHN1",
|
|
207
|
-
"provenance_source": "SIFTS"
|
|
208
|
-
},
|
|
209
|
-
{
|
|
210
|
-
"database_name": "UniProt",
|
|
211
|
-
"database_accession": "Q5SHN1",
|
|
212
|
-
"provenance_source": "PDB"
|
|
213
|
-
}
|
|
214
|
-
]
|
|
215
|
-
"""
|
|
216
|
-
try:
|
|
217
|
-
obEx = ObjectExtractor(
|
|
218
|
-
self.__cfgOb,
|
|
219
|
-
databaseName="pdbx_core",
|
|
220
|
-
collectionName="pdbx_core_polymer_entity",
|
|
221
|
-
cacheFilePath=os.path.join(self.__workPath, "entity-selected-content-test-cache.json"),
|
|
222
|
-
useCache=False,
|
|
223
|
-
keyAttribute="entity",
|
|
224
|
-
uniqueAttributes=["rcsb_id"],
|
|
225
|
-
cacheKwargs=self.__testEntryCacheKwargs,
|
|
226
|
-
# objectLimit=self.__objectLimitTest,
|
|
227
|
-
objectLimit=None,
|
|
228
|
-
selectionQuery={"entity_poly.rcsb_entity_polymer_type": "Protein"},
|
|
229
|
-
selectionList=["rcsb_id", "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers"],
|
|
230
|
-
)
|
|
231
|
-
eCount = obEx.getCount()
|
|
232
|
-
logger.info("Entity count is %d", eCount)
|
|
233
|
-
#
|
|
234
|
-
#
|
|
235
|
-
if self.__objectLimitTest is not None:
|
|
236
|
-
self.assertGreaterEqual(eCount, self.__objectLimitTest)
|
|
237
|
-
objD = obEx.getObjects()
|
|
238
|
-
for _, obj in objD.items():
|
|
239
|
-
obEx.genPathList(obj, path=None)
|
|
240
|
-
#
|
|
241
|
-
pL = obEx.getPathList(filterList=False)
|
|
242
|
-
logger.debug("Path list (unfiltered) %r", pL)
|
|
243
|
-
#
|
|
244
|
-
pL = obEx.getPathList()
|
|
245
|
-
logger.debug("Path list %r", pL)
|
|
246
|
-
obEx.setPathList(pL)
|
|
247
|
-
if self.__verbose:
|
|
248
|
-
for ky, obj in objD.items():
|
|
249
|
-
obEx.genValueList(obj, path=None)
|
|
250
|
-
tD = obEx.getValues()
|
|
251
|
-
logger.info("Index object %r %s", ky, pprint.pformat(tD, indent=3, width=120))
|
|
252
|
-
|
|
253
|
-
objD = obEx.getObjects()
|
|
254
|
-
# logger.info("objD.keys() %r", list(objD.keys()))
|
|
255
|
-
totCount = 0
|
|
256
|
-
difCount = 0
|
|
257
|
-
pdbUnpIdD = defaultdict(int)
|
|
258
|
-
siftsUnpIdD = defaultdict(int)
|
|
259
|
-
pdbDifUnpIdD = defaultdict(int)
|
|
260
|
-
for entityKey, eD in objD.items():
|
|
261
|
-
try:
|
|
262
|
-
siftsS = set()
|
|
263
|
-
pdbS = set()
|
|
264
|
-
for tD in eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]:
|
|
265
|
-
if tD["database_name"] == "UniProt":
|
|
266
|
-
if tD["provenance_source"] == "SIFTS":
|
|
267
|
-
siftsS.add(tD["database_accession"])
|
|
268
|
-
siftsUnpIdD[tD["database_accession"]] += 1
|
|
269
|
-
elif tD["provenance_source"] == "PDB":
|
|
270
|
-
pdbS.add(tD["database_accession"])
|
|
271
|
-
pdbUnpIdD[tD["database_accession"]] += 1
|
|
272
|
-
else:
|
|
273
|
-
logger.debug("No UniProt for %r", eD["rcsb_polymer_entity_container_identifiers"])
|
|
274
|
-
logger.debug("PDB assigned sequence length %d", len(pdbS))
|
|
275
|
-
logger.debug("SIFTS assigned sequence length %d", len(siftsS))
|
|
276
|
-
|
|
277
|
-
if pdbS and siftsS:
|
|
278
|
-
totCount += 1
|
|
279
|
-
if pdbS != siftsS:
|
|
280
|
-
difCount += 1
|
|
281
|
-
for idV in pdbS:
|
|
282
|
-
pdbDifUnpIdD[idV] += 1
|
|
283
|
-
|
|
284
|
-
except Exception as e:
|
|
285
|
-
logger.warning("No identifiers for %s with %s", entityKey, str(e))
|
|
286
|
-
logger.info("Total %d differences %d", totCount, difCount)
|
|
287
|
-
logger.info("Unique UniProt ids PDB %d SIFTS %d", len(pdbUnpIdD), len(siftsUnpIdD))
|
|
288
|
-
logger.info("Unique UniProt differences %d ", len(pdbDifUnpIdD))
|
|
289
|
-
except Exception as e:
|
|
290
|
-
logger.exception("Failing with %s", str(e))
|
|
291
|
-
self.fail()
|
|
292
|
-
|
|
293
|
-
def testExtractEntityTaxonomyContent(self):
|
|
294
|
-
"""Test case - extract unique entity source and host taxonomies"""
|
|
295
|
-
try:
|
|
296
|
-
obEx = ObjectExtractor(
|
|
297
|
-
self.__cfgOb,
|
|
298
|
-
databaseName="pdbx_core",
|
|
299
|
-
collectionName="pdbx_core_polymer_entity",
|
|
300
|
-
cacheFilePath=os.path.join(self.__workPath, "entity-taxonomy-test-cache.json"),
|
|
301
|
-
useCache=False,
|
|
302
|
-
keyAttribute="entity",
|
|
303
|
-
uniqueAttributes=["rcsb_id"],
|
|
304
|
-
cacheKwargs=self.__testEntryCacheKwargs,
|
|
305
|
-
# objectLimit=self.__objectLimitTest,
|
|
306
|
-
objectLimit=None,
|
|
307
|
-
selectionQuery=None,
|
|
308
|
-
selectionList=["rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_host_organism.ncbi_taxonomy_id"],
|
|
309
|
-
)
|
|
310
|
-
eCount = obEx.getCount()
|
|
311
|
-
logger.info("Polymer entity count is %d", eCount)
|
|
312
|
-
taxIdS = set()
|
|
313
|
-
objD = obEx.getObjects()
|
|
314
|
-
for _, eD in objD.items():
|
|
315
|
-
try:
|
|
316
|
-
for tD in eD["rcsb_entity_source_organism"]:
|
|
317
|
-
taxIdS.add(tD["ncbi_taxonomy_id"])
|
|
318
|
-
except Exception:
|
|
319
|
-
pass
|
|
320
|
-
try:
|
|
321
|
-
for tD in eD["rcsb_entity_host_organism"]:
|
|
322
|
-
taxIdS.add(tD["ncbi_taxonomy_id"])
|
|
323
|
-
except Exception:
|
|
324
|
-
pass
|
|
325
|
-
|
|
326
|
-
logger.info("Unique taxons %d", len(taxIdS))
|
|
327
|
-
except Exception as e:
|
|
328
|
-
logger.exception("Failing with %s", str(e))
|
|
329
|
-
self.fail()
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
def objectExtractorSuite():
|
|
333
|
-
suiteSelect = unittest.TestSuite()
|
|
334
|
-
suiteSelect.addTest(ObjectExtractorTests("testExtractEntries"))
|
|
335
|
-
suiteSelect.addTest(ObjectExtractorTests("testExtractEntities"))
|
|
336
|
-
suiteSelect.addTest(ObjectExtractorTests("testExtractSelectedEntityContent"))
|
|
337
|
-
return suiteSelect
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
if __name__ == "__main__":
|
|
341
|
-
mySuite = objectExtractorSuite()
|
|
342
|
-
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
##
|
|
2
|
-
# File: ObjectTransformerTests.py
|
|
3
|
-
# Author: J. Westbrook
|
|
4
|
-
# Date: 25-Apr-2019
|
|
5
|
-
#
|
|
6
|
-
# Updates:
|
|
7
|
-
#
|
|
8
|
-
##
|
|
9
|
-
"""
|
|
10
|
-
Tests for extractor and updater or selected values from collections (limited tests from mock-data repos)
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
__docformat__ = "google en"
|
|
14
|
-
__author__ = "John Westbrook"
|
|
15
|
-
__email__ = "jwest@rcsb.rutgers.edu"
|
|
16
|
-
__license__ = "Apache 2.0"
|
|
17
|
-
|
|
18
|
-
import logging
|
|
19
|
-
import os
|
|
20
|
-
import platform
|
|
21
|
-
import resource
|
|
22
|
-
import time
|
|
23
|
-
import unittest
|
|
24
|
-
|
|
25
|
-
from rcsb.exdb.utils.ObjectTransformer import ObjectTransformer
|
|
26
|
-
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
27
|
-
|
|
28
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
29
|
-
logger = logging.getLogger()
|
|
30
|
-
|
|
31
|
-
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
32
|
-
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class ObjectTransformerTests(unittest.TestCase):
|
|
36
|
-
def __init__(self, methodName="runTest"):
|
|
37
|
-
super(ObjectTransformerTests, self).__init__(methodName)
|
|
38
|
-
self.__verbose = True
|
|
39
|
-
|
|
40
|
-
def setUp(self):
|
|
41
|
-
#
|
|
42
|
-
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
43
|
-
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
44
|
-
#
|
|
45
|
-
configName = "site_info_configuration"
|
|
46
|
-
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
47
|
-
#
|
|
48
|
-
self.__fetchLimit = 5
|
|
49
|
-
#
|
|
50
|
-
self.__startTime = time.time()
|
|
51
|
-
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
52
|
-
|
|
53
|
-
def tearDown(self):
|
|
54
|
-
unitS = "MB" if platform.system() == "Darwin" else "GB"
|
|
55
|
-
rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
56
|
-
logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS)
|
|
57
|
-
endTime = time.time()
|
|
58
|
-
logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
59
|
-
|
|
60
|
-
def testTranformEntityProteinContent(self):
|
|
61
|
-
"""Test case - transform selected entity protein documents"""
|
|
62
|
-
try:
|
|
63
|
-
databaseName = "pdbx_core"
|
|
64
|
-
collectionName = "pdbx_core_polymer_entity"
|
|
65
|
-
obTr = ObjectTransformer(self.__cfgOb)
|
|
66
|
-
ok = obTr.doTransform(
|
|
67
|
-
databaseName=databaseName, collectionName=collectionName, fetchLimit=self.__fetchLimit, selectionQuery={"entity_poly.rcsb_entity_polymer_type": "Protein"}
|
|
68
|
-
)
|
|
69
|
-
self.assertTrue(ok)
|
|
70
|
-
except Exception as e:
|
|
71
|
-
logger.exception("Failing with %s", str(e))
|
|
72
|
-
self.fail()
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def objectTransformerSuite():
|
|
76
|
-
suiteSelect = unittest.TestSuite()
|
|
77
|
-
suiteSelect.addTest(ObjectTransformerTests("testTransformEntityProteinContent"))
|
|
78
|
-
return suiteSelect
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if __name__ == "__main__":
|
|
82
|
-
mySuite = objectTransformerSuite()
|
|
83
|
-
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
##
|
|
2
|
-
# File: ObjectUpdaterTests.py
|
|
3
|
-
# Author: J. Westbrook
|
|
4
|
-
# Date: 25-Apr-2019
|
|
5
|
-
#
|
|
6
|
-
# Updates:
|
|
7
|
-
#
|
|
8
|
-
##
|
|
9
|
-
"""
|
|
10
|
-
Tests for extractor and updater or selected values from collections (limited tests from mock-data repos)
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
__docformat__ = "google en"
|
|
14
|
-
__author__ = "John Westbrook"
|
|
15
|
-
__email__ = "jwest@rcsb.rutgers.edu"
|
|
16
|
-
__license__ = "Apache 2.0"
|
|
17
|
-
|
|
18
|
-
import logging
|
|
19
|
-
import os
|
|
20
|
-
import platform
|
|
21
|
-
import resource
|
|
22
|
-
import time
|
|
23
|
-
import unittest
|
|
24
|
-
|
|
25
|
-
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
26
|
-
from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
|
|
27
|
-
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
28
|
-
|
|
29
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
30
|
-
logger = logging.getLogger()
|
|
31
|
-
|
|
32
|
-
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
33
|
-
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class ObjectUpdaterTests(unittest.TestCase):
|
|
37
|
-
def __init__(self, methodName="runTest"):
|
|
38
|
-
super(ObjectUpdaterTests, self).__init__(methodName)
|
|
39
|
-
self.__verbose = True
|
|
40
|
-
|
|
41
|
-
def setUp(self):
|
|
42
|
-
#
|
|
43
|
-
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
44
|
-
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
45
|
-
#
|
|
46
|
-
configName = "site_info_configuration"
|
|
47
|
-
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
48
|
-
#
|
|
49
|
-
self.__workPath = os.path.join(TOPDIR, "CACHE", "exdb")
|
|
50
|
-
self.__testEntryCacheKwargs = {"fmt": "json", "indent": 3}
|
|
51
|
-
self.__objectLimitTest = 5
|
|
52
|
-
#
|
|
53
|
-
self.__startTime = time.time()
|
|
54
|
-
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
55
|
-
|
|
56
|
-
def tearDown(self):
|
|
57
|
-
unitS = "MB" if platform.system() == "Darwin" else "GB"
|
|
58
|
-
rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
59
|
-
logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS)
|
|
60
|
-
endTime = time.time()
|
|
61
|
-
logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
62
|
-
|
|
63
|
-
def testUpdateSelectedEntityContent(self):
|
|
64
|
-
"""Test case - update of selected entity reference sequence content"""
|
|
65
|
-
try:
|
|
66
|
-
databaseName = "pdbx_core"
|
|
67
|
-
collectionName = "pdbx_core_polymer_entity"
|
|
68
|
-
obEx = ObjectExtractor(
|
|
69
|
-
self.__cfgOb,
|
|
70
|
-
databaseName=databaseName,
|
|
71
|
-
collectionName=collectionName,
|
|
72
|
-
cacheFilePath=os.path.join(self.__workPath, "entity-selected-content-test-cache.json"),
|
|
73
|
-
useCache=False,
|
|
74
|
-
keyAttribute="entity",
|
|
75
|
-
uniqueAttributes=["rcsb_id"],
|
|
76
|
-
cacheKwargs=self.__testEntryCacheKwargs,
|
|
77
|
-
objectLimit=self.__objectLimitTest,
|
|
78
|
-
# objectLimit=None,
|
|
79
|
-
selectionQuery={"entity_poly.rcsb_entity_polymer_type": "Protein"},
|
|
80
|
-
selectionList=["rcsb_id", "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers"],
|
|
81
|
-
)
|
|
82
|
-
eCount = obEx.getCount()
|
|
83
|
-
logger.info("Entity count is %d", eCount)
|
|
84
|
-
objD = obEx.getObjects()
|
|
85
|
-
updateDL = []
|
|
86
|
-
for entityKey, eD in objD.items():
|
|
87
|
-
try:
|
|
88
|
-
selectD = {"rcsb_id": entityKey}
|
|
89
|
-
tL = (
|
|
90
|
-
eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]
|
|
91
|
-
if "reference_sequence_identifiers" in eD["rcsb_polymer_entity_container_identifiers"]
|
|
92
|
-
else []
|
|
93
|
-
)
|
|
94
|
-
tL.append({"database_accession": "1111111", "database_name": "PDB", "provenance_source": "RCSB"})
|
|
95
|
-
#
|
|
96
|
-
updateD = {"rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers": tL}
|
|
97
|
-
updateDL.append({"selectD": selectD, "updateD": updateD})
|
|
98
|
-
except Exception as e:
|
|
99
|
-
logger.exception("Failing with %s", str(e))
|
|
100
|
-
for ii, uD in enumerate(updateDL):
|
|
101
|
-
logger.debug(" >>>> (%d) selectD %r updateD %r", ii, uD["selectD"], uD["updateD"])
|
|
102
|
-
#
|
|
103
|
-
obUpd = ObjectUpdater(self.__cfgOb)
|
|
104
|
-
numUpd = obUpd.update(databaseName, collectionName, updateDL)
|
|
105
|
-
self.assertGreaterEqual(numUpd, len(updateDL))
|
|
106
|
-
logger.info("Update count is %d", numUpd)
|
|
107
|
-
except Exception as e:
|
|
108
|
-
logger.exception("Failing with %s", str(e))
|
|
109
|
-
self.fail()
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def objectUpdaterSuite():
|
|
113
|
-
suiteSelect = unittest.TestSuite()
|
|
114
|
-
suiteSelect.addTest(ObjectUpdaterTests("testUpdateSelectedEntityContent"))
|
|
115
|
-
return suiteSelect
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
if __name__ == "__main__":
|
|
119
|
-
mySuite = objectUpdaterSuite()
|
|
120
|
-
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
##
|
|
2
|
-
# File: PolymerEntityExtractorTests.py
|
|
3
|
-
# Author: J. Westbrook
|
|
4
|
-
# Date: 5-Dec-2020
|
|
5
|
-
#
|
|
6
|
-
# Updates:
|
|
7
|
-
#
|
|
8
|
-
##
|
|
9
|
-
"""
|
|
10
|
-
Tests for extraction of polymer entity sequence details from the ExDB core collections.
|
|
11
|
-
"""
|
|
12
|
-
__docformat__ = "google en"
|
|
13
|
-
__author__ = "John Westbrook"
|
|
14
|
-
__email__ = "jwest@rcsb.rutgers.edu"
|
|
15
|
-
__license__ = "Apache 2.0"
|
|
16
|
-
|
|
17
|
-
import logging
|
|
18
|
-
import os
|
|
19
|
-
import platform
|
|
20
|
-
import resource
|
|
21
|
-
import time
|
|
22
|
-
import unittest
|
|
23
|
-
|
|
24
|
-
from rcsb.exdb.seq.PolymerEntityExtractor import PolymerEntityExtractor
|
|
25
|
-
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
29
|
-
logger = logging.getLogger()
|
|
30
|
-
|
|
31
|
-
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
32
|
-
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class PolymerEntityExtractorTests(unittest.TestCase):
|
|
36
|
-
def __init__(self, methodName="runTest"):
|
|
37
|
-
super(PolymerEntityExtractorTests, self).__init__(methodName)
|
|
38
|
-
self.__verbose = True
|
|
39
|
-
|
|
40
|
-
def setUp(self):
|
|
41
|
-
#
|
|
42
|
-
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
43
|
-
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
44
|
-
configName = "site_info_configuration"
|
|
45
|
-
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
46
|
-
#
|
|
47
|
-
self.__fastaPath = os.path.join(HERE, "test-output", "CACHE", "pdb-protein-entity.fa")
|
|
48
|
-
self.__taxonPath = os.path.join(HERE, "test-output", "CACHE", "pdb-protein-entity-taxon.tdd")
|
|
49
|
-
self.__detailsPath = os.path.join(HERE, "test-output", "CACHE", "pdb-protein-entity-details.json")
|
|
50
|
-
#
|
|
51
|
-
self.__startTime = time.time()
|
|
52
|
-
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
53
|
-
|
|
54
|
-
def tearDown(self):
|
|
55
|
-
unitS = "MB" if platform.system() == "Darwin" else "GB"
|
|
56
|
-
rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
57
|
-
logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS)
|
|
58
|
-
endTime = time.time()
|
|
59
|
-
logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
60
|
-
|
|
61
|
-
def testGetProteinEntityDetails(self):
|
|
62
|
-
"""Test case - get protein entity sequences and essential details"""
|
|
63
|
-
try:
|
|
64
|
-
pEx = PolymerEntityExtractor(self.__cfgOb)
|
|
65
|
-
pD, _ = pEx.getProteinSequenceDetails()
|
|
66
|
-
#
|
|
67
|
-
self.assertGreaterEqual(len(pD), 70)
|
|
68
|
-
logger.info("Polymer entity count %d", len(pD))
|
|
69
|
-
except Exception as e:
|
|
70
|
-
logger.exception("Failing with %s", str(e))
|
|
71
|
-
self.fail()
|
|
72
|
-
|
|
73
|
-
def testExportProteinEntityFasta(self):
|
|
74
|
-
"""Test case - export protein entity sequence Fasta"""
|
|
75
|
-
try:
|
|
76
|
-
pEx = PolymerEntityExtractor(self.__cfgOb)
|
|
77
|
-
ok = pEx.exportProteinEntityFasta(self.__fastaPath, self.__taxonPath, self.__detailsPath)
|
|
78
|
-
self.assertTrue(ok)
|
|
79
|
-
except Exception as e:
|
|
80
|
-
logger.exception("Failing with %s", str(e))
|
|
81
|
-
self.fail()
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def extractorSuite():
|
|
85
|
-
suiteSelect = unittest.TestSuite()
|
|
86
|
-
suiteSelect.addTest(PolymerEntityExtractorTests("testGetProteinEntityDetails"))
|
|
87
|
-
suiteSelect.addTest(PolymerEntityExtractorTests("testExportProteinEntityFasta"))
|
|
88
|
-
return suiteSelect
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
if __name__ == "__main__":
|
|
92
|
-
mySuite = extractorSuite()
|
|
93
|
-
unittest.TextTestRunner(verbosity=2).run(mySuite)
|