rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ObjectExtractor.py
|
|
3
|
+
# Date: 26-Jun-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to extract document features from the document object server.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 27-Jun-2019 jdw add JSON path tracking utilities.
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
import copy
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
from rcsb.db.mongo.Connection import Connection
|
|
21
|
+
from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
|
|
22
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ObjectExtractor(object):
|
|
29
|
+
"""Utilities to extract document features from the document object server."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, cfgOb, **kwargs):
|
|
32
|
+
self.__cfgOb = cfgOb
|
|
33
|
+
self.__resourceName = "MONGO_DB"
|
|
34
|
+
self.__mU = MarshalUtil()
|
|
35
|
+
#
|
|
36
|
+
self.__objectD = self.__rebuildCache(**kwargs)
|
|
37
|
+
self.__objPathD = {}
|
|
38
|
+
self.__stringPathList = []
|
|
39
|
+
self.__objValD = {}
|
|
40
|
+
#
|
|
41
|
+
|
|
42
|
+
def getObjects(self):
|
|
43
|
+
return self.__objectD
|
|
44
|
+
|
|
45
|
+
def getPathList(self, filterList=True):
|
|
46
|
+
kL = []
|
|
47
|
+
if filterList:
|
|
48
|
+
tL = []
|
|
49
|
+
for ky in self.__objPathD:
|
|
50
|
+
if ky and (ky.find(".") != -1 or ky.startswith("_")) and ky not in ["_id"] and not ky.endswith("[]"):
|
|
51
|
+
tL.append(ky)
|
|
52
|
+
for ky in tL:
|
|
53
|
+
for tky in tL:
|
|
54
|
+
ok = True
|
|
55
|
+
if ky in tky and ky != tky:
|
|
56
|
+
ok = False
|
|
57
|
+
break
|
|
58
|
+
if ok:
|
|
59
|
+
kL.append(ky)
|
|
60
|
+
else:
|
|
61
|
+
kL = list(self.__objPathD.keys())
|
|
62
|
+
#
|
|
63
|
+
return sorted(kL)
|
|
64
|
+
|
|
65
|
+
def getValues(self):
|
|
66
|
+
return self.__objValD
|
|
67
|
+
|
|
68
|
+
def setPathList(self, stringPathList):
|
|
69
|
+
self.__objPathD = {k: True for k in stringPathList}
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
def getCount(self):
|
|
73
|
+
return len(self.__objectD)
|
|
74
|
+
|
|
75
|
+
def __rebuildCache(self, **kwargs):
|
|
76
|
+
cacheFilePath = kwargs.get("cacheFilePath", None)
|
|
77
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
|
|
78
|
+
useCache = kwargs.get("useCache", True)
|
|
79
|
+
keyAttribute = kwargs.get("keyAttribute", "entry")
|
|
80
|
+
selectL = kwargs.get("selectionList", [])
|
|
81
|
+
#
|
|
82
|
+
cD = {keyAttribute: {}}
|
|
83
|
+
try:
|
|
84
|
+
if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK):
|
|
85
|
+
cD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
|
|
86
|
+
else:
|
|
87
|
+
if selectL:
|
|
88
|
+
objectD = self.__select(**kwargs)
|
|
89
|
+
else:
|
|
90
|
+
objectD = self.__selectObjects(**kwargs)
|
|
91
|
+
cD[keyAttribute] = objectD
|
|
92
|
+
if cacheFilePath:
|
|
93
|
+
pth, _ = os.path.split(cacheFilePath)
|
|
94
|
+
ok = self.__mU.mkdir(pth)
|
|
95
|
+
ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs)
|
|
96
|
+
logger.info("Saved object results (%d) status %r in %s", len(objectD), ok, cacheFilePath)
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.exception("Failing with %s", str(e))
|
|
99
|
+
return cD[keyAttribute]
|
|
100
|
+
|
|
101
|
+
def __selectObjects(self, **kwargs):
|
|
102
|
+
"""Return a dictionary of objects satisfying the input conditions (e.g. method, resolution limit)"""
|
|
103
|
+
databaseName = kwargs.get("databaseName", "pdbx_core")
|
|
104
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_entry")
|
|
105
|
+
selectionQueryD = kwargs.get("selectionQuery", {})
|
|
106
|
+
#
|
|
107
|
+
uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
|
|
108
|
+
#
|
|
109
|
+
tV = kwargs.get("objectLimit", None)
|
|
110
|
+
objLimit = int(tV) if tV is not None else None
|
|
111
|
+
stripObjectId = kwargs.get("stripObjectId", False)
|
|
112
|
+
logIncrement = kwargs.get("logIncrement", 10000)
|
|
113
|
+
#
|
|
114
|
+
objectD = {}
|
|
115
|
+
try:
|
|
116
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
117
|
+
mg = MongoDbUtil(client)
|
|
118
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
119
|
+
logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
|
|
120
|
+
qD = {}
|
|
121
|
+
if selectionQueryD:
|
|
122
|
+
qD.update(selectionQueryD)
|
|
123
|
+
selectL = ["_id"]
|
|
124
|
+
dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD)
|
|
125
|
+
numDoc = len(dL) if dL else 0
|
|
126
|
+
logger.info("Selection %r fetch result count %d", selectL, numDoc)
|
|
127
|
+
#
|
|
128
|
+
for ii, dD in enumerate(dL, 1):
|
|
129
|
+
if "_id" not in dD:
|
|
130
|
+
continue
|
|
131
|
+
rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"])
|
|
132
|
+
if stripObjectId and rObj and "_id" in rObj:
|
|
133
|
+
rObj.pop("_id")
|
|
134
|
+
else:
|
|
135
|
+
rObj["_id"] = str(rObj["_id"])
|
|
136
|
+
#
|
|
137
|
+
stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
|
|
138
|
+
objectD[stKey] = copy.copy(rObj)
|
|
139
|
+
if objLimit and ii >= objLimit:
|
|
140
|
+
break
|
|
141
|
+
logger.debug("Saving %d %s", ii, stKey)
|
|
142
|
+
if ii % logIncrement == 0 or ii == numDoc:
|
|
143
|
+
logger.info("Extracting object (%d of %d)", ii, numDoc)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.exception("Failing with %s", str(e))
|
|
146
|
+
return objectD
|
|
147
|
+
#
|
|
148
|
+
|
|
149
|
+
def __select(self, **kwargs):
|
|
150
|
+
"""Return a dictionary of object content satisfying the input conditions
|
|
151
|
+
(e.g. method, resolution limit) and selection options.
|
|
152
|
+
"""
|
|
153
|
+
databaseName = kwargs.get("databaseName", "pdbx_core")
|
|
154
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_entry")
|
|
155
|
+
selectionQueryD = kwargs.get("selectionQuery", {})
|
|
156
|
+
uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
|
|
157
|
+
selectL = kwargs.get("selectionList", [])
|
|
158
|
+
stripObjectId = kwargs.get("stripObjectId", False)
|
|
159
|
+
#
|
|
160
|
+
tV = kwargs.get("objectLimit", None)
|
|
161
|
+
objLimit = int(tV) if tV is not None else None
|
|
162
|
+
#
|
|
163
|
+
objectD = {}
|
|
164
|
+
try:
|
|
165
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
166
|
+
mg = MongoDbUtil(client)
|
|
167
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
168
|
+
logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
|
|
169
|
+
qD = {}
|
|
170
|
+
if selectionQueryD:
|
|
171
|
+
qD.update(selectionQueryD)
|
|
172
|
+
dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD, suppressId=True)
|
|
173
|
+
logger.info("Selection %r fetch result count %d", selectL, len(dL))
|
|
174
|
+
#
|
|
175
|
+
for ii, rObj in enumerate(dL, 1):
|
|
176
|
+
stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
|
|
177
|
+
if stripObjectId and rObj and "_id" in rObj:
|
|
178
|
+
rObj.pop("_id")
|
|
179
|
+
objectD[stKey] = copy.copy(rObj)
|
|
180
|
+
if objLimit and ii >= objLimit:
|
|
181
|
+
break
|
|
182
|
+
# logger.debug("Saving %d %s", ii, stKey)
|
|
183
|
+
# logger.debug("Current objectD keys %r", list(objectD.keys()))
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.exception("Failing with %s", str(e))
|
|
187
|
+
return objectD
|
|
188
|
+
#
|
|
189
|
+
|
|
190
|
+
def __getKeyValues(self, dct, keyNames):
|
|
191
|
+
"""Return the tuple of values of corresponding to the input dictionary key names expressed in dot notation.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
dct (dict): source dictionary object (nested)
|
|
195
|
+
keyNames (list): list of dictionary keys in dot notation
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
tuple: tuple of values corresponding to the input key names
|
|
199
|
+
|
|
200
|
+
"""
|
|
201
|
+
rL = []
|
|
202
|
+
try:
|
|
203
|
+
for keyName in keyNames:
|
|
204
|
+
rL.append(self.__getKeyValue(dct, keyName))
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.exception("Failing for key names %r with %s", keyNames, str(e))
|
|
207
|
+
|
|
208
|
+
return tuple(rL)
|
|
209
|
+
|
|
210
|
+
def __getKeyValue(self, dct, keyName):
|
|
211
|
+
"""Return the value of the corresponding key expressed in dot notation in the input dictionary object (nested)."""
|
|
212
|
+
try:
|
|
213
|
+
kys = keyName.split(".")
|
|
214
|
+
for key in kys:
|
|
215
|
+
try:
|
|
216
|
+
dct = dct[key]
|
|
217
|
+
except KeyError:
|
|
218
|
+
return None
|
|
219
|
+
return dct
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logger.exception("Failing for key %r with %s", keyName, str(e))
|
|
222
|
+
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
def __toJsonPathString(self, path):
|
|
226
|
+
pL = [ky if ky else "[]" for ky in path]
|
|
227
|
+
sp = ".".join(pL)
|
|
228
|
+
sp = sp.replace(".[", "[")
|
|
229
|
+
return sp
|
|
230
|
+
|
|
231
|
+
def __pathCallBack(self, path, value):
|
|
232
|
+
sp = self.__toJsonPathString(path)
|
|
233
|
+
self.__objPathD[sp] = self.__objPathD[sp] + 1 if sp in self.__objPathD else 1
|
|
234
|
+
return value
|
|
235
|
+
|
|
236
|
+
def __saveCallBack(self, path, value):
|
|
237
|
+
sP = self.__toJsonPathString(path)
|
|
238
|
+
if sP in self.__objPathD:
|
|
239
|
+
ky = sP.replace("[]", "")
|
|
240
|
+
if sP.find("[") != -1: # multivalued
|
|
241
|
+
if isinstance(value, list):
|
|
242
|
+
self.__objValD.setdefault(ky, []).extend(value)
|
|
243
|
+
else:
|
|
244
|
+
self.__objValD.setdefault(ky, []).append(value)
|
|
245
|
+
else:
|
|
246
|
+
self.__objValD[ky] = value
|
|
247
|
+
return value
|
|
248
|
+
|
|
249
|
+
def genPathList(self, dObj, path=None):
|
|
250
|
+
return self.__walk(dObj, jsonPath=path, funct=self.__pathCallBack)
|
|
251
|
+
|
|
252
|
+
def genValueList(self, dObj, path=None, clear=True):
|
|
253
|
+
self.__objValD = {} if clear else self.__objValD
|
|
254
|
+
return self.__walk(dObj, jsonPath=path, funct=self.__saveCallBack)
|
|
255
|
+
|
|
256
|
+
def __walk(self, jsonObj, jsonPath=None, funct=None):
|
|
257
|
+
"""Walk JSON data types. An optional funct() is called to mutate
|
|
258
|
+
the value of each element. The jsonPath is updated at each element.
|
|
259
|
+
"""
|
|
260
|
+
if jsonPath is None:
|
|
261
|
+
jsonPath = []
|
|
262
|
+
|
|
263
|
+
if isinstance(jsonObj, dict):
|
|
264
|
+
value = {k: self.__walk(v, jsonPath + [k], funct) for k, v in jsonObj.items()}
|
|
265
|
+
elif isinstance(jsonObj, list):
|
|
266
|
+
value = [self.__walk(elem, jsonPath + [[]], funct) for elem in jsonObj]
|
|
267
|
+
else:
|
|
268
|
+
value = jsonObj
|
|
269
|
+
|
|
270
|
+
if funct is None:
|
|
271
|
+
return value
|
|
272
|
+
else:
|
|
273
|
+
return funct(jsonPath, value)
|
|
274
|
+
|
|
275
|
+
def __toPath(self, path):
|
|
276
|
+
"""Convert path strings into path lists."""
|
|
277
|
+
if isinstance(path, list):
|
|
278
|
+
return path # already in list format
|
|
279
|
+
|
|
280
|
+
def _iterPath(path):
|
|
281
|
+
for parts in path.split("[]"):
|
|
282
|
+
for part in parts.strip(".").split("."):
|
|
283
|
+
yield part
|
|
284
|
+
yield []
|
|
285
|
+
|
|
286
|
+
return list(_iterPath(path))[:-1]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ObjectTransformer.py
|
|
3
|
+
# Date: 17-Oct-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to extract and update object from the document object server.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
#
|
|
9
|
+
##
|
|
10
|
+
__docformat__ = "google en"
|
|
11
|
+
__author__ = "John Westbrook"
|
|
12
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
13
|
+
__license__ = "Apache 2.0"
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
from rcsb.db.mongo.Connection import Connection
|
|
18
|
+
from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
|
|
19
|
+
from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
|
|
20
|
+
from rcsb.db.utils.TimeUtil import TimeUtil
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ObjectTransformer(object):
|
|
26
|
+
"""Utilities to extract and update object from the document object server."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, cfgOb, objectAdapter=None, **kwargs):
|
|
29
|
+
self.__cfgOb = cfgOb
|
|
30
|
+
self.__oAdapt = objectAdapter
|
|
31
|
+
self.__resourceName = "MONGO_DB"
|
|
32
|
+
_ = kwargs
|
|
33
|
+
self.__statusList = []
|
|
34
|
+
|
|
35
|
+
def doTransform(self, **kwargs):
|
|
36
|
+
desp = DataExchangeStatus()
|
|
37
|
+
statusStartTimestamp = desp.setStartTime()
|
|
38
|
+
#
|
|
39
|
+
databaseName = kwargs.get("databaseName", "pdbx_core")
|
|
40
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_entry")
|
|
41
|
+
selectionQueryD = kwargs.get("selectionQuery", {})
|
|
42
|
+
fetchLimit = kwargs.get("fetchLimit", None)
|
|
43
|
+
tU = TimeUtil()
|
|
44
|
+
updateId = kwargs.get("updateId", tU.getCurrentWeekSignature())
|
|
45
|
+
#
|
|
46
|
+
docSelectList = self.__selectObjectIds(databaseName, collectionName, selectionQueryD)
|
|
47
|
+
docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList
|
|
48
|
+
ok = self.__transform(databaseName, collectionName, docSelectList)
|
|
49
|
+
#
|
|
50
|
+
okS = True
|
|
51
|
+
if updateId:
|
|
52
|
+
okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
53
|
+
return ok and okS
|
|
54
|
+
|
|
55
|
+
def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
|
|
56
|
+
"""Return a list of object identifiers for the input selection query."""
|
|
57
|
+
try:
|
|
58
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
59
|
+
mg = MongoDbUtil(client)
|
|
60
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
61
|
+
logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
|
|
62
|
+
qD = {}
|
|
63
|
+
if selectionQueryD:
|
|
64
|
+
qD.update(selectionQueryD)
|
|
65
|
+
selectL = ["_id"]
|
|
66
|
+
dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD)
|
|
67
|
+
logger.info("Selection %r fetch result count %d", selectL, len(dL))
|
|
68
|
+
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.exception("Failing with %s", str(e))
|
|
71
|
+
return dL
|
|
72
|
+
#
|
|
73
|
+
|
|
74
|
+
def __transform(self, databaseName, collectionName, docSelectList, logIncrement=10000):
|
|
75
|
+
"""Return a list of object identifiers for the input selection query."""
|
|
76
|
+
#
|
|
77
|
+
ok = True
|
|
78
|
+
try:
|
|
79
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
80
|
+
mg = MongoDbUtil(client)
|
|
81
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
82
|
+
numDoc = len(docSelectList)
|
|
83
|
+
for ii, dD in enumerate(docSelectList, 1):
|
|
84
|
+
if "_id" not in dD:
|
|
85
|
+
continue
|
|
86
|
+
rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"])
|
|
87
|
+
del rObj["_id"]
|
|
88
|
+
#
|
|
89
|
+
fOk = True
|
|
90
|
+
if self.__oAdapt:
|
|
91
|
+
fOk, rObj = self.__oAdapt.filter(rObj)
|
|
92
|
+
if fOk:
|
|
93
|
+
rOk = mg.replace(databaseName, collectionName, rObj, dD)
|
|
94
|
+
if rOk is None:
|
|
95
|
+
tId = rObj["rcsb_id"] if "rcsb_id" in rObj else "anonymous"
|
|
96
|
+
logger.error("%r %r (%r) failing", databaseName, collectionName, tId)
|
|
97
|
+
logger.debug("rObj.keys() %r", list(rObj.keys()))
|
|
98
|
+
logger.debug("rObj.items() %s", rObj.items())
|
|
99
|
+
rOk = False
|
|
100
|
+
ok = ok and rOk
|
|
101
|
+
#
|
|
102
|
+
if ii % logIncrement == 0 or ii == numDoc:
|
|
103
|
+
logger.info("Replace status %r object (%d of %d)", ok, ii, numDoc)
|
|
104
|
+
#
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.exception("Failing with %s", str(e))
|
|
107
|
+
return ok
|
|
108
|
+
|
|
109
|
+
def getLoadStatus(self):
|
|
110
|
+
return self.__statusList
|
|
111
|
+
|
|
112
|
+
def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
|
|
113
|
+
try:
|
|
114
|
+
sFlag = "Y" if status else "N"
|
|
115
|
+
desp = DataExchangeStatus()
|
|
116
|
+
desp.setStartTime(tS=startTimestamp)
|
|
117
|
+
desp.setObject(databaseName, collectionName)
|
|
118
|
+
desp.setStatus(updateId=updateId, successFlag=sFlag)
|
|
119
|
+
desp.setEndTime()
|
|
120
|
+
self.__statusList.append(desp.getStatus())
|
|
121
|
+
return True
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.exception("Failing with %s", str(e))
|
|
124
|
+
return False
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ObjectUpdater.py
|
|
3
|
+
# Date: 9-Oct-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to update document features from the document object server.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
from rcsb.db.mongo.Connection import Connection
|
|
19
|
+
from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ObjectUpdater(object):
|
|
26
|
+
"""Utilities to update document features from the document object server."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, cfgOb, **kwargs):
|
|
29
|
+
self.__cfgOb = cfgOb
|
|
30
|
+
self.__resourceName = "MONGO_DB"
|
|
31
|
+
_ = kwargs
|
|
32
|
+
#
|
|
33
|
+
|
|
34
|
+
def update(self, databaseName, collectionName, updateDL):
|
|
35
|
+
"""Update documents satisfying the selection details with the content of updateDL.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
databaseName (str): Target database name
|
|
39
|
+
collectionName (str): Target collection name
|
|
40
|
+
updateDL = [{selectD: ..., updateD: ... }, ....]
|
|
41
|
+
selectD = {'ky1': 'val1', 'ky2': 'val2', ...}
|
|
42
|
+
updateD = {'key1.subkey1...': 'val1', 'key2.subkey2..': 'val2', ...}
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
try:
|
|
46
|
+
numUpdated = 0
|
|
47
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
48
|
+
mg = MongoDbUtil(client)
|
|
49
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
50
|
+
logger.debug("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
|
|
51
|
+
for updateD in updateDL:
|
|
52
|
+
num = mg.update(databaseName, collectionName, updateD["updateD"], updateD["selectD"], upsertFlag=True)
|
|
53
|
+
numUpdated += num
|
|
54
|
+
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.exception("Failing with %s", str(e))
|
|
57
|
+
return numUpdated
|
|
58
|
+
|
|
59
|
+
def count(self, databaseName, collectionName):
|
|
60
|
+
try:
|
|
61
|
+
numTotal = 0
|
|
62
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
63
|
+
mg = MongoDbUtil(client)
|
|
64
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
65
|
+
numTotal = mg.count(databaseName, collectionName)
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.exception("Failing with %s", str(e))
|
|
68
|
+
return numTotal
|
|
69
|
+
|
|
70
|
+
def createCollection(self, databaseName, collectionName, indexAttributeNames=None, indexName="primary", checkExists=False, bsonSchema=None):
|
|
71
|
+
"""Create collection and optionally set index attributes for the named index and validation schema for a new collection.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
databaseName (str): target database name
|
|
75
|
+
collectionName (str): target collection name
|
|
76
|
+
indexAttributeNames (list, optional): list of attribute names for the 'primary' index. Defaults to None.
|
|
77
|
+
checkExists (bool, optional): reuse an existing collection if True. Defaults to False.
|
|
78
|
+
bsonSchema (object, optional): BSON compatable validation schema. Defaults to None.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
(bool): True for success or False otherwise
|
|
82
|
+
"""
|
|
83
|
+
try:
|
|
84
|
+
logger.debug("Create database %s collection %s", databaseName, collectionName)
|
|
85
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
86
|
+
mg = MongoDbUtil(client)
|
|
87
|
+
if checkExists and mg.databaseExists(databaseName) and mg.collectionExists(databaseName, collectionName):
|
|
88
|
+
ok1 = True
|
|
89
|
+
else:
|
|
90
|
+
ok1 = mg.createCollection(databaseName, collectionName, bsonSchema=bsonSchema)
|
|
91
|
+
ok2 = mg.databaseExists(databaseName)
|
|
92
|
+
ok3 = mg.collectionExists(databaseName, collectionName)
|
|
93
|
+
okI = True
|
|
94
|
+
if indexAttributeNames:
|
|
95
|
+
okI = mg.createIndex(databaseName, collectionName, indexAttributeNames, indexName=indexName, indexType="DESCENDING", uniqueFlag=False)
|
|
96
|
+
|
|
97
|
+
return ok1 and ok2 and ok3 and okI
|
|
98
|
+
#
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.exception("Failing with %s", str(e))
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
def delete(self, databaseName, collectionName, selectD):
|
|
104
|
+
"""Remove documents satisfying the input selection details.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
databaseName (str): Target database name
|
|
108
|
+
collectionName (str): Target collection name
|
|
109
|
+
selectD = {'ky1': 'val1', 'ky2': 'val2', ...}
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
try:
|
|
113
|
+
numDeleted = 0
|
|
114
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
115
|
+
mg = MongoDbUtil(client)
|
|
116
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
117
|
+
logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
|
|
118
|
+
numDeleted = mg.delete(databaseName, collectionName, selectD)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.exception("Failing with %s", str(e))
|
|
121
|
+
return numDeleted
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ObjectValidator.py
|
|
3
|
+
# Date: 17-Oct-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to extract and update object from the document object server including validation.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
#
|
|
9
|
+
##
|
|
10
|
+
__docformat__ = "google en"
|
|
11
|
+
__author__ = "John Westbrook"
|
|
12
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
13
|
+
__license__ = "Apache 2.0"
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
from jsonschema import Draft4Validator
|
|
18
|
+
from jsonschema import FormatChecker
|
|
19
|
+
|
|
20
|
+
from rcsb.db.mongo.Connection import Connection
|
|
21
|
+
from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
|
|
22
|
+
from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
|
|
23
|
+
from rcsb.db.utils.SchemaProvider import SchemaProvider
|
|
24
|
+
from rcsb.db.utils.TimeUtil import TimeUtil
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ObjectValidator(object):
|
|
30
|
+
"""Utilities to extract and update object from the document object server with validation."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, cfgOb, objectAdapter=None, cachePath=".", useCache=True, **kwargs):
|
|
33
|
+
self.__cfgOb = cfgOb
|
|
34
|
+
self.__oAdapt = objectAdapter
|
|
35
|
+
self.__resourceName = "MONGO_DB"
|
|
36
|
+
_ = kwargs
|
|
37
|
+
self.__statusList = []
|
|
38
|
+
self.__schP = SchemaProvider(self.__cfgOb, cachePath, useCache=useCache)
|
|
39
|
+
self.__valInst = None
|
|
40
|
+
|
|
41
|
+
def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
|
|
42
|
+
_ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
|
|
43
|
+
cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True)
|
|
44
|
+
# Raises exceptions for schema compliance.
|
|
45
|
+
Draft4Validator.check_schema(cD)
|
|
46
|
+
valInst = Draft4Validator(cD, format_checker=FormatChecker())
|
|
47
|
+
return valInst
|
|
48
|
+
|
|
49
|
+
def __validateObj(self, databaseName, collectionName, rObj, label=""):
|
|
50
|
+
try:
|
|
51
|
+
eCount = 0
|
|
52
|
+
tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
|
|
53
|
+
for error in sorted(self.__valInst.iter_errors(rObj), key=str):
|
|
54
|
+
logger.info("Database %s collection %s (%s %r) path %s error: %s", databaseName, collectionName, label, tId, error.path, error.message)
|
|
55
|
+
logger.debug(">>> Failing object is %r", rObj)
|
|
56
|
+
eCount += 1
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.exception("Validation failing %s", str(e))
|
|
59
|
+
|
|
60
|
+
return eCount
|
|
61
|
+
|
|
62
|
+
def doTransform(self, **kwargs):
|
|
63
|
+
desp = DataExchangeStatus()
|
|
64
|
+
statusStartTimestamp = desp.setStartTime()
|
|
65
|
+
#
|
|
66
|
+
databaseName = kwargs.get("databaseName", "pdbx_core")
|
|
67
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_entry")
|
|
68
|
+
selectionQueryD = kwargs.get("selectionQuery", {})
|
|
69
|
+
fetchLimit = kwargs.get("fetchLimit", None)
|
|
70
|
+
#
|
|
71
|
+
|
|
72
|
+
#
|
|
73
|
+
tU = TimeUtil()
|
|
74
|
+
updateId = kwargs.get("updateId", tU.getCurrentWeekSignature())
|
|
75
|
+
#
|
|
76
|
+
docSelectList = self.__selectObjectIds(databaseName, collectionName, selectionQueryD)
|
|
77
|
+
docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList
|
|
78
|
+
|
|
79
|
+
ok = self.__transform(databaseName, collectionName, docSelectList)
|
|
80
|
+
#
|
|
81
|
+
okS = True
|
|
82
|
+
if updateId:
|
|
83
|
+
okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
84
|
+
return ok and okS
|
|
85
|
+
|
|
86
|
+
def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
|
|
87
|
+
"""Return a list of object identifiers for the input selection query."""
|
|
88
|
+
try:
|
|
89
|
+
|
|
90
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
91
|
+
mg = MongoDbUtil(client)
|
|
92
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
93
|
+
logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
|
|
94
|
+
qD = {}
|
|
95
|
+
if selectionQueryD:
|
|
96
|
+
qD.update(selectionQueryD)
|
|
97
|
+
selectL = ["_id"]
|
|
98
|
+
dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD)
|
|
99
|
+
logger.info("Selection %r fetch result count %d", selectL, len(dL))
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.exception("Failing with %s", str(e))
|
|
103
|
+
return dL
|
|
104
|
+
#
|
|
105
|
+
|
|
106
|
+
def __transform(self, databaseName, collectionName, docSelectList, logIncrement=100):
|
|
107
|
+
"""Return a list of object identifiers for the input selection query."""
|
|
108
|
+
#
|
|
109
|
+
ok = True
|
|
110
|
+
try:
|
|
111
|
+
self.__valInst = self.__getValidator(databaseName, collectionName, schemaLevel="full")
|
|
112
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
113
|
+
mg = MongoDbUtil(client)
|
|
114
|
+
if mg.collectionExists(databaseName, collectionName):
|
|
115
|
+
numDoc = len(docSelectList)
|
|
116
|
+
for ii, dD in enumerate(docSelectList, 1):
|
|
117
|
+
if "_id" not in dD:
|
|
118
|
+
continue
|
|
119
|
+
rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"])
|
|
120
|
+
del rObj["_id"]
|
|
121
|
+
#
|
|
122
|
+
fOk = True
|
|
123
|
+
|
|
124
|
+
if self.__oAdapt:
|
|
125
|
+
self.__validateObj(databaseName, collectionName, rObj, label="Original")
|
|
126
|
+
fOk, rObj = self.__oAdapt.filter(rObj)
|
|
127
|
+
self.__validateObj(databaseName, collectionName, rObj, label="Updated")
|
|
128
|
+
if fOk:
|
|
129
|
+
rOk = mg.replace(databaseName, collectionName, rObj, dD)
|
|
130
|
+
if rOk is None:
|
|
131
|
+
tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
|
|
132
|
+
logger.error("%r %r (%r) failing", databaseName, collectionName, tId)
|
|
133
|
+
# logger.info("rObj.keys() %r", list(rObj.keys()))
|
|
134
|
+
# logger.info("rObj.items() %s", rObj.items())
|
|
135
|
+
rOk = False
|
|
136
|
+
ok = ok and rOk
|
|
137
|
+
#
|
|
138
|
+
if ii % logIncrement == 0 or ii == numDoc:
|
|
139
|
+
logger.info("Replace status %r object (%d of %d)", ok, ii, numDoc)
|
|
140
|
+
#
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.exception("Failing with %s", str(e))
|
|
143
|
+
return ok
|
|
144
|
+
|
|
145
|
+
def getLoadStatus(self):
|
|
146
|
+
return self.__statusList
|
|
147
|
+
|
|
148
|
+
def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
|
|
149
|
+
try:
|
|
150
|
+
sFlag = "Y" if status else "N"
|
|
151
|
+
desp = DataExchangeStatus()
|
|
152
|
+
desp.setStartTime(tS=startTimestamp)
|
|
153
|
+
desp.setObject(databaseName, collectionName)
|
|
154
|
+
desp.setStatus(updateId=updateId, successFlag=sFlag)
|
|
155
|
+
desp.setEndTime()
|
|
156
|
+
self.__statusList.append(desp.getStatus())
|
|
157
|
+
return True
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.exception("Failing with %s", str(e))
|
|
160
|
+
return False
|