rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,286 @@
1
+ ##
2
+ # File: ObjectExtractor.py
3
+ # Date: 26-Jun-2019 jdw
4
+ #
5
+ # Utilities to extract document features from the document object server.
6
+ #
7
+ # Updates:
8
+ # 27-Jun-2019 jdw add JSON path tracking utilities.
9
+ #
10
+ ##
11
+ __docformat__ = "google en"
12
+ __author__ = "John Westbrook"
13
+ __email__ = "jwest@rcsb.rutgers.edu"
14
+ __license__ = "Apache 2.0"
15
+
16
+ import copy
17
+ import logging
18
+ import os
19
+
20
+ from rcsb.db.mongo.Connection import Connection
21
+ from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
22
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
23
+
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class ObjectExtractor(object):
29
+ """Utilities to extract document features from the document object server."""
30
+
31
+ def __init__(self, cfgOb, **kwargs):
32
+ self.__cfgOb = cfgOb
33
+ self.__resourceName = "MONGO_DB"
34
+ self.__mU = MarshalUtil()
35
+ #
36
+ self.__objectD = self.__rebuildCache(**kwargs)
37
+ self.__objPathD = {}
38
+ self.__stringPathList = []
39
+ self.__objValD = {}
40
+ #
41
+
42
+ def getObjects(self):
43
+ return self.__objectD
44
+
45
+ def getPathList(self, filterList=True):
46
+ kL = []
47
+ if filterList:
48
+ tL = []
49
+ for ky in self.__objPathD:
50
+ if ky and (ky.find(".") != -1 or ky.startswith("_")) and ky not in ["_id"] and not ky.endswith("[]"):
51
+ tL.append(ky)
52
+ for ky in tL:
53
+ for tky in tL:
54
+ ok = True
55
+ if ky in tky and ky != tky:
56
+ ok = False
57
+ break
58
+ if ok:
59
+ kL.append(ky)
60
+ else:
61
+ kL = list(self.__objPathD.keys())
62
+ #
63
+ return sorted(kL)
64
+
65
+ def getValues(self):
66
+ return self.__objValD
67
+
68
+ def setPathList(self, stringPathList):
69
+ self.__objPathD = {k: True for k in stringPathList}
70
+ return True
71
+
72
+ def getCount(self):
73
+ return len(self.__objectD)
74
+
75
+ def __rebuildCache(self, **kwargs):
76
+ cacheFilePath = kwargs.get("cacheFilePath", None)
77
+ cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
78
+ useCache = kwargs.get("useCache", True)
79
+ keyAttribute = kwargs.get("keyAttribute", "entry")
80
+ selectL = kwargs.get("selectionList", [])
81
+ #
82
+ cD = {keyAttribute: {}}
83
+ try:
84
+ if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK):
85
+ cD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
86
+ else:
87
+ if selectL:
88
+ objectD = self.__select(**kwargs)
89
+ else:
90
+ objectD = self.__selectObjects(**kwargs)
91
+ cD[keyAttribute] = objectD
92
+ if cacheFilePath:
93
+ pth, _ = os.path.split(cacheFilePath)
94
+ ok = self.__mU.mkdir(pth)
95
+ ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs)
96
+ logger.info("Saved object results (%d) status %r in %s", len(objectD), ok, cacheFilePath)
97
+ except Exception as e:
98
+ logger.exception("Failing with %s", str(e))
99
+ return cD[keyAttribute]
100
+
101
+ def __selectObjects(self, **kwargs):
102
+ """Return a dictionary of objects satisfying the input conditions (e.g. method, resolution limit)"""
103
+ databaseName = kwargs.get("databaseName", "pdbx_core")
104
+ collectionName = kwargs.get("collectionName", "pdbx_core_entry")
105
+ selectionQueryD = kwargs.get("selectionQuery", {})
106
+ #
107
+ uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
108
+ #
109
+ tV = kwargs.get("objectLimit", None)
110
+ objLimit = int(tV) if tV is not None else None
111
+ stripObjectId = kwargs.get("stripObjectId", False)
112
+ logIncrement = kwargs.get("logIncrement", 10000)
113
+ #
114
+ objectD = {}
115
+ try:
116
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
117
+ mg = MongoDbUtil(client)
118
+ if mg.collectionExists(databaseName, collectionName):
119
+ logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
120
+ qD = {}
121
+ if selectionQueryD:
122
+ qD.update(selectionQueryD)
123
+ selectL = ["_id"]
124
+ dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD)
125
+ numDoc = len(dL) if dL else 0
126
+ logger.info("Selection %r fetch result count %d", selectL, numDoc)
127
+ #
128
+ for ii, dD in enumerate(dL, 1):
129
+ if "_id" not in dD:
130
+ continue
131
+ rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"])
132
+ if stripObjectId and rObj and "_id" in rObj:
133
+ rObj.pop("_id")
134
+ else:
135
+ rObj["_id"] = str(rObj["_id"])
136
+ #
137
+ stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
138
+ objectD[stKey] = copy.copy(rObj)
139
+ if objLimit and ii >= objLimit:
140
+ break
141
+ logger.debug("Saving %d %s", ii, stKey)
142
+ if ii % logIncrement == 0 or ii == numDoc:
143
+ logger.info("Extracting object (%d of %d)", ii, numDoc)
144
+ except Exception as e:
145
+ logger.exception("Failing with %s", str(e))
146
+ return objectD
147
+ #
148
+
149
+ def __select(self, **kwargs):
150
+ """Return a dictionary of object content satisfying the input conditions
151
+ (e.g. method, resolution limit) and selection options.
152
+ """
153
+ databaseName = kwargs.get("databaseName", "pdbx_core")
154
+ collectionName = kwargs.get("collectionName", "pdbx_core_entry")
155
+ selectionQueryD = kwargs.get("selectionQuery", {})
156
+ uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
157
+ selectL = kwargs.get("selectionList", [])
158
+ stripObjectId = kwargs.get("stripObjectId", False)
159
+ #
160
+ tV = kwargs.get("objectLimit", None)
161
+ objLimit = int(tV) if tV is not None else None
162
+ #
163
+ objectD = {}
164
+ try:
165
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
166
+ mg = MongoDbUtil(client)
167
+ if mg.collectionExists(databaseName, collectionName):
168
+ logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
169
+ qD = {}
170
+ if selectionQueryD:
171
+ qD.update(selectionQueryD)
172
+ dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD, suppressId=True)
173
+ logger.info("Selection %r fetch result count %d", selectL, len(dL))
174
+ #
175
+ for ii, rObj in enumerate(dL, 1):
176
+ stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
177
+ if stripObjectId and rObj and "_id" in rObj:
178
+ rObj.pop("_id")
179
+ objectD[stKey] = copy.copy(rObj)
180
+ if objLimit and ii >= objLimit:
181
+ break
182
+ # logger.debug("Saving %d %s", ii, stKey)
183
+ # logger.debug("Current objectD keys %r", list(objectD.keys()))
184
+
185
+ except Exception as e:
186
+ logger.exception("Failing with %s", str(e))
187
+ return objectD
188
+ #
189
+
190
+ def __getKeyValues(self, dct, keyNames):
191
+ """Return the tuple of values of corresponding to the input dictionary key names expressed in dot notation.
192
+
193
+ Args:
194
+ dct (dict): source dictionary object (nested)
195
+ keyNames (list): list of dictionary keys in dot notation
196
+
197
+ Returns:
198
+ tuple: tuple of values corresponding to the input key names
199
+
200
+ """
201
+ rL = []
202
+ try:
203
+ for keyName in keyNames:
204
+ rL.append(self.__getKeyValue(dct, keyName))
205
+ except Exception as e:
206
+ logger.exception("Failing for key names %r with %s", keyNames, str(e))
207
+
208
+ return tuple(rL)
209
+
210
+ def __getKeyValue(self, dct, keyName):
211
+ """Return the value of the corresponding key expressed in dot notation in the input dictionary object (nested)."""
212
+ try:
213
+ kys = keyName.split(".")
214
+ for key in kys:
215
+ try:
216
+ dct = dct[key]
217
+ except KeyError:
218
+ return None
219
+ return dct
220
+ except Exception as e:
221
+ logger.exception("Failing for key %r with %s", keyName, str(e))
222
+
223
+ return None
224
+
225
+ def __toJsonPathString(self, path):
226
+ pL = [ky if ky else "[]" for ky in path]
227
+ sp = ".".join(pL)
228
+ sp = sp.replace(".[", "[")
229
+ return sp
230
+
231
+ def __pathCallBack(self, path, value):
232
+ sp = self.__toJsonPathString(path)
233
+ self.__objPathD[sp] = self.__objPathD[sp] + 1 if sp in self.__objPathD else 1
234
+ return value
235
+
236
+ def __saveCallBack(self, path, value):
237
+ sP = self.__toJsonPathString(path)
238
+ if sP in self.__objPathD:
239
+ ky = sP.replace("[]", "")
240
+ if sP.find("[") != -1: # multivalued
241
+ if isinstance(value, list):
242
+ self.__objValD.setdefault(ky, []).extend(value)
243
+ else:
244
+ self.__objValD.setdefault(ky, []).append(value)
245
+ else:
246
+ self.__objValD[ky] = value
247
+ return value
248
+
249
+ def genPathList(self, dObj, path=None):
250
+ return self.__walk(dObj, jsonPath=path, funct=self.__pathCallBack)
251
+
252
+ def genValueList(self, dObj, path=None, clear=True):
253
+ self.__objValD = {} if clear else self.__objValD
254
+ return self.__walk(dObj, jsonPath=path, funct=self.__saveCallBack)
255
+
256
+ def __walk(self, jsonObj, jsonPath=None, funct=None):
257
+ """Walk JSON data types. An optional funct() is called to mutate
258
+ the value of each element. The jsonPath is updated at each element.
259
+ """
260
+ if jsonPath is None:
261
+ jsonPath = []
262
+
263
+ if isinstance(jsonObj, dict):
264
+ value = {k: self.__walk(v, jsonPath + [k], funct) for k, v in jsonObj.items()}
265
+ elif isinstance(jsonObj, list):
266
+ value = [self.__walk(elem, jsonPath + [[]], funct) for elem in jsonObj]
267
+ else:
268
+ value = jsonObj
269
+
270
+ if funct is None:
271
+ return value
272
+ else:
273
+ return funct(jsonPath, value)
274
+
275
+ def __toPath(self, path):
276
+ """Convert path strings into path lists."""
277
+ if isinstance(path, list):
278
+ return path # already in list format
279
+
280
+ def _iterPath(path):
281
+ for parts in path.split("[]"):
282
+ for part in parts.strip(".").split("."):
283
+ yield part
284
+ yield []
285
+
286
+ return list(_iterPath(path))[:-1]
@@ -0,0 +1,124 @@
1
+ ##
2
+ # File: ObjectTransformer.py
3
+ # Date: 17-Oct-2019 jdw
4
+ #
5
+ # Utilities to extract and update object from the document object server.
6
+ #
7
+ # Updates:
8
+ #
9
+ ##
10
+ __docformat__ = "google en"
11
+ __author__ = "John Westbrook"
12
+ __email__ = "jwest@rcsb.rutgers.edu"
13
+ __license__ = "Apache 2.0"
14
+
15
+ import logging
16
+
17
+ from rcsb.db.mongo.Connection import Connection
18
+ from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
19
+ from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
20
+ from rcsb.db.utils.TimeUtil import TimeUtil
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ObjectTransformer(object):
26
+ """Utilities to extract and update object from the document object server."""
27
+
28
+ def __init__(self, cfgOb, objectAdapter=None, **kwargs):
29
+ self.__cfgOb = cfgOb
30
+ self.__oAdapt = objectAdapter
31
+ self.__resourceName = "MONGO_DB"
32
+ _ = kwargs
33
+ self.__statusList = []
34
+
35
+ def doTransform(self, **kwargs):
36
+ desp = DataExchangeStatus()
37
+ statusStartTimestamp = desp.setStartTime()
38
+ #
39
+ databaseName = kwargs.get("databaseName", "pdbx_core")
40
+ collectionName = kwargs.get("collectionName", "pdbx_core_entry")
41
+ selectionQueryD = kwargs.get("selectionQuery", {})
42
+ fetchLimit = kwargs.get("fetchLimit", None)
43
+ tU = TimeUtil()
44
+ updateId = kwargs.get("updateId", tU.getCurrentWeekSignature())
45
+ #
46
+ docSelectList = self.__selectObjectIds(databaseName, collectionName, selectionQueryD)
47
+ docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList
48
+ ok = self.__transform(databaseName, collectionName, docSelectList)
49
+ #
50
+ okS = True
51
+ if updateId:
52
+ okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
53
+ return ok and okS
54
+
55
+ def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
56
+ """Return a list of object identifiers for the input selection query."""
57
+ try:
58
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
59
+ mg = MongoDbUtil(client)
60
+ if mg.collectionExists(databaseName, collectionName):
61
+ logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
62
+ qD = {}
63
+ if selectionQueryD:
64
+ qD.update(selectionQueryD)
65
+ selectL = ["_id"]
66
+ dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD)
67
+ logger.info("Selection %r fetch result count %d", selectL, len(dL))
68
+
69
+ except Exception as e:
70
+ logger.exception("Failing with %s", str(e))
71
+ return dL
72
+ #
73
+
74
+ def __transform(self, databaseName, collectionName, docSelectList, logIncrement=10000):
75
+ """Return a list of object identifiers for the input selection query."""
76
+ #
77
+ ok = True
78
+ try:
79
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
80
+ mg = MongoDbUtil(client)
81
+ if mg.collectionExists(databaseName, collectionName):
82
+ numDoc = len(docSelectList)
83
+ for ii, dD in enumerate(docSelectList, 1):
84
+ if "_id" not in dD:
85
+ continue
86
+ rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"])
87
+ del rObj["_id"]
88
+ #
89
+ fOk = True
90
+ if self.__oAdapt:
91
+ fOk, rObj = self.__oAdapt.filter(rObj)
92
+ if fOk:
93
+ rOk = mg.replace(databaseName, collectionName, rObj, dD)
94
+ if rOk is None:
95
+ tId = rObj["rcsb_id"] if "rcsb_id" in rObj else "anonymous"
96
+ logger.error("%r %r (%r) failing", databaseName, collectionName, tId)
97
+ logger.debug("rObj.keys() %r", list(rObj.keys()))
98
+ logger.debug("rObj.items() %s", rObj.items())
99
+ rOk = False
100
+ ok = ok and rOk
101
+ #
102
+ if ii % logIncrement == 0 or ii == numDoc:
103
+ logger.info("Replace status %r object (%d of %d)", ok, ii, numDoc)
104
+ #
105
+ except Exception as e:
106
+ logger.exception("Failing with %s", str(e))
107
+ return ok
108
+
109
+ def getLoadStatus(self):
110
+ return self.__statusList
111
+
112
+ def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
113
+ try:
114
+ sFlag = "Y" if status else "N"
115
+ desp = DataExchangeStatus()
116
+ desp.setStartTime(tS=startTimestamp)
117
+ desp.setObject(databaseName, collectionName)
118
+ desp.setStatus(updateId=updateId, successFlag=sFlag)
119
+ desp.setEndTime()
120
+ self.__statusList.append(desp.getStatus())
121
+ return True
122
+ except Exception as e:
123
+ logger.exception("Failing with %s", str(e))
124
+ return False
@@ -0,0 +1,121 @@
1
+ ##
2
+ # File: ObjectUpdater.py
3
+ # Date: 9-Oct-2019 jdw
4
+ #
5
+ # Utilities to update document features from the document object server.
6
+ #
7
+ # Updates:
8
+ #
9
+ #
10
+ ##
11
+ __docformat__ = "google en"
12
+ __author__ = "John Westbrook"
13
+ __email__ = "jwest@rcsb.rutgers.edu"
14
+ __license__ = "Apache 2.0"
15
+
16
+ import logging
17
+
18
+ from rcsb.db.mongo.Connection import Connection
19
+ from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ObjectUpdater(object):
26
+ """Utilities to update document features from the document object server."""
27
+
28
+ def __init__(self, cfgOb, **kwargs):
29
+ self.__cfgOb = cfgOb
30
+ self.__resourceName = "MONGO_DB"
31
+ _ = kwargs
32
+ #
33
+
34
+ def update(self, databaseName, collectionName, updateDL):
35
+ """Update documents satisfying the selection details with the content of updateDL.
36
+
37
+ Args:
38
+ databaseName (str): Target database name
39
+ collectionName (str): Target collection name
40
+ updateDL = [{selectD: ..., updateD: ... }, ....]
41
+ selectD = {'ky1': 'val1', 'ky2': 'val2', ...}
42
+ updateD = {'key1.subkey1...': 'val1', 'key2.subkey2..': 'val2', ...}
43
+
44
+ """
45
+ try:
46
+ numUpdated = 0
47
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
48
+ mg = MongoDbUtil(client)
49
+ if mg.collectionExists(databaseName, collectionName):
50
+ logger.debug("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
51
+ for updateD in updateDL:
52
+ num = mg.update(databaseName, collectionName, updateD["updateD"], updateD["selectD"], upsertFlag=True)
53
+ numUpdated += num
54
+
55
+ except Exception as e:
56
+ logger.exception("Failing with %s", str(e))
57
+ return numUpdated
58
+
59
+ def count(self, databaseName, collectionName):
60
+ try:
61
+ numTotal = 0
62
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
63
+ mg = MongoDbUtil(client)
64
+ if mg.collectionExists(databaseName, collectionName):
65
+ numTotal = mg.count(databaseName, collectionName)
66
+ except Exception as e:
67
+ logger.exception("Failing with %s", str(e))
68
+ return numTotal
69
+
70
+ def createCollection(self, databaseName, collectionName, indexAttributeNames=None, indexName="primary", checkExists=False, bsonSchema=None):
71
+ """Create collection and optionally set index attributes for the named index and validation schema for a new collection.
72
+
73
+ Args:
74
+ databaseName (str): target database name
75
+ collectionName (str): target collection name
76
+ indexAttributeNames (list, optional): list of attribute names for the 'primary' index. Defaults to None.
77
+ checkExists (bool, optional): reuse an existing collection if True. Defaults to False.
78
+ bsonSchema (object, optional): BSON compatable validation schema. Defaults to None.
79
+
80
+ Returns:
81
+ (bool): True for success or False otherwise
82
+ """
83
+ try:
84
+ logger.debug("Create database %s collection %s", databaseName, collectionName)
85
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
86
+ mg = MongoDbUtil(client)
87
+ if checkExists and mg.databaseExists(databaseName) and mg.collectionExists(databaseName, collectionName):
88
+ ok1 = True
89
+ else:
90
+ ok1 = mg.createCollection(databaseName, collectionName, bsonSchema=bsonSchema)
91
+ ok2 = mg.databaseExists(databaseName)
92
+ ok3 = mg.collectionExists(databaseName, collectionName)
93
+ okI = True
94
+ if indexAttributeNames:
95
+ okI = mg.createIndex(databaseName, collectionName, indexAttributeNames, indexName=indexName, indexType="DESCENDING", uniqueFlag=False)
96
+
97
+ return ok1 and ok2 and ok3 and okI
98
+ #
99
+ except Exception as e:
100
+ logger.exception("Failing with %s", str(e))
101
+ return False
102
+
103
+ def delete(self, databaseName, collectionName, selectD):
104
+ """Remove documents satisfying the input selection details.
105
+
106
+ Args:
107
+ databaseName (str): Target database name
108
+ collectionName (str): Target collection name
109
+ selectD = {'ky1': 'val1', 'ky2': 'val2', ...}
110
+
111
+ """
112
+ try:
113
+ numDeleted = 0
114
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
115
+ mg = MongoDbUtil(client)
116
+ if mg.collectionExists(databaseName, collectionName):
117
+ logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
118
+ numDeleted = mg.delete(databaseName, collectionName, selectD)
119
+ except Exception as e:
120
+ logger.exception("Failing with %s", str(e))
121
+ return numDeleted
@@ -0,0 +1,160 @@
1
+ ##
2
+ # File: ObjectValidator.py
3
+ # Date: 17-Oct-2019 jdw
4
+ #
5
+ # Utilities to extract and update object from the document object server including validation.
6
+ #
7
+ # Updates:
8
+ #
9
+ ##
10
+ __docformat__ = "google en"
11
+ __author__ = "John Westbrook"
12
+ __email__ = "jwest@rcsb.rutgers.edu"
13
+ __license__ = "Apache 2.0"
14
+
15
+ import logging
16
+
17
+ from jsonschema import Draft4Validator
18
+ from jsonschema import FormatChecker
19
+
20
+ from rcsb.db.mongo.Connection import Connection
21
+ from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
22
+ from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
23
+ from rcsb.db.utils.SchemaProvider import SchemaProvider
24
+ from rcsb.db.utils.TimeUtil import TimeUtil
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class ObjectValidator(object):
30
+ """Utilities to extract and update object from the document object server with validation."""
31
+
32
+ def __init__(self, cfgOb, objectAdapter=None, cachePath=".", useCache=True, **kwargs):
33
+ self.__cfgOb = cfgOb
34
+ self.__oAdapt = objectAdapter
35
+ self.__resourceName = "MONGO_DB"
36
+ _ = kwargs
37
+ self.__statusList = []
38
+ self.__schP = SchemaProvider(self.__cfgOb, cachePath, useCache=useCache)
39
+ self.__valInst = None
40
+
41
+ def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
42
+ _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
43
+ cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True)
44
+ # Raises exceptions for schema compliance.
45
+ Draft4Validator.check_schema(cD)
46
+ valInst = Draft4Validator(cD, format_checker=FormatChecker())
47
+ return valInst
48
+
49
+ def __validateObj(self, databaseName, collectionName, rObj, label=""):
50
+ try:
51
+ eCount = 0
52
+ tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
53
+ for error in sorted(self.__valInst.iter_errors(rObj), key=str):
54
+ logger.info("Database %s collection %s (%s %r) path %s error: %s", databaseName, collectionName, label, tId, error.path, error.message)
55
+ logger.debug(">>> Failing object is %r", rObj)
56
+ eCount += 1
57
+ except Exception as e:
58
+ logger.exception("Validation failing %s", str(e))
59
+
60
+ return eCount
61
+
62
+ def doTransform(self, **kwargs):
63
+ desp = DataExchangeStatus()
64
+ statusStartTimestamp = desp.setStartTime()
65
+ #
66
+ databaseName = kwargs.get("databaseName", "pdbx_core")
67
+ collectionName = kwargs.get("collectionName", "pdbx_core_entry")
68
+ selectionQueryD = kwargs.get("selectionQuery", {})
69
+ fetchLimit = kwargs.get("fetchLimit", None)
70
+ #
71
+
72
+ #
73
+ tU = TimeUtil()
74
+ updateId = kwargs.get("updateId", tU.getCurrentWeekSignature())
75
+ #
76
+ docSelectList = self.__selectObjectIds(databaseName, collectionName, selectionQueryD)
77
+ docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList
78
+
79
+ ok = self.__transform(databaseName, collectionName, docSelectList)
80
+ #
81
+ okS = True
82
+ if updateId:
83
+ okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
84
+ return ok and okS
85
+
86
+ def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
87
+ """Return a list of object identifiers for the input selection query."""
88
+ try:
89
+
90
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
91
+ mg = MongoDbUtil(client)
92
+ if mg.collectionExists(databaseName, collectionName):
93
+ logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName))
94
+ qD = {}
95
+ if selectionQueryD:
96
+ qD.update(selectionQueryD)
97
+ selectL = ["_id"]
98
+ dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD)
99
+ logger.info("Selection %r fetch result count %d", selectL, len(dL))
100
+
101
+ except Exception as e:
102
+ logger.exception("Failing with %s", str(e))
103
+ return dL
104
+ #
105
+
106
+ def __transform(self, databaseName, collectionName, docSelectList, logIncrement=100):
107
+ """Return a list of object identifiers for the input selection query."""
108
+ #
109
+ ok = True
110
+ try:
111
+ self.__valInst = self.__getValidator(databaseName, collectionName, schemaLevel="full")
112
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
113
+ mg = MongoDbUtil(client)
114
+ if mg.collectionExists(databaseName, collectionName):
115
+ numDoc = len(docSelectList)
116
+ for ii, dD in enumerate(docSelectList, 1):
117
+ if "_id" not in dD:
118
+ continue
119
+ rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"])
120
+ del rObj["_id"]
121
+ #
122
+ fOk = True
123
+
124
+ if self.__oAdapt:
125
+ self.__validateObj(databaseName, collectionName, rObj, label="Original")
126
+ fOk, rObj = self.__oAdapt.filter(rObj)
127
+ self.__validateObj(databaseName, collectionName, rObj, label="Updated")
128
+ if fOk:
129
+ rOk = mg.replace(databaseName, collectionName, rObj, dD)
130
+ if rOk is None:
131
+ tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
132
+ logger.error("%r %r (%r) failing", databaseName, collectionName, tId)
133
+ # logger.info("rObj.keys() %r", list(rObj.keys()))
134
+ # logger.info("rObj.items() %s", rObj.items())
135
+ rOk = False
136
+ ok = ok and rOk
137
+ #
138
+ if ii % logIncrement == 0 or ii == numDoc:
139
+ logger.info("Replace status %r object (%d of %d)", ok, ii, numDoc)
140
+ #
141
+ except Exception as e:
142
+ logger.exception("Failing with %s", str(e))
143
+ return ok
144
+
145
+ def getLoadStatus(self):
146
+ return self.__statusList
147
+
148
+ def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
149
+ try:
150
+ sFlag = "Y" if status else "N"
151
+ desp = DataExchangeStatus()
152
+ desp.setStartTime(tS=startTimestamp)
153
+ desp.setObject(databaseName, collectionName)
154
+ desp.setStatus(updateId=updateId, successFlag=sFlag)
155
+ desp.setEndTime()
156
+ self.__statusList.append(desp.getStatus())
157
+ return True
158
+ except Exception as e:
159
+ logger.exception("Failing with %s", str(e))
160
+ return False