rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,176 @@
1
+ ##
2
+ # File: EntityInstanceExtractorTests.py
3
+ # Author: J. Westbrook
4
+ # Date: 25-Mar-2019
5
+ #
6
+ # Updates:
7
+ # 21-Apr-2019 jdw Tests for full cache construction and processiong
8
+ #
9
+ ##
10
+ """
11
+ Tests for extractor of selected values from entity polymer collections (full cache)
12
+
13
+ """
14
+
15
+ __docformat__ = "google en"
16
+ __author__ = "John Westbrook"
17
+ __email__ = "jwest@rcsb.rutgers.edu"
18
+ __license__ = "Apache 2.0"
19
+
20
+
21
+ import logging
22
+ import os
23
+ import time
24
+ import unittest
25
+
26
+
27
+ from rcsb.exdb.seq.EntityPolymerExtractor import EntityPolymerExtractor
28
+ from rcsb.utils.config.ConfigUtil import ConfigUtil
29
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
30
+ from rcsb.utils.taxonomy.TaxonomyUtils import TaxonomyUtils
31
+
32
+
33
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
34
+ logger = logging.getLogger()
35
+
36
+ HERE = os.path.abspath(os.path.dirname(__file__))
37
+ TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
38
+
39
+
40
+ class EntityPolymerExtractorFullTests(unittest.TestCase):
41
+ def __init__(self, methodName="runTest"):
42
+ super(EntityPolymerExtractorFullTests, self).__init__(methodName)
43
+ self.__verbose = True
44
+
45
+ def setUp(self):
46
+ #
47
+ #
48
+ self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
49
+ configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
50
+ #
51
+ # Caution: this is very site specific setting
52
+ #
53
+ configName = "site_info_remote"
54
+ self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
55
+ if configName != "site_info_configuration":
56
+ self.__cfgOb.replaceSectionName("site_info_configuration", configName)
57
+ #
58
+ #
59
+ self.__workPath = os.path.join(HERE, "test-cache-preserve")
60
+ #
61
+ self.__fullCacheKwargs = {"fmt": "pickle"}
62
+ self.__fullEntitySaveCachePath = os.path.join(self.__workPath, "entity-polymer-data-cache.pic")
63
+ #
64
+ self.__mU = MarshalUtil()
65
+ self.__entryLimitFull = 50
66
+ #
67
+ self.__startTime = time.time()
68
+ logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
69
+
70
+ def tearDown(self):
71
+ endTime = time.time()
72
+ logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
73
+
74
+ @unittest.skip("rebuild cache")
75
+ def testRebuildCache(self):
76
+ """Test case - extract entity polymer info - rebuild full cache of extracted entity polymer data -"""
77
+ try:
78
+ epe = EntityPolymerExtractor(
79
+ self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=False, saveCacheKwargs=self.__fullCacheKwargs, entryLimit=self.__entryLimitFull
80
+ )
81
+ eCount = epe.getEntryCount()
82
+ if self.__entryLimitFull is not None:
83
+ self.assertGreaterEqual(eCount, self.__entryLimitFull)
84
+ else:
85
+ self.assertGreaterEqual(eCount, 10)
86
+ #
87
+ except Exception as e:
88
+ logger.exception("Failing with %s", str(e))
89
+ self.fail()
90
+
91
+ def testAccessEntityPolymerFeatures(self):
92
+ """Test case - access cached entity polymer info from full cache"""
93
+ try:
94
+ epe = EntityPolymerExtractor(self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs)
95
+ eCount = epe.getEntryCount()
96
+ logger.info("Entry count %d", eCount)
97
+ self.assertGreaterEqual(eCount, self.__entryLimitFull)
98
+ #
99
+ unpL = epe.getRefSeqAccessions("UNP")
100
+ logger.info("Ref seq count %d", len(unpL))
101
+ self.assertGreaterEqual(len(unpL), 1)
102
+ #
103
+ testOp = False
104
+ if testOp:
105
+ for entryId in ["1CP9"]:
106
+ for entityId in ["1", "2"]:
107
+ uL = epe.getEntityRefSeqAccessions("UNP", entryId, entityId)
108
+ logger.debug("UNP for %s %s %r", entryId, entityId, uL)
109
+ #
110
+ except Exception as e:
111
+ logger.exception("Failing with %s", str(e))
112
+ self.fail()
113
+
114
+ def testAccessEntityPolymerReadCache(self):
115
+ """Test case - access cached entity polymer info from full cache"""
116
+ try:
117
+ epe = EntityPolymerExtractor(self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs)
118
+ logger.info("Cache entry count %d", epe.getEntryCount())
119
+ cD = epe.countRefSeqAccessions("UNP")
120
+ self.assertGreaterEqual(len(cD), 2)
121
+ #
122
+ logger.info("UNP reference sequences per entity %r", dict(sorted(cD.items())))
123
+ logger.info("Reference sequences per entity %r", dict(sorted(epe.countRefSeqAccessionAny().items())))
124
+ logger.info("Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items())))
125
+ #
126
+ ok = epe.checkRefSeqAlignRange("UNP")
127
+ self.assertTrue(ok)
128
+ unpL = epe.getRefSeqAccessions("UNP")
129
+ logger.info("Unique UNP reference sequences %d", len(unpL))
130
+ self.assertTrue(ok)
131
+ #
132
+ except Exception as e:
133
+ logger.exception("Failing with %s", str(e))
134
+ self.fail()
135
+
136
+ def testTaxonomyEntityPolymerReadCache(self):
137
+ """Test case - evaluate taxonomy - from full cache"""
138
+ try:
139
+ taxIdList = [562, 9606, 3701]
140
+ for taxId in taxIdList:
141
+ tU = TaxonomyUtils(taxDirPath=self.__workPath)
142
+ tL = tU.getLineage(taxId)
143
+ logger.info("Taxonomy lineage for %d %r", taxId, tL)
144
+ #
145
+ #
146
+ epe = EntityPolymerExtractor(self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs)
147
+ logger.info("Cache entry count %d", epe.getEntryCount())
148
+ logger.info("Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items())))
149
+ rD = epe.countRefSeqAccessionByTaxon(dbNameList=["UNP"])
150
+ logger.info("Unique taxons %d", len(list(rD.keys())))
151
+ #
152
+ numT = 0
153
+ for tId, aL in rD.items():
154
+ tL = tU.getLineage(tId)
155
+ if taxId in tL:
156
+ tc = len(set(aL))
157
+ logger.info("Matched %5d %s (%r)", tc, tU.getScientificName(tId), tId)
158
+ numT += tc
159
+ logger.info("Total matched accessions %d ", numT)
160
+ except Exception as e:
161
+ logger.exception("Failing with %s", str(e))
162
+ self.fail()
163
+
164
+
165
+ def entityPolymerExtractFullSuite():
166
+ suiteSelect = unittest.TestSuite()
167
+ # suiteSelect.addTest(EntityPolymerExtractorFullTests("testRebuildCache"))
168
+ suiteSelect.addTest(EntityPolymerExtractorFullTests("testAccessEntityPolymerFeatures"))
169
+ suiteSelect.addTest(EntityPolymerExtractorFullTests("testAccessEntityPolymerReadCache"))
170
+ suiteSelect.addTest(EntityPolymerExtractorFullTests("testTaxonomyEntityPolymerReadCache"))
171
+ return suiteSelect
172
+
173
+
174
+ if __name__ == "__main__":
175
+ mySuite = entityPolymerExtractFullSuite()
176
+ unittest.TextTestRunner(verbosity=2).run(mySuite)
@@ -0,0 +1,449 @@
1
+ ##
2
+ # File: ReferenceSequenceAssignmentUpdater.py
3
+ # Date: 8-Oct-2019 jdw
4
+ #
5
+ # Selected utilities to update reference sequence assignments information
6
+ # in the core_entity collection.
7
+ #
8
+ # Updates:
9
+ #
10
+ ##
11
+ __docformat__ = "google en"
12
+ __author__ = "John Westbrook"
13
+ __email__ = "jwest@rcsb.rutgers.edu"
14
+ __license__ = "Apache 2.0"
15
+
16
+ import logging
17
+ import os
18
+ from collections import defaultdict
19
+
20
+ from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
21
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
22
+ from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
23
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
24
+ from rcsb.utils.seq.SiftsSummaryProvider import SiftsSummaryProvider
25
+ from rcsb.utils.seq.UniProtUtils import UniProtUtils
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class ReferenceSequenceAssignmentUpdater(object):
31
+ """Selected utilities to update reference sequence assignments information
32
+ in the core_entity collection.
33
+
34
+ """
35
+
36
+ def __init__(self, cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", **kwargs):
37
+ self.__cfgOb = cfgOb
38
+ self.__polymerType = polymerType
39
+ self.__mU = MarshalUtil()
40
+ #
41
+ self.__databaseName = databaseName
42
+ self.__collectionName = collectionName
43
+ self.__statusList = []
44
+ #
45
+ self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
46
+ self.__assignRefD, self.__refD, self.__matchD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, **kwargs)
47
+
48
+ def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, **kwargs):
49
+ assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, **kwargs)
50
+ # get refIdD = {refId: [entity_id, ....], }
51
+ refIdD, _ = self.__getUniqueAssignments(assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource)
52
+ #
53
+ refD, matchD = self.__rebuildReferenceCache(referenceDatabaseName, list(refIdD.keys()), **kwargs)
54
+ return assignRefD, refD, matchD
55
+
56
+ def doUpdate(self, updateId, updateLimit=None):
57
+ desp = DataExchangeStatus()
58
+ statusStartTimestamp = desp.setStartTime()
59
+ #
60
+ numUpd = 0
61
+ updateDL = self.__buildUpdate(self.__assignRefD)
62
+ if updateDL:
63
+ if updateLimit:
64
+ numUpd = self.__doUpdate(self.__cfgOb, updateDL[:updateLimit], self.__databaseName, self.__collectionName)
65
+ else:
66
+ numUpd = self.__doUpdate(self.__cfgOb, updateDL, self.__databaseName, self.__collectionName)
67
+ self.__updateStatus(updateId, self.__databaseName, self.__collectionName, True, statusStartTimestamp)
68
+ return len(updateDL), numUpd
69
+
70
+ def __doUpdate(self, cfgOb, updateDL, databaseName, collectionName):
71
+ obUpd = ObjectUpdater(cfgOb)
72
+ numUpd = obUpd.update(databaseName, collectionName, updateDL)
73
+ logger.info("Update count is %d", numUpd)
74
+
75
+ return numUpd
76
+
77
+ def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, **kwargs):
78
+ """Get all accessions assigned to input reference sequence database for the input polymerType.
79
+
80
+ Returns:
81
+ (dict): {"1abc_1": "rcsb_entity_container_identifiers": {"reference_sequence_identifiers": []},
82
+ "rcsb_polymer_entity_align": [],
83
+ "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
84
+ """
85
+ cachePath = kwargs.get("cachePath", ".")
86
+ exDbDir = "exdb"
87
+ cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
88
+ useCache = kwargs.get("useCache", True)
89
+ fetchLimit = kwargs.get("fetchLimit", None)
90
+ cacheFilePath = os.path.join(cachePath, exDbDir, "entity-poly-ref-seq-assign-cache.json")
91
+ #
92
+ try:
93
+ obEx = ObjectExtractor(
94
+ self.__cfgOb,
95
+ databaseName=databaseName,
96
+ collectionName=collectionName,
97
+ cacheFilePath=cacheFilePath,
98
+ useCache=useCache,
99
+ keyAttribute="entity",
100
+ uniqueAttributes=["rcsb_id"],
101
+ cacheKwargs=cacheKwargs,
102
+ objectLimit=fetchLimit,
103
+ selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
104
+ selectionList=[
105
+ "rcsb_id",
106
+ "rcsb_entity_container_identifiers.reference_sequence_identifiers",
107
+ "rcsb_entity_container_identifiers.auth_asym_ids",
108
+ "rcsb_polymer_entity_align",
109
+ "rcsb_entity_source_organism.ncbi_taxonomy_id",
110
+ ],
111
+ )
112
+ eCount = obEx.getCount()
113
+ logger.info("Entity count is %d", eCount)
114
+ objD = obEx.getObjects()
115
+ logger.info("Reading polymer entity entity count %d ref accession length %d ", eCount, len(objD))
116
+ #
117
+ except Exception as e:
118
+ logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
119
+ return objD
120
+
121
+ def __getUniqueAssignments(self, objD, referenceDatabaseName="UniProt", provSource="PDB"):
122
+ refIdD = defaultdict(list)
123
+ taxIdD = defaultdict(list)
124
+ numMissing = 0
125
+ for entityKey, eD in objD.items():
126
+ try:
127
+ accS = set()
128
+ for ii, tD in enumerate(eD["rcsb_entity_container_identifiers"]["reference_sequence_identifiers"]):
129
+ if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource:
130
+ accS.add(tD["database_accession"])
131
+ refIdD[tD["database_accession"]].append(entityKey)
132
+ #
133
+ # pick up the corresponding taxonomy -
134
+ try:
135
+ taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"])
136
+ except Exception:
137
+ logger.warning("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"])
138
+
139
+ logger.debug("PDB assigned sequences length %d", len(accS))
140
+ except Exception as e:
141
+ numMissing += 1
142
+ logger.debug("No sequence assignments for %s with %s", entityKey, str(e))
143
+ #
144
+ for refId, taxIdL in taxIdD.items():
145
+ taxIdL = list(set(taxIdL))
146
+ if len(taxIdL) > 1:
147
+ logger.info("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL)
148
+
149
+ logger.info("Unique %s accession assignments by %s %d (missing %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing)
150
+ return refIdD, taxIdD
151
+
152
+ def __reMapAccessions(self, rsiDL, referenceDatabaseName="UniProt", provSourceL=None, excludeReferenceDatabases=None):
153
+ """Internal method to re-map accessions for the input databae and assignment source
154
+
155
+ Args:
156
+ rsiDL (list): list of accession
157
+ databaseName (str, optional): resource database name. Defaults to 'UniProt'.
158
+ provSource (str, optional): assignment provenance. Defaults to 'PDB'.
159
+
160
+ Returns:
161
+ bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input object list
162
+ """
163
+ isMatched = False
164
+ unMapped = 0
165
+ matched = 0
166
+ excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
167
+ provSourceL = provSourceL if provSourceL else []
168
+ retDL = []
169
+ for rsiD in rsiDL:
170
+ if rsiD["database_name"] in excludeReferenceDatabases:
171
+ unMapped += 1
172
+ continue
173
+ if rsiD["database_name"] == referenceDatabaseName and rsiD["provenance_source"] in provSourceL:
174
+ try:
175
+ if len(self.__matchD[rsiD["database_accession"]]["matchedIds"]) == 1:
176
+ rsiD["database_accession"] = self.__matchD[rsiD["database_accession"]]["matchedIds"][0]
177
+ matched += 1
178
+ else:
179
+ logger.info("Skipping mapping to multiple superseding accessions %s", rsiD["database_accession"])
180
+ #
181
+ except Exception:
182
+ unMapped += 1
183
+ retDL.append(rsiD)
184
+ if matched == len(retDL):
185
+ isMatched = True
186
+ return not unMapped, isMatched, retDL
187
+
188
+ def __reMapAlignments(self, alignDL, referenceDatabaseName="UniProt", provSourceL=None, excludeReferenceDatabases=None):
189
+ """Internal method to re-map alignments for the input databae and assignment source
190
+
191
+ Args:
192
+ alignDL (list): list of aligned regions
193
+ databaseName (str, optional): resource database name. Defaults to 'UniProt'.
194
+ provSourceL (list, optional): assignment provenance. Defaults to 'PDB'.
195
+
196
+ Returns:
197
+ bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input align list
198
+ """
199
+ isMatched = False
200
+ unMapped = 0
201
+ matched = 0
202
+ excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
203
+ retDL = []
204
+ provSourceL = provSourceL if provSourceL else []
205
+ for alignD in alignDL:
206
+ if alignD["reference_database_name"] in excludeReferenceDatabases:
207
+ unMapped += 1
208
+ continue
209
+ if alignD["reference_database_name"] == referenceDatabaseName and alignD["provenance_code"] in provSourceL:
210
+ try:
211
+ if len(self.__matchD[alignD["reference_database_accession"]]["matchedIds"]) == 1:
212
+ alignD["reference_database_accession"] = self.__matchD[alignD["reference_database_accession"]]["matchedIds"][0]
213
+ matched += 1
214
+ else:
215
+ logger.info("Skipping alignment mapping to multiple superseding accessions %s", alignD["reference_database_accession"])
216
+ except Exception:
217
+ unMapped += 1
218
+ retDL.append(alignD)
219
+ if matched == len(retDL):
220
+ isMatched = True
221
+ #
222
+ return not unMapped, isMatched, retDL
223
+
224
+ def __getSiftsAccessions(self, entityKey, authAsymIdL):
225
+ retL = []
226
+ saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
227
+ for (_, dbAccession), _ in saoLD.items():
228
+ retL.append({"database_name": "UniProt", "database_accession": dbAccession, "provenance_source": "SIFTS"})
229
+ return retL
230
+
231
+ def __getSiftsAlignments(self, entityKey, authAsymIdL):
232
+ retL = []
233
+ saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
234
+ for (_, dbAccession), saoL in saoLD.items():
235
+ dD = {"reference_database_name": "UniProt", "reference_database_accession": dbAccession, "provenance_code": "SIFTS", "aligned_regions": []}
236
+ for sao in saoL:
237
+ dD["aligned_regions"].append({"ref_beg_seq_id": sao.getDbSeqIdBeg(), "entity_beg_seq_id": sao.getEntitySeqIdBeg(), "length": sao.getEntityAlignLength()})
238
+ retL.append(dD)
239
+ return retL
240
+
241
+ def __buildUpdate(self, assignRefD):
242
+ #
243
+ updateDL = []
244
+ for entityKey, eD in assignRefD.items():
245
+ selectD = {"rcsb_id": entityKey}
246
+ try:
247
+ updateD = {}
248
+ authAsymIdL = []
249
+ ersDL = (
250
+ eD["rcsb_entity_container_identifiers"]["reference_sequence_identifiers"] if "reference_sequence_identifiers" in eD["rcsb_entity_container_identifiers"] else None
251
+ )
252
+ #
253
+ #
254
+ if ersDL:
255
+ authAsymIdL = eD["rcsb_entity_container_identifiers"]["auth_asym_ids"]
256
+ isMapped, isMatched, updErsDL = self.__reMapAccessions(ersDL, referenceDatabaseName="UniProt", provSourceL=["PDB"])
257
+ #
258
+ if not isMapped or not isMatched:
259
+ tL = self.__getSiftsAccessions(entityKey, authAsymIdL)
260
+ if tL:
261
+ logger.debug("Using SIFTS accession mapping for %s", entityKey)
262
+ else:
263
+ logger.info("No alternative SIFTS accession mapping for %s", entityKey)
264
+ updErsDL = tL if tL else []
265
+ #
266
+ if len(updErsDL) < len(ersDL):
267
+ logger.info("Incomplete reference sequence mapping update for %s", entityKey)
268
+ updateD["rcsb_entity_container_identifiers.reference_sequence_identifiers"] = updErsDL
269
+ #
270
+ alignDL = eD["rcsb_polymer_entity_align"] if "rcsb_polymer_entity_align" in eD else None
271
+ if alignDL and authAsymIdL:
272
+ isMapped, isMatched, updAlignDL = self.__reMapAlignments(alignDL, referenceDatabaseName="UniProt", provSourceL=["PDB"])
273
+ #
274
+ if not isMapped or not isMatched:
275
+ tL = self.__getSiftsAlignments(entityKey, authAsymIdL)
276
+ if tL:
277
+ logger.debug("Using SIFTS alignment mapping for %s", entityKey)
278
+ else:
279
+ logger.info("No alternative SIFTS alignment mapping for %s", entityKey)
280
+ updAlignDL = tL if tL else updAlignDL
281
+ #
282
+ if len(updAlignDL) < len(alignDL):
283
+ logger.info("Incomplete alignment mapping update for %s", entityKey)
284
+ updateD["rcsb_polymer_entity_align"] = updAlignDL
285
+ #
286
+ if updateD:
287
+ updateDL.append({"selectD": selectD, "updateD": updateD})
288
+ except Exception as e:
289
+ logger.exception("Mapping error for %s with %s", entityKey, str(e))
290
+ #
291
+ return updateDL
292
+
293
+ def __rebuildReferenceCache(self, refDbName, idList, **kwargs):
294
+ """ """
295
+ dD = {}
296
+ cachePath = kwargs.get("cachePath", ".")
297
+ dirPath = os.path.join(cachePath, "exdb")
298
+ cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
299
+ useCache = kwargs.get("useCache", True)
300
+ fetchLimit = kwargs.get("fetchLimit", None)
301
+ saveText = kwargs.get("saveText", False)
302
+ #
303
+ ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
304
+ fn = "ref-sequence-data-cache" + "." + ext
305
+ cacheFilePath = os.path.join(dirPath, fn)
306
+ #
307
+ self.__mU.mkdir(dirPath)
308
+ if not useCache:
309
+ for fp in [cacheFilePath]:
310
+ try:
311
+ os.remove(fp)
312
+ except Exception:
313
+ pass
314
+ #
315
+ if useCache and cacheFilePath and self.__mU.exists(cacheFilePath):
316
+ dD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
317
+ # Check for completeness -
318
+ missingS = set(dD["refDbCache"].keys()) - set(idList)
319
+ if missingS:
320
+ logger.info("Reference sequence cache missing %d accessions", len(missingS))
321
+ extraD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit)
322
+ dD["refDbCache"].update(extraD["refDbCache"])
323
+ dD["matchInfo"].update(extraD["matchInfo"])
324
+ if cacheFilePath and cacheKwargs:
325
+ self.__mU.mkdir(dirPath)
326
+ ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
327
+ logger.info("Cache updated with status %r", ok)
328
+ #
329
+ else:
330
+ dD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit)
331
+ if cacheFilePath and cacheKwargs:
332
+ self.__mU.mkdir(dirPath)
333
+ ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
334
+ logger.info("Cache save status %r", ok)
335
+
336
+ return dD["refDbCache"], dD["matchInfo"]
337
+
338
+ def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None):
339
+ """Fetch database entries from the input reference sequence database name."""
340
+ dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}}
341
+
342
+ try:
343
+ idList = idList[:fetchLimit] if fetchLimit else idList
344
+ logger.info("Starting fetch for %d %s entries", len(idList), refDbName)
345
+ if refDbName == "UniProt":
346
+ fobj = UniProtUtils(saveText=saveText)
347
+ refD, matchD = fobj.fetchList(idList)
348
+ dD = {"refDbName": refDbName, "refDbCache": refD, "matchInfo": matchD}
349
+
350
+ except Exception as e:
351
+ logger.exception("Failing with %s", str(e))
352
+
353
+ return dD
354
+
355
+ def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
356
+ abbreviated = kwargs.get("siftsAbbreviated", "PROD")
357
+ cachePath = kwargs.get("cachePath", ".")
358
+ cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
359
+ useCache = kwargs.get("useCache", True)
360
+ #
361
+ siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)
362
+ # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath)
363
+ if siftsSummaryDataPath.lower().startswith("http"):
364
+ srcDirPath = siftsSummaryDataPath
365
+ else:
366
+ srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
367
+ cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
368
+ logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
369
+ ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs)
370
+ logger.info("ssP entry count %d", ssP.getEntryCount())
371
+ return ssP
372
+
373
+ def __dumpEntries(self, refD):
374
+ for (eId, eDict) in refD.items():
375
+ logger.info("------ Reference id %s", eId)
376
+ for k, v in eDict.items():
377
+ logger.info("%-15s = %r", k, v)
378
+
379
+ def __getUpdateAssignmentCandidates(self, objD):
380
+ totCount = 0
381
+ difCount = 0
382
+ pdbUnpIdD = defaultdict(list)
383
+ siftsUnpIdD = defaultdict(list)
384
+ assignIdDifD = defaultdict(list)
385
+ #
386
+ for entityKey, eD in objD.items():
387
+ try:
388
+ siftsS = set()
389
+ pdbS = set()
390
+ for tD in eD["rcsb_entity_container_identifiers"]["reference_sequence_identifiers"]:
391
+ if tD["database_name"] == "UniProt":
392
+ if tD["provenance_source"] == "SIFTS":
393
+ siftsS.add(tD["database_accession"])
394
+ siftsUnpIdD[tD["database_accession"]].append(entityKey)
395
+ elif tD["provenance_source"] == "PDB":
396
+ pdbS.add(tD["database_accession"])
397
+ pdbUnpIdD[tD["database_accession"]].append(entityKey)
398
+ else:
399
+ logger.debug("No UniProt for %r", eD["rcsb_entity_container_identifiers"])
400
+ logger.debug("PDB assigned sequence length %d", len(pdbS))
401
+ logger.debug("SIFTS assigned sequence length %d", len(siftsS))
402
+
403
+ if pdbS and siftsS:
404
+ totCount += 1
405
+ if pdbS != siftsS:
406
+ difCount += 1
407
+ for idV in pdbS:
408
+ assignIdDifD[idV].append(entityKey)
409
+
410
+ except Exception as e:
411
+ logger.warning("No identifiers for %s with %s", entityKey, str(e))
412
+ #
413
+ logger.info("Total %d differences %d", totCount, difCount)
414
+ logger.info("Unique UniProt accession assignments PDB %d SIFTS %d", len(pdbUnpIdD), len(siftsUnpIdD))
415
+ logger.info("Current unique overalapping assignment differences %d ", len(assignIdDifD))
416
+ logger.info("Current unique overalapping assignment differences %r ", assignIdDifD)
417
+ return assignIdDifD, pdbUnpIdD, siftsUnpIdD
418
+
419
+ def getReferenceAccessionAlignSummary(self):
420
+ """Summarize the alignment of PDB accession assignments with the current reference sequence database."""
421
+ numPrimary = 0
422
+ numSecondary = 0
423
+ numNone = 0
424
+ for _, mD in self.__matchD.items():
425
+ if mD["matched"] == "primary":
426
+ numPrimary += 1
427
+ elif mD["matched"] == "secondary":
428
+ numSecondary += 1
429
+ else:
430
+ numNone += 1
431
+ logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone)
432
+ return numPrimary, numSecondary, numNone
433
+
434
+ def getLoadStatus(self):
435
+ return self.__statusList
436
+
437
+ def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
438
+ try:
439
+ sFlag = "Y" if status else "N"
440
+ desp = DataExchangeStatus()
441
+ desp.setStartTime(tS=startTimestamp)
442
+ desp.setObject(databaseName, collectionName)
443
+ desp.setStatus(updateId=updateId, successFlag=sFlag)
444
+ desp.setEndTime()
445
+ self.__statusList.append(desp.getStatus())
446
+ return True
447
+ except Exception as e:
448
+ logger.exception("Failing with %s", str(e))
449
+ return False