rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
rcsb/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__)
rcsb/exdb/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__)
@@ -0,0 +1,82 @@
1
+ ##
2
+ # File: BranchedEntityExtractor.py
3
+ # Date: 24-May-2021 jdw
4
+ #
5
+ # Utilities to extract selected details from the core branched entity collections.
6
+ #
7
+ #
8
+ # Updates:
9
+ #
10
+ ##
11
+ __docformat__ = "google en"
12
+ __author__ = "John Westbrook"
13
+ __email__ = "jwest@rcsb.rutgers.edu"
14
+ __license__ = "Apache 2.0"
15
+
16
+ import logging
17
+
18
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
19
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class BranchedEntityExtractor(object):
25
+ """Utilities to extract selected details from the core branched entity collections."""
26
+
27
+ def __init__(self, cfgOb):
28
+ self.__cfgOb = cfgOb
29
+
30
+ def exportBranchedEntityDetails(self, filePath, fmt="json"):
31
+ """Export branched entity details (BIRD mapping and WURCS descriptors)"""
32
+ rD = self.getBranchedDetails()
33
+ # ----
34
+ mU = MarshalUtil()
35
+ ok = mU.doExport(filePath, rD, fmt=fmt, indent=3)
36
+ logger.info("Exporting (%d) branched entities status %r", len(rD), ok)
37
+ return ok
38
+
39
+ def getBranchedDetails(self):
40
+ """Get branched entity details (BIRD mapping and WURCS descriptors)"""
41
+ rD = {}
42
+ try:
43
+
44
+ #
45
+ obEx = ObjectExtractor(
46
+ self.__cfgOb,
47
+ databaseName="pdbx_core",
48
+ collectionName="pdbx_core_branched_entity",
49
+ useCache=False,
50
+ keyAttribute="entity",
51
+ uniqueAttributes=["rcsb_id"],
52
+ selectionQuery={},
53
+ selectionList=["rcsb_id", "pdbx_entity_branch_descriptor", "rcsb_branched_entity_container_identifiers"],
54
+ )
55
+ #
56
+ # eCount = obEx.getCount()
57
+ # logger.info("Branched entity count is %d", eCount)
58
+ objD = obEx.getObjects()
59
+ rD = {}
60
+ for _, eD in objD.items():
61
+ rcsbId = eD["rcsb_id"]
62
+ #
63
+ prdId = None
64
+ try:
65
+ pD = eD["rcsb_branched_entity_container_identifiers"]
66
+ prdId = pD["prd_id"]
67
+ except Exception:
68
+ pass
69
+ #
70
+ wurcs = None
71
+ try:
72
+ for tD in eD["pdbx_entity_branch_descriptor"]:
73
+ if tD["type"] == "WURCS":
74
+ wurcs = tD["descriptor"]
75
+ except Exception:
76
+ pass
77
+ if prdId or wurcs:
78
+ rD[rcsbId] = {"prdId": prdId, "wurcs": wurcs}
79
+
80
+ except Exception as e:
81
+ logger.exception("Failing with %s", str(e))
82
+ return rD
@@ -0,0 +1,116 @@
1
+ ##
2
+ # File: GlycanProvider.py
3
+ # Date: 24-May-2021 jdw
4
+ #
5
+ # Updated:
6
+ #
7
+ ##
8
+ """
9
+ Accessors for glycan mapped annotations.
10
+
11
+ """
12
+
13
+ import logging
14
+ import os.path
15
+ import time
16
+
17
+ from rcsb.exdb.branch.GlycanUtils import GlycanUtils
18
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
19
+ from rcsb.utils.io.StashableBase import StashableBase
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class GlycanProvider(StashableBase):
25
+ """Accessors and generators for entity glycan mapped identifiers.
26
+
27
+ dirPath -> CACHE/glycan/
28
+ branched_entity_glycan_identifier_map.json
29
+ accession-wurcs-mapping.json
30
+ stash/entity_glycan_mapped_identifiers.tar.gz
31
+
32
+ """
33
+
34
+ def __init__(self, **kwargs):
35
+ #
36
+ self.__version = "0.50"
37
+ cachePath = kwargs.get("cachePath", ".")
38
+ useCache = kwargs.get("useCache", True)
39
+ self.__dirName = "glycan"
40
+ self.__dirPath = os.path.join(cachePath, self.__dirName)
41
+ super(GlycanProvider, self).__init__(cachePath, [self.__dirName])
42
+ #
43
+ self.__mU = MarshalUtil(workPath=self.__dirPath)
44
+ self.__glyD = self.__reload(fmt="json", useCache=useCache)
45
+ #
46
+
47
+ def testCache(self, minCount=1):
48
+ if minCount == 0:
49
+ return True
50
+ if self.__glyD and minCount and ("identifiers" in self.__glyD) and len(self.__glyD["identifiers"]) >= minCount:
51
+ logger.info("Glycan identifiers (%d)", len(self.__glyD["identifiers"]))
52
+ return True
53
+ return False
54
+
55
+ def getIdentifiers(self):
56
+ """Return a dictionary of related identifiers organized by branched entity id.
57
+
58
+ Returns:
59
+ (dict): {entityId: {'idType1': ids, 'idType1': ids}, ... }
60
+ """
61
+ try:
62
+ return self.__glyD["identifiers"] if self.__glyD["identifiers"] else {}
63
+ except Exception as e:
64
+ logger.error("Failing with %r", str(e))
65
+ return {}
66
+
67
+ def __getMappingFilePath(self, fmt="json"):
68
+ baseFileName = "branched_entity_glycan_identifier_map"
69
+ fExt = ".json" if fmt == "json" else ".pic"
70
+ fp = os.path.join(self.__dirPath, baseFileName + fExt)
71
+ return fp
72
+
73
+ def update(self, cfgOb, fmt="json", indent=3):
74
+ """Update branched entity glycan accession mapping cache.
75
+
76
+ Args:
77
+ cfgObj (object): ConfigInfo() object instance
78
+
79
+ Returns:
80
+ (bool): True for success for False otherwise
81
+ """
82
+ ok = False
83
+ try:
84
+ gU = GlycanUtils(cfgOb, self.__dirPath)
85
+ eaD = gU.updateEntityAccessionMap()
86
+ logger.info("Got branched entity glycan accession map (%d)", len(eaD))
87
+ #
88
+ tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
89
+ self.__glyD = {"version": self.__version, "created": tS, "identifiers": eaD}
90
+ #
91
+ mappingFilePath = self.__getMappingFilePath(fmt=fmt)
92
+ kwargs = {"indent": indent} if fmt == "json" else {}
93
+ ok = self.__mU.doExport(mappingFilePath, self.__glyD, fmt=fmt, **kwargs)
94
+ except Exception as e:
95
+ logger.exception("Failing with %s", str(e))
96
+ return ok
97
+
98
+ def reload(self):
99
+ """Reload from the current cache file."""
100
+ ok = False
101
+ try:
102
+ self.__glyD = self.__reload(fmt="json", useCache=True)
103
+ ok = self.__glyD is not None
104
+ except Exception as e:
105
+ logger.exception("Failing with %s", str(e))
106
+ return ok
107
+
108
+ def __reload(self, fmt="json", useCache=True):
109
+ mappingFilePath = self.__getMappingFilePath(fmt=fmt)
110
+ tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
111
+ pcD = {"version": self.__version, "created": tS, "identifiers": {}}
112
+
113
+ if useCache and self.__mU.exists(mappingFilePath):
114
+ logger.info("reading cached path %r", mappingFilePath)
115
+ pcD = self.__mU.doImport(mappingFilePath, fmt=fmt)
116
+ return pcD
@@ -0,0 +1,114 @@
1
+ ##
2
+ # File: GlycanUtils.py
3
+ # Date: 24-May-2021 jdw
4
+ #
5
+ # Updated:
6
+ ##
7
+ """
8
+ Utilities for fetching and mapping glycan accessions.
9
+ """
10
+
11
+ import logging
12
+ import os.path
13
+
14
+ from rcsb.exdb.branch.BranchedEntityExtractor import BranchedEntityExtractor
15
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
16
+ from rcsb.utils.io.UrlRequestUtil import UrlRequestUtil
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class GlycanUtils:
22
+ """Utilities for fetching and mapping glycan annotations."""
23
+
24
+ def __init__(self, cfgOb, dirPath):
25
+ self.__cfgOb = cfgOb
26
+ self.__dirPath = dirPath
27
+ self.__mU = MarshalUtil(workPath=self.__dirPath)
28
+ #
29
+
30
+ def __getRawGlycanDetailsPath(self):
31
+ return os.path.join(self.__dirPath, "pdb-raw-branched-entity-details.json")
32
+
33
+ def getBranchedEntityDetails(self):
34
+ """For branched entities, get BIRD mapping and WURCS details"""
35
+ ok = False
36
+ try:
37
+ bEx = BranchedEntityExtractor(self.__cfgOb)
38
+ branchedEntityD = bEx.getBranchedDetails()
39
+ logger.info("Branched entity descriptor details count %d", len(branchedEntityD))
40
+ detailsPath = self.__getRawGlycanDetailsPath()
41
+ ok = bEx.exportBranchedEntityDetails(detailsPath, fmt="json")
42
+ logger.info("Store raw branched entity data (%r) %s", ok, detailsPath)
43
+ except Exception as e:
44
+ logger.exception("Failing with %s", str(e))
45
+ #
46
+ return branchedEntityD
47
+
48
+ def __getGlycanAccessionMapPath(self):
49
+ return os.path.join(self.__dirPath, "accession-wurcs-mapping.json")
50
+
51
+ def fetchGlycanAccessionMap(self):
52
+ mapD = {}
53
+ accessionMapPath = self.__getGlycanAccessionMapPath()
54
+ if self.__mU.exists(accessionMapPath):
55
+ mapD = self.__mU.doImport(accessionMapPath, fmt="json")
56
+ return mapD
57
+
58
+ def storeGlycanAccessionMap(self, mapD):
59
+ accessionMapPath = self.__getGlycanAccessionMapPath()
60
+ ok = self.__mU.doExport(accessionMapPath, mapD, fmt="json", indent=3)
61
+ return ok
62
+
63
+ def updateEntityAccessionMap(self):
64
+ """Update entity to glycan accession mapping
65
+
66
+ Returns:
67
+ dict: {entityId: {'glyTouCanId':... , 'prdId': ..., }, ... }
68
+ """
69
+ entityAccessionMapD = {}
70
+ wurcsTupL = []
71
+ uniqueWurcsD = {}
72
+ accessionMapD = self.fetchGlycanAccessionMap()
73
+ branchedEntityD = self.getBranchedEntityDetails()
74
+ for entityId, iD in branchedEntityD.items():
75
+ if iD["wurcs"] and iD["wurcs"] not in accessionMapD and iD["wurcs"] not in uniqueWurcsD:
76
+ wurcsTupL.append((entityId, iD["wurcs"]))
77
+ uniqueWurcsD.setdefault(iD["wurcs"], []).append(entityId)
78
+ if wurcsTupL:
79
+ tMap = self.getAccessionMapping(wurcsTupL)
80
+ accessionMapD.update(tMap)
81
+ self.storeGlycanAccessionMap(accessionMapD)
82
+ #
83
+
84
+ for entityId, iD in branchedEntityD.items():
85
+ if iD["wurcs"] in accessionMapD:
86
+ prdId = iD["prdId"] if iD["wurcs"] else None
87
+ entityAccessionMapD[entityId] = {"glyTouCanId": accessionMapD[iD["wurcs"]][0], "prdId": prdId}
88
+ return entityAccessionMapD
89
+
90
+ def getAccessionMapping(self, wurcsTupL):
91
+ """Fetch GlyTouCan accessions for the input WURCS desriptor list"""
92
+ accessionMapD = {}
93
+ logger.info("Fetching (%d) WURCS descriptors", len(wurcsTupL))
94
+ baseUrl = "https://api.glycosmos.org"
95
+ endPoint = "glytoucan/sparql/wurcs2gtcids"
96
+ numDescriptors = len(wurcsTupL)
97
+ for ii, (entityId, wurcs) in enumerate(wurcsTupL, 1):
98
+ try:
99
+ pD = {}
100
+ pD["wurcs"] = wurcs
101
+ uR = UrlRequestUtil()
102
+ rDL, retCode = uR.post(baseUrl, endPoint, pD, returnContentType="JSON")
103
+ logger.debug(" %r wurcs fetch result (%r) %r", entityId, retCode, rDL)
104
+ if rDL:
105
+ for rD in rDL:
106
+ if "id" in rD:
107
+ accessionMapD.setdefault(wurcs, []).append(rD["id"])
108
+ else:
109
+ logger.info("%r fetch fails (%r) (%r) %r", entityId, retCode, wurcs, rDL)
110
+ if ii % 5 == 0:
111
+ logger.info("Fetched %d/%d", ii, numDescriptors)
112
+ except Exception as e:
113
+ logger.exception("Failing for (%r) wurcs (%r) with %s", entityId, wurcs, str(e))
114
+ return accessionMapD
File without changes
@@ -0,0 +1,118 @@
1
+ ##
2
+ # File: ChemRefEtlWorker.py
3
+ # Date: 2-Jul-2018 jdw
4
+ #
5
+ # ETL utilities for processing chemical reference data and related data integration.
6
+ #
7
+ # Updates:
8
+ # 9-Dec-2018 jdw add validation methods
9
+ # 3-Sep-2019 jdw move to rcsb.exdb.chemref
10
+ # 7-Aug-2025 dwp change target DB and collection from "drugbank_core" to "dw" and "core_drugbank" (as part of transition to DW);
11
+ # make use of configuration file for loading drugbank collection and setting indexed fields
12
+ #
13
+ ##
14
+ __docformat__ = "google en"
15
+ __author__ = "John Westbrook"
16
+ __email__ = "jwest@rcsb.rutgers.edu"
17
+ __license__ = "Apache 2.0"
18
+
19
+ import logging
20
+
21
+ from rcsb.db.mongo.DocumentLoader import DocumentLoader
22
+ from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
23
+ from rcsb.db.utils.SchemaProvider import SchemaProvider
24
+ from rcsb.exdb.chemref.ChemRefExtractor import ChemRefExtractor
25
+ from rcsb.utils.chemref.DrugBankProvider import DrugBankProvider
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class ChemRefEtlWorker(object):
32
+ """Prepare and load chemical reference data collections."""
33
+
34
+ def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, verbose=False):
35
+ self.__cfgOb = cfgOb
36
+ self.__cachePath = cachePath
37
+ self.__useCache = useCache
38
+ self.__readBackCheck = readBackCheck
39
+ self.__numProc = numProc
40
+ self.__chunkSize = chunkSize
41
+ self.__maxStepLength = maxStepLength
42
+ self.__documentLimit = documentLimit
43
+ #
44
+ self.__resourceName = "MONGO_DB"
45
+ self.__verbose = verbose
46
+ self.__statusList = []
47
+ self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=self.__useCache)
48
+ #
49
+
50
+ def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
51
+ try:
52
+ sFlag = "Y" if status else "N"
53
+ desp = DataExchangeStatus()
54
+ desp.setStartTime(tS=startTimestamp)
55
+ desp.setObject(databaseName, collectionName)
56
+ desp.setStatus(updateId=updateId, successFlag=sFlag)
57
+ desp.setEndTime()
58
+ self.__statusList.append(desp.getStatus())
59
+ return True
60
+ except Exception as e:
61
+ logger.exception("Failing with %s", str(e))
62
+ return False
63
+
64
+ def load(self, updateId, extResource, loadType="full"):
65
+ """Load chemical reference integrated data for the input external resource-"""
66
+ try:
67
+ self.__statusList = []
68
+ desp = DataExchangeStatus()
69
+ statusStartTimestamp = desp.setStartTime()
70
+ addValues = {}
71
+ collectionGroupName = "core_drugbank"
72
+ #
73
+ if extResource == "DrugBank":
74
+ databaseNameMongo = self.__schP.getDatabaseMongoName(collectionGroupName=collectionGroupName)
75
+ configName = self.__cfgOb.getDefaultSectionName()
76
+ user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName)
77
+ pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName)
78
+ #
79
+ dbP = DrugBankProvider(cachePath=self.__cachePath, useCache=self.__useCache, username=user, password=pw)
80
+ #
81
+ crExt = ChemRefExtractor(self.__cfgOb)
82
+ idD = crExt.getChemCompAccessionMapping(extResource)
83
+ dList = dbP.getDocuments(mapD=idD)
84
+ #
85
+ logger.info("Resource %r extracted mapped document length %d", extResource, len(dList))
86
+ logger.debug("Objects %r", dList[:2])
87
+ _, _, collectionList, docIndexD = self.__schP.getSchemaInfo(collectionGroupName=collectionGroupName)
88
+ collectionName = collectionList[0] if collectionList else "unassigned"
89
+ indexDL = docIndexD[collectionName] if collectionName in docIndexD else []
90
+ logger.info("Database %r collection %r index attributes %r", databaseNameMongo, collectionName, indexDL)
91
+ #
92
+ # For some reason, 'addValues' was being overwritten with an empty dict (https://github.com/rcsb/py-rcsb_exdb/commit/26bd79e9a2fffc97c034b4116dece9248d1c1f39)
93
+ # Will need to review this -- do we want to add the schema version values or not? (Also, see similar logic in UniProtCoreEtlWorker.py)
94
+ # collectionVersion = sD.getCollectionVersion(collectionName)
95
+ # addValues = {"_schema_version": collectionVersion}
96
+ #
97
+ dl = DocumentLoader(
98
+ self.__cfgOb,
99
+ self.__cachePath,
100
+ self.__resourceName,
101
+ numProc=self.__numProc,
102
+ chunkSize=self.__chunkSize,
103
+ maxStepLength=self.__maxStepLength,
104
+ documentLimit=self.__documentLimit,
105
+ verbose=self.__verbose,
106
+ readBackCheck=self.__readBackCheck,
107
+ )
108
+ #
109
+ ok = dl.load(databaseNameMongo, collectionName, loadType=loadType, documentList=dList, keyNames=None, addValues=addValues, indexDL=indexDL)
110
+ self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
111
+
112
+ return True
113
+ except Exception as e:
114
+ logger.exception("Failing with %s", str(e))
115
+ return False
116
+
117
+ def getLoadStatus(self):
118
+ return self.__statusList
@@ -0,0 +1,70 @@
1
+ ##
2
+ # File: ChemRefExtractor.py
3
+ # Date: 2-Jul-2018 jdw
4
+ #
5
+ # Selected utilities to extract data from chemical component core collections.
6
+ #
7
+ # Updates:
8
+ # 7-Jan-2019 jdw moved from ChemRefEtlWorker.
9
+ # 3-Sep-2019 jdw moved again to module rcsb.exdb.chemref
10
+ # 14-Aug-2025 dwp rename bird_chem_comp_core to core_chem_comp
11
+ #
12
+ ##
13
+ __docformat__ = "google en"
14
+ __author__ = "John Westbrook"
15
+ __email__ = "jwest@rcsb.rutgers.edu"
16
+ __license__ = "Apache 2.0"
17
+
18
+ import logging
19
+
20
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ChemRefExtractor(object):
26
+ """Selected utilities to extract data from chemical component core collections."""
27
+
28
+ def __init__(self, cfgOb):
29
+ self.__cfgOb = cfgOb
30
+ self.__resourceName = "MONGO_DB"
31
+ #
32
+
33
+ def getChemCompAccessionMapping(self, referenceResourceName):
34
+ """Get the accession code mapping between chemical component identifiers and identifier(s) for the
35
+ input external reference resource.
36
+
37
+ Args:
38
+ referenceResourceName (str): resource name (e.g. DrugBank, ChEMBL, CCDC)
39
+
40
+ Returns:
41
+ dict: {referenceResourceId: chem_comp/bird_id, referenceResourceId: chem_comp/bird_id, ... }
42
+
43
+ """
44
+ idD = {}
45
+ try:
46
+ databaseName = "dw"
47
+ collectionName = "core_chem_comp"
48
+ selectD = {"rcsb_chem_comp_related.resource_name": referenceResourceName}
49
+ selectionList = ["rcsb_id", "rcsb_chem_comp_related"]
50
+ logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
51
+ obEx = ObjectExtractor(
52
+ self.__cfgOb,
53
+ databaseName=databaseName,
54
+ collectionName=collectionName,
55
+ keyAttribute="rcsb_id",
56
+ uniqueAttributes=["rcsb_id"],
57
+ selectionQuery=selectD,
58
+ selectionList=selectionList,
59
+ stripObjectId=True,
60
+ )
61
+ logger.info("Reference data object count %d", obEx.getCount())
62
+ objD = obEx.getObjects()
63
+ for _, doc in objD.items():
64
+ dL = doc["rcsb_chem_comp_related"] if "rcsb_chem_comp_related" in doc else []
65
+ for dD in dL:
66
+ if dD["resource_name"] == referenceResourceName and "resource_accession_code" in dD:
67
+ idD.setdefault(dD["resource_accession_code"], []).append(dD["comp_id"])
68
+ except Exception as e:
69
+ logger.exception("Failing with %s", str(e))
70
+ return idD
@@ -0,0 +1,139 @@
1
+ ##
2
+ # File: ChemRefMappingProvider.py
3
+ # Date: 18-Jun-2021 jdw
4
+ #
5
+ # Updated:
6
+ #
7
+ ##
8
+ """
9
+ Accessors for chemical reference identifier mapping data.
10
+ """
11
+
12
+ import datetime
13
+ import logging
14
+ import os.path
15
+ import time
16
+
17
+ from rcsb.utils.io.FileUtil import FileUtil
18
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
19
+ from rcsb.utils.io.StashableBase import StashableBase
20
+ from rcsb.exdb.chemref.ChemRefExtractor import ChemRefExtractor
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ChemRefMappingProvider(StashableBase):
26
+ """Accessors for chemical reference identifier mapping data."""
27
+
28
+ def __init__(self, cachePath, useCache=True):
29
+ #
30
+ self.__cachePath = cachePath
31
+ self.__useCache = useCache
32
+ self.__dirName = "chemref-mapping"
33
+ super(ChemRefMappingProvider, self).__init__(self.__cachePath, [self.__dirName])
34
+ self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
35
+ #
36
+ self.__mU = MarshalUtil(workPath=self.__dirPath)
37
+ self.__rD = {}
38
+ self.__mapD = self.__reload(self.__dirPath, useCache)
39
+ #
40
+
41
+ def testCache(self, minCount=0):
42
+ logger.info("Mapping count %d", len(self.__mapD["mapping"]) if "mapping" in self.__mapD else 0)
43
+ if minCount == 0 or self.__mapD and "mapping" in self.__mapD and len(self.__mapD["mapping"]) >= minCount:
44
+ return True
45
+ else:
46
+ return False
47
+
48
+ def getReferenceIds(self, referenceResourceName, localId):
49
+ """Get the identifiers in the reference resource corresponding to input local
50
+ identifiers (Chemical Component or BIRD).
51
+
52
+ Args:
53
+ referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...)
54
+ localId (str): local identifier for a Chemical Component or BIRD definition
55
+
56
+ Returns:
57
+ list: list of reference identifiers
58
+ """
59
+ if not self.__rD:
60
+ for rN, forwardD in self.__mapD["mapping"].items():
61
+ # {refId :[lId, lId, ...], ...}
62
+ reverseD = {}
63
+ for refId, rcsbIdL in forwardD.items():
64
+ for rId in rcsbIdL:
65
+ reverseD.setdefault(rId, []).append(refId)
66
+ self.__rD[rN] = reverseD
67
+ #
68
+ try:
69
+ return self.__rD[referenceResourceName.upper()][localId]
70
+ except Exception:
71
+ return []
72
+
73
+ def getLocalIds(self, referenceResourceName, referenceId):
74
+ """Get the local identifiers (Chemical Component or BIRD) corresponding to identifiers in
75
+ chemical reference resource.
76
+
77
+ Args:
78
+ referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...)
79
+ referenceId (str): identifier in the chemical reference resource
80
+
81
+ Returns:
82
+ list: list of local Chemical Component or BIRD identifiers
83
+ """
84
+ try:
85
+ return self.__mapD["mapping"][referenceResourceName.upper()][referenceId]
86
+ except Exception:
87
+ return []
88
+
89
+ def __getMappingDataPath(self):
90
+ return os.path.join(self.__dirPath, "chemref-mapping-data.json")
91
+
92
+ def reload(self):
93
+ self.__mapD = self.__reload(self.__dirPath, useCache=True)
94
+ return True
95
+
96
+ def __reload(self, dirPath, useCache):
97
+ startTime = time.time()
98
+ fD = {}
99
+ ok = False
100
+ mappingPath = self.__getMappingDataPath()
101
+ #
102
+ logger.info("useCache %r mappingPath %r", useCache, mappingPath)
103
+ if useCache and self.__mU.exists(mappingPath):
104
+ fD = self.__mU.doImport(mappingPath, fmt="json")
105
+ ok = True
106
+ else:
107
+ fU = FileUtil()
108
+ fU.mkdir(dirPath)
109
+ # ---
110
+ logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
111
+ return fD
112
+
113
+ def fetchChemRefMapping(self, cfgOb, referenceResourceNameList=None):
114
+ """Fetch reference resource mapping for chemical component and BIRD definitions
115
+
116
+ Args:
117
+ cfgOb (obj): instance configuration class ConfigUtil()
118
+ referenceResourceNameList (list, optional): list of chemical reference resources. Defaults to [DrugBank, ChEMBL].
119
+
120
+ Returns:
121
+ bool: True for success or False otherwise
122
+ """
123
+ try:
124
+ rnL = referenceResourceNameList if referenceResourceNameList is not None else ["DrugBank", "ChEMBL"]
125
+ mD = {}
126
+ crExt = ChemRefExtractor(cfgOb)
127
+ for referenceResourceName in rnL:
128
+ idD = crExt.getChemCompAccessionMapping(referenceResourceName=referenceResourceName)
129
+ logger.info("%s mapping dictionary (%d)", referenceResourceName, len(idD))
130
+ mD[referenceResourceName.upper()] = idD
131
+ #
132
+ fp = self.__getMappingDataPath()
133
+ tS = datetime.datetime.now().isoformat()
134
+ vS = datetime.datetime.now().strftime("%Y-%m-%d")
135
+ ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "mapping": mD}, fmt="json", indent=3)
136
+ return ok
137
+ except Exception as e:
138
+ logger.exception("Failing with %s", str(e))
139
+ return False