rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
rcsb/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
|
rcsb/exdb/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__path__ = __import__("pkgutil").extend_path(__path__, __name__)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: BranchedEntityExtractor.py
|
|
3
|
+
# Date: 24-May-2021 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to extract selected details from the core branched entity collections.
|
|
6
|
+
#
|
|
7
|
+
#
|
|
8
|
+
# Updates:
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
19
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BranchedEntityExtractor(object):
|
|
25
|
+
"""Utilities to extract selected details from the core branched entity collections."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, cfgOb):
|
|
28
|
+
self.__cfgOb = cfgOb
|
|
29
|
+
|
|
30
|
+
def exportBranchedEntityDetails(self, filePath, fmt="json"):
|
|
31
|
+
"""Export branched entity details (BIRD mapping and WURCS descriptors)"""
|
|
32
|
+
rD = self.getBranchedDetails()
|
|
33
|
+
# ----
|
|
34
|
+
mU = MarshalUtil()
|
|
35
|
+
ok = mU.doExport(filePath, rD, fmt=fmt, indent=3)
|
|
36
|
+
logger.info("Exporting (%d) branched entities status %r", len(rD), ok)
|
|
37
|
+
return ok
|
|
38
|
+
|
|
39
|
+
def getBranchedDetails(self):
|
|
40
|
+
"""Get branched entity details (BIRD mapping and WURCS descriptors)"""
|
|
41
|
+
rD = {}
|
|
42
|
+
try:
|
|
43
|
+
|
|
44
|
+
#
|
|
45
|
+
obEx = ObjectExtractor(
|
|
46
|
+
self.__cfgOb,
|
|
47
|
+
databaseName="pdbx_core",
|
|
48
|
+
collectionName="pdbx_core_branched_entity",
|
|
49
|
+
useCache=False,
|
|
50
|
+
keyAttribute="entity",
|
|
51
|
+
uniqueAttributes=["rcsb_id"],
|
|
52
|
+
selectionQuery={},
|
|
53
|
+
selectionList=["rcsb_id", "pdbx_entity_branch_descriptor", "rcsb_branched_entity_container_identifiers"],
|
|
54
|
+
)
|
|
55
|
+
#
|
|
56
|
+
# eCount = obEx.getCount()
|
|
57
|
+
# logger.info("Branched entity count is %d", eCount)
|
|
58
|
+
objD = obEx.getObjects()
|
|
59
|
+
rD = {}
|
|
60
|
+
for _, eD in objD.items():
|
|
61
|
+
rcsbId = eD["rcsb_id"]
|
|
62
|
+
#
|
|
63
|
+
prdId = None
|
|
64
|
+
try:
|
|
65
|
+
pD = eD["rcsb_branched_entity_container_identifiers"]
|
|
66
|
+
prdId = pD["prd_id"]
|
|
67
|
+
except Exception:
|
|
68
|
+
pass
|
|
69
|
+
#
|
|
70
|
+
wurcs = None
|
|
71
|
+
try:
|
|
72
|
+
for tD in eD["pdbx_entity_branch_descriptor"]:
|
|
73
|
+
if tD["type"] == "WURCS":
|
|
74
|
+
wurcs = tD["descriptor"]
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
if prdId or wurcs:
|
|
78
|
+
rD[rcsbId] = {"prdId": prdId, "wurcs": wurcs}
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.exception("Failing with %s", str(e))
|
|
82
|
+
return rD
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: GlycanProvider.py
|
|
3
|
+
# Date: 24-May-2021 jdw
|
|
4
|
+
#
|
|
5
|
+
# Updated:
|
|
6
|
+
#
|
|
7
|
+
##
|
|
8
|
+
"""
|
|
9
|
+
Accessors for glycan mapped annotations.
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os.path
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
from rcsb.exdb.branch.GlycanUtils import GlycanUtils
|
|
18
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
19
|
+
from rcsb.utils.io.StashableBase import StashableBase
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GlycanProvider(StashableBase):
|
|
25
|
+
"""Accessors and generators for entity glycan mapped identifiers.
|
|
26
|
+
|
|
27
|
+
dirPath -> CACHE/glycan/
|
|
28
|
+
branched_entity_glycan_identifier_map.json
|
|
29
|
+
accession-wurcs-mapping.json
|
|
30
|
+
stash/entity_glycan_mapped_identifiers.tar.gz
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, **kwargs):
|
|
35
|
+
#
|
|
36
|
+
self.__version = "0.50"
|
|
37
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
38
|
+
useCache = kwargs.get("useCache", True)
|
|
39
|
+
self.__dirName = "glycan"
|
|
40
|
+
self.__dirPath = os.path.join(cachePath, self.__dirName)
|
|
41
|
+
super(GlycanProvider, self).__init__(cachePath, [self.__dirName])
|
|
42
|
+
#
|
|
43
|
+
self.__mU = MarshalUtil(workPath=self.__dirPath)
|
|
44
|
+
self.__glyD = self.__reload(fmt="json", useCache=useCache)
|
|
45
|
+
#
|
|
46
|
+
|
|
47
|
+
def testCache(self, minCount=1):
|
|
48
|
+
if minCount == 0:
|
|
49
|
+
return True
|
|
50
|
+
if self.__glyD and minCount and ("identifiers" in self.__glyD) and len(self.__glyD["identifiers"]) >= minCount:
|
|
51
|
+
logger.info("Glycan identifiers (%d)", len(self.__glyD["identifiers"]))
|
|
52
|
+
return True
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
def getIdentifiers(self):
|
|
56
|
+
"""Return a dictionary of related identifiers organized by branched entity id.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
(dict): {entityId: {'idType1': ids, 'idType1': ids}, ... }
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
return self.__glyD["identifiers"] if self.__glyD["identifiers"] else {}
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.error("Failing with %r", str(e))
|
|
65
|
+
return {}
|
|
66
|
+
|
|
67
|
+
def __getMappingFilePath(self, fmt="json"):
|
|
68
|
+
baseFileName = "branched_entity_glycan_identifier_map"
|
|
69
|
+
fExt = ".json" if fmt == "json" else ".pic"
|
|
70
|
+
fp = os.path.join(self.__dirPath, baseFileName + fExt)
|
|
71
|
+
return fp
|
|
72
|
+
|
|
73
|
+
def update(self, cfgOb, fmt="json", indent=3):
|
|
74
|
+
"""Update branched entity glycan accession mapping cache.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
cfgObj (object): ConfigInfo() object instance
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
(bool): True for success for False otherwise
|
|
81
|
+
"""
|
|
82
|
+
ok = False
|
|
83
|
+
try:
|
|
84
|
+
gU = GlycanUtils(cfgOb, self.__dirPath)
|
|
85
|
+
eaD = gU.updateEntityAccessionMap()
|
|
86
|
+
logger.info("Got branched entity glycan accession map (%d)", len(eaD))
|
|
87
|
+
#
|
|
88
|
+
tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
|
|
89
|
+
self.__glyD = {"version": self.__version, "created": tS, "identifiers": eaD}
|
|
90
|
+
#
|
|
91
|
+
mappingFilePath = self.__getMappingFilePath(fmt=fmt)
|
|
92
|
+
kwargs = {"indent": indent} if fmt == "json" else {}
|
|
93
|
+
ok = self.__mU.doExport(mappingFilePath, self.__glyD, fmt=fmt, **kwargs)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.exception("Failing with %s", str(e))
|
|
96
|
+
return ok
|
|
97
|
+
|
|
98
|
+
def reload(self):
|
|
99
|
+
"""Reload from the current cache file."""
|
|
100
|
+
ok = False
|
|
101
|
+
try:
|
|
102
|
+
self.__glyD = self.__reload(fmt="json", useCache=True)
|
|
103
|
+
ok = self.__glyD is not None
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.exception("Failing with %s", str(e))
|
|
106
|
+
return ok
|
|
107
|
+
|
|
108
|
+
def __reload(self, fmt="json", useCache=True):
|
|
109
|
+
mappingFilePath = self.__getMappingFilePath(fmt=fmt)
|
|
110
|
+
tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
|
|
111
|
+
pcD = {"version": self.__version, "created": tS, "identifiers": {}}
|
|
112
|
+
|
|
113
|
+
if useCache and self.__mU.exists(mappingFilePath):
|
|
114
|
+
logger.info("reading cached path %r", mappingFilePath)
|
|
115
|
+
pcD = self.__mU.doImport(mappingFilePath, fmt=fmt)
|
|
116
|
+
return pcD
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: GlycanUtils.py
|
|
3
|
+
# Date: 24-May-2021 jdw
|
|
4
|
+
#
|
|
5
|
+
# Updated:
|
|
6
|
+
##
|
|
7
|
+
"""
|
|
8
|
+
Utilities for fetching and mapping glycan accessions.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os.path
|
|
13
|
+
|
|
14
|
+
from rcsb.exdb.branch.BranchedEntityExtractor import BranchedEntityExtractor
|
|
15
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
16
|
+
from rcsb.utils.io.UrlRequestUtil import UrlRequestUtil
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GlycanUtils:
|
|
22
|
+
"""Utilities for fetching and mapping glycan annotations."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, cfgOb, dirPath):
|
|
25
|
+
self.__cfgOb = cfgOb
|
|
26
|
+
self.__dirPath = dirPath
|
|
27
|
+
self.__mU = MarshalUtil(workPath=self.__dirPath)
|
|
28
|
+
#
|
|
29
|
+
|
|
30
|
+
def __getRawGlycanDetailsPath(self):
|
|
31
|
+
return os.path.join(self.__dirPath, "pdb-raw-branched-entity-details.json")
|
|
32
|
+
|
|
33
|
+
def getBranchedEntityDetails(self):
|
|
34
|
+
"""For branched entities, get BIRD mapping and WURCS details"""
|
|
35
|
+
ok = False
|
|
36
|
+
try:
|
|
37
|
+
bEx = BranchedEntityExtractor(self.__cfgOb)
|
|
38
|
+
branchedEntityD = bEx.getBranchedDetails()
|
|
39
|
+
logger.info("Branched entity descriptor details count %d", len(branchedEntityD))
|
|
40
|
+
detailsPath = self.__getRawGlycanDetailsPath()
|
|
41
|
+
ok = bEx.exportBranchedEntityDetails(detailsPath, fmt="json")
|
|
42
|
+
logger.info("Store raw branched entity data (%r) %s", ok, detailsPath)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.exception("Failing with %s", str(e))
|
|
45
|
+
#
|
|
46
|
+
return branchedEntityD
|
|
47
|
+
|
|
48
|
+
def __getGlycanAccessionMapPath(self):
|
|
49
|
+
return os.path.join(self.__dirPath, "accession-wurcs-mapping.json")
|
|
50
|
+
|
|
51
|
+
def fetchGlycanAccessionMap(self):
|
|
52
|
+
mapD = {}
|
|
53
|
+
accessionMapPath = self.__getGlycanAccessionMapPath()
|
|
54
|
+
if self.__mU.exists(accessionMapPath):
|
|
55
|
+
mapD = self.__mU.doImport(accessionMapPath, fmt="json")
|
|
56
|
+
return mapD
|
|
57
|
+
|
|
58
|
+
def storeGlycanAccessionMap(self, mapD):
|
|
59
|
+
accessionMapPath = self.__getGlycanAccessionMapPath()
|
|
60
|
+
ok = self.__mU.doExport(accessionMapPath, mapD, fmt="json", indent=3)
|
|
61
|
+
return ok
|
|
62
|
+
|
|
63
|
+
def updateEntityAccessionMap(self):
|
|
64
|
+
"""Update entity to glycan accession mapping
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
dict: {entityId: {'glyTouCanId':... , 'prdId': ..., }, ... }
|
|
68
|
+
"""
|
|
69
|
+
entityAccessionMapD = {}
|
|
70
|
+
wurcsTupL = []
|
|
71
|
+
uniqueWurcsD = {}
|
|
72
|
+
accessionMapD = self.fetchGlycanAccessionMap()
|
|
73
|
+
branchedEntityD = self.getBranchedEntityDetails()
|
|
74
|
+
for entityId, iD in branchedEntityD.items():
|
|
75
|
+
if iD["wurcs"] and iD["wurcs"] not in accessionMapD and iD["wurcs"] not in uniqueWurcsD:
|
|
76
|
+
wurcsTupL.append((entityId, iD["wurcs"]))
|
|
77
|
+
uniqueWurcsD.setdefault(iD["wurcs"], []).append(entityId)
|
|
78
|
+
if wurcsTupL:
|
|
79
|
+
tMap = self.getAccessionMapping(wurcsTupL)
|
|
80
|
+
accessionMapD.update(tMap)
|
|
81
|
+
self.storeGlycanAccessionMap(accessionMapD)
|
|
82
|
+
#
|
|
83
|
+
|
|
84
|
+
for entityId, iD in branchedEntityD.items():
|
|
85
|
+
if iD["wurcs"] in accessionMapD:
|
|
86
|
+
prdId = iD["prdId"] if iD["wurcs"] else None
|
|
87
|
+
entityAccessionMapD[entityId] = {"glyTouCanId": accessionMapD[iD["wurcs"]][0], "prdId": prdId}
|
|
88
|
+
return entityAccessionMapD
|
|
89
|
+
|
|
90
|
+
def getAccessionMapping(self, wurcsTupL):
|
|
91
|
+
"""Fetch GlyTouCan accessions for the input WURCS desriptor list"""
|
|
92
|
+
accessionMapD = {}
|
|
93
|
+
logger.info("Fetching (%d) WURCS descriptors", len(wurcsTupL))
|
|
94
|
+
baseUrl = "https://api.glycosmos.org"
|
|
95
|
+
endPoint = "glytoucan/sparql/wurcs2gtcids"
|
|
96
|
+
numDescriptors = len(wurcsTupL)
|
|
97
|
+
for ii, (entityId, wurcs) in enumerate(wurcsTupL, 1):
|
|
98
|
+
try:
|
|
99
|
+
pD = {}
|
|
100
|
+
pD["wurcs"] = wurcs
|
|
101
|
+
uR = UrlRequestUtil()
|
|
102
|
+
rDL, retCode = uR.post(baseUrl, endPoint, pD, returnContentType="JSON")
|
|
103
|
+
logger.debug(" %r wurcs fetch result (%r) %r", entityId, retCode, rDL)
|
|
104
|
+
if rDL:
|
|
105
|
+
for rD in rDL:
|
|
106
|
+
if "id" in rD:
|
|
107
|
+
accessionMapD.setdefault(wurcs, []).append(rD["id"])
|
|
108
|
+
else:
|
|
109
|
+
logger.info("%r fetch fails (%r) (%r) %r", entityId, retCode, wurcs, rDL)
|
|
110
|
+
if ii % 5 == 0:
|
|
111
|
+
logger.info("Fetched %d/%d", ii, numDescriptors)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.exception("Failing for (%r) wurcs (%r) with %s", entityId, wurcs, str(e))
|
|
114
|
+
return accessionMapD
|
|
File without changes
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ChemRefEtlWorker.py
|
|
3
|
+
# Date: 2-Jul-2018 jdw
|
|
4
|
+
#
|
|
5
|
+
# ETL utilities for processing chemical reference data and related data integration.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 9-Dec-2018 jdw add validation methods
|
|
9
|
+
# 3-Sep-2019 jdw move to rcsb.exdb.chemref
|
|
10
|
+
# 7-Aug-2025 dwp change target DB and collection from "drugbank_core" to "dw" and "core_drugbank" (as part of transition to DW);
|
|
11
|
+
# make use of configuration file for loading drugbank collection and setting indexed fields
|
|
12
|
+
#
|
|
13
|
+
##
|
|
14
|
+
__docformat__ = "google en"
|
|
15
|
+
__author__ = "John Westbrook"
|
|
16
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
17
|
+
__license__ = "Apache 2.0"
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
from rcsb.db.mongo.DocumentLoader import DocumentLoader
|
|
22
|
+
from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
|
|
23
|
+
from rcsb.db.utils.SchemaProvider import SchemaProvider
|
|
24
|
+
from rcsb.exdb.chemref.ChemRefExtractor import ChemRefExtractor
|
|
25
|
+
from rcsb.utils.chemref.DrugBankProvider import DrugBankProvider
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ChemRefEtlWorker(object):
|
|
32
|
+
"""Prepare and load chemical reference data collections."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, verbose=False):
|
|
35
|
+
self.__cfgOb = cfgOb
|
|
36
|
+
self.__cachePath = cachePath
|
|
37
|
+
self.__useCache = useCache
|
|
38
|
+
self.__readBackCheck = readBackCheck
|
|
39
|
+
self.__numProc = numProc
|
|
40
|
+
self.__chunkSize = chunkSize
|
|
41
|
+
self.__maxStepLength = maxStepLength
|
|
42
|
+
self.__documentLimit = documentLimit
|
|
43
|
+
#
|
|
44
|
+
self.__resourceName = "MONGO_DB"
|
|
45
|
+
self.__verbose = verbose
|
|
46
|
+
self.__statusList = []
|
|
47
|
+
self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=self.__useCache)
|
|
48
|
+
#
|
|
49
|
+
|
|
50
|
+
def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
|
|
51
|
+
try:
|
|
52
|
+
sFlag = "Y" if status else "N"
|
|
53
|
+
desp = DataExchangeStatus()
|
|
54
|
+
desp.setStartTime(tS=startTimestamp)
|
|
55
|
+
desp.setObject(databaseName, collectionName)
|
|
56
|
+
desp.setStatus(updateId=updateId, successFlag=sFlag)
|
|
57
|
+
desp.setEndTime()
|
|
58
|
+
self.__statusList.append(desp.getStatus())
|
|
59
|
+
return True
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.exception("Failing with %s", str(e))
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
def load(self, updateId, extResource, loadType="full"):
|
|
65
|
+
"""Load chemical reference integrated data for the input external resource-"""
|
|
66
|
+
try:
|
|
67
|
+
self.__statusList = []
|
|
68
|
+
desp = DataExchangeStatus()
|
|
69
|
+
statusStartTimestamp = desp.setStartTime()
|
|
70
|
+
addValues = {}
|
|
71
|
+
collectionGroupName = "core_drugbank"
|
|
72
|
+
#
|
|
73
|
+
if extResource == "DrugBank":
|
|
74
|
+
databaseNameMongo = self.__schP.getDatabaseMongoName(collectionGroupName=collectionGroupName)
|
|
75
|
+
configName = self.__cfgOb.getDefaultSectionName()
|
|
76
|
+
user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName)
|
|
77
|
+
pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName)
|
|
78
|
+
#
|
|
79
|
+
dbP = DrugBankProvider(cachePath=self.__cachePath, useCache=self.__useCache, username=user, password=pw)
|
|
80
|
+
#
|
|
81
|
+
crExt = ChemRefExtractor(self.__cfgOb)
|
|
82
|
+
idD = crExt.getChemCompAccessionMapping(extResource)
|
|
83
|
+
dList = dbP.getDocuments(mapD=idD)
|
|
84
|
+
#
|
|
85
|
+
logger.info("Resource %r extracted mapped document length %d", extResource, len(dList))
|
|
86
|
+
logger.debug("Objects %r", dList[:2])
|
|
87
|
+
_, _, collectionList, docIndexD = self.__schP.getSchemaInfo(collectionGroupName=collectionGroupName)
|
|
88
|
+
collectionName = collectionList[0] if collectionList else "unassigned"
|
|
89
|
+
indexDL = docIndexD[collectionName] if collectionName in docIndexD else []
|
|
90
|
+
logger.info("Database %r collection %r index attributes %r", databaseNameMongo, collectionName, indexDL)
|
|
91
|
+
#
|
|
92
|
+
# For some reason, 'addValues' was being overwritten with an empty dict (https://github.com/rcsb/py-rcsb_exdb/commit/26bd79e9a2fffc97c034b4116dece9248d1c1f39)
|
|
93
|
+
# Will need to review this -- do we want to add the schema version values or not? (Also, see similar logic in UniProtCoreEtlWorker.py)
|
|
94
|
+
# collectionVersion = sD.getCollectionVersion(collectionName)
|
|
95
|
+
# addValues = {"_schema_version": collectionVersion}
|
|
96
|
+
#
|
|
97
|
+
dl = DocumentLoader(
|
|
98
|
+
self.__cfgOb,
|
|
99
|
+
self.__cachePath,
|
|
100
|
+
self.__resourceName,
|
|
101
|
+
numProc=self.__numProc,
|
|
102
|
+
chunkSize=self.__chunkSize,
|
|
103
|
+
maxStepLength=self.__maxStepLength,
|
|
104
|
+
documentLimit=self.__documentLimit,
|
|
105
|
+
verbose=self.__verbose,
|
|
106
|
+
readBackCheck=self.__readBackCheck,
|
|
107
|
+
)
|
|
108
|
+
#
|
|
109
|
+
ok = dl.load(databaseNameMongo, collectionName, loadType=loadType, documentList=dList, keyNames=None, addValues=addValues, indexDL=indexDL)
|
|
110
|
+
self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
|
|
111
|
+
|
|
112
|
+
return True
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.exception("Failing with %s", str(e))
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
def getLoadStatus(self):
|
|
118
|
+
return self.__statusList
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ChemRefExtractor.py
|
|
3
|
+
# Date: 2-Jul-2018 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to extract data from chemical component core collections.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 7-Jan-2019 jdw moved from ChemRefEtlWorker.
|
|
9
|
+
# 3-Sep-2019 jdw moved again to module rcsb.exdb.chemref
|
|
10
|
+
# 14-Aug-2025 dwp rename bird_chem_comp_core to core_chem_comp
|
|
11
|
+
#
|
|
12
|
+
##
|
|
13
|
+
__docformat__ = "google en"
|
|
14
|
+
__author__ = "John Westbrook"
|
|
15
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
16
|
+
__license__ = "Apache 2.0"
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ChemRefExtractor(object):
|
|
26
|
+
"""Selected utilities to extract data from chemical component core collections."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, cfgOb):
|
|
29
|
+
self.__cfgOb = cfgOb
|
|
30
|
+
self.__resourceName = "MONGO_DB"
|
|
31
|
+
#
|
|
32
|
+
|
|
33
|
+
def getChemCompAccessionMapping(self, referenceResourceName):
|
|
34
|
+
"""Get the accession code mapping between chemical component identifiers and identifier(s) for the
|
|
35
|
+
input external reference resource.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
referenceResourceName (str): resource name (e.g. DrugBank, ChEMBL, CCDC)
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
dict: {referenceResourceId: chem_comp/bird_id, referenceResourceId: chem_comp/bird_id, ... }
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
idD = {}
|
|
45
|
+
try:
|
|
46
|
+
databaseName = "dw"
|
|
47
|
+
collectionName = "core_chem_comp"
|
|
48
|
+
selectD = {"rcsb_chem_comp_related.resource_name": referenceResourceName}
|
|
49
|
+
selectionList = ["rcsb_id", "rcsb_chem_comp_related"]
|
|
50
|
+
logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
|
|
51
|
+
obEx = ObjectExtractor(
|
|
52
|
+
self.__cfgOb,
|
|
53
|
+
databaseName=databaseName,
|
|
54
|
+
collectionName=collectionName,
|
|
55
|
+
keyAttribute="rcsb_id",
|
|
56
|
+
uniqueAttributes=["rcsb_id"],
|
|
57
|
+
selectionQuery=selectD,
|
|
58
|
+
selectionList=selectionList,
|
|
59
|
+
stripObjectId=True,
|
|
60
|
+
)
|
|
61
|
+
logger.info("Reference data object count %d", obEx.getCount())
|
|
62
|
+
objD = obEx.getObjects()
|
|
63
|
+
for _, doc in objD.items():
|
|
64
|
+
dL = doc["rcsb_chem_comp_related"] if "rcsb_chem_comp_related" in doc else []
|
|
65
|
+
for dD in dL:
|
|
66
|
+
if dD["resource_name"] == referenceResourceName and "resource_accession_code" in dD:
|
|
67
|
+
idD.setdefault(dD["resource_accession_code"], []).append(dD["comp_id"])
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.exception("Failing with %s", str(e))
|
|
70
|
+
return idD
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ChemRefMappingProvider.py
|
|
3
|
+
# Date: 18-Jun-2021 jdw
|
|
4
|
+
#
|
|
5
|
+
# Updated:
|
|
6
|
+
#
|
|
7
|
+
##
|
|
8
|
+
"""
|
|
9
|
+
Accessors for chemical reference identifier mapping data.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import datetime
|
|
13
|
+
import logging
|
|
14
|
+
import os.path
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
from rcsb.utils.io.FileUtil import FileUtil
|
|
18
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
19
|
+
from rcsb.utils.io.StashableBase import StashableBase
|
|
20
|
+
from rcsb.exdb.chemref.ChemRefExtractor import ChemRefExtractor
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ChemRefMappingProvider(StashableBase):
|
|
26
|
+
"""Accessors for chemical reference identifier mapping data."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, cachePath, useCache=True):
|
|
29
|
+
#
|
|
30
|
+
self.__cachePath = cachePath
|
|
31
|
+
self.__useCache = useCache
|
|
32
|
+
self.__dirName = "chemref-mapping"
|
|
33
|
+
super(ChemRefMappingProvider, self).__init__(self.__cachePath, [self.__dirName])
|
|
34
|
+
self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
|
|
35
|
+
#
|
|
36
|
+
self.__mU = MarshalUtil(workPath=self.__dirPath)
|
|
37
|
+
self.__rD = {}
|
|
38
|
+
self.__mapD = self.__reload(self.__dirPath, useCache)
|
|
39
|
+
#
|
|
40
|
+
|
|
41
|
+
def testCache(self, minCount=0):
|
|
42
|
+
logger.info("Mapping count %d", len(self.__mapD["mapping"]) if "mapping" in self.__mapD else 0)
|
|
43
|
+
if minCount == 0 or self.__mapD and "mapping" in self.__mapD and len(self.__mapD["mapping"]) >= minCount:
|
|
44
|
+
return True
|
|
45
|
+
else:
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
def getReferenceIds(self, referenceResourceName, localId):
|
|
49
|
+
"""Get the identifiers in the reference resource corresponding to input local
|
|
50
|
+
identifiers (Chemical Component or BIRD).
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...)
|
|
54
|
+
localId (str): local identifier for a Chemical Component or BIRD definition
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
list: list of reference identifiers
|
|
58
|
+
"""
|
|
59
|
+
if not self.__rD:
|
|
60
|
+
for rN, forwardD in self.__mapD["mapping"].items():
|
|
61
|
+
# {refId :[lId, lId, ...], ...}
|
|
62
|
+
reverseD = {}
|
|
63
|
+
for refId, rcsbIdL in forwardD.items():
|
|
64
|
+
for rId in rcsbIdL:
|
|
65
|
+
reverseD.setdefault(rId, []).append(refId)
|
|
66
|
+
self.__rD[rN] = reverseD
|
|
67
|
+
#
|
|
68
|
+
try:
|
|
69
|
+
return self.__rD[referenceResourceName.upper()][localId]
|
|
70
|
+
except Exception:
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
def getLocalIds(self, referenceResourceName, referenceId):
|
|
74
|
+
"""Get the local identifiers (Chemical Component or BIRD) corresponding to identifiers in
|
|
75
|
+
chemical reference resource.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
referenceResourceName (str): chemical reference resource name (DrugBank, ChEMBL, ChEBI, PubChem, ...)
|
|
79
|
+
referenceId (str): identifier in the chemical reference resource
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
list: list of local Chemical Component or BIRD identifiers
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
return self.__mapD["mapping"][referenceResourceName.upper()][referenceId]
|
|
86
|
+
except Exception:
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
def __getMappingDataPath(self):
|
|
90
|
+
return os.path.join(self.__dirPath, "chemref-mapping-data.json")
|
|
91
|
+
|
|
92
|
+
def reload(self):
|
|
93
|
+
self.__mapD = self.__reload(self.__dirPath, useCache=True)
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
def __reload(self, dirPath, useCache):
|
|
97
|
+
startTime = time.time()
|
|
98
|
+
fD = {}
|
|
99
|
+
ok = False
|
|
100
|
+
mappingPath = self.__getMappingDataPath()
|
|
101
|
+
#
|
|
102
|
+
logger.info("useCache %r mappingPath %r", useCache, mappingPath)
|
|
103
|
+
if useCache and self.__mU.exists(mappingPath):
|
|
104
|
+
fD = self.__mU.doImport(mappingPath, fmt="json")
|
|
105
|
+
ok = True
|
|
106
|
+
else:
|
|
107
|
+
fU = FileUtil()
|
|
108
|
+
fU.mkdir(dirPath)
|
|
109
|
+
# ---
|
|
110
|
+
logger.info("Completed reload with status (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
|
|
111
|
+
return fD
|
|
112
|
+
|
|
113
|
+
def fetchChemRefMapping(self, cfgOb, referenceResourceNameList=None):
|
|
114
|
+
"""Fetch reference resource mapping for chemical component and BIRD definitions
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
cfgOb (obj): instance configuration class ConfigUtil()
|
|
118
|
+
referenceResourceNameList (list, optional): list of chemical reference resources. Defaults to [DrugBank, ChEMBL].
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
bool: True for success or False otherwise
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
rnL = referenceResourceNameList if referenceResourceNameList is not None else ["DrugBank", "ChEMBL"]
|
|
125
|
+
mD = {}
|
|
126
|
+
crExt = ChemRefExtractor(cfgOb)
|
|
127
|
+
for referenceResourceName in rnL:
|
|
128
|
+
idD = crExt.getChemCompAccessionMapping(referenceResourceName=referenceResourceName)
|
|
129
|
+
logger.info("%s mapping dictionary (%d)", referenceResourceName, len(idD))
|
|
130
|
+
mD[referenceResourceName.upper()] = idD
|
|
131
|
+
#
|
|
132
|
+
fp = self.__getMappingDataPath()
|
|
133
|
+
tS = datetime.datetime.now().isoformat()
|
|
134
|
+
vS = datetime.datetime.now().strftime("%Y-%m-%d")
|
|
135
|
+
ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "mapping": mD}, fmt="json", indent=3)
|
|
136
|
+
return ok
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.exception("Failing with %s", str(e))
|
|
139
|
+
return False
|