rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: CitationAdapter.py
|
|
3
|
+
# Date: 21-Nov-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to update entry citations in the core_entry collection.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
#
|
|
9
|
+
##
|
|
10
|
+
__docformat__ = "google en"
|
|
11
|
+
__author__ = "John Westbrook"
|
|
12
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
13
|
+
__license__ = "Apache 2.0"
|
|
14
|
+
|
|
15
|
+
import copy
|
|
16
|
+
import logging
|
|
17
|
+
from string import capwords
|
|
18
|
+
|
|
19
|
+
from rcsb.exdb.utils.ObjectAdapterBase import ObjectAdapterBase
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CitationAdapter(ObjectAdapterBase):
|
|
25
|
+
"""Selected utilities to update entry citations in the core_entry collection."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, citationReferenceProvider, journalTitleAbbreviationProvider):
|
|
28
|
+
super(CitationAdapter, self).__init__()
|
|
29
|
+
#
|
|
30
|
+
self.__crP = citationReferenceProvider
|
|
31
|
+
self.__jtaP = journalTitleAbbreviationProvider
|
|
32
|
+
|
|
33
|
+
def filter(self, obj, **kwargs):
|
|
34
|
+
isTestMode = True
|
|
35
|
+
if isTestMode:
|
|
36
|
+
_, _ = self.__filter(copy.deepcopy(obj))
|
|
37
|
+
return True, obj
|
|
38
|
+
else:
|
|
39
|
+
return self.__filter(obj)
|
|
40
|
+
|
|
41
|
+
def __filter(self, obj):
|
|
42
|
+
ok = True
|
|
43
|
+
try:
|
|
44
|
+
rcsbId = obj["rcsb_id"]
|
|
45
|
+
if "citation" in obj:
|
|
46
|
+
for citObj in obj["citation"]:
|
|
47
|
+
if citObj["id"].upper() != "PRIMARY":
|
|
48
|
+
continue
|
|
49
|
+
issn = citObj["journal_id_ISSN"] if "journal_id_ISSN" in citObj else None
|
|
50
|
+
curAbbrev = citObj["journal_abbrev"] if "journal_abbrev" in citObj else None
|
|
51
|
+
revAbbrev = self.__updateJournalAbbreviation(rcsbId, issn, curAbbrev)
|
|
52
|
+
logger.debug("%s: revised: %r current: %r", rcsbId, revAbbrev, curAbbrev)
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
ok = False
|
|
56
|
+
logger.exception("Filter adapter failing with error with %s", str(e))
|
|
57
|
+
#
|
|
58
|
+
return ok, obj
|
|
59
|
+
|
|
60
|
+
def __updateJournalAbbreviation(self, rcsbId, issn, curAbbrev):
|
|
61
|
+
try:
|
|
62
|
+
revAbbrev = None
|
|
63
|
+
if issn:
|
|
64
|
+
medlineAbbrev = self.__crP.getMedlineJournalAbbreviation(issn)
|
|
65
|
+
# medlineIsoAbbrev = self.__crP.getMedlineJournalIsoAbbreviation(issn)
|
|
66
|
+
crIssn = issn.replace("-", "")
|
|
67
|
+
crTitle = self.__crP.getCrossRefJournalTitle(crIssn)
|
|
68
|
+
#
|
|
69
|
+
revAbbrev = medlineAbbrev
|
|
70
|
+
if not medlineAbbrev and not crTitle:
|
|
71
|
+
logger.debug("%s: missing information for issn %r curAbbrev %r", rcsbId, issn, curAbbrev)
|
|
72
|
+
revAbbrev = capwords(curAbbrev.replace(".", " "))
|
|
73
|
+
elif not medlineAbbrev:
|
|
74
|
+
revAbbrev = self.__jtaP.getJournalAbbreviation(crTitle, usePunctuation=False)
|
|
75
|
+
else:
|
|
76
|
+
if curAbbrev.upper() in ["TO BE PUBLISHED", "IN PREPARATION"]:
|
|
77
|
+
revAbbrev = "To be published"
|
|
78
|
+
elif curAbbrev.upper().startswith("THESIS"):
|
|
79
|
+
revAbbrev = "Thesis"
|
|
80
|
+
else:
|
|
81
|
+
revAbbrev = capwords(curAbbrev.replace(".", " "))
|
|
82
|
+
logger.debug("%r: missing issn and non-standard abbrev for %r", rcsbId, curAbbrev)
|
|
83
|
+
|
|
84
|
+
if not curAbbrev:
|
|
85
|
+
logger.info("%r: missing issn and journal abbrev", rcsbId)
|
|
86
|
+
#
|
|
87
|
+
logger.debug("%s: revised: %r current: %r", rcsbId, revAbbrev, curAbbrev)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.exception("Failing on %r %r %r with %r", rcsbId, issn, curAbbrev, str(e))
|
|
90
|
+
|
|
91
|
+
return revAbbrev
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: CitationExtractor.py
|
|
3
|
+
# Date: 19-Feb-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to extract citation data from the core_entry exchange database schema.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
# import copy
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
21
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CitationExtractor(object):
|
|
27
|
+
"""Utilities to extract citation related data from the core_entry collection."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, cfgOb, **kwargs):
|
|
30
|
+
self.__cfgOb = cfgOb
|
|
31
|
+
self.__resourceName = "MONGO_DB"
|
|
32
|
+
self.__databaseName = "pdbx_core"
|
|
33
|
+
self.__collectionName = "pdbx_core_entry"
|
|
34
|
+
#
|
|
35
|
+
self.__mU = MarshalUtil()
|
|
36
|
+
#
|
|
37
|
+
self.__entryD = self.__rebuildCache(**kwargs)
|
|
38
|
+
self.__idxD = self.__buildIndices(self.__entryD)
|
|
39
|
+
#
|
|
40
|
+
|
|
41
|
+
def __rebuildCache(self, **kwargs):
|
|
42
|
+
useCache = kwargs.get("useCache", True)
|
|
43
|
+
dirPath = kwargs.get("exdbDirPath", ".")
|
|
44
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
|
|
45
|
+
#
|
|
46
|
+
ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
|
|
47
|
+
fn = "entry-citation-extracted-data-cache" + "." + ext
|
|
48
|
+
cacheFilePath = os.path.join(dirPath, fn)
|
|
49
|
+
|
|
50
|
+
cD = {"entryD": {}}
|
|
51
|
+
try:
|
|
52
|
+
if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK):
|
|
53
|
+
logger.info("Using cached entry citation file %s", cacheFilePath)
|
|
54
|
+
cD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
|
|
55
|
+
else:
|
|
56
|
+
entryD = self.__extractCitations()
|
|
57
|
+
cD["entryD"] = entryD
|
|
58
|
+
if cacheFilePath:
|
|
59
|
+
ok = self.__mU.mkdir(dirPath)
|
|
60
|
+
ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs)
|
|
61
|
+
logger.info("Saved entry citation results (%d) status %r in %s", len(entryD), ok, cacheFilePath)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.exception("Failing with %s", str(e))
|
|
64
|
+
return cD["entryD"]
|
|
65
|
+
|
|
66
|
+
def __buildIndices(self, entryD):
|
|
67
|
+
"""
|
|
68
|
+
Example:
|
|
69
|
+
"entryD": {
|
|
70
|
+
"5KAL": {
|
|
71
|
+
"citation": [
|
|
72
|
+
{
|
|
73
|
+
"country": "UK",
|
|
74
|
+
"id": "primary",
|
|
75
|
+
"journal_abbrev": "Nucleic Acids Res.",
|
|
76
|
+
"journal_id_ASTM": "NARHAD",
|
|
77
|
+
"journal_id_CSD": "0389",
|
|
78
|
+
"journal_id_ISSN": "1362-4962",
|
|
79
|
+
"journal_volume": "44",
|
|
80
|
+
"page_first": "10862",
|
|
81
|
+
"page_last": "10878",
|
|
82
|
+
"title": "RNA Editing TUTase 1: structural foundation of substrate recognition, complex interactions and drug targeting.",
|
|
83
|
+
"year": 2016,
|
|
84
|
+
"pdbx_database_id_DOI": "10.1093/nar/gkw917",
|
|
85
|
+
"pdbx_database_id_PubMed": 27744351,
|
|
86
|
+
"rcsb_authors": [
|
|
87
|
+
"Rajappa-Titu, L.",
|
|
88
|
+
"Suematsu, T.",
|
|
89
|
+
"Munoz-Tello, P.",
|
|
90
|
+
"Long, M.",
|
|
91
|
+
"Demir, O.",
|
|
92
|
+
"Cheng, K.J.",
|
|
93
|
+
"Stagno, J.R.",
|
|
94
|
+
"Luecke, H.",
|
|
95
|
+
"Amaro, R.E.",
|
|
96
|
+
"Aphasizheva, I.",
|
|
97
|
+
"Aphasizhev, R.",
|
|
98
|
+
"Thore, S."
|
|
99
|
+
]
|
|
100
|
+
}
|
|
101
|
+
],
|
|
102
|
+
"_entry_id": "5KAL"
|
|
103
|
+
},
|
|
104
|
+
"""
|
|
105
|
+
indD = {}
|
|
106
|
+
missingCitationCount = 0
|
|
107
|
+
missingJournalName = 0
|
|
108
|
+
numPubMed = 0
|
|
109
|
+
numDOI = 0
|
|
110
|
+
numCitations = 0
|
|
111
|
+
mD = {}
|
|
112
|
+
issnD = {}
|
|
113
|
+
missingISSNCount = 0
|
|
114
|
+
missingPubMedCount = 0
|
|
115
|
+
try:
|
|
116
|
+
for entryId, eD in entryD.items():
|
|
117
|
+
cDL = eD["citation"] if "citation" in eD else None
|
|
118
|
+
if cDL:
|
|
119
|
+
for cD in cDL[:1]:
|
|
120
|
+
if cD and "journal_abbrev" in cD:
|
|
121
|
+
indD[cD["journal_abbrev"]] = indD[cD["journal_abbrev"]] + 1 if cD["journal_abbrev"] in indD else 1
|
|
122
|
+
else:
|
|
123
|
+
logger.info("Missing journal name in entryId %s %r ", entryId, cD)
|
|
124
|
+
missingJournalName += 1
|
|
125
|
+
if cD and "pdbx_database_id_DOI" in cD:
|
|
126
|
+
numDOI += 1
|
|
127
|
+
|
|
128
|
+
if cD and "pdbx_database_id_PubMed" in cD:
|
|
129
|
+
numPubMed += 1
|
|
130
|
+
else:
|
|
131
|
+
mD[cD["journal_abbrev"]] = mD[cD["journal_abbrev"]] + 1 if cD["journal_abbrev"] in mD else 1
|
|
132
|
+
missingPubMedCount += 1
|
|
133
|
+
|
|
134
|
+
if "journal_id_ISSN" in cD and len(cD["journal_id_ISSN"]) > 7:
|
|
135
|
+
issnD[cD["journal_id_ISSN"]] = issnD[cD["journal_id_ISSN"]] + 1 if cD["journal_id_ISSN"] in issnD else 1
|
|
136
|
+
else:
|
|
137
|
+
missingISSNCount += 1
|
|
138
|
+
|
|
139
|
+
if cD:
|
|
140
|
+
numCitations += 1
|
|
141
|
+
else:
|
|
142
|
+
missingCitationCount += 1
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.exception("Failing with %s", str(e))
|
|
145
|
+
#
|
|
146
|
+
logger.info("Number of citatons %d", numCitations)
|
|
147
|
+
logger.info("Number of PubMed ids %d", numPubMed)
|
|
148
|
+
logger.info("Number of DOIs %d", numDOI)
|
|
149
|
+
logger.info("No citation category count %d missing journal name %d", missingCitationCount, missingJournalName)
|
|
150
|
+
#
|
|
151
|
+
logger.info("Journal index name length %d", len(indD))
|
|
152
|
+
# logger.info("Journal name length %r",indD.items())
|
|
153
|
+
#
|
|
154
|
+
logger.info("Missing pubmed index length %d", len(mD))
|
|
155
|
+
logger.info("Missing pubmed length %d", missingPubMedCount)
|
|
156
|
+
logger.info("Missing PubMed %r", mD.items())
|
|
157
|
+
#
|
|
158
|
+
logger.info("ISSN dictionary length %d", len(issnD))
|
|
159
|
+
logger.info("ISSN missing length %d", missingISSNCount)
|
|
160
|
+
#
|
|
161
|
+
return indD
|
|
162
|
+
|
|
163
|
+
def getEntryCount(self):
|
|
164
|
+
return len(self.__entryD)
|
|
165
|
+
|
|
166
|
+
def __extractCitations(self):
|
|
167
|
+
"""Test case - extract unique entity source and host taxonomies"""
|
|
168
|
+
try:
|
|
169
|
+
obEx = ObjectExtractor(
|
|
170
|
+
self.__cfgOb,
|
|
171
|
+
databaseName=self.__databaseName,
|
|
172
|
+
collectionName=self.__collectionName,
|
|
173
|
+
cacheFilePath=None,
|
|
174
|
+
useCache=False,
|
|
175
|
+
keyAttribute="entry",
|
|
176
|
+
uniqueAttributes=["rcsb_id"],
|
|
177
|
+
cacheKwargs=None,
|
|
178
|
+
objectLimit=None,
|
|
179
|
+
selectionQuery={},
|
|
180
|
+
selectionList=["rcsb_id", "citation"],
|
|
181
|
+
)
|
|
182
|
+
eCount = obEx.getCount()
|
|
183
|
+
logger.info("Entry count is %d", eCount)
|
|
184
|
+
objD = obEx.getObjects()
|
|
185
|
+
# for ky, eD in objD.items():
|
|
186
|
+
# logger.info("%s: %r", ky, eD)
|
|
187
|
+
return objD
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.exception("Failing with %s", str(e))
|
|
190
|
+
return {}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: CitationExtractor.py
|
|
3
|
+
# Date: 19-Feb-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to process and normalize PDB citation data.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
from rcsb.exdb.citation.CitationExtractor import CitationExtractor
|
|
19
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CitationUtils(object):
|
|
26
|
+
"""Utilities to process and normalize PDB citation data."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, cfgOb, **kwargs):
|
|
29
|
+
self.__cfgOb = cfgOb
|
|
30
|
+
self.__mU = MarshalUtil()
|
|
31
|
+
#
|
|
32
|
+
self.__ce = self.__getEntryCitations(**kwargs)
|
|
33
|
+
|
|
34
|
+
def getCitationEntryCount(self):
|
|
35
|
+
return self.__ce.getEntryCount()
|
|
36
|
+
|
|
37
|
+
def __getEntryCitations(self, **kwargs):
|
|
38
|
+
"""Extract entry citations"""
|
|
39
|
+
ce = None
|
|
40
|
+
exdbDirPath = kwargs.get("exdbDirPath", None)
|
|
41
|
+
saveKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
|
|
42
|
+
useCache = kwargs.get("useCache", True)
|
|
43
|
+
entryLimit = kwargs.get("entryLimit", True)
|
|
44
|
+
try:
|
|
45
|
+
ce = CitationExtractor(self.__cfgOb, exdbDirPath=exdbDirPath, useCache=useCache, cacheKwargs=saveKwargs, entryLimit=entryLimit)
|
|
46
|
+
eCount = ce.getEntryCount()
|
|
47
|
+
logger.info("Using citation data for %d entries", eCount)
|
|
48
|
+
#
|
|
49
|
+
except Exception as e:
|
|
50
|
+
logger.exception("Failing with %s", str(e))
|
|
51
|
+
return ce
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: EntryInfoProvider.py
|
|
3
|
+
# Date: 22-Sep-2021 jdw
|
|
4
|
+
#
|
|
5
|
+
# Updated:
|
|
6
|
+
#
|
|
7
|
+
##
|
|
8
|
+
"""
|
|
9
|
+
Accessors for entry-level annotations.
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os.path
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
18
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
19
|
+
from rcsb.utils.io.StashableBase import StashableBase
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EntryInfoProvider(StashableBase):
|
|
25
|
+
"""Accessors and generators for entry-level annotations."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, **kwargs):
|
|
28
|
+
#
|
|
29
|
+
self.__version = "0.50"
|
|
30
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
31
|
+
useCache = kwargs.get("useCache", True)
|
|
32
|
+
self.__dirName = "rcsb_entry_info"
|
|
33
|
+
self.__dirPath = os.path.join(cachePath, self.__dirName)
|
|
34
|
+
super(EntryInfoProvider, self).__init__(cachePath, [self.__dirName])
|
|
35
|
+
#
|
|
36
|
+
self.__mU = MarshalUtil(workPath=self.__dirPath)
|
|
37
|
+
self.__entryInfoD = self.__reload(fmt="json", useCache=useCache)
|
|
38
|
+
#
|
|
39
|
+
|
|
40
|
+
def testCache(self, minCount=1):
|
|
41
|
+
if minCount == 0:
|
|
42
|
+
return True
|
|
43
|
+
if self.__entryInfoD and minCount and "entryInfo" in self.__entryInfoD and len(self.__entryInfoD["entryInfo"]) > minCount:
|
|
44
|
+
logger.info("Entry annotations for (%d) entries", len(self.__entryInfoD["entryInfo"]))
|
|
45
|
+
return True
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
def getEntryInfo(self, entryId):
|
|
49
|
+
"""Return a dictionary of entry-level annotations.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
(dict): of entry-level annotations
|
|
53
|
+
"""
|
|
54
|
+
try:
|
|
55
|
+
return self.__entryInfoD["entryInfo"][entryId.upper()] if entryId.upper() in self.__entryInfoD["entryInfo"] else {}
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.error("Failing with %r", str(e))
|
|
58
|
+
return {}
|
|
59
|
+
|
|
60
|
+
def getEntriesByPolymerEntityCount(self, count):
|
|
61
|
+
oL = []
|
|
62
|
+
try:
|
|
63
|
+
for entryId, eD in self.__entryInfoD["entryInfo"].items():
|
|
64
|
+
if eD["polymer_entity_count"] == count:
|
|
65
|
+
oL.append(entryId)
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error("Failing with %r", str(e))
|
|
68
|
+
return oL
|
|
69
|
+
|
|
70
|
+
def __getEntryInfoFilePath(self, fmt="json"):
|
|
71
|
+
baseFileName = "entry_info_details"
|
|
72
|
+
fExt = ".json" if fmt == "json" else ".pic"
|
|
73
|
+
fp = os.path.join(self.__dirPath, baseFileName + fExt)
|
|
74
|
+
return fp
|
|
75
|
+
|
|
76
|
+
def update(self, cfgOb, fmt="json", indent=3):
|
|
77
|
+
"""Update branched entity glycan accession mapping cache.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
cfgObj (object): ConfigInfo() object instance
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
(bool): True for success for False otherwise
|
|
84
|
+
"""
|
|
85
|
+
ok = False
|
|
86
|
+
try:
|
|
87
|
+
entryInfoD = self.__updateEntryInfo(cfgOb)
|
|
88
|
+
|
|
89
|
+
logger.info("Got entry_info for (%d)", len(entryInfoD))
|
|
90
|
+
#
|
|
91
|
+
tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
|
|
92
|
+
self.__entryInfoD = {"version": self.__version, "created": tS, "entryInfo": entryInfoD}
|
|
93
|
+
#
|
|
94
|
+
infoFilePath = self.__getEntryInfoFilePath(fmt=fmt)
|
|
95
|
+
kwargs = {"indent": indent} if fmt == "json" else {}
|
|
96
|
+
ok = self.__mU.doExport(infoFilePath, self.__entryInfoD, fmt=fmt, **kwargs)
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.exception("Failing with %s", str(e))
|
|
99
|
+
return ok
|
|
100
|
+
|
|
101
|
+
def reload(self):
|
|
102
|
+
"""Reload from the current cache file."""
|
|
103
|
+
ok = False
|
|
104
|
+
try:
|
|
105
|
+
self.__entryInfoD = self.__reload(fmt="json", useCache=True)
|
|
106
|
+
ok = self.__entryInfoD is not None
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logger.exception("Failing with %s", str(e))
|
|
109
|
+
return ok
|
|
110
|
+
|
|
111
|
+
def __reload(self, fmt="json", useCache=True):
|
|
112
|
+
entryInfoFilePath = self.__getEntryInfoFilePath(fmt=fmt)
|
|
113
|
+
tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
|
|
114
|
+
pcD = {"version": self.__version, "created": tS, "identifiers": {}}
|
|
115
|
+
|
|
116
|
+
if useCache and self.__mU.exists(entryInfoFilePath):
|
|
117
|
+
logger.info("Reading entry-info cached path %r", entryInfoFilePath)
|
|
118
|
+
pcD = self.__mU.doImport(entryInfoFilePath, fmt=fmt)
|
|
119
|
+
return pcD
|
|
120
|
+
|
|
121
|
+
def __updateEntryInfo(self, cfgOb):
|
|
122
|
+
"""Get entry_info data"""
|
|
123
|
+
rD = {}
|
|
124
|
+
try:
|
|
125
|
+
obEx = ObjectExtractor(
|
|
126
|
+
cfgOb,
|
|
127
|
+
databaseName="pdbx_core",
|
|
128
|
+
collectionName="pdbx_core_entry",
|
|
129
|
+
useCache=False,
|
|
130
|
+
keyAttribute="entry",
|
|
131
|
+
uniqueAttributes=["rcsb_id"],
|
|
132
|
+
selectionQuery={},
|
|
133
|
+
selectionList=["rcsb_id", "rcsb_entry_info.polymer_entity_count"],
|
|
134
|
+
)
|
|
135
|
+
#
|
|
136
|
+
eCount = obEx.getCount()
|
|
137
|
+
logger.info("Entry count is %d", eCount)
|
|
138
|
+
|
|
139
|
+
objD = obEx.getObjects()
|
|
140
|
+
for _, eD in objD.items():
|
|
141
|
+
rcsbId = eD["rcsb_id"]
|
|
142
|
+
try:
|
|
143
|
+
rD[rcsbId] = eD["rcsb_entry_info"]
|
|
144
|
+
except Exception:
|
|
145
|
+
pass
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.exception("Failing with %s", str(e))
|
|
148
|
+
return rD
|
|
File without changes
|