rcsb.exdb 1.32__py3-none-any.whl → 1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/exdb/seq/PolymerEntityExtractor.py +3 -2
- rcsb/exdb/tree/TreeNodeListWorker.py +72 -49
- {rcsb_exdb-1.32.dist-info → rcsb_exdb-1.34.dist-info}/METADATA +2 -2
- {rcsb_exdb-1.32.dist-info → rcsb_exdb-1.34.dist-info}/RECORD +6 -6
- {rcsb_exdb-1.32.dist-info → rcsb_exdb-1.34.dist-info}/WHEEL +0 -0
- {rcsb_exdb-1.32.dist-info → rcsb_exdb-1.34.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
# 9-Jan-2024 dwp Turn off use of uniprot_exdb DB for enriching protein entity details file (data not used)
|
|
10
10
|
# 10-Dec-2024 dwp Sort extracted polymer entity sequence data by entity ID (alphabetically), to ensure consistent
|
|
11
11
|
# ordering between coasts (order of sequence data influences results of mmseqs2 sequence searching)
|
|
12
|
+
# 2-Feb-2026 dwp Handle case of missing 'rcsb_entity_source_organism.source_type'
|
|
12
13
|
#
|
|
13
14
|
##
|
|
14
15
|
__docformat__ = "google en"
|
|
@@ -115,7 +116,7 @@ class PolymerEntityExtractor(object):
|
|
|
115
116
|
begSeqNum = 1
|
|
116
117
|
endSeqNum = seqLen
|
|
117
118
|
srcId = tD["pdbx_src_id"]
|
|
118
|
-
srcType = tD["source_type"]
|
|
119
|
+
srcType = tD["source_type"] if "source_type" in tD else None
|
|
119
120
|
taxId = tD["ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in tD else -1
|
|
120
121
|
if srcName and taxId == -1:
|
|
121
122
|
missingSrcD.setdefault(srcName, []).append(rId)
|
|
@@ -138,7 +139,7 @@ class PolymerEntityExtractor(object):
|
|
|
138
139
|
pD = eD["rcsb_polymer_entity"]
|
|
139
140
|
taxCount = pD["rcsb_source_taxonomy_count"]
|
|
140
141
|
except Exception:
|
|
141
|
-
if srcType == "synthetic":
|
|
142
|
+
if srcType is not None and srcType == "synthetic":
|
|
142
143
|
taxCount = 0
|
|
143
144
|
else:
|
|
144
145
|
logger.warning("%s (srcName %r) no source taxonomy count type %r", rId, srcName, srcType)
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
# 23-Jan-2025 dwp Change indexed field from 'update_id' to 'id'
|
|
13
13
|
# 7-Aug-2025 dwp Change target DB and collection names to "dw" and "tree_*" (via configuration file);
|
|
14
14
|
# Make use of configuration file for loading tree node lists and setting indexed fields
|
|
15
|
+
# 6-Jan-2026 dwp Raise error if a tree node list is empty or fails to load;
|
|
16
|
+
# Add support for providing a manual list of tree node lists to load (for testing)
|
|
15
17
|
#
|
|
16
18
|
##
|
|
17
19
|
__docformat__ = "google en"
|
|
@@ -41,7 +43,20 @@ logger = logging.getLogger(__name__)
|
|
|
41
43
|
class TreeNodeListWorker(object):
|
|
42
44
|
"""Prepare and load repository holdings and repository update data."""
|
|
43
45
|
|
|
44
|
-
def __init__(
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
cfgOb,
|
|
49
|
+
cachePath,
|
|
50
|
+
numProc=1,
|
|
51
|
+
chunkSize=10,
|
|
52
|
+
maxStepLength=4000,
|
|
53
|
+
readBackCheck=False,
|
|
54
|
+
documentLimit=None,
|
|
55
|
+
verbose=False,
|
|
56
|
+
useCache=False,
|
|
57
|
+
useFilteredLists=False,
|
|
58
|
+
treeCollectionList=None,
|
|
59
|
+
):
|
|
45
60
|
self.__cfgOb = cfgOb
|
|
46
61
|
self.__cachePath = os.path.abspath(cachePath)
|
|
47
62
|
self.__readBackCheck = readBackCheck
|
|
@@ -55,6 +70,7 @@ class TreeNodeListWorker(object):
|
|
|
55
70
|
self.__statusList = []
|
|
56
71
|
self.__useCache = useCache
|
|
57
72
|
self.__useFilteredLists = useFilteredLists
|
|
73
|
+
self.__treeCollectionList = treeCollectionList if treeCollectionList else [] # Manually specify tree collections to load
|
|
58
74
|
|
|
59
75
|
def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
|
|
60
76
|
try:
|
|
@@ -95,43 +111,49 @@ class TreeNodeListWorker(object):
|
|
|
95
111
|
ATTRIBUTE_NAMES:
|
|
96
112
|
- parents
|
|
97
113
|
"""
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
114
|
+
ok = True
|
|
115
|
+
useCache = self.__useCache
|
|
116
|
+
#
|
|
117
|
+
logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache)
|
|
118
|
+
#
|
|
119
|
+
self.__statusList = []
|
|
120
|
+
desp = DataExchangeStatus()
|
|
121
|
+
statusStartTimestamp = desp.setStartTime()
|
|
122
|
+
dl = DocumentLoader(
|
|
123
|
+
self.__cfgOb,
|
|
124
|
+
self.__cachePath,
|
|
125
|
+
self.__resourceName,
|
|
126
|
+
numProc=self.__numProc,
|
|
127
|
+
chunkSize=self.__chunkSize,
|
|
128
|
+
maxStepLength=self.__maxStepLength,
|
|
129
|
+
documentLimit=self.__documentLimit,
|
|
130
|
+
verbose=self.__verbose,
|
|
131
|
+
readBackCheck=self.__readBackCheck,
|
|
132
|
+
)
|
|
133
|
+
#
|
|
134
|
+
sectionName = "tree_node_lists_configuration"
|
|
135
|
+
databaseNameMongo = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
|
|
136
|
+
collectionNameList = self.__cfgOb.get("COLLECTION_NAME_LIST", sectionName=sectionName)
|
|
137
|
+
collectionIndexList = self.__cfgOb.get("COLLECTION_INDICES", sectionName=sectionName)
|
|
138
|
+
# databaseNameMongo = 'dw'
|
|
139
|
+
# collectionNameList = ['tree_taxonomy', 'tree_ec', 'tree_scop', 'tree_scop2', 'tree_cath', 'tree_atc', 'tree_card', 'tree_ecod', 'tree_go']
|
|
140
|
+
# collectionIndexList = [{'INDEX_NAME': 'primary', 'ATTRIBUTE_NAMES': ['id']}, {'INDEX_NAME': 'index_2', 'ATTRIBUTE_NAMES': ['parents']}]
|
|
141
|
+
|
|
142
|
+
if len(self.__treeCollectionList) > 0:
|
|
143
|
+
collectionNameList = [col for col in self.__treeCollectionList]
|
|
144
|
+
|
|
145
|
+
# collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
|
|
146
|
+
# addValues = {"_schema_version": collectionVersion}
|
|
147
|
+
addValues = None
|
|
148
|
+
|
|
149
|
+
for collectionName in collectionNameList:
|
|
150
|
+
nL = self.__getTreeDocList(collectionName, useCache)
|
|
151
|
+
if doLoad:
|
|
152
|
+
if not nL or len(nL) == 0:
|
|
153
|
+
logger.error("Empty node list returned for collectionName %r", collectionName)
|
|
154
|
+
ok = False
|
|
155
|
+
else:
|
|
156
|
+
okL = dl.load(
|
|
135
157
|
databaseNameMongo,
|
|
136
158
|
collectionName,
|
|
137
159
|
loadType=loadType,
|
|
@@ -140,18 +162,19 @@ class TreeNodeListWorker(object):
|
|
|
140
162
|
addValues=addValues,
|
|
141
163
|
schemaLevel=None,
|
|
142
164
|
indexDL=collectionIndexList
|
|
143
|
-
)
|
|
165
|
+
)
|
|
166
|
+
ok = okL and ok
|
|
144
167
|
self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
return
|
|
168
|
+
|
|
169
|
+
logger.info(
|
|
170
|
+
"Completed load of tree node list for database %r, collection %r, len(nL) %r (status %r)",
|
|
171
|
+
databaseNameMongo, collectionName, len(nL), okL
|
|
172
|
+
)
|
|
173
|
+
# ---
|
|
174
|
+
logger.info("Completed tree node list loading operations with loadType %r (status %r)", loadType, ok)
|
|
175
|
+
if not ok:
|
|
176
|
+
raise ValueError("Failed to load at least one tree node list.")
|
|
177
|
+
return ok
|
|
155
178
|
|
|
156
179
|
def __checkTaxonNodeList(self, nL):
|
|
157
180
|
eCount = 0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rcsb.exdb
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.34
|
|
4
4
|
Summary: RCSB Python ExDB data extraction and loading workflows
|
|
5
5
|
Project-URL: Homepage, https://github.com/rcsb/py-rcsb_exdb
|
|
6
6
|
Author-email: John Westbrook <john.westbrook@rcsb.org>
|
|
@@ -26,7 +26,7 @@ Requires-Dist: rcsb-utils-ec>=0.25
|
|
|
26
26
|
Requires-Dist: rcsb-utils-go>=0.18
|
|
27
27
|
Requires-Dist: rcsb-utils-io>=1.48
|
|
28
28
|
Requires-Dist: rcsb-utils-seq>=0.82
|
|
29
|
-
Requires-Dist: rcsb-utils-struct>=0.
|
|
29
|
+
Requires-Dist: rcsb-utils-struct>=0.51
|
|
30
30
|
Requires-Dist: rcsb-utils-targets>=0.82
|
|
31
31
|
Requires-Dist: rcsb-utils-taxonomy>=0.43
|
|
32
32
|
Provides-Extra: tests
|
|
@@ -33,7 +33,7 @@ rcsb/exdb/examples-seq/testReferenceSequenceUtils.py,sha256=JIpQsyVfU9sx-50ludlD
|
|
|
33
33
|
rcsb/exdb/seq/AnnotationExtractor.py,sha256=8iCE8LJR7QH8ilSWDkpKYQSfVuSvorlnM9VHIIYcRoY,2741
|
|
34
34
|
rcsb/exdb/seq/LigandNeighborMappingExtractor.py,sha256=PpRWunQHZ6j8s6MCu1tuY59cocEoUvM3bIUcsvQ-Ijo,3431
|
|
35
35
|
rcsb/exdb/seq/LigandNeighborMappingProvider.py,sha256=vzy2tJGB6IgDNgSHyzGU3TuTOV9cfWI_eLRkO3apQtk,3741
|
|
36
|
-
rcsb/exdb/seq/PolymerEntityExtractor.py,sha256=
|
|
36
|
+
rcsb/exdb/seq/PolymerEntityExtractor.py,sha256=7Vf50e4aCwBJ_0nMuPrlMLU3dyQMD_T2w-YtoawKInc,15144
|
|
37
37
|
rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py,sha256=VXStOXoZcbFGYOm5eYiqn599BNksG87h3kzK_aQz2Xc,31160
|
|
38
38
|
rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py,sha256=2fbNeH1VkxEPpny0lfaTJ-UGEXQjxTdQbUoeiTRbJuU,9560
|
|
39
39
|
rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py,sha256=P-t4w4uvnx4qv_vBAJwKuBGjo6wOIYZ8IKzGaw2bCQ8,25935
|
|
@@ -43,7 +43,7 @@ rcsb/exdb/seq/TaxonomyExtractor.py,sha256=I7jsb5Kanrnh4X-znl9kZPZMJ7o2dp4fsnp2IW
|
|
|
43
43
|
rcsb/exdb/seq/UniProtCoreEtlWorker.py,sha256=-fEojXF3lAJ1tbMsPIxT9In6ooiPThuKSoIRQ0YlZ1s,7590
|
|
44
44
|
rcsb/exdb/seq/UniProtExtractor.py,sha256=pR_A9e82YvbQ813M8rNPu1bCPOHMjGnCqJmLDMM23Qo,2695
|
|
45
45
|
rcsb/exdb/seq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
-
rcsb/exdb/tree/TreeNodeListWorker.py,sha256=
|
|
46
|
+
rcsb/exdb/tree/TreeNodeListWorker.py,sha256=jaCe21zVfj5h5g_n5ibwG33JoaX0nBqXCSLiw8DSF0A,10903
|
|
47
47
|
rcsb/exdb/tree/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
48
|
rcsb/exdb/utils/ObjectAdapterBase.py,sha256=w-MGvs-TFQXzfgOfAX3aNyCfaN9gY8WP-7MU2FcMAYs,466
|
|
49
49
|
rcsb/exdb/utils/ObjectExtractor.py,sha256=fAJ-WW_-80h_s_XSDdZYi1I2pltb-uQ3teOtCkcyznk,11057
|
|
@@ -55,7 +55,7 @@ rcsb/exdb/wf/EntryInfoEtlWorkflow.py,sha256=YVr75Wz1BPjLr_satd28B9BeD3QL6HwmkR17
|
|
|
55
55
|
rcsb/exdb/wf/GlycanEtlWorkflow.py,sha256=oJ6wf438K2e-eLmy8Ni3MCPxjAKgVJY38SWO885gnmg,2820
|
|
56
56
|
rcsb/exdb/wf/PubChemEtlWorkflow.py,sha256=fNX3A6kf0S1XiJMz7ywNpFuuua5lT3XaUFjcCJtvQsU,11235
|
|
57
57
|
rcsb/exdb/wf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
|
-
rcsb_exdb-1.
|
|
59
|
-
rcsb_exdb-1.
|
|
60
|
-
rcsb_exdb-1.
|
|
61
|
-
rcsb_exdb-1.
|
|
58
|
+
rcsb_exdb-1.34.dist-info/METADATA,sha256=nfmbDBYh52noNUURQXRnqS8ksTzLd_46fINrsstMmkc,3845
|
|
59
|
+
rcsb_exdb-1.34.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
60
|
+
rcsb_exdb-1.34.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
61
|
+
rcsb_exdb-1.34.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|