rcsb.exdb 1.32__py3-none-any.whl → 1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@
9
9
  # 9-Jan-2024 dwp Turn off use of uniprot_exdb DB for enriching protein entity details file (data not used)
10
10
  # 10-Dec-2024 dwp Sort extracted polymer entity sequence data by entity ID (alphabetically), to ensure consistent
11
11
  # ordering between coasts (order of sequence data influences results of mmseqs2 sequence searching)
12
+ # 2-Feb-2026 dwp Handle case of missing 'rcsb_entity_source_organism.source_type'
12
13
  #
13
14
  ##
14
15
  __docformat__ = "google en"
@@ -115,7 +116,7 @@ class PolymerEntityExtractor(object):
115
116
  begSeqNum = 1
116
117
  endSeqNum = seqLen
117
118
  srcId = tD["pdbx_src_id"]
118
- srcType = tD["source_type"]
119
+ srcType = tD["source_type"] if "source_type" in tD else None
119
120
  taxId = tD["ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in tD else -1
120
121
  if srcName and taxId == -1:
121
122
  missingSrcD.setdefault(srcName, []).append(rId)
@@ -138,7 +139,7 @@ class PolymerEntityExtractor(object):
138
139
  pD = eD["rcsb_polymer_entity"]
139
140
  taxCount = pD["rcsb_source_taxonomy_count"]
140
141
  except Exception:
141
- if srcType == "synthetic":
142
+ if srcType is not None and srcType == "synthetic":
142
143
  taxCount = 0
143
144
  else:
144
145
  logger.warning("%s (srcName %r) no source taxonomy count type %r", rId, srcName, srcType)
@@ -12,6 +12,8 @@
12
12
  # 23-Jan-2025 dwp Change indexed field from 'update_id' to 'id'
13
13
  # 7-Aug-2025 dwp Change target DB and collection names to "dw" and "tree_*" (via configuration file);
14
14
  # Make use of configuration file for loading tree node lists and setting indexed fields
15
+ # 6-Jan-2026 dwp Raise error if a tree node list is empty or fails to load;
16
+ # Add support for providing a manual list of tree node lists to load (for testing)
15
17
  #
16
18
  ##
17
19
  __docformat__ = "google en"
@@ -41,7 +43,20 @@ logger = logging.getLogger(__name__)
41
43
  class TreeNodeListWorker(object):
42
44
  """Prepare and load repository holdings and repository update data."""
43
45
 
44
- def __init__(self, cfgOb, cachePath, numProc=1, chunkSize=10, maxStepLength=4000, readBackCheck=False, documentLimit=None, verbose=False, useCache=False, useFilteredLists=False):
46
+ def __init__(
47
+ self,
48
+ cfgOb,
49
+ cachePath,
50
+ numProc=1,
51
+ chunkSize=10,
52
+ maxStepLength=4000,
53
+ readBackCheck=False,
54
+ documentLimit=None,
55
+ verbose=False,
56
+ useCache=False,
57
+ useFilteredLists=False,
58
+ treeCollectionList=None,
59
+ ):
45
60
  self.__cfgOb = cfgOb
46
61
  self.__cachePath = os.path.abspath(cachePath)
47
62
  self.__readBackCheck = readBackCheck
@@ -55,6 +70,7 @@ class TreeNodeListWorker(object):
55
70
  self.__statusList = []
56
71
  self.__useCache = useCache
57
72
  self.__useFilteredLists = useFilteredLists
73
+ self.__treeCollectionList = treeCollectionList if treeCollectionList else [] # Manually specify tree collections to load
58
74
 
59
75
  def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
60
76
  try:
@@ -95,43 +111,49 @@ class TreeNodeListWorker(object):
95
111
  ATTRIBUTE_NAMES:
96
112
  - parents
97
113
  """
98
- try:
99
- useCache = self.__useCache
100
- #
101
- logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache)
102
- #
103
- self.__statusList = []
104
- desp = DataExchangeStatus()
105
- statusStartTimestamp = desp.setStartTime()
106
- dl = DocumentLoader(
107
- self.__cfgOb,
108
- self.__cachePath,
109
- self.__resourceName,
110
- numProc=self.__numProc,
111
- chunkSize=self.__chunkSize,
112
- maxStepLength=self.__maxStepLength,
113
- documentLimit=self.__documentLimit,
114
- verbose=self.__verbose,
115
- readBackCheck=self.__readBackCheck,
116
- )
117
- #
118
- sectionName = "tree_node_lists_configuration"
119
- databaseNameMongo = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
120
- collectionNameList = self.__cfgOb.get("COLLECTION_NAME_LIST", sectionName=sectionName)
121
- collectionIndexList = self.__cfgOb.get("COLLECTION_INDICES", sectionName=sectionName)
122
- # databaseNameMongo = 'dw'
123
- # collectionNameList = ['tree_taxonomy', 'tree_ec', 'tree_scop', 'tree_scop2', 'tree_cath', 'tree_atc', 'tree_card', 'tree_ecod', 'tree_go']
124
- # collectionIndexList = [{'INDEX_NAME': 'primary', 'ATTRIBUTE_NAMES': ['id']}, {'INDEX_NAME': 'index_2', 'ATTRIBUTE_NAMES': ['parents']}]
125
-
126
- # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
127
- # addValues = {"_schema_version": collectionVersion}
128
- addValues = None
129
-
130
- ok = True
131
- for collectionName in collectionNameList:
132
- nL = self.__getTreeDocList(collectionName, useCache)
133
- if nL and doLoad:
134
- ok = dl.load(
114
+ ok = True
115
+ useCache = self.__useCache
116
+ #
117
+ logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache)
118
+ #
119
+ self.__statusList = []
120
+ desp = DataExchangeStatus()
121
+ statusStartTimestamp = desp.setStartTime()
122
+ dl = DocumentLoader(
123
+ self.__cfgOb,
124
+ self.__cachePath,
125
+ self.__resourceName,
126
+ numProc=self.__numProc,
127
+ chunkSize=self.__chunkSize,
128
+ maxStepLength=self.__maxStepLength,
129
+ documentLimit=self.__documentLimit,
130
+ verbose=self.__verbose,
131
+ readBackCheck=self.__readBackCheck,
132
+ )
133
+ #
134
+ sectionName = "tree_node_lists_configuration"
135
+ databaseNameMongo = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
136
+ collectionNameList = self.__cfgOb.get("COLLECTION_NAME_LIST", sectionName=sectionName)
137
+ collectionIndexList = self.__cfgOb.get("COLLECTION_INDICES", sectionName=sectionName)
138
+ # databaseNameMongo = 'dw'
139
+ # collectionNameList = ['tree_taxonomy', 'tree_ec', 'tree_scop', 'tree_scop2', 'tree_cath', 'tree_atc', 'tree_card', 'tree_ecod', 'tree_go']
140
+ # collectionIndexList = [{'INDEX_NAME': 'primary', 'ATTRIBUTE_NAMES': ['id']}, {'INDEX_NAME': 'index_2', 'ATTRIBUTE_NAMES': ['parents']}]
141
+
142
+ if len(self.__treeCollectionList) > 0:
143
+ collectionNameList = [col for col in self.__treeCollectionList]
144
+
145
+ # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
146
+ # addValues = {"_schema_version": collectionVersion}
147
+ addValues = None
148
+
149
+ for collectionName in collectionNameList:
150
+ nL = self.__getTreeDocList(collectionName, useCache)
151
+ if doLoad:
152
+ if not nL or len(nL) == 0:
153
+ logger.error("Empty node list returned for collectionName %r", collectionName)
154
+ ok = False
155
+ else:
156
+ okL = dl.load(
135
157
  databaseNameMongo,
136
158
  collectionName,
137
159
  loadType=loadType,
@@ -140,18 +162,19 @@ class TreeNodeListWorker(object):
140
162
  addValues=addValues,
141
163
  schemaLevel=None,
142
164
  indexDL=collectionIndexList
143
- ) and ok
165
+ )
166
+ ok = okL and ok
144
167
  self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
145
- logger.info(
146
- "Completed load of tree node list for database %r, collection %r, len(nL) %r (status %r)",
147
- databaseNameMongo, collectionName, len(nL), ok
148
- )
149
- # ---
150
- logger.info("Completed tree node list loading operations with loadType %r (status %r)", loadType, ok)
151
- return True
152
- except Exception as e:
153
- logger.exception("Failing with %s", str(e))
154
- return False
168
+
169
+ logger.info(
170
+ "Completed load of tree node list for database %r, collection %r, len(nL) %r (status %r)",
171
+ databaseNameMongo, collectionName, len(nL), okL
172
+ )
173
+ # ---
174
+ logger.info("Completed tree node list loading operations with loadType %r (status %r)", loadType, ok)
175
+ if not ok:
176
+ raise ValueError("Failed to load at least one tree node list.")
177
+ return ok
155
178
 
156
179
  def __checkTaxonNodeList(self, nL):
157
180
  eCount = 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rcsb.exdb
3
- Version: 1.32
3
+ Version: 1.34
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Project-URL: Homepage, https://github.com/rcsb/py-rcsb_exdb
6
6
  Author-email: John Westbrook <john.westbrook@rcsb.org>
@@ -26,7 +26,7 @@ Requires-Dist: rcsb-utils-ec>=0.25
26
26
  Requires-Dist: rcsb-utils-go>=0.18
27
27
  Requires-Dist: rcsb-utils-io>=1.48
28
28
  Requires-Dist: rcsb-utils-seq>=0.82
29
- Requires-Dist: rcsb-utils-struct>=0.47
29
+ Requires-Dist: rcsb-utils-struct>=0.51
30
30
  Requires-Dist: rcsb-utils-targets>=0.82
31
31
  Requires-Dist: rcsb-utils-taxonomy>=0.43
32
32
  Provides-Extra: tests
@@ -33,7 +33,7 @@ rcsb/exdb/examples-seq/testReferenceSequenceUtils.py,sha256=JIpQsyVfU9sx-50ludlD
33
33
  rcsb/exdb/seq/AnnotationExtractor.py,sha256=8iCE8LJR7QH8ilSWDkpKYQSfVuSvorlnM9VHIIYcRoY,2741
34
34
  rcsb/exdb/seq/LigandNeighborMappingExtractor.py,sha256=PpRWunQHZ6j8s6MCu1tuY59cocEoUvM3bIUcsvQ-Ijo,3431
35
35
  rcsb/exdb/seq/LigandNeighborMappingProvider.py,sha256=vzy2tJGB6IgDNgSHyzGU3TuTOV9cfWI_eLRkO3apQtk,3741
36
- rcsb/exdb/seq/PolymerEntityExtractor.py,sha256=YPyK0F90mjHCBc_vqZdAvBcDecAcOHUax4TxGToEzmg,15004
36
+ rcsb/exdb/seq/PolymerEntityExtractor.py,sha256=7Vf50e4aCwBJ_0nMuPrlMLU3dyQMD_T2w-YtoawKInc,15144
37
37
  rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py,sha256=VXStOXoZcbFGYOm5eYiqn599BNksG87h3kzK_aQz2Xc,31160
38
38
  rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py,sha256=2fbNeH1VkxEPpny0lfaTJ-UGEXQjxTdQbUoeiTRbJuU,9560
39
39
  rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py,sha256=P-t4w4uvnx4qv_vBAJwKuBGjo6wOIYZ8IKzGaw2bCQ8,25935
@@ -43,7 +43,7 @@ rcsb/exdb/seq/TaxonomyExtractor.py,sha256=I7jsb5Kanrnh4X-znl9kZPZMJ7o2dp4fsnp2IW
43
43
  rcsb/exdb/seq/UniProtCoreEtlWorker.py,sha256=-fEojXF3lAJ1tbMsPIxT9In6ooiPThuKSoIRQ0YlZ1s,7590
44
44
  rcsb/exdb/seq/UniProtExtractor.py,sha256=pR_A9e82YvbQ813M8rNPu1bCPOHMjGnCqJmLDMM23Qo,2695
45
45
  rcsb/exdb/seq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- rcsb/exdb/tree/TreeNodeListWorker.py,sha256=VLd7MWCxw9fONoC3xYbjvARp2O2V8Vyy-kUZnwQWi30,10233
46
+ rcsb/exdb/tree/TreeNodeListWorker.py,sha256=jaCe21zVfj5h5g_n5ibwG33JoaX0nBqXCSLiw8DSF0A,10903
47
47
  rcsb/exdb/tree/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
48
  rcsb/exdb/utils/ObjectAdapterBase.py,sha256=w-MGvs-TFQXzfgOfAX3aNyCfaN9gY8WP-7MU2FcMAYs,466
49
49
  rcsb/exdb/utils/ObjectExtractor.py,sha256=fAJ-WW_-80h_s_XSDdZYi1I2pltb-uQ3teOtCkcyznk,11057
@@ -55,7 +55,7 @@ rcsb/exdb/wf/EntryInfoEtlWorkflow.py,sha256=YVr75Wz1BPjLr_satd28B9BeD3QL6HwmkR17
55
55
  rcsb/exdb/wf/GlycanEtlWorkflow.py,sha256=oJ6wf438K2e-eLmy8Ni3MCPxjAKgVJY38SWO885gnmg,2820
56
56
  rcsb/exdb/wf/PubChemEtlWorkflow.py,sha256=fNX3A6kf0S1XiJMz7ywNpFuuua5lT3XaUFjcCJtvQsU,11235
57
57
  rcsb/exdb/wf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
- rcsb_exdb-1.32.dist-info/METADATA,sha256=ZKtnT7xFqpDHcUGISba091bqV1ckkjVPBKd-ot1ICJ4,3845
59
- rcsb_exdb-1.32.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
60
- rcsb_exdb-1.32.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
61
- rcsb_exdb-1.32.dist-info/RECORD,,
58
+ rcsb_exdb-1.34.dist-info/METADATA,sha256=nfmbDBYh52noNUURQXRnqS8ksTzLd_46fINrsstMmkc,3845
59
+ rcsb_exdb-1.34.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
60
+ rcsb_exdb-1.34.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
61
+ rcsb_exdb-1.34.dist-info/RECORD,,