debase 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.18"
3
+ __version__ = "0.1.19"
@@ -927,12 +927,77 @@ Ignore locations that contain data for other campaigns.
927
927
  # ------------------------------------------------------------------
928
928
  # 6.3 Extract metrics in batch
929
929
  # ------------------------------------------------------------------
930
+
931
+ def _validate_location_exists(self, ref: str) -> bool:
932
+ """Verify that the referenced location actually exists in the document."""
933
+ # Search for the actual reference in the document
934
+ for page_num in range(len(self.doc)):
935
+ page = self.doc[page_num]
936
+ text = page.get_text()
937
+
938
+ # Look for table references like "Table 1", "Table S1", etc.
939
+ if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
940
+ return True
941
+
942
+ return False
943
+
944
+ def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
945
+ """Validate that the context contains meaningful content for extraction."""
946
+ if not snippet or len(snippet.strip()) < 50:
947
+ LOGGER.warning("Insufficient context for extraction from %s - skipping", ref)
948
+ return False
949
+
950
+ # Check if context actually mentions the enzymes we're looking for
951
+ enzyme_mentions = sum(1 for enzyme in enzyme_list if enzyme.lower() in snippet.lower())
952
+ if enzyme_mentions == 0:
953
+ LOGGER.warning("No enzyme mentions found in context for %s - skipping", ref)
954
+ return False
955
+
956
+ # Check for performance-related keywords
957
+ performance_keywords = ['yield', 'selectivity', 'conversion', 'ee', 'er', 'ttn', 'ton', 'tof', '%', 'percent']
958
+ has_performance_data = any(keyword in snippet.lower() for keyword in performance_keywords)
959
+
960
+ if not has_performance_data:
961
+ LOGGER.warning("No performance metrics found in context for %s - skipping", ref)
962
+ return False
963
+
964
+ LOGGER.info("Context validated for %s: %d chars, %d enzyme mentions", ref, len(snippet), enzyme_mentions)
965
+ return True
966
+
967
+ def _validate_response(self, data: Dict, enzyme_list: List[str], ref: str) -> bool:
968
+ """Validate that the response contains meaningful data for the requested enzymes."""
969
+ if not data or not isinstance(data, dict):
970
+ LOGGER.warning("Invalid response format from %s - skipping", ref)
971
+ return False
972
+
973
+ # Check if we got data for at least one enzyme
974
+ enzymes_with_data = 0
975
+ for enzyme in enzyme_list:
976
+ enzyme_data = data.get(enzyme, {})
977
+ if isinstance(enzyme_data, dict) and enzyme_data:
978
+ # Check if there's at least one non-null metric
979
+ metrics = ['yield', 'ttn', 'ton', 'selectivity', 'conversion', 'tof', 'activity']
980
+ has_metric = any(enzyme_data.get(metric) is not None for metric in metrics)
981
+ if has_metric:
982
+ enzymes_with_data += 1
983
+
984
+ if enzymes_with_data == 0:
985
+ LOGGER.warning("No valid metrics found in response from %s - skipping", ref)
986
+ return False
987
+
988
+ LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
989
+ return True
930
990
 
931
991
  def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
932
992
  """Extract performance metrics for multiple enzymes from the identified location in batch."""
933
993
  ref_lc = ref.lower()
934
994
  image_b64: Optional[str] = None
935
995
 
996
+ # First, validate that the location actually exists in the document
997
+ if not self._validate_location_exists(ref):
998
+ LOGGER.warning("Location %s not found in document - skipping", ref)
999
+ return []
1000
+
936
1001
  # Add campaign context if available
937
1002
  campaign_context = ""
938
1003
  if self.campaign_filter:
@@ -953,6 +1018,10 @@ Ignore locations that contain data for other campaigns.
953
1018
  else:
954
1019
  snippet = self._page_with_reference(ref) or ""
955
1020
 
1021
+ # Validate context before sending to Gemini
1022
+ if not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
1023
+ return []
1024
+
956
1025
  enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
957
1026
 
958
1027
  if image_b64:
@@ -977,6 +1046,10 @@ Ignore locations that contain data for other campaigns.
977
1046
  image_b64=image_b64
978
1047
  )
979
1048
 
1049
+ # Validate response has meaningful data
1050
+ if not self._validate_response(data, enzyme_list, ref):
1051
+ return []
1052
+
980
1053
  # Handle the response format - expecting a dict with enzyme names as keys
981
1054
  results = []
982
1055
  if isinstance(data, dict):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.18
3
+ Version: 0.1.19
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,17 +1,17 @@
1
1
  debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
2
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
3
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=Qd1kKsssesKE5FvJnDdAuZsx_BrxTSJJyt68SK99D54,50
4
+ debase/_version.py,sha256=VbYiJzmzValsIDmCyQWPabFFsmy_TQ_Qp35j2mo-UKc,50
5
5
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
6
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
7
  debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
8
8
  debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
9
- debase/reaction_info_extractor.py,sha256=W9CS0puFTdhJ_T2Fpy931EgnjOCsHHjbtU6RdnzDlhw,113140
9
+ debase/reaction_info_extractor.py,sha256=otj8D3MnrThhUR_xOCc3sSVIw8hrCKnB4OY6y6NnaWA,116674
10
10
  debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
11
11
  debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
12
- debase-0.1.18.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.18.dist-info/METADATA,sha256=XvSrveJ0Y40c53JYUfiveaQNJ3qoEkxaQ61n3_--1cQ,10790
14
- debase-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.18.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.18.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.18.dist-info/RECORD,,
12
+ debase-0.1.19.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.19.dist-info/METADATA,sha256=i1dFEB8kPkfTt8q8hJpAAAkZA29T2kb1bzPFMjzPdJU,10790
14
+ debase-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.19.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.19.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.19.dist-info/RECORD,,