debase 0.1.17__tar.gz → 0.1.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.1.17 → debase-0.1.19}/PKG-INFO +1 -1
- {debase-0.1.17 → debase-0.1.19}/src/debase/_version.py +1 -1
- {debase-0.1.17 → debase-0.1.19}/src/debase/reaction_info_extractor.py +76 -2
- {debase-0.1.17 → debase-0.1.19}/src/debase.egg-info/PKG-INFO +1 -1
- {debase-0.1.17 → debase-0.1.19}/.gitignore +0 -0
- {debase-0.1.17 → debase-0.1.19}/CONTRIBUTING.md +0 -0
- {debase-0.1.17 → debase-0.1.19}/LICENSE +0 -0
- {debase-0.1.17 → debase-0.1.19}/MANIFEST.in +0 -0
- {debase-0.1.17 → debase-0.1.19}/README.md +0 -0
- {debase-0.1.17 → debase-0.1.19}/docs/README.md +0 -0
- {debase-0.1.17 → debase-0.1.19}/docs/examples/README.md +0 -0
- {debase-0.1.17 → debase-0.1.19}/environment.yml +0 -0
- {debase-0.1.17 → debase-0.1.19}/pyproject.toml +0 -0
- {debase-0.1.17 → debase-0.1.19}/setup.cfg +0 -0
- {debase-0.1.17 → debase-0.1.19}/setup.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/__init__.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/PIPELINE_FLOW.md +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/__init__.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/__main__.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/build_db.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/enzyme_lineage_extractor.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/lineage_format.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/substrate_scope_extractor.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase/wrapper.py +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase.egg-info/SOURCES.txt +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.1.17 → debase-0.1.19}/src/debase.egg-info/top_level.txt +0 -0
@@ -927,12 +927,77 @@ Ignore locations that contain data for other campaigns.
|
|
927
927
|
# ------------------------------------------------------------------
|
928
928
|
# 6.3 Extract metrics in batch
|
929
929
|
# ------------------------------------------------------------------
|
930
|
+
|
931
|
+
def _validate_location_exists(self, ref: str) -> bool:
|
932
|
+
"""Verify that the referenced location actually exists in the document."""
|
933
|
+
# Search for the actual reference in the document
|
934
|
+
for page_num in range(len(self.doc)):
|
935
|
+
page = self.doc[page_num]
|
936
|
+
text = page.get_text()
|
937
|
+
|
938
|
+
# Look for table references like "Table 1", "Table S1", etc.
|
939
|
+
if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
|
940
|
+
return True
|
941
|
+
|
942
|
+
return False
|
943
|
+
|
944
|
+
def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
|
945
|
+
"""Validate that the context contains meaningful content for extraction."""
|
946
|
+
if not snippet or len(snippet.strip()) < 50:
|
947
|
+
LOGGER.warning("Insufficient context for extraction from %s - skipping", ref)
|
948
|
+
return False
|
949
|
+
|
950
|
+
# Check if context actually mentions the enzymes we're looking for
|
951
|
+
enzyme_mentions = sum(1 for enzyme in enzyme_list if enzyme.lower() in snippet.lower())
|
952
|
+
if enzyme_mentions == 0:
|
953
|
+
LOGGER.warning("No enzyme mentions found in context for %s - skipping", ref)
|
954
|
+
return False
|
955
|
+
|
956
|
+
# Check for performance-related keywords
|
957
|
+
performance_keywords = ['yield', 'selectivity', 'conversion', 'ee', 'er', 'ttn', 'ton', 'tof', '%', 'percent']
|
958
|
+
has_performance_data = any(keyword in snippet.lower() for keyword in performance_keywords)
|
959
|
+
|
960
|
+
if not has_performance_data:
|
961
|
+
LOGGER.warning("No performance metrics found in context for %s - skipping", ref)
|
962
|
+
return False
|
963
|
+
|
964
|
+
LOGGER.info("Context validated for %s: %d chars, %d enzyme mentions", ref, len(snippet), enzyme_mentions)
|
965
|
+
return True
|
966
|
+
|
967
|
+
def _validate_response(self, data: Dict, enzyme_list: List[str], ref: str) -> bool:
|
968
|
+
"""Validate that the response contains meaningful data for the requested enzymes."""
|
969
|
+
if not data or not isinstance(data, dict):
|
970
|
+
LOGGER.warning("Invalid response format from %s - skipping", ref)
|
971
|
+
return False
|
972
|
+
|
973
|
+
# Check if we got data for at least one enzyme
|
974
|
+
enzymes_with_data = 0
|
975
|
+
for enzyme in enzyme_list:
|
976
|
+
enzyme_data = data.get(enzyme, {})
|
977
|
+
if isinstance(enzyme_data, dict) and enzyme_data:
|
978
|
+
# Check if there's at least one non-null metric
|
979
|
+
metrics = ['yield', 'ttn', 'ton', 'selectivity', 'conversion', 'tof', 'activity']
|
980
|
+
has_metric = any(enzyme_data.get(metric) is not None for metric in metrics)
|
981
|
+
if has_metric:
|
982
|
+
enzymes_with_data += 1
|
983
|
+
|
984
|
+
if enzymes_with_data == 0:
|
985
|
+
LOGGER.warning("No valid metrics found in response from %s - skipping", ref)
|
986
|
+
return False
|
987
|
+
|
988
|
+
LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
|
989
|
+
return True
|
930
990
|
|
931
991
|
def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
|
932
992
|
"""Extract performance metrics for multiple enzymes from the identified location in batch."""
|
933
993
|
ref_lc = ref.lower()
|
934
994
|
image_b64: Optional[str] = None
|
935
995
|
|
996
|
+
# First, validate that the location actually exists in the document
|
997
|
+
if not self._validate_location_exists(ref):
|
998
|
+
LOGGER.warning("Location %s not found in document - skipping", ref)
|
999
|
+
return []
|
1000
|
+
|
936
1001
|
# Add campaign context if available
|
937
1002
|
campaign_context = ""
|
938
1003
|
if self.campaign_filter:
|
@@ -953,6 +1018,10 @@ Ignore locations that contain data for other campaigns.
|
|
953
1018
|
else:
|
954
1019
|
snippet = self._page_with_reference(ref) or ""
|
955
1020
|
|
1021
|
+
# Validate context before sending to Gemini
|
1022
|
+
if not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
|
1023
|
+
return []
|
1024
|
+
|
956
1025
|
enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
|
957
1026
|
|
958
1027
|
if image_b64:
|
@@ -961,8 +1030,9 @@ Ignore locations that contain data for other campaigns.
|
|
961
1030
|
LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
962
1031
|
tag = f"extract_metrics_batch_vision"
|
963
1032
|
else:
|
964
|
-
# Add enzyme names to prompt for batch extraction
|
965
|
-
|
1033
|
+
# Add enzyme names to prompt for batch extraction with explicit format requirement
|
1034
|
+
format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
|
1035
|
+
prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\nReturn a JSON object with enzyme names as keys, each containing the metrics.\nExample format: {format_example}\n\n=== CONTEXT ===\n" + snippet[:4000]
|
966
1036
|
LOGGER.info("Gemini: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
967
1037
|
tag = f"extract_metrics_batch"
|
968
1038
|
|
@@ -976,6 +1046,10 @@ Ignore locations that contain data for other campaigns.
|
|
976
1046
|
image_b64=image_b64
|
977
1047
|
)
|
978
1048
|
|
1049
|
+
# Validate response has meaningful data
|
1050
|
+
if not self._validate_response(data, enzyme_list, ref):
|
1051
|
+
return []
|
1052
|
+
|
979
1053
|
# Handle the response format - expecting a dict with enzyme names as keys
|
980
1054
|
results = []
|
981
1055
|
if isinstance(data, dict):
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|