debase 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.1.4 → debase-0.1.6}/PKG-INFO +1 -1
- {debase-0.1.4 → debase-0.1.6}/src/debase/_version.py +1 -1
- {debase-0.1.4 → debase-0.1.6}/src/debase/enzyme_lineage_extractor.py +115 -7
- {debase-0.1.4 → debase-0.1.6}/src/debase.egg-info/PKG-INFO +1 -1
- {debase-0.1.4 → debase-0.1.6}/.gitignore +0 -0
- {debase-0.1.4 → debase-0.1.6}/CONTRIBUTING.md +0 -0
- {debase-0.1.4 → debase-0.1.6}/LICENSE +0 -0
- {debase-0.1.4 → debase-0.1.6}/MANIFEST.in +0 -0
- {debase-0.1.4 → debase-0.1.6}/README.md +0 -0
- {debase-0.1.4 → debase-0.1.6}/docs/README.md +0 -0
- {debase-0.1.4 → debase-0.1.6}/docs/examples/README.md +0 -0
- {debase-0.1.4 → debase-0.1.6}/environment.yml +0 -0
- {debase-0.1.4 → debase-0.1.6}/pyproject.toml +0 -0
- {debase-0.1.4 → debase-0.1.6}/setup.cfg +0 -0
- {debase-0.1.4 → debase-0.1.6}/setup.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/__init__.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/PIPELINE_FLOW.md +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/__init__.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/__main__.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/build_db.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/lineage_format.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/reaction_info_extractor.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/substrate_scope_extractor.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase/wrapper.py +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase.egg-info/SOURCES.txt +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.1.4 → debase-0.1.6}/src/debase.egg-info/top_level.txt +0 -0
@@ -1024,6 +1024,38 @@ IMPORTANT: Only extract variants that belong to this specific campaign.
|
|
1024
1024
|
|
1025
1025
|
# ---- 6.3 Helper for location-based extraction -----------------------------
|
1026
1026
|
|
1027
|
+
def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
|
1028
|
+
"""Check if a found pattern is likely a table of contents entry."""
|
1029
|
+
# Find the line containing this position
|
1030
|
+
line_start = text.rfind('\n', 0, position)
|
1031
|
+
line_end = text.find('\n', position)
|
1032
|
+
|
1033
|
+
if line_start == -1:
|
1034
|
+
line_start = 0
|
1035
|
+
else:
|
1036
|
+
line_start += 1
|
1037
|
+
|
1038
|
+
if line_end == -1:
|
1039
|
+
line_end = len(text)
|
1040
|
+
|
1041
|
+
line = text[line_start:line_end]
|
1042
|
+
|
1043
|
+
# TOC indicators:
|
1044
|
+
# 1. Line contains dots (...) followed by page number
|
1045
|
+
# 2. Line ends with just a page number
|
1046
|
+
# 3. Line has "Table S12:" or similar followed by title and page
|
1047
|
+
if '...' in line or re.search(r'\.\s*\d+\s*$', line) or re.search(r':\s*[^:]+\s+\d+\s*$', line):
|
1048
|
+
return True
|
1049
|
+
|
1050
|
+
# Check if this is in a contents/TOC section
|
1051
|
+
# Look backwards up to 500 chars for "Contents" or "Table of Contents"
|
1052
|
+
context_start = max(0, position - 500)
|
1053
|
+
context = text[context_start:position].lower()
|
1054
|
+
if 'contents' in context or 'table of contents' in context:
|
1055
|
+
return True
|
1056
|
+
|
1057
|
+
return False
|
1058
|
+
|
1027
1059
|
def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 5000, validate_sequences: bool = False) -> str:
|
1028
1060
|
"""Extract text around identified locations."""
|
1029
1061
|
if not locations:
|
@@ -1082,11 +1114,25 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1082
1114
|
pos = -1
|
1083
1115
|
used_pattern = None
|
1084
1116
|
for pattern in page_patterns:
|
1085
|
-
|
1086
|
-
|
1117
|
+
search_pos = 0
|
1118
|
+
while search_pos < len(text_lower):
|
1119
|
+
temp_pos = text_lower.find(pattern.lower(), search_pos)
|
1120
|
+
if temp_pos == -1:
|
1121
|
+
break
|
1122
|
+
|
1123
|
+
# Check if this is a TOC entry
|
1124
|
+
if _is_toc_entry(text, temp_pos, pattern):
|
1125
|
+
log.debug("Skipping TOC entry for pattern '%s' at position %d", pattern, temp_pos)
|
1126
|
+
search_pos = temp_pos + len(pattern)
|
1127
|
+
continue
|
1128
|
+
|
1129
|
+
# Found non-TOC entry
|
1087
1130
|
pos = temp_pos
|
1088
1131
|
used_pattern = pattern
|
1089
|
-
log.debug("Found pattern '%s' at position %d", pattern, pos)
|
1132
|
+
log.debug("Found pattern '%s' at position %d (not TOC)", pattern, pos)
|
1133
|
+
break
|
1134
|
+
|
1135
|
+
if pos != -1:
|
1090
1136
|
break
|
1091
1137
|
|
1092
1138
|
if pos != -1:
|
@@ -1275,7 +1321,9 @@ def get_lineage(
|
|
1275
1321
|
|
1276
1322
|
# Use text-based extraction (works for tables and text sections)
|
1277
1323
|
# Extract from full text, not caption text - use only primary location
|
1278
|
-
|
1324
|
+
# Use more context for tables since they often span multiple pages
|
1325
|
+
context_size = 15000 if location_type == 'table' else 5000
|
1326
|
+
focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
|
1279
1327
|
log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
|
1280
1328
|
len(full_text), len(focused_text),
|
1281
1329
|
primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
|
@@ -1864,7 +1912,7 @@ def _infer_generations(variants: List[Variant]) -> None:
|
|
1864
1912
|
|
1865
1913
|
|
1866
1914
|
def _merge_lineage_and_sequences(
|
1867
|
-
lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str]
|
1915
|
+
lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str], model=None
|
1868
1916
|
) -> pd.DataFrame:
|
1869
1917
|
"""Return a tidy DataFrame with one row per variant."""
|
1870
1918
|
|
@@ -1895,6 +1943,65 @@ def _merge_lineage_and_sequences(
|
|
1895
1943
|
# 2. Outer merge keeps every lineage entry and adds sequence cols when present
|
1896
1944
|
df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
|
1897
1945
|
|
1946
|
+
# 2a. If we have unmatched sequences and a model, use Gemini to match them
|
1947
|
+
if model and len(df_seq) > 0:
|
1948
|
+
# Find lineage entries without sequences
|
1949
|
+
missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
|
1950
|
+
unmatched_lineage = df[missing_seq]['variant_id'].tolist()
|
1951
|
+
|
1952
|
+
# Find sequences that weren't matched
|
1953
|
+
matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
|
1954
|
+
unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
|
1955
|
+
|
1956
|
+
if len(unmatched_lineage) > 0 and len(unmatched_seqs) > 0:
|
1957
|
+
log.info(f"Found {len(unmatched_lineage)} lineage entries without sequences and {len(unmatched_seqs)} unmatched sequences")
|
1958
|
+
log.info(f"Using Gemini to match variants")
|
1959
|
+
|
1960
|
+
# Build prompt for Gemini to match variants
|
1961
|
+
prompt = f"""Match enzyme variant IDs between two lists. The same variant may be labeled differently in different parts of the paper.
|
1962
|
+
|
1963
|
+
Lineage variant IDs (need sequences):
|
1964
|
+
{json.dumps(unmatched_lineage)}
|
1965
|
+
|
1966
|
+
Sequence variant IDs (have sequences):
|
1967
|
+
{json.dumps(unmatched_seqs['variant_id'].tolist())}
|
1968
|
+
|
1969
|
+
These lists contain variant identifiers from the same paper but may use different naming conventions.
|
1970
|
+
Match each lineage variant ID to its corresponding sequence variant ID based on any patterns or relationships you can identify.
|
1971
|
+
|
1972
|
+
Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
|
1973
|
+
"""
|
1974
|
+
|
1975
|
+
try:
|
1976
|
+
response = model.generate_content(prompt)
|
1977
|
+
text = _extract_text(response).strip()
|
1978
|
+
|
1979
|
+
# Parse JSON response
|
1980
|
+
if text.startswith("```"):
|
1981
|
+
text = text.split("```")[1].strip()
|
1982
|
+
if text.startswith("json"):
|
1983
|
+
text = text[4:].strip()
|
1984
|
+
|
1985
|
+
matches = json.loads(text)
|
1986
|
+
|
1987
|
+
# Apply the matches
|
1988
|
+
for lineage_id, seq_id in matches.items():
|
1989
|
+
if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
|
1990
|
+
# Get the sequence data
|
1991
|
+
seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
|
1992
|
+
|
1993
|
+
# Update the dataframe
|
1994
|
+
mask = df['variant_id'] == lineage_id
|
1995
|
+
df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
|
1996
|
+
df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
|
1997
|
+
df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
|
1998
|
+
df.loc[mask, 'truncated'] = seq_data['truncated']
|
1999
|
+
|
2000
|
+
log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
|
2001
|
+
|
2002
|
+
except Exception as e:
|
2003
|
+
log.warning(f"Failed to match variants using Gemini: {e}")
|
2004
|
+
|
1898
2005
|
# 3. If generation missing after user input, try inference
|
1899
2006
|
if df["generation"].isna().any():
|
1900
2007
|
_infer_generations(lineage) # mutates in place
|
@@ -1920,6 +2027,7 @@ def merge_and_score(
|
|
1920
2027
|
lineage: List[Variant],
|
1921
2028
|
seqs: List[SequenceBlock],
|
1922
2029
|
doi: Optional[str] = None,
|
2030
|
+
model=None,
|
1923
2031
|
) -> pd.DataFrame:
|
1924
2032
|
"""User-facing helper imported by the pipeline orchestrator.
|
1925
2033
|
|
@@ -1932,7 +2040,7 @@ def merge_and_score(
|
|
1932
2040
|
raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
|
1933
2041
|
|
1934
2042
|
# If no sequences found, still build a DataFrame so caller can decide what to do.
|
1935
|
-
df = _merge_lineage_and_sequences(lineage, seqs, doi)
|
2043
|
+
df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
|
1936
2044
|
|
1937
2045
|
# Basic sanity: warn if many missing sequences
|
1938
2046
|
missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
|
@@ -2115,7 +2223,7 @@ def run_pipeline(
|
|
2115
2223
|
|
2116
2224
|
# 5. Merge & score (Section 8) --------------------------------------------
|
2117
2225
|
doi = extract_doi(manuscript)
|
2118
|
-
df_final = merge_and_score(lineage, sequences, doi)
|
2226
|
+
df_final = merge_and_score(lineage, sequences, doi, model)
|
2119
2227
|
|
2120
2228
|
# 6. Write FINAL CSV -------------------------------------------------------
|
2121
2229
|
if output_csv:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|