debase 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.4"
3
+ __version__ = "0.1.6"
@@ -1024,6 +1024,38 @@ IMPORTANT: Only extract variants that belong to this specific campaign.
1024
1024
 
1025
1025
  # ---- 6.3 Helper for location-based extraction -----------------------------
1026
1026
 
1027
+ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
1028
+ """Check if a found pattern is likely a table of contents entry."""
1029
+ # Find the line containing this position
1030
+ line_start = text.rfind('\n', 0, position)
1031
+ line_end = text.find('\n', position)
1032
+
1033
+ if line_start == -1:
1034
+ line_start = 0
1035
+ else:
1036
+ line_start += 1
1037
+
1038
+ if line_end == -1:
1039
+ line_end = len(text)
1040
+
1041
+ line = text[line_start:line_end]
1042
+
1043
+ # TOC indicators:
1044
+ # 1. Line contains dots (...) followed by page number
1045
+ # 2. Line ends with just a page number
1046
+ # 3. Line has "Table S12:" or similar followed by title and page
1047
+ if '...' in line or re.search(r'\.\s*\d+\s*$', line) or re.search(r':\s*[^:]+\s+\d+\s*$', line):
1048
+ return True
1049
+
1050
+ # Check if this is in a contents/TOC section
1051
+ # Look backwards up to 500 chars for "Contents" or "Table of Contents"
1052
+ context_start = max(0, position - 500)
1053
+ context = text[context_start:position].lower()
1054
+ if 'contents' in context or 'table of contents' in context:
1055
+ return True
1056
+
1057
+ return False
1058
+
1027
1059
  def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 5000, validate_sequences: bool = False) -> str:
1028
1060
  """Extract text around identified locations."""
1029
1061
  if not locations:
@@ -1082,11 +1114,25 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1082
1114
  pos = -1
1083
1115
  used_pattern = None
1084
1116
  for pattern in page_patterns:
1085
- temp_pos = text_lower.find(pattern.lower())
1086
- if temp_pos != -1:
1117
+ search_pos = 0
1118
+ while search_pos < len(text_lower):
1119
+ temp_pos = text_lower.find(pattern.lower(), search_pos)
1120
+ if temp_pos == -1:
1121
+ break
1122
+
1123
+ # Check if this is a TOC entry
1124
+ if _is_toc_entry(text, temp_pos, pattern):
1125
+ log.debug("Skipping TOC entry for pattern '%s' at position %d", pattern, temp_pos)
1126
+ search_pos = temp_pos + len(pattern)
1127
+ continue
1128
+
1129
+ # Found non-TOC entry
1087
1130
  pos = temp_pos
1088
1131
  used_pattern = pattern
1089
- log.debug("Found pattern '%s' at position %d", pattern, pos)
1132
+ log.debug("Found pattern '%s' at position %d (not TOC)", pattern, pos)
1133
+ break
1134
+
1135
+ if pos != -1:
1090
1136
  break
1091
1137
 
1092
1138
  if pos != -1:
@@ -1275,7 +1321,9 @@ def get_lineage(
1275
1321
 
1276
1322
  # Use text-based extraction (works for tables and text sections)
1277
1323
  # Extract from full text, not caption text - use only primary location
1278
- focused_text = _extract_text_at_locations(full_text, [primary_location])
1324
+ # Use more context for tables since they often span multiple pages
1325
+ context_size = 15000 if location_type == 'table' else 5000
1326
+ focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
1279
1327
  log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
1280
1328
  len(full_text), len(focused_text),
1281
1329
  primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
@@ -1864,7 +1912,7 @@ def _infer_generations(variants: List[Variant]) -> None:
1864
1912
 
1865
1913
 
1866
1914
  def _merge_lineage_and_sequences(
1867
- lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str]
1915
+ lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str], model=None
1868
1916
  ) -> pd.DataFrame:
1869
1917
  """Return a tidy DataFrame with one row per variant."""
1870
1918
 
@@ -1895,6 +1943,65 @@ def _merge_lineage_and_sequences(
1895
1943
  # 2. Outer merge keeps every lineage entry and adds sequence cols when present
1896
1944
  df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
1897
1945
 
1946
+ # 2a. If we have unmatched sequences and a model, use Gemini to match them
1947
+ if model and len(df_seq) > 0:
1948
+ # Find lineage entries without sequences
1949
+ missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
1950
+ unmatched_lineage = df[missing_seq]['variant_id'].tolist()
1951
+
1952
+ # Find sequences that weren't matched
1953
+ matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
1954
+ unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
1955
+
1956
+ if len(unmatched_lineage) > 0 and len(unmatched_seqs) > 0:
1957
+ log.info(f"Found {len(unmatched_lineage)} lineage entries without sequences and {len(unmatched_seqs)} unmatched sequences")
1958
+ log.info(f"Using Gemini to match variants")
1959
+
1960
+ # Build prompt for Gemini to match variants
1961
+ prompt = f"""Match enzyme variant IDs between two lists. The same variant may be labeled differently in different parts of the paper.
1962
+
1963
+ Lineage variant IDs (need sequences):
1964
+ {json.dumps(unmatched_lineage)}
1965
+
1966
+ Sequence variant IDs (have sequences):
1967
+ {json.dumps(unmatched_seqs['variant_id'].tolist())}
1968
+
1969
+ These lists contain variant identifiers from the same paper but may use different naming conventions.
1970
+ Match each lineage variant ID to its corresponding sequence variant ID based on any patterns or relationships you can identify.
1971
+
1972
+ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
1973
+ """
1974
+
1975
+ try:
1976
+ response = model.generate_content(prompt)
1977
+ text = _extract_text(response).strip()
1978
+
1979
+ # Parse JSON response
1980
+ if text.startswith("```"):
1981
+ text = text.split("```")[1].strip()
1982
+ if text.startswith("json"):
1983
+ text = text[4:].strip()
1984
+
1985
+ matches = json.loads(text)
1986
+
1987
+ # Apply the matches
1988
+ for lineage_id, seq_id in matches.items():
1989
+ if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
1990
+ # Get the sequence data
1991
+ seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
1992
+
1993
+ # Update the dataframe
1994
+ mask = df['variant_id'] == lineage_id
1995
+ df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
1996
+ df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
1997
+ df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
1998
+ df.loc[mask, 'truncated'] = seq_data['truncated']
1999
+
2000
+ log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
2001
+
2002
+ except Exception as e:
2003
+ log.warning(f"Failed to match variants using Gemini: {e}")
2004
+
1898
2005
  # 3. If generation missing after user input, try inference
1899
2006
  if df["generation"].isna().any():
1900
2007
  _infer_generations(lineage) # mutates in place
@@ -1920,6 +2027,7 @@ def merge_and_score(
1920
2027
  lineage: List[Variant],
1921
2028
  seqs: List[SequenceBlock],
1922
2029
  doi: Optional[str] = None,
2030
+ model=None,
1923
2031
  ) -> pd.DataFrame:
1924
2032
  """User-facing helper imported by the pipeline orchestrator.
1925
2033
 
@@ -1932,7 +2040,7 @@ def merge_and_score(
1932
2040
  raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
1933
2041
 
1934
2042
  # If no sequences found, still build a DataFrame so caller can decide what to do.
1935
- df = _merge_lineage_and_sequences(lineage, seqs, doi)
2043
+ df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
1936
2044
 
1937
2045
  # Basic sanity: warn if many missing sequences
1938
2046
  missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
@@ -2115,7 +2223,7 @@ def run_pipeline(
2115
2223
 
2116
2224
  # 5. Merge & score (Section 8) --------------------------------------------
2117
2225
  doi = extract_doi(manuscript)
2118
- df_final = merge_and_score(lineage, sequences, doi)
2226
+ df_final = merge_and_score(lineage, sequences, doi, model)
2119
2227
 
2120
2228
  # 6. Write FINAL CSV -------------------------------------------------------
2121
2229
  if output_csv:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,17 +1,17 @@
1
1
  debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
2
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
3
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=mcDHWqAxAKwMNAAyHmpWVDTK-zafQ1kQjmiwnsZbUD4,49
4
+ debase/_version.py,sha256=0-ypEiBe2kUuZ71tMon-TVN7tQNbXA_-yM2NzlyMWuk,49
5
5
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
6
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=s1kPOomvJjfMSN5odxeyXNmxiaOzXyOZICr4YUWU6j8,89288
7
+ debase/enzyme_lineage_extractor.py,sha256=_kVsuOXOR8qhrOIy3mKNJuac3joK6goke628nAJoj88,94183
8
8
  debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
9
  debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
10
10
  debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
11
  debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
12
- debase-0.1.4.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.4.dist-info/METADATA,sha256=fZwXCP1i1s0VNq7Ds5bd2ys3pONgaV1XCe_edUkQdRU,10789
14
- debase-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.4.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.4.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.4.dist-info/RECORD,,
12
+ debase-0.1.6.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.6.dist-info/METADATA,sha256=mLzMY4LQx3SoOsirAiKhcqMofiMZ_D7jjB68ohUplFI,10789
14
+ debase-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.6.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.6.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.6.dist-info/RECORD,,
File without changes