debase 0.4.1__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. debase-0.4.3/.gitignore +177 -0
  2. {debase-0.4.1/src/debase.egg-info → debase-0.4.3}/PKG-INFO +1 -1
  3. debase-0.4.3/environment.yml +21 -0
  4. {debase-0.4.1 → debase-0.4.3}/src/debase/_version.py +1 -1
  5. {debase-0.4.1 → debase-0.4.3}/src/debase/cleanup_sequence.py +151 -1
  6. {debase-0.4.1 → debase-0.4.3}/src/debase/enzyme_lineage_extractor.py +114 -20
  7. {debase-0.4.1 → debase-0.4.3}/src/debase/lineage_format.py +335 -56
  8. {debase-0.4.1 → debase-0.4.3}/src/debase/reaction_info_extractor.py +60 -32
  9. {debase-0.4.1 → debase-0.4.3}/src/debase/substrate_scope_extractor.py +366 -93
  10. {debase-0.4.1 → debase-0.4.3}/src/debase/wrapper.py +37 -11
  11. {debase-0.4.1 → debase-0.4.3/src/debase.egg-info}/PKG-INFO +1 -1
  12. {debase-0.4.1 → debase-0.4.3}/src/debase.egg-info/SOURCES.txt +3 -0
  13. debase-0.4.3/src/debase.egg-info/dependency_links.txt +1 -0
  14. {debase-0.4.1 → debase-0.4.3}/LICENSE +0 -0
  15. {debase-0.4.1 → debase-0.4.3}/MANIFEST.in +0 -0
  16. {debase-0.4.1 → debase-0.4.3}/README.md +0 -0
  17. {debase-0.4.1 → debase-0.4.3}/pyproject.toml +0 -0
  18. {debase-0.4.1 → debase-0.4.3}/setup.cfg +0 -0
  19. {debase-0.4.1 → debase-0.4.3}/setup.py +0 -0
  20. /debase-0.4.1/src/debase.egg-info/dependency_links.txt → /debase-0.4.3/src/__init__.py +0 -0
  21. {debase-0.4.1 → debase-0.4.3}/src/debase/__init__.py +0 -0
  22. {debase-0.4.1 → debase-0.4.3}/src/debase/__main__.py +0 -0
  23. {debase-0.4.1 → debase-0.4.3}/src/debase/build_db.py +0 -0
  24. {debase-0.4.1 → debase-0.4.3}/src/debase.egg-info/entry_points.txt +0 -0
  25. {debase-0.4.1 → debase-0.4.3}/src/debase.egg-info/requires.txt +0 -0
  26. {debase-0.4.1 → debase-0.4.3}/src/debase.egg-info/top_level.txt +0 -0
@@ -0,0 +1,177 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+ cover/
51
+
52
+ # Jupyter Notebook
53
+ .ipynb_checkpoints
54
+
55
+ # IPython
56
+ profile_default/
57
+ ipython_config.py
58
+
59
+ # pyenv
60
+ .python-version
61
+
62
+ # pipenv
63
+ Pipfile.lock
64
+
65
+ # poetry
66
+ poetry.lock
67
+
68
+ # pdm
69
+ .pdm.toml
70
+
71
+ # PEP 582
72
+ __pypackages__/
73
+
74
+ # Celery stuff
75
+ celerybeat-schedule
76
+ celerybeat.pid
77
+
78
+ # SageMath parsed files
79
+ *.sage.py
80
+
81
+ # Environments
82
+ .env
83
+ .venv
84
+ env/
85
+ venv/
86
+ ENV/
87
+ env.bak/
88
+ venv.bak/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # Pyre type checker
106
+ .pyre/
107
+
108
+ # pytype static type analyzer
109
+ .pytype/
110
+
111
+ # Cython debug symbols
112
+ cython_debug/
113
+
114
+ # PyCharm
115
+ .idea/
116
+
117
+ # VS Code
118
+ .vscode/
119
+
120
+ # macOS
121
+ .DS_Store
122
+ .AppleDouble
123
+ .LSOverride
124
+
125
+ # Windows
126
+ Thumbs.db
127
+ Thumbs.db:encryptable
128
+ ehthumbs.db
129
+ ehthumbs_vista.db
130
+ *.stackdump
131
+ [Dd]esktop.ini
132
+ $RECYCLE.BIN/
133
+ *.cab
134
+ *.msi
135
+ *.msix
136
+ *.msm
137
+ *.msp
138
+ *.lnk
139
+
140
+ # Linux
141
+ *~
142
+
143
+ # Temporary files
144
+ *.tmp
145
+ *.temp
146
+ *.log
147
+ .temp_*/
148
+ .cache/
149
+
150
+ # DEBase specific
151
+ enzyme_pipeline*.log
152
+ temp_merged_input.csv
153
+ *.egg-info/
154
+
155
+ # Project data and examples
156
+ data/
157
+ examples/
158
+ !examples/test.csv # Keep test.csv as example output
159
+
160
+ # Cache files
161
+ *.pkl
162
+ *_cache.pkl
163
+
164
+ # Large database files
165
+ *.db
166
+
167
+ # PDFs and Excel files
168
+ *.pdf
169
+ *.xlsx
170
+
171
+ # Backup files
172
+ *_backup.py
173
+ lineage_format_backup.py
174
+
175
+ # Temporary directories
176
+ .temp_*
177
+ enzyme_analysis_*
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,21 @@
1
+ name: debase
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - python=3.9
7
+ - pandas>=1.0.0
8
+ - numpy>=1.19.0
9
+ - matplotlib>=3.3.0
10
+ - seaborn>=0.11.0
11
+ - jupyter>=1.0.0
12
+ - jupyterlab>=3.0.0
13
+ - openpyxl>=3.0.0
14
+ - biopython>=1.78
15
+ - requests>=2.25.0
16
+ - tqdm>=4.60.0
17
+ - rdkit>=2020.03.1
18
+ - pip
19
+ - pip:
20
+ - PyMuPDF>=1.18.0
21
+ - google-generativeai>=0.3.0
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.4.1"
3
+ __version__ = "0.4.3"
@@ -11,6 +11,7 @@ Usage:
11
11
 
12
12
  import argparse
13
13
  import logging
14
+ import os
14
15
  import re
15
16
  import sys
16
17
  from dataclasses import dataclass, field
@@ -19,11 +20,20 @@ from typing import Dict, List, Optional, Set, Tuple, Union
19
20
 
20
21
  import pandas as pd
21
22
 
23
+ try:
24
+ import google.generativeai as genai # type: ignore
25
+ GEMINI_OK = True
26
+ except ImportError: # pragma: no cover
27
+ GEMINI_OK = False
28
+
22
29
 
23
30
  # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
24
31
 
25
32
  VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
26
33
 
34
+ # Gemini API configuration
35
+ GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
36
+
27
37
  # Configure module logger
28
38
  log = logging.getLogger(__name__)
29
39
 
@@ -565,7 +575,136 @@ class SequenceGenerator:
565
575
  return None
566
576
 
567
577
 
568
- # === 7. MAIN PROCESSOR === ---------------------------------------------------
578
+ # === 7. GEMINI PARENT IDENTIFICATION === ------------------------------------
579
+
580
+ def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
581
+ """Use Gemini API to identify parent enzymes for entries with missing parent information."""
582
+ if not GEMINI_OK:
583
+ log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
584
+ return df
585
+
586
+ if not GEMINI_API_KEY:
587
+ log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
588
+ return df
589
+
590
+ try:
591
+ genai.configure(api_key=GEMINI_API_KEY)
592
+ model = genai.GenerativeModel('gemini-1.5-flash')
593
+ except Exception as e:
594
+ log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
595
+ return df
596
+
597
+ # Find entries with empty sequences but missing parent information
598
+ entries_needing_parents = []
599
+ for idx, row in df.iterrows():
600
+ protein_seq = str(row.get("protein_sequence", "")).strip()
601
+ parent_id = str(row.get("parent_enzyme_id", "")).strip()
602
+
603
+ # Only process entries that have empty sequences AND no parent info
604
+ if (not protein_seq or protein_seq == "nan") and (not parent_id or parent_id == "nan"):
605
+ enzyme_id = str(row.get("enzyme_id", ""))
606
+ campaign_id = str(row.get("campaign_id", ""))
607
+ generation = str(row.get("generation", ""))
608
+
609
+ entries_needing_parents.append({
610
+ "idx": idx,
611
+ "enzyme_id": enzyme_id,
612
+ "campaign_id": campaign_id,
613
+ "generation": generation
614
+ })
615
+
616
+ if not entries_needing_parents:
617
+ log.info("No entries need parent identification from Gemini")
618
+ return df
619
+
620
+ log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
621
+
622
+ # Create a lookup of all available enzyme IDs for context
623
+ available_enzymes = {}
624
+ for idx, row in df.iterrows():
625
+ enzyme_id = str(row.get("enzyme_id", ""))
626
+ campaign_id = str(row.get("campaign_id", ""))
627
+ protein_seq = str(row.get("protein_sequence", "")).strip()
628
+ generation = str(row.get("generation", ""))
629
+
630
+ if enzyme_id and enzyme_id != "nan":
631
+ available_enzymes[enzyme_id] = {
632
+ "campaign_id": campaign_id,
633
+ "has_sequence": bool(protein_seq and protein_seq != "nan"),
634
+ "generation": generation
635
+ }
636
+
637
+ identified_count = 0
638
+ for entry in entries_needing_parents:
639
+ enzyme_id = entry["enzyme_id"]
640
+ campaign_id = entry["campaign_id"]
641
+ generation = entry["generation"]
642
+
643
+ # Create context for Gemini
644
+ context_info = []
645
+ context_info.append(f"Enzyme ID: {enzyme_id}")
646
+ context_info.append(f"Campaign ID: {campaign_id}")
647
+ if generation:
648
+ context_info.append(f"Generation: {generation}")
649
+
650
+ # Add available enzymes from the same campaign for context
651
+ campaign_enzymes = []
652
+ for enz_id, enz_data in available_enzymes.items():
653
+ if enz_data["campaign_id"] == campaign_id:
654
+ status = "with sequence" if enz_data["has_sequence"] else "without sequence"
655
+ gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
656
+ campaign_enzymes.append(f" - {enz_id} {status} {gen_info}")
657
+
658
+ if campaign_enzymes:
659
+ context_info.append("Available enzymes in same campaign:")
660
+ context_info.extend(campaign_enzymes[:10]) # Limit to first 10 for context
661
+
662
+ context_text = "\n".join(context_info)
663
+
664
+ prompt = f"""
665
+ Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
666
+
667
+ {context_text}
668
+
669
+ This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
670
+
671
+ Please provide your response in this format:
672
+ Parent: [parent_enzyme_id or "Unknown"]
673
+
674
+ If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
675
+ """
676
+
677
+ try:
678
+ response = model.generate_content(prompt)
679
+ response_text = response.text.strip()
680
+
681
+ # Parse the response
682
+ parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
683
+
684
+ if parent_match:
685
+ parent = parent_match.group(1).strip()
686
+ if parent and parent != "Unknown" and parent != "No parent identified":
687
+ # Verify the parent exists in our available enzymes
688
+ if parent in available_enzymes:
689
+ df.at[entry["idx"], "parent_enzyme_id"] = parent
690
+ identified_count += 1
691
+ log.info(f"Identified parent for {enzyme_id}: {parent}")
692
+ else:
693
+ log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
694
+
695
+ except Exception as e:
696
+ log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
697
+ continue
698
+
699
+ if identified_count > 0:
700
+ log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
701
+ else:
702
+ log.info("No parent enzymes were identified using Gemini API")
703
+
704
+ return df
705
+
706
+
707
+ # === 8. MAIN PROCESSOR === ---------------------------------------------------
569
708
 
570
709
  class SequenceProcessor:
571
710
  """Main processor for handling the complete workflow."""
@@ -866,6 +1005,17 @@ class SequenceProcessor:
866
1005
  self.process_remaining()
867
1006
  self.backward_pass()
868
1007
 
1008
+ # Use Gemini to identify parent enzymes for entries with missing sequences
1009
+ log.info(f"Identifying parents with Gemini for campaign: {campaign_id}")
1010
+ self.df = identify_parents_with_gemini(self.df)
1011
+
1012
+ # Rebuild relationships after parent identification
1013
+ self.generator = SequenceGenerator(self.df)
1014
+
1015
+ # Try to fill sequences again after parent identification
1016
+ log.info(f"Attempting to fill sequences after parent identification for campaign: {campaign_id}")
1017
+ self.process_remaining()
1018
+
869
1019
  # Update the original dataframe with results
870
1020
  original_df.loc[campaign_mask, :] = self.df
871
1021
 
@@ -142,21 +142,36 @@ def extract_text(pdf_path: str | Path | bytes) -> str:
142
142
 
143
143
 
144
144
  def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -> str:
145
- """Extract figure/table captions using the improved regex.
145
+ """Extract ALL figure/table captions with extensive surrounding context.
146
146
 
147
147
  The function scans every text line on every page and keeps lines whose first
148
148
  token matches `_CAPTION_PREFIX_RE`. This covers labels such as:
149
- * Fig. 1, Figure 2A, Extended Data Fig 3
149
+ * Fig. 1, Figure 2A, Figure 2B, Figure 2C (ALL sub-captions)
150
150
  * Table S1, Table 4, Scheme 2, Chart 1B
151
- * Supplementary Fig. S5, Supp Table 2
151
+ * Supplementary Fig. S5A, S5B, S5C (ALL variations)
152
+
153
+ For SI documents, includes extensive context since understanding what each
154
+ section contains is crucial for accurate location identification.
152
155
  """
153
156
 
154
157
  doc = _open_doc(pdf_path)
155
158
  captions: list[str] = []
156
159
  try:
157
- for page in doc:
160
+ for page_num, page in enumerate(doc):
158
161
  page_dict = page.get_text("dict")
162
+
163
+ # Get all text blocks on this page for broader context
164
+ page_text_blocks = []
159
165
  for block in page_dict.get("blocks", []):
166
+ block_text = ""
167
+ for line in block.get("lines", []):
168
+ text_line = "".join(span["text"] for span in line.get("spans", []))
169
+ if text_line.strip():
170
+ block_text += text_line.strip() + " "
171
+ if block_text.strip():
172
+ page_text_blocks.append(block_text.strip())
173
+
174
+ for block_idx, block in enumerate(page_dict.get("blocks", [])):
160
175
  # Get all lines in this block
161
176
  block_lines = []
162
177
  for line in block.get("lines", []):
@@ -166,21 +181,94 @@ def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -
166
181
  # Check if any line starts with a caption prefix
167
182
  for i, line in enumerate(block_lines):
168
183
  if _CAPTION_PREFIX_RE.match(line):
169
- # Found a caption start - collect this line and subsequent lines
170
- # until we hit an empty line or the end of the block
184
+ context_parts = []
185
+
186
+ # Add page context for SI documents (more critical there)
187
+ context_parts.append(f"Page {page_num + 1}")
188
+
189
+ # Add extensive context before the caption (5-7 lines for SI context)
190
+ context_before = []
191
+
192
+ # First try to get context from current block
193
+ for k in range(max(0, i-7), i):
194
+ if k < len(block_lines) and block_lines[k].strip():
195
+ if not _CAPTION_PREFIX_RE.match(block_lines[k]):
196
+ context_before.append(block_lines[k])
197
+
198
+ # If not enough context, look at previous text blocks on the page
199
+ if len(context_before) < 3 and block_idx > 0:
200
+ prev_block_text = page_text_blocks[block_idx - 1] if block_idx < len(page_text_blocks) else ""
201
+ if prev_block_text:
202
+ # Get last few sentences from previous block
203
+ sentences = prev_block_text.split('. ')
204
+ context_before = sentences[-2:] + context_before if len(sentences) > 1 else [prev_block_text] + context_before
205
+
206
+ if context_before:
207
+ # Include more extensive context for better understanding
208
+ context_text = " ".join(context_before[-5:]) # Last 5 lines/sentences of context
209
+ context_parts.append("Context: " + context_text)
210
+
211
+ # Extract the COMPLETE caption including all sub-parts
171
212
  caption_parts = [line]
172
- for j in range(i + 1, len(block_lines)):
213
+ j = i + 1
214
+
215
+ # Continue collecting caption text until we hit a clear break
216
+ while j < len(block_lines):
173
217
  next_line = block_lines[j]
174
- if not next_line: # Empty line signals end of caption
175
- break
176
- # Check if next line is a new caption
218
+
219
+ # Stop if we hit an empty line followed by non-caption text
220
+ if not next_line:
221
+ # Check if the line after empty is a new caption
222
+ if j + 1 < len(block_lines) and _CAPTION_PREFIX_RE.match(block_lines[j + 1]):
223
+ break
224
+ # If next non-empty line is not a caption, continue collecting
225
+ elif j + 1 < len(block_lines):
226
+ j += 1
227
+ continue
228
+ else:
229
+ break
230
+
231
+ # Stop if we hit a new caption
177
232
  if _CAPTION_PREFIX_RE.match(next_line):
178
233
  break
234
+
235
+ # Include this line as part of the caption
179
236
  caption_parts.append(next_line)
237
+ j += 1
180
238
 
181
- # Join the parts with spaces
239
+ # Join the caption parts
182
240
  full_caption = " ".join(caption_parts)
183
- captions.append(full_caption)
241
+ context_parts.append("Caption: " + full_caption)
242
+
243
+ # Add extensive context after the caption (especially important for SI)
244
+ context_after = []
245
+
246
+ # Look for descriptive text following the caption
247
+ for k in range(j, min(len(block_lines), j + 10)): # Look ahead up to 10 lines
248
+ if k < len(block_lines) and block_lines[k].strip():
249
+ if not _CAPTION_PREFIX_RE.match(block_lines[k]):
250
+ context_after.append(block_lines[k])
251
+
252
+ # If not enough context, look at next text blocks
253
+ if len(context_after) < 3 and block_idx + 1 < len(page_text_blocks):
254
+ next_block_text = page_text_blocks[block_idx + 1]
255
+ if next_block_text:
256
+ # Get first few sentences from next block
257
+ sentences = next_block_text.split('. ')
258
+ context_after.extend(sentences[:3] if len(sentences) > 1 else [next_block_text])
259
+
260
+ if context_after:
261
+ # Include extensive following context
262
+ following_text = " ".join(context_after[:7]) # First 7 lines of following context
263
+ context_parts.append("Following: " + following_text)
264
+
265
+ # For SI documents, add section context if this appears to be a section header
266
+ if any(keyword in full_caption.lower() for keyword in ['supplementary', 'supporting', 'si ', 's1', 's2', 's3']):
267
+ context_parts.append("SI_SECTION: This appears to be supplementary material content")
268
+
269
+ # Combine all parts with proper separation
270
+ full_caption_with_context = " | ".join(context_parts)
271
+ captions.append(full_caption_with_context)
184
272
  finally:
185
273
  doc.close()
186
274
 
@@ -645,11 +733,13 @@ find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
645
733
  came from which parent and what mutations were introduced).
646
734
 
647
735
  Respond with a JSON array of objects, each containing:
648
- - "location": the identifier (e.g. "Table S1", "Figure 2B", "p. 6")
736
+ - "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
649
737
  - "type": one of "table", "figure", "text", "section"
650
738
  - "confidence": your confidence score (0-100) that this location contains lineage data
651
739
  - "reason": brief explanation of why this location likely contains lineage
652
740
 
741
+ IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
742
+
653
743
  Order by confidence score (highest first). Tables showing complete variant lineages or
654
744
  mutation lists should be ranked higher than figure showing complete variant lineages.
655
745
  Text sections is used when no suitable tables/figurews exist.
@@ -747,7 +837,7 @@ def identify_campaigns(
747
837
  debug_dir: str | Path | None = None,
748
838
  ) -> List[Campaign]:
749
839
  """Identify distinct directed evolution campaigns in the manuscript."""
750
- prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text[:30_000])
840
+ prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text)
751
841
  campaigns_data: List[dict] = []
752
842
  try:
753
843
  campaigns_data = generate_json_with_retry(
@@ -825,7 +915,7 @@ def identify_evolution_locations(
825
915
 
826
916
  # Include TOC before the main text
827
917
  combined_text = toc_text + text if toc_text else text
828
- prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
918
+ prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
829
919
  locs: List[dict] = []
830
920
  try:
831
921
  locs = generate_json_with_retry(
@@ -1306,7 +1396,7 @@ def get_lineage(
1306
1396
  5. Return both variants and campaigns.
1307
1397
  """
1308
1398
  # First, identify campaigns in the manuscript
1309
- campaigns = identify_campaigns(full_text[:50_000], model, debug_dir=debug_dir)
1399
+ campaigns = identify_campaigns(full_text, model, debug_dir=debug_dir)
1310
1400
 
1311
1401
  if campaigns:
1312
1402
  log.info(f"Identified {len(campaigns)} distinct campaigns")
@@ -1364,7 +1454,7 @@ def get_lineage(
1364
1454
  context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
1365
1455
  locations_with_context.append({
1366
1456
  'location': loc,
1367
- 'context': context_text[:1000] # First 1000 chars of extracted context
1457
+ 'context': context_text # Full extracted context
1368
1458
  })
1369
1459
 
1370
1460
  # For each campaign, ask Gemini to select the best location
@@ -1554,13 +1644,17 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
1554
1644
  Look for table of contents entries or section listings that mention sequences.
1555
1645
  Return a JSON array where each element has:
1556
1646
  - "section": the section heading or description
1557
- - "page": the page number shown in the table of contents for this section, to your best judgement.
1647
+ - "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
1558
1648
 
1559
1649
  Focus on:
1560
1650
  - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
1561
- - Return the EXACT notation as shown.
1651
+ - For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
1562
1652
  - Prioritize sections that mention "protein" or "amino acid" sequences
1563
1653
 
1654
+ CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
1655
+ - Correct: "53", "S12", "147"
1656
+ - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
1657
+
1564
1658
  Return [] if no sequence sections are found.
1565
1659
  Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
1566
1660
 
@@ -1572,7 +1666,7 @@ TEXT (truncated):
1572
1666
 
1573
1667
  def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | None = None) -> list[dict]:
1574
1668
  """Ask Gemini for promising places to look for sequences."""
1575
- prompt = _SEQ_LOC_PROMPT.format(chunk=text[:15_000])
1669
+ prompt = _SEQ_LOC_PROMPT.format(chunk=text)
1576
1670
  try:
1577
1671
  locs = generate_json_with_retry(model, prompt, debug_dir=debug_dir, tag="seq_locations")
1578
1672
  return locs if isinstance(locs, list) else []