debase 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.0"
3
+ __version__ = "0.1.2"
@@ -685,7 +685,7 @@ Ignore locations that contain data for other campaigns.
685
685
  'confidence': 95
686
686
  }
687
687
 
688
- def find_lineage_model_reaction(self, location: str, group_context: str) -> Dict[str, Any]:
688
+ def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
689
689
  """Find the model reaction for a specific lineage group."""
690
690
  # Gather relevant text near this location
691
691
  page_text = self._page_with_reference(location) or ""
@@ -693,6 +693,7 @@ Ignore locations that contain data for other campaigns.
693
693
  # Also check manuscript introduction for model reaction info
694
694
  intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
695
695
 
696
+ # Build the prompt with location and context
696
697
  prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
697
698
  location=location,
698
699
  group_context=group_context
@@ -700,6 +701,22 @@ Ignore locations that contain data for other campaigns.
700
701
  prompt += f"\n\nText near {location}:\n{page_text[:3000]}"
701
702
  prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
702
703
 
704
+ # If we have model reaction locations, include text from those locations too
705
+ if model_reaction_locations:
706
+ # Add text from model reaction location
707
+ if model_reaction_locations.get("model_reaction_location", {}).get("location"):
708
+ model_loc = model_reaction_locations["model_reaction_location"]["location"]
709
+ model_text = self._get_text_around_location(model_loc)
710
+ if model_text:
711
+ prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
712
+
713
+ # Add text from conditions location (often contains reaction details)
714
+ if model_reaction_locations.get("conditions_location", {}).get("location"):
715
+ cond_loc = model_reaction_locations["conditions_location"]["location"]
716
+ cond_text = self._get_text_around_location(cond_loc)
717
+ if cond_text:
718
+ prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
719
+
703
720
  try:
704
721
  data = generate_json_with_retry(
705
722
  self.model,
@@ -1790,8 +1807,16 @@ TEXT FROM MANUSCRIPT:
1790
1807
  if location.get('caption'):
1791
1808
  location_context += f"\nCaption: {location['caption']}"
1792
1809
 
1793
- # Try to find model reaction for this specific lineage
1794
- location_model_reaction = self.find_lineage_model_reaction(location['location'], location_context)
1810
+ # First find model reaction locations for this campaign/enzyme group
1811
+ location_enzymes = df_location['enzyme'].unique().tolist()
1812
+ model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
1813
+
1814
+ # Try to find model reaction for this specific lineage, passing the locations
1815
+ location_model_reaction = self.find_lineage_model_reaction(
1816
+ location['location'],
1817
+ location_context,
1818
+ model_reaction_locations
1819
+ )
1795
1820
 
1796
1821
  # Get full model reaction info with IUPAC names
1797
1822
  if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
@@ -1799,7 +1824,6 @@ TEXT FROM MANUSCRIPT:
1799
1824
  else:
1800
1825
  # Fall back to general model reaction extraction
1801
1826
  # Pass the enzyme variants from this location
1802
- location_enzymes = df_location['enzyme'].unique().tolist()
1803
1827
  model_info = self.gather_model_reaction_info(location_enzymes)
1804
1828
 
1805
1829
  # Add model reaction info to all enzymes from this location
@@ -1891,7 +1915,16 @@ TEXT FROM MANUSCRIPT:
1891
1915
  if group.get('caption'):
1892
1916
  location_context += f"\nCaption: {group['caption']}"
1893
1917
 
1894
- location_model_reaction = self.find_lineage_model_reaction(group_location, location_context)
1918
+ # First find model reaction locations for this enzyme group
1919
+ location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
1920
+ model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
1921
+
1922
+ # Try to find model reaction for this specific lineage, passing the locations
1923
+ location_model_reaction = self.find_lineage_model_reaction(
1924
+ group_location,
1925
+ location_context,
1926
+ model_reaction_locations
1927
+ )
1895
1928
 
1896
1929
  # Get full model reaction info with IUPAC names
1897
1930
  if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
@@ -1899,7 +1932,6 @@ TEXT FROM MANUSCRIPT:
1899
1932
  else:
1900
1933
  # Try to extract model reaction from this specific location
1901
1934
  # Pass the enzyme variants that have data in this location
1902
- location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
1903
1935
  model_info = self.gather_model_reaction_info(location_enzymes)
1904
1936
 
1905
1937
  # Add model reaction info to all enzymes from this location
debase/wrapper.py CHANGED
@@ -35,9 +35,7 @@ def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir:
35
35
  """
36
36
  logger.info(f"Extracting enzyme lineage from {manuscript.name}")
37
37
 
38
- import sys
39
- sys.path.insert(0, str(Path(__file__).parent.parent.parent))
40
- from src.debase.enzyme_lineage_extractor import run_pipeline
38
+ from .enzyme_lineage_extractor import run_pipeline
41
39
  run_pipeline(manuscript=manuscript, si=si, output_csv=output, debug_dir=debug_dir)
42
40
 
43
41
  logger.info(f"Lineage extraction complete: {output}")
@@ -51,7 +49,7 @@ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
51
49
  """
52
50
  logger.info(f"Cleaning sequences from {input_csv.name}")
53
51
 
54
- from src.debase.cleanup_sequence import main as cleanup_sequences
52
+ from .cleanup_sequence import main as cleanup_sequences
55
53
  cleanup_sequences([str(input_csv), str(output_csv)])
56
54
 
57
55
  logger.info(f"Sequence cleanup complete: {output_csv}")
@@ -65,7 +63,7 @@ def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, outpu
65
63
  """
66
64
  logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
67
65
 
68
- from src.debase.reaction_info_extractor import ReactionExtractor, Config
66
+ from .reaction_info_extractor import ReactionExtractor, Config
69
67
  import pandas as pd
70
68
 
71
69
  # Load enzyme data
@@ -89,7 +87,7 @@ def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path
89
87
  """
90
88
  logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
91
89
 
92
- from src.debase.substrate_scope_extractor import run_pipeline
90
+ from .substrate_scope_extractor import run_pipeline
93
91
 
94
92
  # Run substrate scope extraction
95
93
  run_pipeline(
@@ -111,7 +109,7 @@ def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_cs
111
109
  """
112
110
  logger.info(f"Formatting and merging data into final output")
113
111
 
114
- from src.debase.lineage_format import run_pipeline
112
+ from .lineage_format import run_pipeline
115
113
  import pandas as pd
116
114
 
117
115
  # First, we need to merge the protein sequences into the reaction data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -64,13 +64,6 @@ Enzyme lineage analysis and sequence extraction package with advanced parallel p
64
64
  ```bash
65
65
  pip install debase
66
66
  ```
67
-
68
- For full functionality with chemical SMILES support:
69
-
70
- ```bash
71
- pip install debase[rdkit]
72
- ```
73
-
74
67
  ## Requirements
75
68
 
76
69
  - Python 3.8 or higher
@@ -139,13 +132,6 @@ debase --manuscript paper.pdf --si si.pdf --use-optimized-reaction --reaction-ba
139
132
  debase --manuscript paper.pdf --si si.pdf # Default method
140
133
  ```
141
134
 
142
- ## Performance Comparison
143
-
144
- | Method | Total Time | API Calls | Accuracy | Best For |
145
- |--------|------------|-----------|----------|----------|
146
- | Sequential | ~45 min | 44 calls | Highest | Small datasets |
147
- | **Parallel Individual** | **~12 min** | **44 calls** | **High** | **Recommended** |
148
- | Batch Processing | ~8 min | ~8 calls | Good | Speed-critical |
149
135
 
150
136
  ## Advanced Usage
151
137
 
@@ -169,31 +155,6 @@ python -m debase.substrate_scope_extractor_parallel \
169
155
  --manuscript paper.pdf --si si.pdf --lineage-csv lineage.csv \
170
156
  --max-workers 5 --output substrate_scope.csv
171
157
  ```
172
-
173
- ## Python API
174
-
175
- ```python
176
- from debase.wrapper import run_pipeline
177
-
178
- # Run full pipeline with parallel processing
179
- run_pipeline(
180
- manuscript_path="paper.pdf",
181
- si_path="si.pdf",
182
- output="output.csv",
183
- use_parallel_individual=True,
184
- max_workers=5
185
- )
186
-
187
- # For individual steps
188
- from debase.reaction_info_extractor_parallel import extract_reaction_info_parallel
189
- from debase.enzyme_lineage_extractor import setup_gemini_api
190
-
191
- model = setup_gemini_api()
192
- reaction_data = extract_reaction_info_parallel(
193
- model, manuscript_path, si_path, enzyme_csv_path, max_workers=5
194
- )
195
- ```
196
-
197
158
  ## Pipeline Architecture
198
159
 
199
160
  The DEBase pipeline consists of 5 main steps:
@@ -222,9 +183,6 @@ The DEBase pipeline consists of 5 main steps:
222
183
  - **External database integration:** Automatic sequence fetching from PDB and UniProt
223
184
  - **AI-powered matching:** Uses Gemini to intelligently match database entries to enzyme variants
224
185
  - **Smart filtering:** Automatically excludes non-enzyme entries (buffers, controls, etc.)
225
- - **Progress tracking:** Real-time status updates
226
- - **Flexible output:** CSV format with comprehensive chemical and performance data
227
- - **Caching:** PDF encoding cache for improved performance
228
186
  - **Vision capabilities:** Extracts data from both text and images in PDFs
229
187
 
230
188
  ## Complete Command Reference
@@ -234,7 +192,6 @@ The DEBase pipeline consists of 5 main steps:
234
192
  --manuscript PATH # Required: Path to manuscript PDF
235
193
  --si PATH # Optional: Path to supplementary information PDF
236
194
  --output PATH # Output file path (default: manuscript_name_debase.csv)
237
- --queries N # Number of consensus queries (default: 2)
238
195
  ```
239
196
 
240
197
  ### Performance Options
@@ -279,21 +236,5 @@ The DEBase pipeline consists of 5 main steps:
279
236
  3. **Use batch processing** only when speed is critical and some accuracy loss is acceptable
280
237
  4. **Skip validation** (`--skip-validation`) for faster processing in production
281
238
  5. **Keep intermediates** (`--keep-intermediates`) for debugging and incremental runs
282
- 6. **Check external databases** - Many sequences can be automatically fetched from PDB/UniProt
283
- 7. **Verify enzyme entries** - The system automatically filters out buffers and controls
284
-
285
- ## Troubleshooting
286
-
287
- ### No sequences found
288
- - The extractor will automatically search PDB and UniProt databases
289
- - Check the logs for which database IDs were found and attempted
290
- - Sequences with PDB structures will be fetched with high confidence
291
-
292
- ### Incorrect enzyme extraction
293
- - Non-enzyme entries (buffers, controls, media) are automatically filtered
294
- - Check the log for entries marked as "Filtering out non-enzyme entry"
239
+ 6.
295
240
 
296
- ### PDB matching issues
297
- - The system uses AI to match PDB IDs to specific enzyme variants
298
- - Increased context extraction ensures better matching accuracy
299
- - Check logs for "Gemini PDB matching" entries to see the matching process
@@ -1,17 +1,17 @@
1
1
  debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
2
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
3
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=HnfC_TWAA2mfjIbkXT0ipZEqElS5wLaMzSj1DkE1F88,49
4
+ debase/_version.py,sha256=rPOdIIhUKYST-L457GFA8SWkOdMGZQAiiaWLSYHnVwc,49
5
5
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
6
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
7
  debase/enzyme_lineage_extractor.py,sha256=1GcgHA-lQPRf9-bNDlvQIP8p-KsP3D2WhIuOtCVJ_ME,87276
8
8
  debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
- debase/reaction_info_extractor.py,sha256=euw-4NHFuOPxpF99PJxTMLYYG0WryBDUCpoANB-SPPM,109655
9
+ debase/reaction_info_extractor.py,sha256=vgXE4eFSmRUU_RPsW7E0vbP5mU0tjrhwk7UVqz_98yM,111469
10
10
  debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
- debase/wrapper.py,sha256=UlUBxxIXBnVtSIT9lZXkQeImlCABiUuof1CVZNKv9N4,10482
12
- debase-0.1.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.0.dist-info/METADATA,sha256=3s1NGPGYOb2bbP5PD5OoWBcJ7UeZ2OTQiOQ-SE5uqoM,11509
14
- debase-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.0.dist-info/RECORD,,
11
+ debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
12
+ debase-0.1.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.2.dist-info/METADATA,sha256=t5JrPGNEtLsF3qXrSpHvn02_rlNGlkYv-NDubaXZa2w,9382
14
+ debase-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.2.dist-info/RECORD,,
File without changes