debase 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {debase-0.1.2 → debase-0.1.3}/PKG-INFO +1 -1
  2. {debase-0.1.2 → debase-0.1.3}/src/debase/_version.py +1 -1
  3. {debase-0.1.2 → debase-0.1.3}/src/debase/enzyme_lineage_extractor.py +10 -1
  4. {debase-0.1.2 → debase-0.1.3}/src/debase/reaction_info_extractor.py +14 -1
  5. {debase-0.1.2 → debase-0.1.3}/src/debase.egg-info/PKG-INFO +1 -1
  6. {debase-0.1.2 → debase-0.1.3}/.gitignore +0 -0
  7. {debase-0.1.2 → debase-0.1.3}/CONTRIBUTING.md +0 -0
  8. {debase-0.1.2 → debase-0.1.3}/LICENSE +0 -0
  9. {debase-0.1.2 → debase-0.1.3}/MANIFEST.in +0 -0
  10. {debase-0.1.2 → debase-0.1.3}/README.md +0 -0
  11. {debase-0.1.2 → debase-0.1.3}/docs/README.md +0 -0
  12. {debase-0.1.2 → debase-0.1.3}/docs/examples/README.md +0 -0
  13. {debase-0.1.2 → debase-0.1.3}/environment.yml +0 -0
  14. {debase-0.1.2 → debase-0.1.3}/pyproject.toml +0 -0
  15. {debase-0.1.2 → debase-0.1.3}/setup.cfg +0 -0
  16. {debase-0.1.2 → debase-0.1.3}/setup.py +0 -0
  17. {debase-0.1.2 → debase-0.1.3}/src/__init__.py +0 -0
  18. {debase-0.1.2 → debase-0.1.3}/src/debase/PIPELINE_FLOW.md +0 -0
  19. {debase-0.1.2 → debase-0.1.3}/src/debase/__init__.py +0 -0
  20. {debase-0.1.2 → debase-0.1.3}/src/debase/__main__.py +0 -0
  21. {debase-0.1.2 → debase-0.1.3}/src/debase/build_db.py +0 -0
  22. {debase-0.1.2 → debase-0.1.3}/src/debase/cleanup_sequence.py +0 -0
  23. {debase-0.1.2 → debase-0.1.3}/src/debase/lineage_format.py +0 -0
  24. {debase-0.1.2 → debase-0.1.3}/src/debase/substrate_scope_extractor.py +0 -0
  25. {debase-0.1.2 → debase-0.1.3}/src/debase/wrapper.py +0 -0
  26. {debase-0.1.2 → debase-0.1.3}/src/debase.egg-info/SOURCES.txt +0 -0
  27. {debase-0.1.2 → debase-0.1.3}/src/debase.egg-info/dependency_links.txt +0 -0
  28. {debase-0.1.2 → debase-0.1.3}/src/debase.egg-info/entry_points.txt +0 -0
  29. {debase-0.1.2 → debase-0.1.3}/src/debase.egg-info/requires.txt +0 -0
  30. {debase-0.1.2 → debase-0.1.3}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.2"
3
+ __version__ = "0.1.3"
@@ -1297,6 +1297,8 @@ _SEQUENCE_SCHEMA_HINT = """
1297
1297
  _SEQ_LOC_PROMPT = """
1298
1298
  Find where FULL-LENGTH protein or DNA sequences are located in this document.
1299
1299
 
1300
+ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
1301
+
1300
1302
  Look for table of contents entries or section listings that mention sequences.
1301
1303
  Return a JSON array where each element has:
1302
1304
  - "section": the section heading or description
@@ -1305,6 +1307,7 @@ Return a JSON array where each element has:
1305
1307
  Focus on:
1306
1308
  - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
1307
1309
  - Return the EXACT notation as shown.
1310
+ - Prioritize sections that mention "protein" or "amino acid" sequences
1308
1311
 
1309
1312
  Return [] if no sequence sections are found.
1310
1313
  Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -1465,10 +1468,16 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
1465
1468
  # --- 7.3 Main extraction prompt ---------------------------------------------
1466
1469
  _SEQ_EXTRACTION_PROMPT = """
1467
1470
  Extract EVERY distinct enzyme-variant sequence you can find in the text.
1471
+
1472
+ IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
1473
+ - If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
1474
+ - Only extract dna_seq if NO amino acid sequence is available for that variant
1475
+ - This reduces redundancy since protein sequences are usually more relevant
1476
+
1468
1477
  For each variant return:
1469
1478
  * variant_id - the label used in the paper (e.g. "R4-10")
1470
1479
  * aa_seq - amino-acid sequence (uppercase), or null
1471
- * dna_seq - DNA sequence (A/C/G/T), or null
1480
+ * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
1472
1481
 
1473
1482
  Respond ONLY with **minified JSON** that matches the schema below.
1474
1483
  NO markdown, no code fences, no commentary.
@@ -1055,7 +1055,20 @@ Different campaigns may use different model reactions.
1055
1055
  """Extract text around a given location identifier."""
1056
1056
  location_lower = location.lower()
1057
1057
 
1058
- # Search in all pages
1058
+ # Handle compound locations like "Figure 2 caption and Section I"
1059
+ # Extract the first figure/table/scheme reference
1060
+ figure_match = re.search(r"(figure|scheme|table)\s*\d+", location_lower)
1061
+ if figure_match:
1062
+ primary_location = figure_match.group(0)
1063
+ # Try to find this primary location first
1064
+ for page_text in self.all_pages:
1065
+ if primary_location in page_text.lower():
1066
+ idx = page_text.lower().index(primary_location)
1067
+ start = max(0, idx - 500)
1068
+ end = min(len(page_text), idx + 3000)
1069
+ return page_text[start:end]
1070
+
1071
+ # Search in all pages for exact match
1059
1072
  for page_text in self.all_pages:
1060
1073
  if location_lower in page_text.lower():
1061
1074
  # Find the location and extract context around it
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes