debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/campaign_utils.py +146 -0
- debase/caption_pattern.py +44 -0
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +481 -106
- debase/lineage_format.py +44 -1
- debase/reaction_info_extractor.py +479 -135
- debase/substrate_scope_extractor.py +207 -80
- debase/wrapper.py +3 -3
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
- debase-0.6.2.dist-info/RECORD +18 -0
- debase-0.6.0.dist-info/RECORD +0 -16
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,13 @@ import re
|
|
28
28
|
import json
|
29
29
|
import time
|
30
30
|
import logging
|
31
|
+
|
32
|
+
# Import universal caption pattern
|
33
|
+
try:
|
34
|
+
from .caption_pattern import get_universal_caption_pattern
|
35
|
+
except ImportError:
|
36
|
+
# Fallback if running as standalone script
|
37
|
+
from caption_pattern import get_universal_caption_pattern
|
31
38
|
import subprocess
|
32
39
|
from pathlib import Path
|
33
40
|
from dataclasses import dataclass, field
|
@@ -183,13 +190,8 @@ except ImportError as exc: # pragma: no cover
|
|
183
190
|
|
184
191
|
from base64 import b64encode
|
185
192
|
|
186
|
-
#
|
187
|
-
|
188
|
-
# This catches all variations including "Table S 2", "Figure.", etc.
|
189
|
-
_CAPTION_PREFIX_RE = re.compile(
|
190
|
-
r"^(Table|Figure|Fig|Scheme|Chart|Extended\s+Data\s+Fig|ED\s+Fig|Supplementary\s+(?:Table|Figure)).*",
|
191
|
-
re.I | re.M
|
192
|
-
)
|
193
|
+
# Use universal caption pattern
|
194
|
+
_CAPTION_PREFIX_RE = get_universal_caption_pattern()
|
193
195
|
|
194
196
|
def _open_doc(pdf_path: str | Path | bytes):
|
195
197
|
if isinstance(pdf_path, (str, Path)):
|
@@ -294,12 +296,14 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
|
|
294
296
|
|
295
297
|
return "\n".join(chunks)
|
296
298
|
|
297
|
-
def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str]:
|
299
|
+
def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: Optional[str] = None, document_hint: Optional[str] = None) -> Optional[str]:
|
298
300
|
"""Extract figure as a page region when embedded images aren't available.
|
299
301
|
|
300
302
|
Args:
|
301
303
|
pdf_paths: List of PDF paths to search
|
302
304
|
figure_ref: Figure reference to search for (e.g., "Figure 3" or "Figure 3(a)")
|
305
|
+
caption_hint: Optional caption text to help identify the exact figure
|
306
|
+
document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
|
303
307
|
|
304
308
|
Returns:
|
305
309
|
Base64-encoded PNG string or None if not found
|
@@ -316,8 +320,20 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
316
320
|
log.info("Extracting entire figure '%s' from reference '%s'", base_figure_ref, figure_ref)
|
317
321
|
else:
|
318
322
|
base_figure_ref = figure_ref
|
323
|
+
|
324
|
+
# Determine search order based on document hint
|
325
|
+
search_paths = list(pdf_paths) # Create a copy
|
326
|
+
if document_hint and len(pdf_paths) > 1:
|
327
|
+
if document_hint.lower() == "manuscript":
|
328
|
+
# Prioritize manuscript (first PDF)
|
329
|
+
search_paths = [pdf_paths[0]] + pdf_paths[1:]
|
330
|
+
log.info("Prioritizing manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
|
331
|
+
elif document_hint.lower() == "supplementary":
|
332
|
+
# Prioritize SI (second PDF if available)
|
333
|
+
search_paths = [pdf_paths[1], pdf_paths[0]] if len(pdf_paths) > 1 else pdf_paths
|
334
|
+
log.info("Prioritizing supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
|
319
335
|
|
320
|
-
for pdf_path in
|
336
|
+
for pdf_path in search_paths:
|
321
337
|
doc = _open_doc(pdf_path)
|
322
338
|
try:
|
323
339
|
for page_num in range(doc.page_count):
|
@@ -331,26 +347,38 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
331
347
|
# Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
|
332
348
|
figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
|
333
349
|
|
334
|
-
#
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
|
341
|
-
]
|
342
|
-
|
343
|
-
for pattern in caption_patterns:
|
344
|
-
matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
|
345
|
-
if matches:
|
346
|
-
# Found actual figure caption, get its position
|
347
|
-
caption_text = matches.group(0)
|
348
|
-
caption_instances = page.search_for(caption_text, quads=False)
|
350
|
+
# First try to find using caption hint if provided
|
351
|
+
if caption_hint and len(caption_hint) > 10:
|
352
|
+
# Try to find the exact caption text
|
353
|
+
caption_snippet = caption_hint[:100] # Use first 100 chars
|
354
|
+
if caption_snippet in page_text:
|
355
|
+
caption_instances = page.search_for(caption_snippet, quads=False)
|
349
356
|
if caption_instances:
|
350
357
|
caption_rect = caption_instances[0]
|
351
358
|
found = True
|
352
|
-
log.info("Found
|
353
|
-
|
359
|
+
log.info("Found figure using caption hint on page %d", page_num + 1)
|
360
|
+
|
361
|
+
# If not found with hint, look for actual figure captions using regex patterns
|
362
|
+
if not found:
|
363
|
+
caption_patterns = [
|
364
|
+
rf"^Figure\s+{re.escape(figure_num)}\.", # "Figure 3." at start of line
|
365
|
+
rf"^Figure\s+{re.escape(figure_num)}:", # "Figure 3:" at start of line
|
366
|
+
rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
|
367
|
+
rf"Figure\s+{re.escape(figure_num)}\s*\.", # "Figure 3." anywhere
|
368
|
+
rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
|
369
|
+
]
|
370
|
+
|
371
|
+
for pattern in caption_patterns:
|
372
|
+
matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
|
373
|
+
if matches:
|
374
|
+
# Found actual figure caption, get its position
|
375
|
+
caption_text = matches.group(0)
|
376
|
+
caption_instances = page.search_for(caption_text, quads=False)
|
377
|
+
if caption_instances:
|
378
|
+
caption_rect = caption_instances[0]
|
379
|
+
found = True
|
380
|
+
log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
|
381
|
+
break
|
354
382
|
|
355
383
|
if not found:
|
356
384
|
continue
|
@@ -485,61 +513,134 @@ def extract_scheme_image(pdf_paths: List[Path], scheme_ref: str) -> Optional[str
|
|
485
513
|
return None
|
486
514
|
|
487
515
|
|
488
|
-
def
|
489
|
-
"""
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
# Try to extract base figure/table reference
|
494
|
-
base_ref_match = re.match(r'((?:Figure|Fig|Table)\s*\d+)', ref, re.IGNORECASE)
|
495
|
-
base_ref = base_ref_match.group(1) if base_ref_match else ref
|
516
|
+
def _build_caption_index(pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]:
|
517
|
+
"""Build an index of all captions for quick lookup."""
|
518
|
+
cap_pattern = get_universal_caption_pattern()
|
519
|
+
caption_index = {}
|
496
520
|
|
497
|
-
for pdf_path in pdf_paths:
|
521
|
+
for pdf_idx, pdf_path in enumerate(pdf_paths):
|
498
522
|
doc = _open_doc(pdf_path)
|
523
|
+
source = "manuscript" if pdf_idx == 0 else "supplementary"
|
524
|
+
|
499
525
|
try:
|
500
526
|
for page_num in range(doc.page_count):
|
501
527
|
page = doc.load_page(page_num)
|
502
528
|
page_text = page.get_text()
|
503
529
|
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
base_ref, # Base reference (e.g., "Figure 3")
|
508
|
-
ref.replace("(", " ").replace(")", ""), # "Figure 3 a"
|
509
|
-
ref.replace("(", "").replace(")", ""), # "Figure 3a"
|
510
|
-
ref.replace("Figure", "Fig"), # "Fig 3(a)"
|
511
|
-
base_ref.replace("Figure", "Fig"), # "Fig 3"
|
512
|
-
]
|
513
|
-
|
514
|
-
# Find the reference in the page
|
515
|
-
found = False
|
516
|
-
pos = -1
|
517
|
-
used_ref = ref
|
518
|
-
|
519
|
-
for ref_var in ref_variations:
|
520
|
-
if ref_var.lower() in page_text.lower():
|
521
|
-
pos = page_text.lower().find(ref_var.lower())
|
522
|
-
used_ref = ref_var
|
523
|
-
found = True
|
524
|
-
break
|
525
|
-
|
526
|
-
if found and pos >= 0:
|
527
|
-
# Extract context around it
|
528
|
-
start = max(0, pos - context_chars)
|
529
|
-
end = min(len(page_text), pos + len(used_ref) + context_chars)
|
530
|
+
for match in cap_pattern.finditer(page_text):
|
531
|
+
caption_text = match.group(0).strip()
|
532
|
+
caption_lower = caption_text.lower()
|
530
533
|
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
534
|
+
# Store caption info
|
535
|
+
caption_info = {
|
536
|
+
'full_caption': caption_text,
|
537
|
+
'page_text': page_text,
|
538
|
+
'page_num': page_num + 1,
|
539
|
+
'pdf_path': pdf_path,
|
540
|
+
'source': source,
|
541
|
+
'match_start': match.start(),
|
542
|
+
'doc': doc # Keep doc reference for page extraction
|
543
|
+
}
|
544
|
+
|
545
|
+
# Create multiple keys for flexible matching
|
546
|
+
# Key 1: Full caption text (first 100 chars)
|
547
|
+
key1 = caption_text[:100].lower().strip()
|
548
|
+
caption_index[key1] = caption_info
|
549
|
+
|
550
|
+
# Key 2: Simplified reference (e.g., "table 5", "figure s3")
|
551
|
+
ref_match = re.search(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', caption_lower)
|
552
|
+
if ref_match:
|
553
|
+
key2 = f"{ref_match.group(1)} {ref_match.group(2)}"
|
554
|
+
caption_index[key2] = caption_info
|
555
|
+
|
556
|
+
# Also store with 's' prefix if in SI
|
557
|
+
if source == "supplementary" and 's' not in key2:
|
558
|
+
key3 = f"{ref_match.group(1)} s{ref_match.group(2)}"
|
559
|
+
caption_index[key3] = caption_info
|
536
560
|
finally:
|
537
561
|
doc.close()
|
538
562
|
|
563
|
+
return caption_index
|
564
|
+
|
565
|
+
def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
|
566
|
+
"""Extract text around a specific reference using caption index."""
|
567
|
+
import re
|
568
|
+
|
569
|
+
# Build caption index if not already built
|
570
|
+
if not hasattr(_extract_text_around_reference, '_caption_index'):
|
571
|
+
_extract_text_around_reference._caption_index = _build_caption_index(pdf_paths)
|
572
|
+
|
573
|
+
caption_index = _extract_text_around_reference._caption_index
|
574
|
+
ref_lower = ref.lower().strip()
|
575
|
+
|
576
|
+
# Try multiple matching strategies
|
577
|
+
matches = []
|
578
|
+
|
579
|
+
# Strategy 1: Direct key lookup
|
580
|
+
if ref_lower in caption_index:
|
581
|
+
matches.append(caption_index[ref_lower])
|
582
|
+
|
583
|
+
# Strategy 2: Normalized reference lookup
|
584
|
+
ref_match = re.match(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', ref_lower, re.I)
|
585
|
+
if ref_match:
|
586
|
+
ref_type, ref_num = ref_match.groups()
|
587
|
+
if ref_type == 'fig':
|
588
|
+
ref_type = 'figure'
|
589
|
+
|
590
|
+
# Try different key formats
|
591
|
+
keys_to_try = [
|
592
|
+
f"{ref_type} {ref_num}",
|
593
|
+
f"{ref_type} s{ref_num}",
|
594
|
+
f"table {ref_num}",
|
595
|
+
f"fig {ref_num}",
|
596
|
+
f"figure {ref_num}"
|
597
|
+
]
|
598
|
+
|
599
|
+
for key in keys_to_try:
|
600
|
+
if key in caption_index and caption_index[key] not in matches:
|
601
|
+
matches.append(caption_index[key])
|
602
|
+
|
603
|
+
# Strategy 3: Fuzzy matching
|
604
|
+
if not matches and ref_match:
|
605
|
+
for key, info in caption_index.items():
|
606
|
+
if ref_num in key and any(t in key for t in ['table', 'figure', 'fig', 'scheme']):
|
607
|
+
if info not in matches:
|
608
|
+
matches.append(info)
|
609
|
+
|
610
|
+
# Extract text from matches
|
611
|
+
extracted_sections = []
|
612
|
+
for match in matches:
|
613
|
+
page_text = match['page_text']
|
614
|
+
caption_start = match['match_start']
|
615
|
+
|
616
|
+
# Extract context around the caption
|
617
|
+
start = max(0, caption_start - context_chars // 2)
|
618
|
+
end = min(len(page_text), caption_start + context_chars)
|
619
|
+
|
620
|
+
section = page_text[start:end]
|
621
|
+
source_label = f"{match['source'].upper()} page {match['page_num']}"
|
622
|
+
extracted_sections.append(f"\n[From {source_label}]\n{section}")
|
623
|
+
|
539
624
|
if not extracted_sections:
|
540
|
-
log.warning("
|
625
|
+
log.warning(f"No matches found for reference '{ref}'")
|
626
|
+
# Fallback to old approach
|
627
|
+
for pdf_path in pdf_paths:
|
628
|
+
doc = _open_doc(pdf_path)
|
629
|
+
try:
|
630
|
+
for page_num in range(doc.page_count):
|
631
|
+
page = doc.load_page(page_num)
|
632
|
+
page_text = page.get_text()
|
633
|
+
|
634
|
+
if ref_lower in page_text.lower():
|
635
|
+
pos = page_text.lower().find(ref_lower)
|
636
|
+
start = max(0, pos - context_chars // 2)
|
637
|
+
end = min(len(page_text), pos + context_chars)
|
638
|
+
extracted_sections.append(page_text[start:end])
|
639
|
+
break
|
640
|
+
finally:
|
641
|
+
doc.close()
|
541
642
|
|
542
|
-
return "\n".join(extracted_sections)
|
643
|
+
return "\n\n".join(extracted_sections)
|
543
644
|
|
544
645
|
def _extract_sections_by_title(pdf_paths: List[Path], section_titles: List[str], max_chars_per_section: int = 10000) -> str:
|
545
646
|
"""Extract sections by their titles from PDFs."""
|
@@ -1060,17 +1161,24 @@ Your task is to:
|
|
1060
1161
|
4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
|
1061
1162
|
5. Determine which enzyme variants from this campaign were tested in substrate scope studies
|
1062
1163
|
|
1164
|
+
IMPORTANT FIGURE REFERENCE RULES:
|
1165
|
+
- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
|
1166
|
+
- Include the figure caption if available to help with identification
|
1167
|
+
- The extraction system will handle retrieving the entire figure including all sub-panels
|
1168
|
+
|
1063
1169
|
Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
|
1064
1170
|
[
|
1065
1171
|
{{
|
1066
|
-
"location": "
|
1172
|
+
"location": "Main figure/table reference (e.g., 'Figure 2', 'Table S1', NOT 'Figure 2a')",
|
1067
1173
|
"type": "table|figure|text",
|
1068
1174
|
"confidence": 0.0-1.0,
|
1069
1175
|
"enzyme_variants": ["list of enzyme IDs found"],
|
1070
1176
|
"substrates_tested": ["list of substrates if identifiable"],
|
1071
1177
|
"campaign_match": true/false,
|
1072
1178
|
"is_substrate_scope": true/false,
|
1073
|
-
"model_reaction_excluded": "reason why this is not a model reaction"
|
1179
|
+
"model_reaction_excluded": "reason why this is not a model reaction",
|
1180
|
+
"caption": "Include the figure/table caption if available",
|
1181
|
+
"document": "manuscript|supplementary - specify whether this location is in the main manuscript or supplementary information"
|
1074
1182
|
}}
|
1075
1183
|
]
|
1076
1184
|
|
@@ -1790,22 +1898,28 @@ def extract_substrate_scope_entries_for_campaign(
|
|
1790
1898
|
all_refs = []
|
1791
1899
|
|
1792
1900
|
if locations:
|
1793
|
-
#
|
1794
|
-
|
1795
|
-
|
1796
|
-
loc_str = loc.get('location', '')
|
1797
|
-
location_strs.append(loc_str)
|
1798
|
-
all_refs.append(loc_str)
|
1901
|
+
# Sort locations by confidence and use only the PRIMARY (most confident) location
|
1902
|
+
sorted_locations = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
|
1903
|
+
primary_location = sorted_locations[0] if sorted_locations else None
|
1799
1904
|
|
1800
|
-
|
1905
|
+
if primary_location:
|
1906
|
+
primary_ref = primary_location.get('location', '')
|
1907
|
+
all_refs = [primary_ref] # Only extract from primary location
|
1908
|
+
|
1909
|
+
extraction_hints = f"\nPRIMARY substrate scope location for campaign {campaign_id}: {primary_ref}"
|
1910
|
+
extraction_hints += f"\nLocation confidence: {primary_location.get('confidence', 0)}%"
|
1911
|
+
extraction_hints += f"\nLocation type: {primary_location.get('type', 'unknown')}"
|
1801
1912
|
|
1802
1913
|
# Focus on campaign-specific enzyme variants
|
1803
1914
|
extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
|
1804
1915
|
|
1805
|
-
# Extract text from
|
1916
|
+
# Extract text from ONLY the primary location
|
1806
1917
|
extraction_texts = []
|
1807
1918
|
figure_images = {}
|
1808
1919
|
|
1920
|
+
# Create a mapping of location strings to their full location data
|
1921
|
+
location_map = {loc.get('location', ''): loc for loc in locations}
|
1922
|
+
|
1809
1923
|
for ref in all_refs:
|
1810
1924
|
if ref and pdf_paths:
|
1811
1925
|
ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
|
@@ -1814,7 +1928,10 @@ def extract_substrate_scope_entries_for_campaign(
|
|
1814
1928
|
|
1815
1929
|
# Extract figure images for this reference (crop page around figure)
|
1816
1930
|
try:
|
1817
|
-
|
1931
|
+
# Get caption and document hints if available
|
1932
|
+
caption_hint = location_map.get(ref, {}).get('caption', '')
|
1933
|
+
document_hint = location_map.get(ref, {}).get('document', '')
|
1934
|
+
fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
|
1818
1935
|
if fig_base64:
|
1819
1936
|
figure_images[ref] = fig_base64
|
1820
1937
|
log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
|
@@ -1867,6 +1984,14 @@ IMPORTANT INSTRUCTIONS:
|
|
1867
1984
|
4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
|
1868
1985
|
5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
|
1869
1986
|
|
1987
|
+
CRITICAL DATA ACCURACY REQUIREMENTS:
|
1988
|
+
- BE EXTREMELY CAREFUL about which substrate ID maps to which yield, TTN, and selectivity values
|
1989
|
+
- Each substrate entry should have its OWN yield, ee, and TTN values - do not mix up values between substrates
|
1990
|
+
- If looking at a table or figure, carefully match each substrate with its corresponding row/bar/data point
|
1991
|
+
- Double-check that substrate 1a's data is not confused with substrate 1b's data, etc.
|
1992
|
+
- If values are unclear or ambiguous for a specific substrate, return null rather than guessing
|
1993
|
+
- Pay special attention when extracting from figures - ensure you're reading the correct bar/point for each substrate
|
1994
|
+
|
1870
1995
|
{extraction_hints}
|
1871
1996
|
|
1872
1997
|
Return your analysis as JSON in this format:
|
@@ -2212,13 +2337,15 @@ def get_substrate_scope(
|
|
2212
2337
|
if should_extract:
|
2213
2338
|
figure_ref = location_str
|
2214
2339
|
confidence = loc.get('confidence', 0)
|
2340
|
+
caption_hint = loc.get('caption', '')
|
2215
2341
|
log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, location_type)
|
2216
2342
|
|
2217
2343
|
# Use appropriate extraction function based on type
|
2218
2344
|
if 'scheme' in location_str.lower() or location_type == 'scheme':
|
2219
2345
|
figure_image = extract_scheme_image(pdf_paths, figure_ref)
|
2220
2346
|
else:
|
2221
|
-
|
2347
|
+
document_hint = loc.get('document', '')
|
2348
|
+
figure_image = extract_figure_image(pdf_paths, figure_ref, caption_hint=caption_hint, document_hint=document_hint)
|
2222
2349
|
|
2223
2350
|
if figure_image:
|
2224
2351
|
log.info("Successfully extracted %s image for %s (%d bytes)",
|
debase/wrapper.py
CHANGED
@@ -7,10 +7,10 @@ Pipeline flow:
|
|
7
7
|
2. cleanup_sequence.py - Clean and validate protein sequences
|
8
8
|
3. reaction_info_extractor.py - Extract reaction performance metrics
|
9
9
|
4. substrate_scope_extractor.py - Extract substrate scope data (runs independently)
|
10
|
-
5.
|
10
|
+
5. lineage_format.py - Format and merge all data into final CSV
|
11
11
|
|
12
12
|
The reaction_info and substrate_scope extractors run in parallel,
|
13
|
-
then their outputs are combined in
|
13
|
+
then their outputs are combined in lineage_format.
|
14
14
|
"""
|
15
15
|
import os
|
16
16
|
import sys
|
@@ -579,7 +579,7 @@ Pipeline steps:
|
|
579
579
|
2. cleanup_sequence - Validate and clean protein sequences
|
580
580
|
3. reaction_info_extractor - Extract reaction performance metrics
|
581
581
|
4. substrate_scope_extractor - Extract substrate scope data
|
582
|
-
5.
|
582
|
+
5. lineage_format - Format and merge into final CSV
|
583
583
|
|
584
584
|
The pipeline automatically handles all steps sequentially.
|
585
585
|
"""
|
@@ -0,0 +1,18 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=t771GcmZTaJJGrIex6Ea6Q5pcMqVPIihCdRFRA1dMAM,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
|
6
|
+
debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
|
7
|
+
debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
|
8
|
+
debase/enzyme_lineage_extractor.py,sha256=OXO2jUqAqF0pXrw17oIQERnek1uZ5gsFIuKRz4NMS1o,188556
|
9
|
+
debase/lineage_format.py,sha256=YWAP9OhFN3MQWbqk5gguX0C2cCwGvKJAtMq9pG5TJp8,59515
|
10
|
+
debase/reaction_info_extractor.py,sha256=kQBxPpzurjHXsHFWE_WM84ArSnc3E8f6xPMJpyTIGnU,188246
|
11
|
+
debase/substrate_scope_extractor.py,sha256=hRlt8iWOURmgW4SJHB1Svoh3TTa4fa9YIE8qVUZPnY0,122621
|
12
|
+
debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
|
13
|
+
debase-0.6.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
14
|
+
debase-0.6.2.dist-info/METADATA,sha256=gnPvTWvazrsdGrIKX8tA4Wwt8yKYph87POVKF25rkkg,4047
|
15
|
+
debase-0.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
16
|
+
debase-0.6.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
17
|
+
debase-0.6.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
18
|
+
debase-0.6.2.dist-info/RECORD,,
|
debase-0.6.0.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=C3pfAAiv8eypvKmkKXrxXIwx9sLe4TtShkzld5GJJ7k,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
6
|
-
debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
|
7
|
-
debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
|
8
|
-
debase/reaction_info_extractor.py,sha256=9QXbtp0RSP6QMqQ_azBWDceGIqiw2JPCg3eJ0Ba_lxA,167849
|
9
|
-
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
|
-
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
-
debase-0.6.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
-
debase-0.6.0.dist-info/METADATA,sha256=o_Iq-jjk_ekBLLheIQZuLEeC7tWbduq70b5jqfiCmro,4047
|
13
|
-
debase-0.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
debase-0.6.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
-
debase-0.6.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
-
debase-0.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|