debase 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +123 -0
- debase/enzyme_lineage_extractor.py +254 -315
- debase/lineage_format.py +22 -18
- debase/reaction_info_extractor.py +180 -62
- debase/substrate_scope_extractor.py +3 -2
- {debase-0.4.4.dist-info → debase-0.5.0.dist-info}/METADATA +1 -1
- debase-0.5.0.dist-info/RECORD +16 -0
- debase-0.4.4.dist-info/RECORD +0 -16
- {debase-0.4.4.dist-info → debase-0.5.0.dist-info}/WHEEL +0 -0
- {debase-0.4.4.dist-info → debase-0.5.0.dist-info}/entry_points.txt +0 -0
- {debase-0.4.4.dist-info → debase-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.4.dist-info → debase-0.5.0.dist-info}/top_level.txt +0 -0
debase/lineage_format.py
CHANGED
@@ -1183,15 +1183,33 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
1183
1183
|
if generation != "0":
|
1184
1184
|
for cid, cmap in campaign_idmap.items():
|
1185
1185
|
if cid == campaign_id:
|
1186
|
+
# First try to find generation 0
|
1186
1187
|
for enzyme_id, enzyme_row in cmap.items():
|
1187
1188
|
enzyme_gen = str(enzyme_row.get("generation", "")).strip()
|
1188
1189
|
if enzyme_gen == "0" or enzyme_gen == "0.0":
|
1189
1190
|
reference_row = enzyme_row
|
1190
1191
|
log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
|
1191
1192
|
break
|
1193
|
+
|
1194
|
+
# If no generation 0 found, find the earliest generation
|
1195
|
+
if not reference_row:
|
1196
|
+
earliest_gen = float('inf')
|
1197
|
+
earliest_enzyme = None
|
1198
|
+
for enzyme_id, enzyme_row in cmap.items():
|
1199
|
+
try:
|
1200
|
+
enzyme_gen = float(str(enzyme_row.get("generation", "")).strip())
|
1201
|
+
if enzyme_gen < earliest_gen and enzyme_gen < float(generation):
|
1202
|
+
earliest_gen = enzyme_gen
|
1203
|
+
earliest_enzyme = enzyme_id
|
1204
|
+
reference_row = enzyme_row
|
1205
|
+
except (ValueError, AttributeError):
|
1206
|
+
continue
|
1207
|
+
|
1208
|
+
if reference_row:
|
1209
|
+
log.info(f"No generation 0 found in campaign {campaign_id}, using generation {earliest_gen} enzyme {earliest_enzyme} as reference for {eid}")
|
1210
|
+
else:
|
1211
|
+
log.warning(f"No suitable reference enzyme found in campaign {campaign_id} for {eid}")
|
1192
1212
|
break
|
1193
|
-
if not reference_row:
|
1194
|
-
log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
|
1195
1213
|
|
1196
1214
|
reference_aa = ""
|
1197
1215
|
reference_nt = ""
|
@@ -1332,24 +1350,10 @@ def run_pipeline(reaction_csv: str | Path | None = None,
|
|
1332
1350
|
if not dfs:
|
1333
1351
|
raise ValueError("At least one input CSV must be provided")
|
1334
1352
|
|
1335
|
-
# Combine dataframes
|
1353
|
+
# Combine dataframes without deduplication
|
1336
1354
|
if len(dfs) > 1:
|
1337
1355
|
df_in = pd.concat(dfs, ignore_index=True)
|
1338
|
-
log.info("Combined data: %d total entries
|
1339
|
-
|
1340
|
-
# Deduplicate based on unique combination of campaign, variant, fitness, and product
|
1341
|
-
# Define the key columns that should be unique
|
1342
|
-
unique_cols = ['campaign_id', 'enzyme_id', 'product_list']
|
1343
|
-
|
1344
|
-
# Check if we have these columns
|
1345
|
-
available_cols = [col for col in unique_cols if col in df_in.columns]
|
1346
|
-
|
1347
|
-
if len(available_cols) >= 2: # Need at least campaign_id and enzyme_id
|
1348
|
-
# Keep the first occurrence of each unique combination
|
1349
|
-
df_in = df_in.drop_duplicates(subset=available_cols, keep='first')
|
1350
|
-
log.info("After deduplication on %s: %d entries", available_cols, len(df_in))
|
1351
|
-
else:
|
1352
|
-
log.warning("Could not deduplicate - missing required columns: %s", unique_cols)
|
1356
|
+
log.info("Combined data: %d total entries", len(df_in))
|
1353
1357
|
else:
|
1354
1358
|
df_in = dfs[0]
|
1355
1359
|
|
@@ -54,11 +54,11 @@ class Config:
|
|
54
54
|
"""Centralised tunables so tests can override them easily."""
|
55
55
|
|
56
56
|
model_name: str = "gemini-2.5-flash"
|
57
|
-
location_temperature: float = 0.
|
57
|
+
location_temperature: float = 0.0
|
58
58
|
extract_temperature: float = 0.0
|
59
59
|
model_reaction_temperature: float = 0.0
|
60
60
|
top_p: float = 1.0
|
61
|
-
max_tokens: int = 12288
|
61
|
+
max_tokens: int = 12288
|
62
62
|
pdf_cache_size: int = 8
|
63
63
|
retries: int = 2
|
64
64
|
|
@@ -730,8 +730,8 @@ Return as JSON:
|
|
730
730
|
###############################################################################
|
731
731
|
|
732
732
|
class ReactionExtractor:
|
733
|
-
_FIG_RE = re.compile(r"fig(?:ure)?\s+s?\d+[a-z]?", re.I)
|
734
|
-
_TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
733
|
+
_FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
|
734
|
+
_TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
735
735
|
|
736
736
|
def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
|
737
737
|
campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
|
@@ -778,40 +778,62 @@ class ReactionExtractor:
|
|
778
778
|
# ------------------------------------------------------------------
|
779
779
|
|
780
780
|
def _collect_captions_and_titles(self) -> str:
|
781
|
-
# Pattern to match Table or Figure with optional leading whitespace
|
781
|
+
# Pattern to match Table or Figure with optional leading whitespace and page numbers
|
782
782
|
# This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
|
783
|
-
# Also handles cases where there's whitespace before the caption
|
784
|
-
cap_pattern = re.compile(r"
|
783
|
+
# Also handles cases where there's whitespace or page numbers before the caption
|
784
|
+
cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
|
785
785
|
captions: List[str] = []
|
786
786
|
|
787
|
-
#
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
#
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
787
|
+
# Process each page individually to avoid TOC entries
|
788
|
+
for page_idx, page_text in enumerate(self.all_pages):
|
789
|
+
# Skip if this looks like a TOC page
|
790
|
+
if self._is_toc_page(page_text):
|
791
|
+
LOGGER.debug("Skipping TOC page %d for caption collection", page_idx + 1)
|
792
|
+
continue
|
793
|
+
|
794
|
+
# Find all figure/table captions with more context
|
795
|
+
for match in cap_pattern.finditer(page_text):
|
796
|
+
caption_line = match.group(0).strip()
|
797
|
+
|
798
|
+
# Skip if this looks like a TOC entry (has page number at end or dots)
|
799
|
+
if re.search(r'\.{3,}|\.{2,}\s*\d+\s*$|\s+\d+\s*$', caption_line):
|
800
|
+
LOGGER.debug("Skipping TOC-style entry: %s", caption_line[:50])
|
801
|
+
continue
|
802
|
+
|
803
|
+
caption_start = match.start()
|
804
|
+
|
805
|
+
# For tables, include much more content after the caption to show actual table data
|
806
|
+
# For figures, include substantial content to show what the figure contains
|
807
|
+
is_table = 'table' in match.group(1).lower()
|
808
|
+
# Increase context for figures to ensure we capture descriptive text
|
809
|
+
max_chars = 8000 if is_table else 3000
|
810
|
+
|
811
|
+
# Get context including text before and after the caption
|
812
|
+
# Include some text before to help identify the location
|
813
|
+
context_before = max(0, caption_start - 200)
|
814
|
+
context_after = min(len(page_text), caption_start + max_chars)
|
815
|
+
|
816
|
+
# Extract the full context
|
817
|
+
full_context = page_text[context_before:context_after].strip()
|
818
|
+
|
819
|
+
# Find the actual caption text (not just the "Figure X" part)
|
820
|
+
# Look for text after the figure/table identifier that forms the caption
|
821
|
+
caption_text = page_text[caption_start:context_after]
|
822
|
+
|
823
|
+
# Try to find the end of the caption (usually ends with a period before next paragraph)
|
824
|
+
caption_end_match = re.search(r'^[^\n]+\.[^\n]*(?:\n\n|\n(?=[A-Z]))', caption_text)
|
825
|
+
if caption_end_match:
|
826
|
+
actual_caption = caption_text[:caption_end_match.end()].strip()
|
827
|
+
else:
|
828
|
+
# Fallback: take first few lines
|
829
|
+
lines = caption_text.split('\n')
|
830
|
+
actual_caption = '\n'.join(lines[:3]).strip()
|
831
|
+
|
832
|
+
# Ensure we have meaningful content, not just the figure number
|
833
|
+
if len(actual_caption) > 20: # More than just "Figure S23."
|
834
|
+
# For the prompt, include the full context to help identify what's in the figure
|
835
|
+
caption_with_context = f"{actual_caption}\n\n[Context around figure/table:]\n{full_context}"
|
836
|
+
captions.append(caption_with_context)
|
815
837
|
|
816
838
|
# Also look for SI section titles
|
817
839
|
si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
|
@@ -1048,6 +1070,39 @@ class ReactionExtractor:
|
|
1048
1070
|
# 6.2 Figure / Table context helpers
|
1049
1071
|
# ------------------------------------------------------------------
|
1050
1072
|
|
1073
|
+
def _is_toc_page(self, page_text: str) -> bool:
|
1074
|
+
"""Detect if a page is a Table of Contents page."""
|
1075
|
+
# Look for common TOC indicators
|
1076
|
+
toc_indicators = [
|
1077
|
+
"table of contents",
|
1078
|
+
"contents",
|
1079
|
+
r"\.{5,}", # Multiple dots (common in TOCs)
|
1080
|
+
r"\d+\s*\n\s*\d+\s*\n\s*\d+", # Multiple page numbers in sequence
|
1081
|
+
]
|
1082
|
+
|
1083
|
+
# Count how many TOC-like patterns we find
|
1084
|
+
toc_score = 0
|
1085
|
+
text_lower = page_text.lower()
|
1086
|
+
|
1087
|
+
# Check for explicit TOC title
|
1088
|
+
if "table of contents" in text_lower or (
|
1089
|
+
"contents" in text_lower and text_lower.index("contents") < 200
|
1090
|
+
):
|
1091
|
+
toc_score += 3
|
1092
|
+
|
1093
|
+
# Check for multiple figure/table references with page numbers
|
1094
|
+
figure_with_page = re.findall(r'figure\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
|
1095
|
+
table_with_page = re.findall(r'table\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
|
1096
|
+
|
1097
|
+
if len(figure_with_page) + len(table_with_page) > 5:
|
1098
|
+
toc_score += 2
|
1099
|
+
|
1100
|
+
# Check for many dotted lines
|
1101
|
+
if len(re.findall(r'\.{5,}', page_text)) > 3:
|
1102
|
+
toc_score += 1
|
1103
|
+
|
1104
|
+
return toc_score >= 2
|
1105
|
+
|
1051
1106
|
def _page_with_reference(self, ref_id: str) -> Optional[str]:
|
1052
1107
|
for page in self.all_pages:
|
1053
1108
|
if ref_id.lower() in page.lower():
|
@@ -1090,6 +1145,7 @@ class ReactionExtractor:
|
|
1090
1145
|
If extract_figure_only=True, extracts just the figure above the caption.
|
1091
1146
|
If False, extracts the entire page (useful for tables).
|
1092
1147
|
Returns a base64-encoded PNG or None."""
|
1148
|
+
LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
|
1093
1149
|
|
1094
1150
|
# Check cache first
|
1095
1151
|
cache_key = f"{ref}_{extract_figure_only}"
|
@@ -1107,14 +1163,27 @@ class ReactionExtractor:
|
|
1107
1163
|
return None
|
1108
1164
|
|
1109
1165
|
# For figure extraction, search both documents for actual figure captions
|
1110
|
-
|
1166
|
+
docs = list(filter(None, [self.ms_doc, self.si_doc]))
|
1167
|
+
LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
|
1168
|
+
|
1169
|
+
for doc_idx, doc in enumerate(docs):
|
1170
|
+
doc_name = "MS" if doc_idx == 0 else "SI"
|
1171
|
+
LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
|
1172
|
+
|
1111
1173
|
for page_number in range(doc.page_count):
|
1112
1174
|
page = doc.load_page(page_number)
|
1113
1175
|
page_text = page.get_text()
|
1176
|
+
LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
|
1177
|
+
page_number + 1, doc_name, len(page_text))
|
1178
|
+
|
1179
|
+
# Skip Table of Contents pages
|
1180
|
+
if self._is_toc_page(page_text):
|
1181
|
+
LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
|
1182
|
+
continue
|
1114
1183
|
|
1115
|
-
# Look for figure caption pattern
|
1116
|
-
#
|
1117
|
-
figure_num = ref.replace('Figure
|
1184
|
+
# Look for figure caption pattern more flexibly
|
1185
|
+
# Normalize the reference to handle variations
|
1186
|
+
figure_num = ref.replace('Figure', '').replace('figure', '').strip()
|
1118
1187
|
|
1119
1188
|
# Extract main figure number from subfigure (e.g., "1C" -> "1")
|
1120
1189
|
main_figure_num = re.match(r'^(\d+)', figure_num)
|
@@ -1123,33 +1192,62 @@ class ReactionExtractor:
|
|
1123
1192
|
else:
|
1124
1193
|
main_figure_num = figure_num
|
1125
1194
|
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
rf"^Figure\s+{re.escape(main_figure_num)}\s*$", # "Figure 1" at end of line
|
1131
|
-
rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
|
1132
|
-
rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
|
1133
|
-
]
|
1195
|
+
# Create a flexible pattern that handles various spacing and formatting
|
1196
|
+
# This pattern looks for "Figure" (case insensitive) followed by optional spaces
|
1197
|
+
# then the figure number, then any of: period, colon, space+capital letter, or end of line
|
1198
|
+
flexible_pattern = rf"(?i)figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
|
1134
1199
|
|
1135
|
-
LOGGER.debug("Looking for
|
1136
|
-
main_figure_num,
|
1200
|
+
LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
|
1201
|
+
main_figure_num, flexible_pattern)
|
1137
1202
|
|
1138
1203
|
caption_found = False
|
1139
1204
|
cap_rect = None
|
1140
1205
|
|
1141
|
-
for
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1206
|
+
# Search for all matches of the flexible pattern
|
1207
|
+
for match in re.finditer(flexible_pattern, page_text, re.MULTILINE):
|
1208
|
+
LOGGER.debug("Found potential figure caption: %s at position %d", match.group(0), match.start())
|
1209
|
+
# Check if this is likely an actual caption (not just a reference)
|
1210
|
+
match_start = match.start()
|
1211
|
+
match_end = match.end()
|
1212
|
+
|
1213
|
+
# Get surrounding context
|
1214
|
+
context_start = max(0, match_start - 50)
|
1215
|
+
context_end = min(len(page_text), match_end + 100)
|
1216
|
+
context = page_text[context_start:context_end]
|
1217
|
+
|
1218
|
+
# Check if this looks like a real caption (not just a reference)
|
1219
|
+
# Look for words that typically precede figure references
|
1220
|
+
preceding_text = page_text[max(0, match_start-20):match_start].lower()
|
1221
|
+
if any(word in preceding_text for word in ['see ', 'in ', 'from ', 'shown in ', 'refer to ']):
|
1222
|
+
LOGGER.debug("Skipping reference preceded by: %s", preceding_text.strip())
|
1223
|
+
continue
|
1224
|
+
|
1225
|
+
# Check if there's descriptive text after the figure number
|
1226
|
+
remaining_text = page_text[match_end:match_end+100].strip()
|
1227
|
+
|
1228
|
+
# For actual captions, there should be substantial descriptive text
|
1229
|
+
if len(remaining_text) < 20:
|
1230
|
+
LOGGER.debug("Skipping potential reference: insufficient text after (%d chars)", len(remaining_text))
|
1231
|
+
continue
|
1232
|
+
|
1233
|
+
# Check if the remaining text looks like a caption (contains descriptive words)
|
1234
|
+
first_words = remaining_text[:50].lower()
|
1235
|
+
if not any(word in first_words for word in ['detailed', 'representative', 'shows', 'comparison',
|
1236
|
+
'illustrates', 'demonstrates', 'results', 'data',
|
1237
|
+
'chromatogram', 'spectra', 'analysis', 'site-directed',
|
1238
|
+
'mutagenesis', 'mutants']):
|
1239
|
+
LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
|
1240
|
+
continue
|
1241
|
+
|
1242
|
+
# Found actual figure caption, get its position
|
1243
|
+
caption_text = match.group(0)
|
1244
|
+
text_instances = page.search_for(caption_text, quads=False)
|
1245
|
+
if text_instances:
|
1246
|
+
cap_rect = text_instances[0]
|
1247
|
+
caption_found = True
|
1248
|
+
LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
|
1249
|
+
ref, caption_text, remaining_text[:50])
|
1250
|
+
break
|
1153
1251
|
|
1154
1252
|
if not caption_found:
|
1155
1253
|
# Debug: show what figure-related text is actually on this page
|
@@ -1158,6 +1256,14 @@ class ReactionExtractor:
|
|
1158
1256
|
if figure_mentions:
|
1159
1257
|
LOGGER.debug("Page %d has figure mentions but no caption match: %s",
|
1160
1258
|
page_number, figure_mentions[:3])
|
1259
|
+
|
1260
|
+
# For supplementary figures, also check for "supplementary" mentions
|
1261
|
+
if 'supplementary' in ref.lower():
|
1262
|
+
supp_mentions = [line.strip() for line in page_text.split('\n')
|
1263
|
+
if 'supplementary' in line.lower() and 'figure' in line.lower()]
|
1264
|
+
if supp_mentions:
|
1265
|
+
LOGGER.warning("Found supplementary figure mentions on page %d but no caption match. First 3: %s",
|
1266
|
+
page_number + 1, supp_mentions[:3])
|
1161
1267
|
continue
|
1162
1268
|
|
1163
1269
|
if extract_figure_only:
|
@@ -1215,6 +1321,8 @@ class ReactionExtractor:
|
|
1215
1321
|
# Cache the result
|
1216
1322
|
self._figure_cache.put(cache_key, result)
|
1217
1323
|
return result
|
1324
|
+
|
1325
|
+
LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
|
1218
1326
|
return None
|
1219
1327
|
|
1220
1328
|
def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
|
@@ -1229,6 +1337,11 @@ class ReactionExtractor:
|
|
1229
1337
|
page = doc.load_page(page_number)
|
1230
1338
|
page_text = page.get_text()
|
1231
1339
|
|
1340
|
+
# Skip Table of Contents pages
|
1341
|
+
if self._is_toc_page(page_text):
|
1342
|
+
LOGGER.debug("Skipping TOC page %d in _find_pages_with_reference", page_number + 1)
|
1343
|
+
continue
|
1344
|
+
|
1232
1345
|
# Check for actual figure caption first
|
1233
1346
|
if ref.lower().startswith('figure'):
|
1234
1347
|
figure_num = ref.replace('Figure ', '').replace('figure ', '')
|
@@ -1445,6 +1558,7 @@ class ReactionExtractor:
|
|
1445
1558
|
|
1446
1559
|
def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
|
1447
1560
|
"""Extract performance metrics for multiple enzymes from the identified location in batch."""
|
1561
|
+
LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
|
1448
1562
|
ref_lc = ref.lower()
|
1449
1563
|
image_b64: Optional[str] = None
|
1450
1564
|
|
@@ -1466,11 +1580,15 @@ class ReactionExtractor:
|
|
1466
1580
|
snippet = self._extract_table_context(ref)
|
1467
1581
|
elif self._FIG_RE.search(ref_lc):
|
1468
1582
|
# For figures, extract just the figure image (same logic as compound mapping)
|
1583
|
+
LOGGER.debug("Attempting to extract figure image for '%s'", ref)
|
1469
1584
|
image_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
1470
1585
|
if not image_b64:
|
1471
|
-
LOGGER.
|
1586
|
+
LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
|
1472
1587
|
snippet = self._extract_figure_caption(ref)
|
1588
|
+
LOGGER.debug("Caption extraction result: %s",
|
1589
|
+
f"'{snippet[:100]}...'" if snippet else "empty")
|
1473
1590
|
else:
|
1591
|
+
LOGGER.info("Successfully extracted figure image for '%s'", ref)
|
1474
1592
|
# If figure is found, ignore text information - use image only
|
1475
1593
|
snippet = ""
|
1476
1594
|
else:
|
@@ -2543,7 +2543,8 @@ def merge_with_lineage(
|
|
2543
2543
|
data = lineage_map[matched_name]
|
2544
2544
|
entry.parent_id = data['parent_id']
|
2545
2545
|
entry.mutations = data['mutations']
|
2546
|
-
|
2546
|
+
# Skip generation - to be filled by lineage_format
|
2547
|
+
# entry.generation = data['generation']
|
2547
2548
|
entry.aa_seq = data['aa_seq']
|
2548
2549
|
entry.dna_seq = data['dna_seq']
|
2549
2550
|
entry.confidence = data['confidence']
|
@@ -2571,7 +2572,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2571
2572
|
'enzyme_id': entry.enzyme_id,
|
2572
2573
|
'parent_enzyme_id': entry.parent_id or '',
|
2573
2574
|
'mutations': entry.mutations or '',
|
2574
|
-
'generation':
|
2575
|
+
'generation': '', # Empty generation - to be filled by lineage_format
|
2575
2576
|
'campaign_id': entry.campaign_id or '',
|
2576
2577
|
'protein_sequence': entry.aa_seq or '',
|
2577
2578
|
'nucleotide_sequence': entry.dna_seq or '',
|
@@ -0,0 +1,16 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=sJMwhIVyUE0G4qRHUUpEgw2beNe5jCSb9uQVOTV6krw,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
6
|
+
debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
|
7
|
+
debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
|
8
|
+
debase/reaction_info_extractor.py,sha256=8ilu5o2FbXTV9R1Nhxd4m4TdgHOd6GsC3rxxHvqu9f4,165555
|
9
|
+
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
|
+
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
+
debase-0.5.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
+
debase-0.5.0.dist-info/METADATA,sha256=2Csgtf4gF8egVAvq8CsY4jpad2yWw_6c1iuOj55L5n8,4047
|
13
|
+
debase-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
+
debase-0.5.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
+
debase-0.5.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
+
debase-0.5.0.dist-info/RECORD,,
|
debase-0.4.4.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=Vtl1u7rFItRnkcTvBiUypIltuuzta9Uy3PxMO2NgNgc,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
|
6
|
-
debase/enzyme_lineage_extractor.py,sha256=jWyDRfOY792zjY5SZCvhNfQxVcEOC1JjTGb9Wo2qZ4I,170543
|
7
|
-
debase/lineage_format.py,sha256=ch5kyoUqD_4Hj7K0hJrRbKrN_FysqFrFXgbyDIgp2oA,57515
|
8
|
-
debase/reaction_info_extractor.py,sha256=Gv1qgzInNWxdaEJdsWGlgyy5syL2qClVoKHFQpR_6q0,158498
|
9
|
-
debase/substrate_scope_extractor.py,sha256=7JyTE3CiIQVDDetwfENCoiq5bLnHElsY3Db1ThVLEBE,115884
|
10
|
-
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
-
debase-0.4.4.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
-
debase-0.4.4.dist-info/METADATA,sha256=Gwx754a5Zr_0yp-HXQuRRLylgEp0hD15MhhMjSOVMHo,4047
|
13
|
-
debase-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
debase-0.4.4.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
-
debase-0.4.4.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
-
debase-0.4.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|