debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/wrapper.py CHANGED
@@ -46,101 +46,333 @@ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
46
46
  """
47
47
  Step 2: Clean and validate protein sequences
48
48
  Calls: cleanup_sequence.py
49
+ Returns output path even if cleanup fails (copies input file)
49
50
  """
50
51
  logger.info(f"Cleaning sequences from {input_csv.name}")
51
52
 
52
- from .cleanup_sequence import main as cleanup_sequences
53
- cleanup_sequences([str(input_csv), str(output_csv)])
54
-
55
- logger.info(f"Sequence cleanup complete: {output_csv}")
56
- return output_csv
53
+ try:
54
+ from .cleanup_sequence import main as cleanup_sequences
55
+ cleanup_sequences([str(input_csv), str(output_csv)])
56
+
57
+ logger.info(f"Sequence cleanup complete: {output_csv}")
58
+ return output_csv
59
+
60
+ except Exception as e:
61
+ logger.warning(f"Sequence cleanup failed: {e}")
62
+ logger.info("Copying original file to continue pipeline...")
63
+
64
+ # Copy the input file as-is to continue pipeline
65
+ import shutil
66
+ shutil.copy2(input_csv, output_csv)
67
+
68
+ logger.info(f"Original file copied: {output_csv}")
69
+ return output_csv
57
70
 
58
71
 
59
72
  def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
60
73
  """
61
74
  Step 3a: Extract reaction performance metrics
62
75
  Calls: reaction_info_extractor.py
76
+ Returns output path even if extraction fails (creates empty file)
63
77
  """
64
78
  logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
65
79
 
66
- from .reaction_info_extractor import ReactionExtractor, Config
67
- import pandas as pd
68
-
69
- # Load enzyme data
70
- enzyme_df = pd.read_csv(lineage_csv)
71
-
72
- # Initialize extractor and run
73
- cfg = Config()
74
- extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
75
- df_metrics = extractor.run(enzyme_df)
76
-
77
- # Save results
78
- df_metrics.to_csv(output, index=False)
79
- logger.info(f"Reaction extraction complete: {output}")
80
- return output
80
+ try:
81
+ from .reaction_info_extractor import ReactionExtractor, Config
82
+ import pandas as pd
83
+
84
+ # Load enzyme data
85
+ enzyme_df = pd.read_csv(lineage_csv)
86
+
87
+ # Initialize extractor and run
88
+ cfg = Config()
89
+ extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
90
+ df_metrics = extractor.run(enzyme_df)
91
+
92
+ # Save results
93
+ df_metrics.to_csv(output, index=False)
94
+ logger.info(f"Reaction extraction complete: {output}")
95
+ return output
96
+
97
+ except Exception as e:
98
+ logger.warning(f"Reaction extraction failed: {e}")
99
+ logger.info("Creating empty reaction info file to continue pipeline...")
100
+
101
+ # Create empty reaction CSV with basic columns
102
+ import pandas as pd
103
+ empty_df = pd.DataFrame(columns=[
104
+ 'enzyme', 'substrate', 'product', 'yield_percent', 'ee_percent',
105
+ 'conversion_percent', 'reaction_type', 'reaction_conditions', 'notes'
106
+ ])
107
+ empty_df.to_csv(output, index=False)
108
+
109
+ logger.info(f"Empty reaction file created: {output}")
110
+ return output
81
111
 
82
112
 
83
113
  def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
84
114
  """
85
115
  Step 3b: Extract substrate scope data (runs in parallel with reaction extraction)
86
116
  Calls: substrate_scope_extractor.py
117
+ Returns output path even if extraction fails (creates empty file)
87
118
  """
88
119
  logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
89
120
 
90
- from .substrate_scope_extractor import run_pipeline
121
+ try:
122
+ from .substrate_scope_extractor import run_pipeline
123
+
124
+ # Run substrate scope extraction
125
+ run_pipeline(
126
+ manuscript=manuscript,
127
+ si=si,
128
+ lineage_csv=lineage_csv,
129
+ output_csv=output,
130
+ debug_dir=debug_dir
131
+ )
132
+
133
+ logger.info(f"Substrate scope extraction complete: {output}")
134
+ return output
135
+
136
+ except Exception as e:
137
+ logger.warning(f"Substrate scope extraction failed: {e}")
138
+ logger.info("Creating empty substrate scope file to continue pipeline...")
139
+
140
+ # Create empty substrate scope CSV with proper headers
141
+ import pandas as pd
142
+ empty_df = pd.DataFrame(columns=[
143
+ 'enzyme', 'substrate', 'product', 'yield_percent', 'ee_percent',
144
+ 'conversion_percent', 'selectivity', 'reaction_conditions', 'notes'
145
+ ])
146
+ empty_df.to_csv(output, index=False)
147
+
148
+ logger.info(f"Empty substrate scope file created: {output}")
149
+ return output
150
+
151
+
152
+ def match_enzyme_variants_with_gemini(lineage_enzymes: list, data_enzymes: list, model=None) -> dict:
153
+ """
154
+ Use Gemini to match enzyme variant IDs between different datasets.
155
+ Returns a mapping of data_enzyme_id -> lineage_enzyme_id.
156
+ """
157
+ import json
91
158
 
92
- # Run substrate scope extraction
93
- run_pipeline(
94
- manuscript=manuscript,
95
- si=si,
96
- lineage_csv=lineage_csv,
97
- output_csv=output,
98
- debug_dir=debug_dir
99
- )
159
+ if not model:
160
+ try:
161
+ from .enzyme_lineage_extractor import get_model
162
+ model = get_model()
163
+ except:
164
+ logger.warning("Could not load Gemini model for variant matching")
165
+ return {}
100
166
 
101
- logger.info(f"Substrate scope extraction complete: {output}")
102
- return output
167
+ prompt = f"""Match enzyme variant IDs between two lists from the same scientific paper.
168
+
169
+ These lists come from different sections or analyses of the same study, but may use different naming conventions.
170
+
171
+ List 1 (from lineage/sequence data):
172
+ {json.dumps(lineage_enzymes)}
173
+
174
+ List 2 (from experimental data):
175
+ {json.dumps(data_enzymes)}
176
+
177
+ Analyze the patterns and match variants that refer to the same enzyme.
178
+ Return ONLY a JSON object mapping IDs from List 2 to their corresponding IDs in List 1.
179
+ Format: {{"list2_id": "list1_id", ...}}
180
+ Only include matches you are confident about based on the naming patterns.
181
+ """
182
+
183
+ try:
184
+ response = model.generate_content(prompt)
185
+ mapping_text = response.text.strip()
186
+
187
+ # Extract JSON from response
188
+ if '```json' in mapping_text:
189
+ mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
190
+ elif '```' in mapping_text:
191
+ mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
192
+
193
+ mapping = json.loads(mapping_text)
194
+ logger.info(f"Gemini matched {len(mapping)} enzyme variants")
195
+ for k, v in mapping.items():
196
+ logger.info(f" Matched '{k}' -> '{v}'")
197
+ return mapping
198
+ except Exception as e:
199
+ logger.warning(f"Failed to match variants with Gemini: {e}")
200
+ return {}
103
201
 
104
202
 
105
203
  def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_csv: Path, output_csv: Path) -> Path:
106
204
  """
107
205
  Step 4: Format and merge all data into final CSV
108
- Calls: lineage_format.py
206
+ Creates comprehensive format merging all available data, even if some extraction steps failed
109
207
  """
110
208
  logger.info(f"Formatting and merging data into final output")
111
209
 
112
- from .lineage_format import run_pipeline
113
- import pandas as pd
114
-
115
- # First, we need to merge the protein sequences into the reaction data
116
- df_reaction = pd.read_csv(reaction_csv)
117
- df_sequences = pd.read_csv(cleaned_csv)
118
-
119
- # Merge sequences into reaction data
120
- # Include generation and parent info for proper mutation calculation
121
- sequence_cols = ['protein_sequence', 'dna_seq', 'seq_confidence', 'truncated', 'flag',
122
- 'generation', 'parent_enzyme_id', 'mutations']
123
- sequence_data = df_sequences[['enzyme_id'] + [col for col in sequence_cols if col in df_sequences.columns]]
124
-
125
- # Merge on enzyme_id or variant_id
126
- if 'enzyme_id' in df_reaction.columns:
127
- df_reaction = df_reaction.merge(sequence_data, on='enzyme_id', how='left', suffixes=('', '_seq'))
128
- elif 'enzyme' in df_reaction.columns:
129
- sequence_data = sequence_data.rename(columns={'enzyme_id': 'enzyme'})
130
- df_reaction = df_reaction.merge(sequence_data, on='enzyme', how='left', suffixes=('', '_seq'))
131
-
132
- # Save the merged reaction data
133
- df_reaction.to_csv(reaction_csv, index=False)
134
-
135
- # Run the formatting pipeline
136
- df_final = run_pipeline(
137
- reaction_csv=reaction_csv,
138
- substrate_scope_csv=substrate_scope_csv,
139
- output_csv=output_csv
140
- )
141
-
142
- logger.info(f"Final formatting complete: {output_csv}")
143
- return output_csv
210
+ try:
211
+ import pandas as pd
212
+
213
+ # Read all available data files
214
+ logger.info("Reading enzyme lineage data...")
215
+ df_lineage = pd.read_csv(cleaned_csv)
216
+
217
+ logger.info("Reading reaction data...")
218
+ try:
219
+ df_reaction = pd.read_csv(reaction_csv)
220
+ has_reaction_data = len(df_reaction) > 0 and not df_reaction.empty
221
+ except:
222
+ df_reaction = pd.DataFrame()
223
+ has_reaction_data = False
224
+
225
+ logger.info("Reading substrate scope data...")
226
+ try:
227
+ df_scope = pd.read_csv(substrate_scope_csv)
228
+ has_scope_data = len(df_scope) > 0 and not df_scope.empty
229
+ except:
230
+ df_scope = pd.DataFrame()
231
+ has_scope_data = False
232
+
233
+ # Start with lineage data as base
234
+ df_final = df_lineage.copy()
235
+
236
+ # Ensure consistent enzyme ID column
237
+ if 'variant_id' in df_final.columns and 'enzyme_id' not in df_final.columns:
238
+ df_final = df_final.rename(columns={'variant_id': 'enzyme_id'})
239
+
240
+ # Merge reaction data if available
241
+ if has_reaction_data:
242
+ logger.info(f"Merging reaction data ({len(df_reaction)} records)")
243
+ # Match on enzyme_id or enzyme
244
+ merge_key = 'enzyme_id' if 'enzyme_id' in df_reaction.columns else 'enzyme'
245
+ if merge_key in df_reaction.columns:
246
+ df_final = df_final.merge(df_reaction, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_reaction'))
247
+ else:
248
+ logger.info("No reaction data available")
249
+
250
+ # Merge substrate scope data if available
251
+ if has_scope_data:
252
+ logger.info(f"Merging substrate scope data ({len(df_scope)} records)")
253
+ merge_key = 'enzyme_id' if 'enzyme_id' in df_scope.columns else 'enzyme'
254
+
255
+ if merge_key in df_scope.columns:
256
+ # First try direct merge
257
+ df_test_merge = df_final.merge(df_scope, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
258
+
259
+ # Check if any matches were found
260
+ matched_count = df_test_merge[merge_key + '_scope'].notna().sum() if merge_key + '_scope' in df_test_merge.columns else 0
261
+
262
+ if matched_count == 0:
263
+ logger.info("No direct matches found, using Gemini to match enzyme variants...")
264
+
265
+ # Get unique enzyme IDs from both datasets
266
+ lineage_enzymes = df_final['enzyme_id'].dropna().unique().tolist()
267
+ scope_enzymes = df_scope[merge_key].dropna().unique().tolist()
268
+
269
+ # Get mapping from Gemini
270
+ mapping = match_enzyme_variants_with_gemini(lineage_enzymes, scope_enzymes)
271
+
272
+ if mapping:
273
+ # Apply mapping to scope data
274
+ df_scope_mapped = df_scope.copy()
275
+ df_scope_mapped[merge_key] = df_scope_mapped[merge_key].map(lambda x: mapping.get(x, x))
276
+ df_final = df_final.merge(df_scope_mapped, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
277
+ else:
278
+ logger.warning("Could not match enzyme variants between datasets")
279
+ df_final = df_test_merge
280
+ else:
281
+ df_final = df_test_merge
282
+ logger.info(f"Direct merge matched {matched_count} records")
283
+ else:
284
+ logger.info("No substrate scope data available")
285
+
286
+ # Add comprehensive column structure for missing data
287
+ essential_columns = [
288
+ 'enzyme_id', 'parent_id', 'generation', 'mutations', 'campaign_id', 'notes',
289
+ 'aa_seq', 'dna_seq', 'seq_confidence', 'truncated', 'seq_source', 'doi',
290
+ 'substrate_list', 'substrate_iupac_list', 'product_list', 'product_iupac_list',
291
+ 'cofactor_list', 'cofactor_iupac_list', 'yield', 'ee', 'ttn',
292
+ 'reaction_temperature', 'reaction_ph', 'reaction_buffer', 'reaction_other_conditions',
293
+ 'data_location'
294
+ ]
295
+
296
+ # Add missing columns with NaN
297
+ for col in essential_columns:
298
+ if col not in df_final.columns:
299
+ df_final[col] = None
300
+
301
+ # Clean up duplicate columns from merging
302
+ columns_to_keep = []
303
+ seen_base_names = set()
304
+ for col in df_final.columns:
305
+ base_name = col.split('_reaction')[0].split('_scope')[0]
306
+ if base_name not in seen_base_names:
307
+ columns_to_keep.append(col)
308
+ seen_base_names.add(base_name)
309
+ elif col.endswith('_scope') or col.endswith('_reaction'):
310
+ # Prefer scope or reaction data over base lineage data for certain columns
311
+ if base_name in ['substrate_list', 'product_list', 'yield', 'ee', 'reaction_temperature']:
312
+ columns_to_keep.append(col)
313
+ # Remove the base column if it exists
314
+ if base_name in columns_to_keep:
315
+ columns_to_keep.remove(base_name)
316
+ seen_base_names.add(base_name)
317
+
318
+ df_final = df_final[columns_to_keep]
319
+
320
+ # Rename merged columns back to standard names
321
+ rename_map = {}
322
+ for col in df_final.columns:
323
+ if col.endswith('_scope') or col.endswith('_reaction'):
324
+ base_name = col.split('_scope')[0].split('_reaction')[0]
325
+ rename_map[col] = base_name
326
+ df_final = df_final.rename(columns=rename_map)
327
+
328
+ # Save the comprehensive final output
329
+ df_final.to_csv(output_csv, index=False)
330
+
331
+ logger.info(f"Final comprehensive format complete: {output_csv}")
332
+ logger.info(f"Final output contains {len(df_final)} variants with {len(df_final.columns)} data columns")
333
+
334
+ # Log what data was successfully merged
335
+ if has_reaction_data:
336
+ logger.info("✓ Reaction performance data merged")
337
+ if has_scope_data:
338
+ logger.info("✓ Substrate scope data merged")
339
+
340
+ # Now run the actual lineage format to produce plate-based format
341
+ logger.info("\nRunning lineage format to produce plate-based output...")
342
+ try:
343
+ from .lineage_format import flatten_dataframe
344
+
345
+ # Create the plate-based output filename
346
+ plate_output = output_csv.parent / (output_csv.stem + "_plate_format.csv")
347
+
348
+ # Flatten the dataframe to plate format
349
+ df_flattened = flatten_dataframe(df_final)
350
+
351
+ # Save the flattened output
352
+ df_flattened.to_csv(plate_output, index=False)
353
+
354
+ logger.info(f"✓ Plate-based format saved to: {plate_output}")
355
+ logger.info(f" Contains {len(df_flattened)} rows with plate/well assignments")
356
+
357
+ # Update the final output path to be the plate format
358
+ output_csv = plate_output
359
+
360
+ except Exception as e:
361
+ logger.warning(f"Could not generate plate-based format: {e}")
362
+ logger.info("Comprehensive format will be used as final output")
363
+
364
+ return output_csv
365
+
366
+ except Exception as e:
367
+ logger.warning(f"Final formatting failed: {e}")
368
+ logger.info("Using cleaned sequence data as final output...")
369
+
370
+ # Copy the cleaned CSV as the final output
371
+ import shutil
372
+ shutil.copy2(cleaned_csv, output_csv)
373
+
374
+ logger.info(f"Cleaned sequence file used as final output: {output_csv}")
375
+ return output_csv
144
376
 
145
377
 
146
378
  def run_pipeline(
@@ -206,7 +438,7 @@ def run_pipeline(
206
438
 
207
439
  # Step 4: Format and merge
208
440
  logger.info("\n[Step 4/5] Formatting and merging data...")
209
- run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
441
+ final_output = run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
210
442
 
211
443
  # Step 5: Finalize
212
444
  logger.info("\n[Step 5/5] Finalizing...")
@@ -219,11 +451,13 @@ def run_pipeline(
219
451
 
220
452
  logger.info("\n" + "="*60)
221
453
  logger.info("PIPELINE COMPLETED SUCCESSFULLY")
222
- logger.info(f"Output: {output_path}")
454
+ logger.info(f"Comprehensive output: {output_path}")
455
+ if final_output != output_path:
456
+ logger.info(f"Plate-based output: {final_output}")
223
457
  logger.info(f"Runtime: {elapsed:.1f} seconds")
224
458
  logger.info("="*60)
225
459
 
226
- return output_path
460
+ return final_output
227
461
 
228
462
  except Exception as e:
229
463
  logger.error(f"Pipeline failed: {str(e)}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.11
3
+ Version: 0.1.17
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,17 @@
1
+ debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
+ debase/_version.py,sha256=edeF0ciTSBytkIGNcNjx3UR4nAs3QzF_Lmmyr66k0Jc,50
5
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
+ debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
+ debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
8
+ debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
9
+ debase/reaction_info_extractor.py,sha256=NjOXZf22i3PvYpCgk9DCnswCbgmCQkj5V2-E21LEM6M,112876
10
+ debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
11
+ debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
12
+ debase-0.1.17.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.17.dist-info/METADATA,sha256=uCGXpNG7dIVZtpywd8V7kBcXuWHPyTjhJmH0mWKD7Ew,10790
14
+ debase-0.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.17.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.17.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.17.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=L4sqaU-oAJRWrcboH-vA95jHfUiXr5-fAsrF7lqZSyQ,50
5
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
- debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=at4OYHdXtgMku1FR_6AsHWk64UKInWkGQL9m3H6cKIQ,99809
8
- debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
- debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
10
- debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
- debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
12
- debase-0.1.11.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.11.dist-info/METADATA,sha256=ZSR0Yl36Al_rQm9Ht9jut7om3xQT8yqyobIjEUH_Xfo,10790
14
- debase-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.11.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.11.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.11.dist-info/RECORD,,