debase 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +512 -33
- debase/enzyme_lineage_extractor.py +977 -97
- debase/lineage_format.py +221 -12
- debase/reaction_info_extractor.py +133 -23
- debase/substrate_scope_extractor.py +49 -2
- debase/wrapper.py +155 -151
- debase-0.4.4.dist-info/METADATA +121 -0
- debase-0.4.4.dist-info/RECORD +16 -0
- debase-0.4.3.dist-info/METADATA +0 -296
- debase-0.4.3.dist-info/RECORD +0 -16
- {debase-0.4.3.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
- {debase-0.4.3.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
- {debase-0.4.3.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.3.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0
debase/wrapper.py
CHANGED
@@ -75,6 +75,48 @@ def reset_token_usage():
|
|
75
75
|
module_data['output'] = 0
|
76
76
|
module_data['calls'] = 0
|
77
77
|
|
78
|
+
def save_token_usage_to_csv(manuscript_path: Path, input_tokens: int, output_tokens: int, cost: float, runtime: float, output_dir: Path):
|
79
|
+
"""Save token usage and cost to CSV with naming format: price_manuscriptname.csv"""
|
80
|
+
import pandas as pd
|
81
|
+
|
82
|
+
# Create filename: price_manuscriptname.csv
|
83
|
+
manuscript_name = manuscript_path.stem.replace(' ', '_').replace('-', '_')
|
84
|
+
csv_filename = f"price_{manuscript_name}.csv"
|
85
|
+
csv_path = output_dir / csv_filename
|
86
|
+
|
87
|
+
# Prepare the data
|
88
|
+
data = {
|
89
|
+
'manuscript_name': [manuscript_name],
|
90
|
+
'timestamp': [datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
|
91
|
+
'input_tokens': [input_tokens],
|
92
|
+
'output_tokens': [output_tokens],
|
93
|
+
'total_tokens': [input_tokens + output_tokens],
|
94
|
+
'estimated_cost_usd': [cost],
|
95
|
+
'runtime_seconds': [runtime]
|
96
|
+
}
|
97
|
+
|
98
|
+
# Add module breakdown
|
99
|
+
with _token_lock:
|
100
|
+
for module_name, usage in _token_usage['calls_by_module'].items():
|
101
|
+
if usage['calls'] > 0:
|
102
|
+
data[f'{module_name}_calls'] = [usage['calls']]
|
103
|
+
data[f'{module_name}_input_tokens'] = [usage['input']]
|
104
|
+
data[f'{module_name}_output_tokens'] = [usage['output']]
|
105
|
+
module_cost = (usage['input'] / 1_000_000) * 0.30 + (usage['output'] / 1_000_000) * 2.50
|
106
|
+
data[f'{module_name}_cost_usd'] = [module_cost]
|
107
|
+
else:
|
108
|
+
data[f'{module_name}_calls'] = [0]
|
109
|
+
data[f'{module_name}_input_tokens'] = [0]
|
110
|
+
data[f'{module_name}_output_tokens'] = [0]
|
111
|
+
data[f'{module_name}_cost_usd'] = [0.0]
|
112
|
+
|
113
|
+
# Create DataFrame and save
|
114
|
+
df = pd.DataFrame(data)
|
115
|
+
df.to_csv(csv_path, index=False)
|
116
|
+
|
117
|
+
logger.info(f"Token usage saved to: {csv_path}")
|
118
|
+
return csv_path
|
119
|
+
|
78
120
|
|
79
121
|
def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir: Path = None) -> Path:
|
80
122
|
"""
|
@@ -277,175 +319,104 @@ Only include matches you are confident about based on the naming patterns.
|
|
277
319
|
def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_csv: Path, output_csv: Path) -> Path:
|
278
320
|
"""
|
279
321
|
Step 4: Format and merge all data into final CSV
|
280
|
-
|
322
|
+
Uses lineage_format module to normalize data, convert IUPAC to SMILES, fill missing sequences,
|
323
|
+
and create the plate format output
|
281
324
|
"""
|
282
|
-
logger.info(f"Formatting and merging data into final output")
|
325
|
+
logger.info(f"Formatting and merging data into final plate format output")
|
283
326
|
|
284
327
|
try:
|
328
|
+
from . import lineage_format
|
285
329
|
import pandas as pd
|
286
330
|
|
287
|
-
#
|
288
|
-
|
289
|
-
|
331
|
+
# Check which files have data
|
332
|
+
has_reaction_data = False
|
333
|
+
has_scope_data = False
|
290
334
|
|
291
|
-
logger.info("Reading reaction data...")
|
292
335
|
try:
|
293
336
|
df_reaction = pd.read_csv(reaction_csv)
|
294
|
-
has_reaction_data = len(df_reaction) > 0
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
logger.info("Reading substrate scope data...")
|
337
|
+
has_reaction_data = len(df_reaction) > 0
|
338
|
+
logger.info(f"Reaction data has {len(df_reaction)} entries")
|
339
|
+
except Exception as e:
|
340
|
+
logger.info(f"No reaction data available: {e}")
|
341
|
+
|
300
342
|
try:
|
301
343
|
df_scope = pd.read_csv(substrate_scope_csv)
|
302
|
-
has_scope_data = len(df_scope) > 0
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
#
|
308
|
-
|
309
|
-
|
310
|
-
#
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
#
|
315
|
-
if has_reaction_data:
|
316
|
-
logger.
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
344
|
+
has_scope_data = len(df_scope) > 0
|
345
|
+
logger.info(f"Substrate scope data has {len(df_scope)} entries")
|
346
|
+
except Exception as e:
|
347
|
+
logger.info(f"No substrate scope data available: {e}")
|
348
|
+
|
349
|
+
# Use lineage_format's run_pipeline to process the data
|
350
|
+
logger.info("Running lineage format pipeline to create plate format...")
|
351
|
+
|
352
|
+
# The lineage_format expects string paths
|
353
|
+
reaction_path = str(reaction_csv) if has_reaction_data else None
|
354
|
+
scope_path = str(substrate_scope_csv) if has_scope_data else None
|
355
|
+
|
356
|
+
# If neither file has data, just copy the cleaned file
|
357
|
+
if not has_reaction_data and not has_scope_data:
|
358
|
+
logger.warning("No data to process in either reaction or substrate scope files")
|
359
|
+
import shutil
|
360
|
+
shutil.copy2(cleaned_csv, output_csv)
|
361
|
+
return output_csv
|
362
|
+
|
363
|
+
# Call lineage_format's run_pipeline function
|
364
|
+
# This will handle all the processing including:
|
365
|
+
# - Merging reaction and substrate scope data
|
366
|
+
# - Filling missing sequences
|
367
|
+
# - Converting IUPAC names to SMILES
|
368
|
+
# - Creating the flattened plate format
|
369
|
+
logger.info("Calling lineage_format.run_pipeline...")
|
370
|
+
|
371
|
+
# Run the pipeline and get the formatted dataframe
|
372
|
+
df = lineage_format.run_pipeline(
|
373
|
+
reaction_csv=reaction_path,
|
374
|
+
substrate_scope_csv=scope_path,
|
375
|
+
output_csv=str(output_csv)
|
376
|
+
)
|
323
377
|
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
merge_key = 'enzyme_id' if 'enzyme_id' in df_scope.columns else 'enzyme'
|
328
|
-
|
329
|
-
if merge_key in df_scope.columns:
|
330
|
-
# First try direct merge
|
331
|
-
df_test_merge = df_final.merge(df_scope, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
|
332
|
-
|
333
|
-
# Check if any matches were found
|
334
|
-
matched_count = df_test_merge[merge_key + '_scope'].notna().sum() if merge_key + '_scope' in df_test_merge.columns else 0
|
335
|
-
|
336
|
-
if matched_count == 0:
|
337
|
-
logger.info("No direct matches found, using Gemini to match enzyme variants...")
|
338
|
-
|
339
|
-
# Get unique enzyme IDs from both datasets
|
340
|
-
lineage_enzymes = df_final['enzyme_id'].dropna().unique().tolist()
|
341
|
-
scope_enzymes = df_scope[merge_key].dropna().unique().tolist()
|
342
|
-
|
343
|
-
# Get mapping from Gemini
|
344
|
-
mapping = match_enzyme_variants_with_gemini(lineage_enzymes, scope_enzymes)
|
345
|
-
|
346
|
-
if mapping:
|
347
|
-
# Apply mapping to scope data
|
348
|
-
df_scope_mapped = df_scope.copy()
|
349
|
-
df_scope_mapped[merge_key] = df_scope_mapped[merge_key].map(lambda x: mapping.get(x, x))
|
350
|
-
df_final = df_final.merge(df_scope_mapped, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
|
351
|
-
else:
|
352
|
-
logger.warning("Could not match enzyme variants between datasets")
|
353
|
-
df_final = df_test_merge
|
354
|
-
else:
|
355
|
-
df_final = df_test_merge
|
356
|
-
logger.info(f"Direct merge matched {matched_count} records")
|
357
|
-
else:
|
358
|
-
logger.info("No substrate scope data available")
|
359
|
-
|
360
|
-
# Add comprehensive column structure for missing data
|
361
|
-
essential_columns = [
|
362
|
-
'enzyme_id', 'parent_id', 'generation', 'mutations', 'campaign_id', 'notes',
|
363
|
-
'aa_seq', 'dna_seq', 'seq_confidence', 'truncated', 'seq_source', 'doi',
|
364
|
-
'substrate_list', 'substrate_iupac_list', 'product_list', 'product_iupac_list',
|
365
|
-
'cofactor_list', 'cofactor_iupac_list', 'yield', 'ee', 'ttn',
|
366
|
-
'reaction_temperature', 'reaction_ph', 'reaction_buffer', 'reaction_other_conditions',
|
367
|
-
'data_location'
|
368
|
-
]
|
378
|
+
logger.info(f"Lineage format pipeline completed successfully")
|
379
|
+
logger.info(f"Final output saved to: {output_csv}")
|
380
|
+
logger.info(f"Output contains {len(df)} rows in plate format (flattened)")
|
369
381
|
|
370
|
-
#
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
# Clean up duplicate columns from merging
|
376
|
-
columns_to_keep = []
|
377
|
-
seen_base_names = set()
|
378
|
-
for col in df_final.columns:
|
379
|
-
base_name = col.split('_reaction')[0].split('_scope')[0]
|
380
|
-
if base_name not in seen_base_names:
|
381
|
-
columns_to_keep.append(col)
|
382
|
-
seen_base_names.add(base_name)
|
383
|
-
elif col.endswith('_scope') or col.endswith('_reaction'):
|
384
|
-
# Prefer scope or reaction data over base lineage data for certain columns
|
385
|
-
if base_name in ['substrate_list', 'product_list', 'yield', 'ee', 'reaction_temperature']:
|
386
|
-
columns_to_keep.append(col)
|
387
|
-
# Remove the base column if it exists
|
388
|
-
if base_name in columns_to_keep:
|
389
|
-
columns_to_keep.remove(base_name)
|
390
|
-
seen_base_names.add(base_name)
|
391
|
-
|
392
|
-
df_final = df_final[columns_to_keep]
|
393
|
-
|
394
|
-
# Rename merged columns back to standard names
|
395
|
-
rename_map = {}
|
396
|
-
for col in df_final.columns:
|
397
|
-
if col.endswith('_scope') or col.endswith('_reaction'):
|
398
|
-
base_name = col.split('_scope')[0].split('_reaction')[0]
|
399
|
-
rename_map[col] = base_name
|
400
|
-
df_final = df_final.rename(columns=rename_map)
|
401
|
-
|
402
|
-
# Save the comprehensive final output
|
403
|
-
df_final.to_csv(output_csv, index=False)
|
404
|
-
|
405
|
-
logger.info(f"Final comprehensive format complete: {output_csv}")
|
406
|
-
logger.info(f"Final output contains {len(df_final)} variants with {len(df_final.columns)} data columns")
|
407
|
-
|
408
|
-
# Log what data was successfully merged
|
409
|
-
if has_reaction_data:
|
410
|
-
logger.info("✓ Reaction performance data merged")
|
411
|
-
if has_scope_data:
|
412
|
-
logger.info("✓ Substrate scope data merged")
|
413
|
-
|
414
|
-
# Now run the actual lineage format to produce plate-based format
|
415
|
-
logger.info("\nRunning lineage format to produce plate-based output...")
|
416
|
-
try:
|
417
|
-
from .lineage_format import flatten_dataframe
|
418
|
-
|
419
|
-
# Create the plate-based output filename
|
420
|
-
plate_output = output_csv.parent / (output_csv.stem + "_plate_format.csv")
|
421
|
-
|
422
|
-
# Flatten the dataframe to plate format
|
423
|
-
df_flattened = flatten_dataframe(df_final)
|
424
|
-
|
425
|
-
# Save the flattened output
|
426
|
-
df_flattened.to_csv(plate_output, index=False)
|
427
|
-
|
428
|
-
logger.info(f"✓ Plate-based format saved to: {plate_output}")
|
429
|
-
logger.info(f" Contains {len(df_flattened)} rows with plate/well assignments")
|
430
|
-
|
431
|
-
# Update the final output path to be the plate format
|
432
|
-
output_csv = plate_output
|
433
|
-
|
434
|
-
except Exception as e:
|
435
|
-
logger.warning(f"Could not generate plate-based format: {e}")
|
436
|
-
logger.info("Comprehensive format will be used as final output")
|
382
|
+
# Log column summary
|
383
|
+
key_columns = ['enzyme_id', 'substrate', 'product', 'yield', 'ee', 'ttn',
|
384
|
+
'substrate_smiles', 'product_smiles', 'protein_sequence']
|
385
|
+
available_columns = [col for col in key_columns if col in df.columns]
|
386
|
+
logger.info(f"Key columns in output: {', '.join(available_columns)}")
|
437
387
|
|
438
388
|
return output_csv
|
439
389
|
|
440
390
|
except Exception as e:
|
441
|
-
logger.warning(f"
|
442
|
-
logger.info("
|
391
|
+
logger.warning(f"Lineage formatting failed: {e}")
|
392
|
+
logger.info("Falling back to simple concatenation...")
|
443
393
|
|
444
|
-
#
|
445
|
-
import
|
446
|
-
|
394
|
+
# Fallback to simple concatenation
|
395
|
+
import pandas as pd
|
396
|
+
dfs = []
|
397
|
+
|
398
|
+
try:
|
399
|
+
df_reaction = pd.read_csv(reaction_csv)
|
400
|
+
if len(df_reaction) > 0:
|
401
|
+
dfs.append(df_reaction)
|
402
|
+
except:
|
403
|
+
pass
|
404
|
+
|
405
|
+
try:
|
406
|
+
df_scope = pd.read_csv(substrate_scope_csv)
|
407
|
+
if len(df_scope) > 0:
|
408
|
+
dfs.append(df_scope)
|
409
|
+
except:
|
410
|
+
pass
|
411
|
+
|
412
|
+
if dfs:
|
413
|
+
df_final = pd.concat(dfs, ignore_index=True)
|
414
|
+
df_final.to_csv(output_csv, index=False)
|
415
|
+
else:
|
416
|
+
import shutil
|
417
|
+
shutil.copy2(cleaned_csv, output_csv)
|
447
418
|
|
448
|
-
logger.info(f"
|
419
|
+
logger.info(f"Fallback output saved to: {output_csv}")
|
449
420
|
return output_csv
|
450
421
|
|
451
422
|
|
@@ -478,6 +449,26 @@ def run_pipeline(
|
|
478
449
|
reaction_csv = output_dir / "3a_reaction_info.csv"
|
479
450
|
substrate_csv = output_dir / "3b_substrate_scope.csv"
|
480
451
|
|
452
|
+
# Setup file logging
|
453
|
+
log_file = output_dir / f"debase_pipeline_{time.strftime('%Y%m%d_%H%M%S')}.log"
|
454
|
+
|
455
|
+
# Configure logging to both file and console
|
456
|
+
file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
|
457
|
+
file_handler.setLevel(logging.DEBUG)
|
458
|
+
file_formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(name)s: %(message)s')
|
459
|
+
file_handler.setFormatter(file_formatter)
|
460
|
+
|
461
|
+
# Add file handler to root logger and all module loggers
|
462
|
+
root_logger = logging.getLogger()
|
463
|
+
root_logger.addHandler(file_handler)
|
464
|
+
|
465
|
+
# Also add to module-specific loggers
|
466
|
+
for module_name in ['debase.enzyme_lineage_extractor', 'debase.cleanup_sequence',
|
467
|
+
'debase.reaction_info_extractor', 'debase.substrate_scope_extractor',
|
468
|
+
'debase.lineage_format', 'debase.wrapper']:
|
469
|
+
module_logger = logging.getLogger(module_name)
|
470
|
+
module_logger.addHandler(file_handler)
|
471
|
+
|
481
472
|
try:
|
482
473
|
# Reset token usage tracking for this pipeline run
|
483
474
|
reset_token_usage()
|
@@ -487,6 +478,7 @@ def run_pipeline(
|
|
487
478
|
logger.info(f"Manuscript: {manuscript_path}")
|
488
479
|
logger.info(f"SI: {si_path if si_path else 'None'}")
|
489
480
|
logger.info(f"Output: {output_path}")
|
481
|
+
logger.info(f"Log file: {log_file}")
|
490
482
|
logger.info("="*60)
|
491
483
|
|
492
484
|
start_time = time.time()
|
@@ -529,6 +521,9 @@ def run_pipeline(
|
|
529
521
|
# Calculate token usage and estimated costs
|
530
522
|
total_input_tokens, total_output_tokens, estimated_cost = calculate_token_usage_and_cost()
|
531
523
|
|
524
|
+
# Save token usage to CSV file
|
525
|
+
save_token_usage_to_csv(manuscript_path, total_input_tokens, total_output_tokens, estimated_cost, elapsed, output_dir)
|
526
|
+
|
532
527
|
logger.info("\n" + "="*60)
|
533
528
|
logger.info("PIPELINE COMPLETED SUCCESSFULLY")
|
534
529
|
logger.info(f"Comprehensive output: {output_path}")
|
@@ -563,6 +558,15 @@ def run_pipeline(
|
|
563
558
|
except Exception as e:
|
564
559
|
logger.error(f"Pipeline failed: {str(e)}")
|
565
560
|
raise
|
561
|
+
finally:
|
562
|
+
# Clean up file handler
|
563
|
+
file_handler.close()
|
564
|
+
root_logger.removeHandler(file_handler)
|
565
|
+
for module_name in ['debase.enzyme_lineage_extractor', 'debase.cleanup_sequence',
|
566
|
+
'debase.reaction_info_extractor', 'debase.substrate_scope_extractor',
|
567
|
+
'debase.lineage_format', 'debase.wrapper']:
|
568
|
+
module_logger = logging.getLogger(module_name)
|
569
|
+
module_logger.removeHandler(file_handler)
|
566
570
|
|
567
571
|
|
568
572
|
def main():
|
@@ -0,0 +1,121 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: debase
|
3
|
+
Version: 0.4.4
|
4
|
+
Summary: Enzyme lineage analysis and sequence extraction package
|
5
|
+
Home-page: https://github.com/YuemingLong/DEBase
|
6
|
+
Author: DEBase Team
|
7
|
+
Author-email: DEBase Team <ylong@caltech.edu>
|
8
|
+
License: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/YuemingLong/DEBase
|
10
|
+
Project-URL: Documentation, https://github.com/YuemingLong/DEBase#readme
|
11
|
+
Project-URL: Repository, https://github.com/YuemingLong/DEBase
|
12
|
+
Project-URL: Issues, https://github.com/YuemingLong/DEBase/issues
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
16
|
+
Classifier: Operating System :: OS Independent
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
25
|
+
Requires-Python: >=3.8
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
License-File: LICENSE
|
28
|
+
Requires-Dist: pandas>=1.0.0
|
29
|
+
Requires-Dist: PyMuPDF>=1.18.0
|
30
|
+
Requires-Dist: numpy>=1.19.0
|
31
|
+
Requires-Dist: google-generativeai>=0.3.0
|
32
|
+
Requires-Dist: biopython>=1.78
|
33
|
+
Requires-Dist: requests>=2.25.0
|
34
|
+
Requires-Dist: httpx>=0.24.0
|
35
|
+
Requires-Dist: tqdm>=4.60.0
|
36
|
+
Requires-Dist: openpyxl>=3.0.0
|
37
|
+
Requires-Dist: PyPDF2>=2.0.0
|
38
|
+
Requires-Dist: Pillow>=8.0.0
|
39
|
+
Requires-Dist: networkx>=2.5
|
40
|
+
Provides-Extra: rdkit
|
41
|
+
Requires-Dist: rdkit>=2020.03.1; extra == "rdkit"
|
42
|
+
Provides-Extra: dev
|
43
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
44
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
45
|
+
Requires-Dist: black; extra == "dev"
|
46
|
+
Requires-Dist: isort; extra == "dev"
|
47
|
+
Requires-Dist: flake8; extra == "dev"
|
48
|
+
Requires-Dist: mypy; extra == "dev"
|
49
|
+
Provides-Extra: docs
|
50
|
+
Requires-Dist: sphinx>=4.0; extra == "docs"
|
51
|
+
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
52
|
+
Requires-Dist: myst-parser; extra == "docs"
|
53
|
+
Dynamic: author
|
54
|
+
Dynamic: home-page
|
55
|
+
Dynamic: license-file
|
56
|
+
Dynamic: requires-python
|
57
|
+
|
58
|
+
# DEBase
|
59
|
+
|
60
|
+
DEBase is a Python package for extracting and analyzing enzyme lineage data from scientific papers using AI-powered parsing.
|
61
|
+
|
62
|
+
## Features
|
63
|
+
|
64
|
+
- Extract enzyme variant lineages from PDF documents
|
65
|
+
- Parse protein and DNA sequences with mutation annotations
|
66
|
+
- Extract reaction performance metrics (yield, TTN, ee)
|
67
|
+
- Extract and organize substrate scope data
|
68
|
+
- Match enzyme variants across different data sources using AI
|
69
|
+
- Generate structured CSV outputs for downstream analysis
|
70
|
+
|
71
|
+
## Installation
|
72
|
+
|
73
|
+
```bash
|
74
|
+
pip install debase
|
75
|
+
```
|
76
|
+
|
77
|
+
## Quick Start
|
78
|
+
|
79
|
+
```bash
|
80
|
+
# Run the complete pipeline
|
81
|
+
debase --manuscript paper.pdf --si supplementary.pdf --output results.csv
|
82
|
+
|
83
|
+
# Enable debug mode to save Gemini prompts and responses
|
84
|
+
debase --manuscript paper.pdf --si supplementary.pdf --output results.csv --debug-dir ./debug_output
|
85
|
+
|
86
|
+
# Individual components with debugging
|
87
|
+
python -m debase.enzyme_lineage_extractor --manuscript paper.pdf --output lineage.csv --debug-dir ./debug_output
|
88
|
+
python -m debase.reaction_info_extractor --manuscript paper.pdf --lineage-csv lineage.csv --output reactions.csv --debug-dir ./debug_output
|
89
|
+
python -m debase.substrate_scope_extractor --manuscript paper.pdf --lineage-csv lineage.csv --output substrate_scope.csv --debug-dir ./debug_output
|
90
|
+
python -m debase.lineage_format -r reactions.csv -s substrate_scope.csv -o final.csv -v
|
91
|
+
```
|
92
|
+
|
93
|
+
## Debugging
|
94
|
+
|
95
|
+
Use the `--debug-dir` flag to save all Gemini API prompts and responses for debugging:
|
96
|
+
- Location extraction prompts
|
97
|
+
- Sequence extraction prompts (can be very large, up to 150K characters)
|
98
|
+
- Enzyme matching prompts
|
99
|
+
- All API responses with timestamps
|
100
|
+
- Note: lineage_format.py uses `-v` for verbose output instead of `--debug-dir`
|
101
|
+
|
102
|
+
## Requirements
|
103
|
+
|
104
|
+
- Python 3.8+
|
105
|
+
- Google Gemini API key (set as GEMINI_API_KEY environment variable)
|
106
|
+
|
107
|
+
## Version
|
108
|
+
|
109
|
+
0.4.4
|
110
|
+
|
111
|
+
## License
|
112
|
+
|
113
|
+
MIT License
|
114
|
+
|
115
|
+
## Authors
|
116
|
+
|
117
|
+
DEBase Team - Caltech
|
118
|
+
|
119
|
+
## Contact
|
120
|
+
|
121
|
+
ylong@caltech.edu
|
@@ -0,0 +1,16 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=Vtl1u7rFItRnkcTvBiUypIltuuzta9Uy3PxMO2NgNgc,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
|
6
|
+
debase/enzyme_lineage_extractor.py,sha256=jWyDRfOY792zjY5SZCvhNfQxVcEOC1JjTGb9Wo2qZ4I,170543
|
7
|
+
debase/lineage_format.py,sha256=ch5kyoUqD_4Hj7K0hJrRbKrN_FysqFrFXgbyDIgp2oA,57515
|
8
|
+
debase/reaction_info_extractor.py,sha256=Gv1qgzInNWxdaEJdsWGlgyy5syL2qClVoKHFQpR_6q0,158498
|
9
|
+
debase/substrate_scope_extractor.py,sha256=7JyTE3CiIQVDDetwfENCoiq5bLnHElsY3Db1ThVLEBE,115884
|
10
|
+
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
+
debase-0.4.4.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
+
debase-0.4.4.dist-info/METADATA,sha256=Gwx754a5Zr_0yp-HXQuRRLylgEp0hD15MhhMjSOVMHo,4047
|
13
|
+
debase-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
+
debase-0.4.4.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
+
debase-0.4.4.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
+
debase-0.4.4.dist-info/RECORD,,
|