debase 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.0"
3
+ __version__ = "0.1.1"
debase/wrapper.py CHANGED
@@ -35,9 +35,7 @@ def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir:
35
35
  """
36
36
  logger.info(f"Extracting enzyme lineage from {manuscript.name}")
37
37
 
38
- import sys
39
- sys.path.insert(0, str(Path(__file__).parent.parent.parent))
40
- from src.debase.enzyme_lineage_extractor import run_pipeline
38
+ from .enzyme_lineage_extractor import run_pipeline
41
39
  run_pipeline(manuscript=manuscript, si=si, output_csv=output, debug_dir=debug_dir)
42
40
 
43
41
  logger.info(f"Lineage extraction complete: {output}")
@@ -51,7 +49,7 @@ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
51
49
  """
52
50
  logger.info(f"Cleaning sequences from {input_csv.name}")
53
51
 
54
- from src.debase.cleanup_sequence import main as cleanup_sequences
52
+ from .cleanup_sequence import main as cleanup_sequences
55
53
  cleanup_sequences([str(input_csv), str(output_csv)])
56
54
 
57
55
  logger.info(f"Sequence cleanup complete: {output_csv}")
@@ -65,7 +63,7 @@ def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, outpu
65
63
  """
66
64
  logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
67
65
 
68
- from src.debase.reaction_info_extractor import ReactionExtractor, Config
66
+ from .reaction_info_extractor import ReactionExtractor, Config
69
67
  import pandas as pd
70
68
 
71
69
  # Load enzyme data
@@ -89,7 +87,7 @@ def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path
89
87
  """
90
88
  logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
91
89
 
92
- from src.debase.substrate_scope_extractor import run_pipeline
90
+ from .substrate_scope_extractor import run_pipeline
93
91
 
94
92
  # Run substrate scope extraction
95
93
  run_pipeline(
@@ -111,7 +109,7 @@ def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_cs
111
109
  """
112
110
  logger.info(f"Formatting and merging data into final output")
113
111
 
114
- from src.debase.lineage_format import run_pipeline
112
+ from .lineage_format import run_pipeline
115
113
  import pandas as pd
116
114
 
117
115
  # First, we need to merge the protein sequences into the reaction data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,17 +1,16 @@
1
- debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
1
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
2
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=HnfC_TWAA2mfjIbkXT0ipZEqElS5wLaMzSj1DkE1F88,49
3
+ debase/_version.py,sha256=f_aADPF4S4TQJIdnkbAgxIqnOWgZS6TJ3X9EDBZt_OM,49
5
4
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
5
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
6
  debase/enzyme_lineage_extractor.py,sha256=1GcgHA-lQPRf9-bNDlvQIP8p-KsP3D2WhIuOtCVJ_ME,87276
8
7
  debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
8
  debase/reaction_info_extractor.py,sha256=euw-4NHFuOPxpF99PJxTMLYYG0WryBDUCpoANB-SPPM,109655
10
9
  debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
- debase/wrapper.py,sha256=UlUBxxIXBnVtSIT9lZXkQeImlCABiUuof1CVZNKv9N4,10482
12
- debase-0.1.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.0.dist-info/METADATA,sha256=3s1NGPGYOb2bbP5PD5OoWBcJ7UeZ2OTQiOQ-SE5uqoM,11509
14
- debase-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.0.dist-info/RECORD,,
10
+ debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
11
+ debase-0.1.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
+ debase-0.1.1.dist-info/METADATA,sha256=GI8WvSNVIllw_ZKLqlhy-rqtVHBun3ZG1hahEvO_BMo,11509
13
+ debase-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ debase-0.1.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
+ debase-0.1.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
+ debase-0.1.1.dist-info/RECORD,,
debase/PIPELINE_FLOW.md DELETED
@@ -1,100 +0,0 @@
1
- # DEBase Pipeline Flow
2
-
3
- ## Overview
4
- The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
5
-
6
- ## Pipeline Architecture
7
-
8
- ```
9
- ┌─────────────────────┐ ┌─────────────────────┐
10
- │ Manuscript PDF │ │ SI PDF │
11
- └──────────┬──────────┘ └──────────┬──────────┘
12
- │ │
13
- └───────────┬───────────────┘
14
-
15
-
16
- ┌─────────────────────────────┐
17
- │ 1. enzyme_lineage_extractor │
18
- │ - Extract enzyme variants │
19
- │ - Parse mutations │
20
- │ - Get basic metadata │
21
- └─────────────┬───────────────┘
22
-
23
-
24
- ┌─────────────────────────────┐
25
- │ 2. cleanup_sequence │
26
- │ - Validate sequences │
27
- │ - Fix formatting issues │
28
- │ - Generate full sequences │
29
- └─────────────┬───────────────┘
30
-
31
- ┌───────────┴───────────────┐
32
- │ │
33
- ▼ ▼
34
- ┌─────────────────────────┐ ┌─────────────────────────┐
35
- │ 3a. reaction_info │ │ 3b. substrate_scope │
36
- │ _extractor │ │ _extractor │
37
- │ - Performance metrics │ │ - Substrate variations │
38
- │ - Model reaction │ │ - Additional variants │
39
- │ - Conditions │ │ - Scope data │
40
- └───────────┬─────────────┘ └───────────┬─────────────┘
41
- │ │
42
- └───────────┬───────────────┘
43
-
44
-
45
- ┌─────────────────────────────┐
46
- │ 4. lineage_format_o3 │
47
- │ - Merge all data │
48
- │ - Fill missing sequences │
49
- │ - Format final output │
50
- └─────────────┬───────────────┘
51
-
52
-
53
- ┌─────────────┐
54
- │ Final CSV │
55
- └─────────────┘
56
- ```
57
-
58
- ## Module Details
59
-
60
- ### 1. enzyme_lineage_extractor.py
61
- - **Input**: Manuscript PDF, SI PDF
62
- - **Output**: CSV with enzyme variants and mutations
63
- - **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
64
-
65
- ### 2. cleanup_sequence.py
66
- - **Input**: Enzyme lineage CSV
67
- - **Output**: CSV with validated sequences
68
- - **Function**: Validates protein sequences, generates full sequences from mutations
69
-
70
- ### 3a. reaction_info_extractor.py
71
- - **Input**: PDFs + cleaned enzyme CSV
72
- - **Output**: CSV with reaction performance data
73
- - **Function**: Extracts yield, TTN, selectivity, reaction conditions
74
-
75
- ### 3b. substrate_scope_extractor.py
76
- - **Input**: PDFs + cleaned enzyme CSV
77
- - **Output**: CSV with substrate scope entries
78
- - **Function**: Extracts substrate variations tested with different enzymes
79
-
80
- ### 4. lineage_format_o3.py
81
- - **Input**: Reaction CSV + Substrate scope CSV
82
- - **Output**: Final formatted CSV
83
- - **Function**: Merges data, fills missing sequences, applies consistent formatting
84
-
85
- ## Key Features
86
-
87
- 1. **Modular Design**: Each step can be run independently
88
- 2. **Parallel Extraction**: Steps 3a and 3b run independently
89
- 3. **Error Recovery**: Pipeline can resume from any step
90
- 4. **Clean Interfaces**: Each module has well-defined inputs/outputs
91
-
92
- ## Usage
93
-
94
- ```bash
95
- # Full pipeline
96
- python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
97
-
98
- # With intermediate files kept for debugging
99
- python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
100
- ```
File without changes