boltznet 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- boltznet-0.3.0/LICENSE +16 -0
- boltznet-0.3.0/PKG-INFO +115 -0
- boltznet-0.3.0/README.md +92 -0
- boltznet-0.3.0/pyproject.toml +45 -0
- boltznet-0.3.0/setup.cfg +4 -0
- boltznet-0.3.0/src/boltznet/__init__.py +3 -0
- boltznet-0.3.0/src/boltznet/_internal/CONFIG.py +51 -0
- boltznet-0.3.0/src/boltznet/_internal/aln.py +1323 -0
- boltznet-0.3.0/src/boltznet/_internal/base_data_type.py +107 -0
- boltznet-0.3.0/src/boltznet/_internal/browser.py +538 -0
- boltznet-0.3.0/src/boltznet/_internal/browser_util.py +344 -0
- boltznet-0.3.0/src/boltznet/_internal/coverage.py +96 -0
- boltznet-0.3.0/src/boltznet/_internal/factory.py +63 -0
- boltznet-0.3.0/src/boltznet/_internal/fileDataType.py +332 -0
- boltznet-0.3.0/src/boltznet/_internal/genomes.py +304 -0
- boltznet-0.3.0/src/boltznet/_internal/model_create.py +1711 -0
- boltznet-0.3.0/src/boltznet/_internal/model_disect.py +1190 -0
- boltznet-0.3.0/src/boltznet/_internal/model_predict.py +2254 -0
- boltznet-0.3.0/src/boltznet/_internal/model_predict_create.py +263 -0
- boltznet-0.3.0/src/boltznet/_internal/model_predict_models.py +132 -0
- boltznet-0.3.0/src/boltznet/_internal/sequence.py +467 -0
- boltznet-0.3.0/src/boltznet/_internal/tf_util.py +2160 -0
- boltznet-0.3.0/src/boltznet/_internal/util.py +732 -0
- boltznet-0.3.0/src/boltznet/boltznet_tf.py +328 -0
- boltznet-0.3.0/src/boltznet/selftest.py +32 -0
- boltznet-0.3.0/src/boltznet/test_module.py +14 -0
- boltznet-0.3.0/src/boltznet/testdata/__init__.py +0 -0
- boltznet-0.3.0/src/boltznet/testdata/ecoli.gff +6447 -0
- boltznet-0.3.0/src/boltznet/testdata/promoters.fa +1185 -0
- boltznet-0.3.0/src/boltznet.egg-info/PKG-INFO +115 -0
- boltznet-0.3.0/src/boltznet.egg-info/SOURCES.txt +33 -0
- boltznet-0.3.0/src/boltznet.egg-info/dependency_links.txt +1 -0
- boltznet-0.3.0/src/boltznet.egg-info/entry_points.txt +3 -0
- boltznet-0.3.0/src/boltznet.egg-info/requires.txt +14 -0
- boltznet-0.3.0/src/boltznet.egg-info/top_level.txt +1 -0
boltznet-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 James Galagan
|
|
4
|
+
|
|
5
|
+
You are free to:
|
|
6
|
+
- Share — copy and redistribute the material in any medium or format
|
|
7
|
+
- Adapt — remix, transform, and build upon the material
|
|
8
|
+
- Use — for research and academic purposes
|
|
9
|
+
|
|
10
|
+
Under the following terms:
|
|
11
|
+
- Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made.
|
|
12
|
+
- NonCommercial — You may not use the material for commercial purposes without prior written permission.
|
|
13
|
+
|
|
14
|
+
Full license text: https://creativecommons.org/licenses/by-nc/4.0/
|
|
15
|
+
|
|
16
|
+
For commercial licensing inquiries, please contact: you@example.com
|
boltznet-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: boltznet
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: BoltzNet
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: numpy<2,>=1.26
|
|
9
|
+
Requires-Dist: pandas<3,>=1.5
|
|
10
|
+
Requires-Dist: tensorflow<3,>=2.15
|
|
11
|
+
Requires-Dist: logomaker
|
|
12
|
+
Requires-Dist: biopython
|
|
13
|
+
Requires-Dist: dna-features-viewer
|
|
14
|
+
Requires-Dist: seaborn
|
|
15
|
+
Requires-Dist: scipy
|
|
16
|
+
Requires-Dist: scikit-learn
|
|
17
|
+
Requires-Dist: setproctitle
|
|
18
|
+
Requires-Dist: bokeh
|
|
19
|
+
Requires-Dist: bcbio-gff
|
|
20
|
+
Requires-Dist: keras<3,>2.12.0
|
|
21
|
+
Requires-Dist: platformdirs
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# boltznet
|
|
25
|
+
|
|
26
|
+
BoltzNet is a biophysically designed neural network that learns a quantitative model of TF-DNA binding energy from ChIP-Seq data. BoltzNet mirrors a quantitative biophysical model and provides directly interpretable predictions genome-wide at nucleotide resolution. We have performed ChIP-Seq mapping of genome-wide DNA binding for 139 E. coli TFs. From these data we have generated BoltzNet models for 124 TFs.
|
|
27
|
+
|
|
28
|
+
The Boltznet models are described in our [publication](https://rdcu.be/ek2Sq) and through the companion [website](https://boltznet.bu.edu):
|
|
29
|
+
|
|
30
|
+
https://boltznet.bu.edu
|
|
31
|
+
|
|
32
|
+
This python package provides a high-level object interface for downloading pretrained models, running predictions on DNA sequences, and visualizing results.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
Create a conda environment and activate it. The run:
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
pip install boltznet
|
|
40
|
+
boltznet-init
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
This installs the package and downloads available models to the package cache dir. To perform selftests, run
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
boltznet-selftest
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
This builds a model on all TFs, performs predictions on a set of ecoli promoter sequences, and then generates and saves a plot for as selftest_pdhR,pdhR-aceE-aceF-lpd.png. Basically runs the example code in Usage below.
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
## USAGE
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
from boltznet import boltznet_tf
|
|
56
|
+
|
|
57
|
+
####################################
|
|
58
|
+
# create a tfmodel on all TFs that have been loaded into the package cache
|
|
59
|
+
####################################
|
|
60
|
+
tfmodel=boltznet_tf.create()
|
|
61
|
+
|
|
62
|
+
####################################################
|
|
63
|
+
# load sequences from fasta file and run predictions
|
|
64
|
+
# Returns a np.array of predicions at each position on both
|
|
65
|
+
# strands of each sequence for all TFs
|
|
66
|
+
#
|
|
67
|
+
# The numpy array has shape:
|
|
68
|
+
# (nseqs,2,seqlen,numtfs)
|
|
69
|
+
# - nseqs: number of sequences
|
|
70
|
+
# - 2: forward and reverse strands
|
|
71
|
+
# - seqlen: length of each sequence
|
|
72
|
+
# - numtf: number of models
|
|
73
|
+
####################################################
|
|
74
|
+
fa_name='teset.fa'
|
|
75
|
+
y=tfmodel(fastafile=fa_name)
|
|
76
|
+
|
|
77
|
+
####################################################
|
|
78
|
+
# load annotations for the sequences for plotting
|
|
79
|
+
####################################################
|
|
80
|
+
gff_name='ecoli.gff'
|
|
81
|
+
tfmodel.loadGff(gff_name)
|
|
82
|
+
|
|
83
|
+
####################################################
|
|
84
|
+
# Plot the predictions for sequences by sequence index or sequence name patterns
|
|
85
|
+
# Below will plot sequence number 76 as well as any sequences that
|
|
86
|
+
# contain chaC or pdhR in the name. But will not plot the same sequence twice
|
|
87
|
+
# If savefilename is None, generate plots in a window
|
|
88
|
+
3 If savefilename is given, generate plots named savefilename_<seqid>.png
|
|
89
|
+
####################################################
|
|
90
|
+
tfmodel.plotPrediction(inds=[76],seqnames=['chaC','pdhR'],model_names=None,seqlogo=False,baseseq=False, maxN=3, savefilename='test')
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Test data
|
|
95
|
+
|
|
96
|
+
The package comes bundled with two datafiles that can be used for testing:
|
|
97
|
+
- promoters.fa: a fasta file with a small subset of promoters (see https://boltznet.bu.edu/ecoli/promoters)
|
|
98
|
+
- ecoli.gff: a gff file with annotations of genes and known binding sites
|
|
99
|
+
|
|
100
|
+
You can retrieve and use these data files with code like the following:
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
from importlib import resources
|
|
104
|
+
import boltznet.testdata as testdata_pkg
|
|
105
|
+
|
|
106
|
+
fa_name=resources.files(testdata_pkg).joinpath('promoters.fa')
|
|
107
|
+
|
|
108
|
+
gff_name=resources.files(testdata_pkg).joinpath('ecoli.gff')
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Citation
|
|
112
|
+
|
|
113
|
+
The code for BoltzNet is freely available for academic use. BoltzNet can be used by molecular biologists seeking to quantitatively predict TF binding, by synthetic biologists seeking to predictively engineer new regulatory interactions, and by computational biologists seeking to develop biophysically motivated bioinformatic tools.
|
|
114
|
+
|
|
115
|
+
- Lally, Patrick, Gómez-Romero, Laura, Tierrafría, Víctor H., Aquino, Patricia, Rioualen, Claire, Zhang, Xiaoman, Kim, Sunyoung, Baniulyte, Gabriele, Plitnick, Jonathan, Smith, Carol, Babu, Mohan, Collado-Vides, Julio, Wade, Joseph, Galagan, James E. (2025) Predictive Biophysical Neural Network Modeling of a Compendium of in vivo Transcription Factor DNA Binding Profiles for Escherichia coli. Nature Communications
|
boltznet-0.3.0/README.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# boltznet
|
|
2
|
+
|
|
3
|
+
BoltzNet is a biophysically designed neural network that learns a quantitative model of TF-DNA binding energy from ChIP-Seq data. BoltzNet mirrors a quantitative biophysical model and provides directly interpretable predictions genome-wide at nucleotide resolution. We have performed ChIP-Seq mapping of genome-wide DNA binding for 139 E. coli TFs. From these data we have generated BoltzNet models for 124 TFs.
|
|
4
|
+
|
|
5
|
+
The Boltznet models are described in our [publication](https://rdcu.be/ek2Sq) and through the companion [website](https://boltznet.bu.edu):
|
|
6
|
+
|
|
7
|
+
https://boltznet.bu.edu
|
|
8
|
+
|
|
9
|
+
This python package provides a high-level object interface for downloading pretrained models, running predictions on DNA sequences, and visualizing results.
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
Create a conda environment and activate it. The run:
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
pip install boltznet
|
|
17
|
+
boltznet-init
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
This installs the package and downloads available models to the package cache dir. To perform selftests, run
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
boltznet-selftest
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
This builds a model on all TFs, performs predictions on a set of ecoli promoter sequences, and then generates and saves a plot for as selftest_pdhR,pdhR-aceE-aceF-lpd.png. Basically runs the example code in Usage below.
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
## USAGE
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
from boltznet import boltznet_tf
|
|
33
|
+
|
|
34
|
+
####################################
|
|
35
|
+
# create a tfmodel on all TFs that have been loaded into the package cache
|
|
36
|
+
####################################
|
|
37
|
+
tfmodel=boltznet_tf.create()
|
|
38
|
+
|
|
39
|
+
####################################################
|
|
40
|
+
# load sequences from fasta file and run predictions
|
|
41
|
+
# Returns a np.array of predicions at each position on both
|
|
42
|
+
# strands of each sequence for all TFs
|
|
43
|
+
#
|
|
44
|
+
# The numpy array has shape:
|
|
45
|
+
# (nseqs,2,seqlen,numtfs)
|
|
46
|
+
# - nseqs: number of sequences
|
|
47
|
+
# - 2: forward and reverse strands
|
|
48
|
+
# - seqlen: length of each sequence
|
|
49
|
+
# - numtf: number of models
|
|
50
|
+
####################################################
|
|
51
|
+
fa_name='teset.fa'
|
|
52
|
+
y=tfmodel(fastafile=fa_name)
|
|
53
|
+
|
|
54
|
+
####################################################
|
|
55
|
+
# load annotations for the sequences for plotting
|
|
56
|
+
####################################################
|
|
57
|
+
gff_name='ecoli.gff'
|
|
58
|
+
tfmodel.loadGff(gff_name)
|
|
59
|
+
|
|
60
|
+
####################################################
|
|
61
|
+
# Plot the predictions for sequences by sequence index or sequence name patterns
|
|
62
|
+
# Below will plot sequence number 76 as well as any sequences that
|
|
63
|
+
# contain chaC or pdhR in the name. But will not plot the same sequence twice
|
|
64
|
+
# If savefilename is None, generate plots in a window
|
|
65
|
+
3 If savefilename is given, generate plots named savefilename_<seqid>.png
|
|
66
|
+
####################################################
|
|
67
|
+
tfmodel.plotPrediction(inds=[76],seqnames=['chaC','pdhR'],model_names=None,seqlogo=False,baseseq=False, maxN=3, savefilename='test')
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Test data
|
|
72
|
+
|
|
73
|
+
The package comes bundled with two datafiles that can be used for testing:
|
|
74
|
+
- promoters.fa: a fasta file with a small subset of promoters (see https://boltznet.bu.edu/ecoli/promoters)
|
|
75
|
+
- ecoli.gff: a gff file with annotations of genes and known binding sites
|
|
76
|
+
|
|
77
|
+
You can retrieve and use these data files with code like the following:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
from importlib import resources
|
|
81
|
+
import boltznet.testdata as testdata_pkg
|
|
82
|
+
|
|
83
|
+
fa_name=resources.files(testdata_pkg).joinpath('promoters.fa')
|
|
84
|
+
|
|
85
|
+
gff_name=resources.files(testdata_pkg).joinpath('ecoli.gff')
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Citation
|
|
89
|
+
|
|
90
|
+
The code for BoltzNet is freely available for academic use. BoltzNet can be used by molecular biologists seeking to quantitatively predict TF binding, by synthetic biologists seeking to predictively engineer new regulatory interactions, and by computational biologists seeking to develop biophysically motivated bioinformatic tools.
|
|
91
|
+
|
|
92
|
+
- Lally, Patrick, Gómez-Romero, Laura, Tierrafría, Víctor H., Aquino, Patricia, Rioualen, Claire, Zhang, Xiaoman, Kim, Sunyoung, Baniulyte, Gabriele, Plitnick, Jonathan, Smith, Carol, Babu, Mohan, Collado-Vides, Julio, Wade, Joseph, Galagan, James E. (2025) Predictive Biophysical Neural Network Modeling of a Compendium of in vivo Transcription Factor DNA Binding Profiles for Escherichia coli. Nature Communications
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
[build-system]
|
|
3
|
+
requires = ["setuptools>=68", "wheel"]
|
|
4
|
+
build-backend = "setuptools.build_meta"
|
|
5
|
+
|
|
6
|
+
[project]
|
|
7
|
+
name = "boltznet"
|
|
8
|
+
version = "0.3.0"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
description = "BoltzNet"
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy>=1.26,<2", # lib-style
|
|
14
|
+
"pandas>=1.5,<3",
|
|
15
|
+
"tensorflow>=2.15,<3",
|
|
16
|
+
"logomaker",
|
|
17
|
+
"biopython",
|
|
18
|
+
'dna-features-viewer',
|
|
19
|
+
'seaborn',
|
|
20
|
+
'scipy',
|
|
21
|
+
'scikit-learn',
|
|
22
|
+
'setproctitle',
|
|
23
|
+
'bokeh',
|
|
24
|
+
'bcbio-gff',
|
|
25
|
+
'keras>2.12.0,<3',
|
|
26
|
+
'platformdirs'
|
|
27
|
+
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[tool.setuptools]
|
|
31
|
+
package-dir = {"" = "src"}
|
|
32
|
+
include-package-data = true # if you ship any data files
|
|
33
|
+
|
|
34
|
+
[tool.setuptools.packages.find]
|
|
35
|
+
where = ["src"]
|
|
36
|
+
include = ["boltznet", "boltznet.*"]
|
|
37
|
+
|
|
38
|
+
# Tell setuptools to include your test data files in the wheel
|
|
39
|
+
[tool.setuptools.package-data]
|
|
40
|
+
boltznet = ["testdata/*"]
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
boltznet-init = "boltznet.boltznet_tf:__download_all_models__" # downloads index.json
|
|
44
|
+
boltznet-selftest = "boltznet.selftest:main" # runs a selftest
|
|
45
|
+
|
boltznet-0.3.0/setup.cfg
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Configuration file that can be modified for each user
|
|
5
|
+
Primarily to indicate locations of files etc
|
|
6
|
+
|
|
7
|
+
@author: jgalag
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# SERVER LOCATIONS
|
|
11
|
+
|
|
12
|
+
server_data_root = '/Volumes/eng_research_galagan/seqdata/projects/ecoli'
|
|
13
|
+
'''Mounted server location of seqdata/projects/ecoli'''
|
|
14
|
+
server_projects_root = '/Volumes/eng_research_galagan/seqdata/projects/'
|
|
15
|
+
'''Mounted server location of seqdata/projects/'''
|
|
16
|
+
|
|
17
|
+
server_meme_root = '%s/downstream_files/analysis/meme' % server_data_root
|
|
18
|
+
'''Mounted server location of meme analysis directory'''
|
|
19
|
+
|
|
20
|
+
boltznet_comparison_server='/Volumes/eng_research_galagan/seqdata/projects/boltzNetComparisons/'
|
|
21
|
+
|
|
22
|
+
new_regulon_dir='/Volumes/eng_research_galagan/seqdata/projects/ecoli/downstream_files/analysis/known_motifs/Regulon-v12-confirmed'
|
|
23
|
+
'''Location of regulonDB meme file'''
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# LOCAL PLACES
|
|
27
|
+
|
|
28
|
+
model_dir = '/Users/jgalag/Dropbox/Python/CNNModels/models'
|
|
29
|
+
'''Location for models'''
|
|
30
|
+
|
|
31
|
+
model_json_dir = '%s/json'%model_dir
|
|
32
|
+
'''Location for model json files'''
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
bokeh_temp_dir = '/Users/jgalag/Dropbox/Python/CNNModels/bokeh_temp'
|
|
36
|
+
'''Temp dir for browsers'''
|
|
37
|
+
|
|
38
|
+
browser_dir = '/Users/jgalag/Dropbox/Python/CNNModels/browsers'
|
|
39
|
+
'''Standard location for TF genome browsers and model browsers'''
|
|
40
|
+
|
|
41
|
+
data_dir = '/Users/jgalag/Dropbox/Python/CNNModels/source/datafiles'
|
|
42
|
+
'''Location of datafiles including genome files and geneview files'''
|
|
43
|
+
|
|
44
|
+
# figpath='/Users/jgalag/Dropbox/Projects/Papers/Ecoli Papers/Model Paper/resub'
|
|
45
|
+
figpath='/Users/jgalag/Dropbox/Projects/Papers/Ecoli Papers/Boltznet/NatureFormat/Nat Comm Files/Final Submission/figures'
|
|
46
|
+
|
|
47
|
+
file_dir = '/Users/jgalag/Dropbox/Python/CNNModels/input_files'
|
|
48
|
+
'''Loction of Files for import and export'''
|
|
49
|
+
|
|
50
|
+
source_dir ='/Users/jgalag/Dropbox/Python/CNNModels/source'
|
|
51
|
+
'''Location of source folder'''
|