yamcot 1.0.0__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yamcot/__init__.py +46 -0
- yamcot/_core/__init__.py +17 -0
- yamcot/_core/_core.cpython-311-darwin.so +0 -0
- yamcot/_core/bindings.cpp +28 -0
- yamcot/_core/core_functions.h +29 -0
- yamcot/_core/fasta_to_plain.h +182 -0
- yamcot/_core/mco_prc.cpp +1476 -0
- yamcot/_core/pfm_to_pwm.h +130 -0
- yamcot/cli.py +621 -0
- yamcot/comparison.py +1066 -0
- yamcot/execute.py +97 -0
- yamcot/functions.py +787 -0
- yamcot/io.py +522 -0
- yamcot/models.py +1161 -0
- yamcot/pipeline.py +402 -0
- yamcot/ragged.py +126 -0
- yamcot-1.0.0.dist-info/METADATA +433 -0
- yamcot-1.0.0.dist-info/RECORD +21 -0
- yamcot-1.0.0.dist-info/WHEEL +6 -0
- yamcot-1.0.0.dist-info/entry_points.txt +3 -0
- yamcot-1.0.0.dist-info/licenses/LICENSE +21 -0
yamcot/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
unimotifcomparator
|
|
3
|
+
==================
|
|
4
|
+
|
|
5
|
+
This package provides a modular and extensible framework for de‑novo motif
|
|
6
|
+
discovery, motif comparison and performance evaluation in ChIP‑seq like
|
|
7
|
+
applications. The design emphasises object oriented abstractions and clear
|
|
8
|
+
interfaces so that new motif discovery tools, motif comparison strategies
|
|
9
|
+
and evaluation metrics can be added with minimal effort. All code is
|
|
10
|
+
intended for educational and fundamental research use on open, anonymised
|
|
11
|
+
datasets only and must not be applied to sensitive or identifiable data.
|
|
12
|
+
|
|
13
|
+
The top level modules expose the following key components:
|
|
14
|
+
|
|
15
|
+
``io``
|
|
16
|
+
Functions for reading and writing biological sequences in FASTA format
|
|
17
|
+
and motifs in MEME format.
|
|
18
|
+
|
|
19
|
+
``models``
|
|
20
|
+
Motif model classes. A generic :class:`MotifModel` describes the common
|
|
21
|
+
behaviour of sequence motifs, while concrete subclasses such as
|
|
22
|
+
:class:`PWMMotif` implement specific scoring logic.
|
|
23
|
+
|
|
24
|
+
``discovery``
|
|
25
|
+
Base classes and concrete implementations of motif discovery tools.
|
|
26
|
+
|
|
27
|
+
``comparison``
|
|
28
|
+
Base classes and concrete implementations of motif comparison metrics.
|
|
29
|
+
|
|
30
|
+
``evaluation``
|
|
31
|
+
Classes for computing ROC/PR curves and their associated summary
|
|
32
|
+
statistics.
|
|
33
|
+
|
|
34
|
+
``pipeline``
|
|
35
|
+
High level orchestrators that perform bootstrapping, odd/even
|
|
36
|
+
cross‑validation, motif comparison and final motif selection.
|
|
37
|
+
|
|
38
|
+
``cli``
|
|
39
|
+
An example command line interface exposing the pipeline to end users.
|
|
40
|
+
|
|
41
|
+
This modular structure makes it straightforward to integrate alternative
|
|
42
|
+
motif models (e.g. BaMM or SiteGA), comparison methods (e.g. Jaccard
|
|
43
|
+
index, overlap coefficient or recognition‑function correlation) and
|
|
44
|
+
performance metrics. See individual modules for detailed API
|
|
45
|
+
documentation.
|
|
46
|
+
"""
|
yamcot/_core/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Import the C++ extension functionality
|
|
2
|
+
try:
|
|
3
|
+
# Try to import the compiled C++ extension directly
|
|
4
|
+
from . import _core
|
|
5
|
+
|
|
6
|
+
run_motali_cpp = _core.run_motali_cpp
|
|
7
|
+
except (ImportError, AttributeError) as e:
|
|
8
|
+
_import_error = e
|
|
9
|
+
_core = None
|
|
10
|
+
|
|
11
|
+
# Fallback when the compiled extension is not available
|
|
12
|
+
def run_motali_cpp(*args, **kwargs):
|
|
13
|
+
raise ImportError(
|
|
14
|
+
"The C++ extension '_core' is not installed or could not be loaded. "
|
|
15
|
+
"Please ensure the package was built correctly. "
|
|
16
|
+
f"Original error: {_import_error}"
|
|
17
|
+
)
|
|
Binary file
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#include <nanobind/nanobind.h>
|
|
2
|
+
#include <nanobind/stl/string.h>
|
|
3
|
+
|
|
4
|
+
#include "core_functions.h"
|
|
5
|
+
|
|
6
|
+
namespace nb = nanobind;
|
|
7
|
+
|
|
8
|
+
NB_MODULE(_core, m) {
|
|
9
|
+
m.doc() = "C++ backend for yamcot";
|
|
10
|
+
|
|
11
|
+
m.def("run_motali_cpp", &run_motali_cpp,
|
|
12
|
+
nb::arg("file_fasta"),
|
|
13
|
+
nb::arg("type_model_1"),
|
|
14
|
+
nb::arg("type_model_2"),
|
|
15
|
+
nb::arg("file_model_1"),
|
|
16
|
+
nb::arg("file_model_2"),
|
|
17
|
+
nb::arg("file_table_1"),
|
|
18
|
+
nb::arg("file_table_2"),
|
|
19
|
+
nb::arg("shift"),
|
|
20
|
+
nb::arg("threshold"),
|
|
21
|
+
nb::arg("file_hist"),
|
|
22
|
+
nb::arg("yes_out_hist"),
|
|
23
|
+
nb::arg("file_prc"),
|
|
24
|
+
nb::arg("yes_out_prc"),
|
|
25
|
+
nb::arg("file_short_over"),
|
|
26
|
+
nb::arg("file_short_all"),
|
|
27
|
+
nb::arg("file_sta_long"));
|
|
28
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#ifdef __cplusplus
|
|
4
|
+
extern "C" {
|
|
5
|
+
#endif
|
|
6
|
+
|
|
7
|
+
// Function declaration for the main computational function
|
|
8
|
+
int run_motali_cpp(
|
|
9
|
+
const char* file_fasta,
|
|
10
|
+
const char* type_model_1,
|
|
11
|
+
const char* type_model_2,
|
|
12
|
+
const char* file_model_1,
|
|
13
|
+
const char* file_model_2,
|
|
14
|
+
const char* file_table_1,
|
|
15
|
+
const char* file_table_2,
|
|
16
|
+
int shift,
|
|
17
|
+
double pvalue,
|
|
18
|
+
const char* file_hist,
|
|
19
|
+
int yes_out_hist,
|
|
20
|
+
const char* file_prc,
|
|
21
|
+
int yes_out_prc,
|
|
22
|
+
const char* file_short_over,
|
|
23
|
+
const char* file_short_all,
|
|
24
|
+
const char* file_sta_long
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
#ifdef __cplusplus
|
|
28
|
+
}
|
|
29
|
+
#endif
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
char TransStr(char x)
|
|
2
|
+
{
|
|
3
|
+
int c=int(x);
|
|
4
|
+
if(c<97) x=char(c+32);
|
|
5
|
+
return x;
|
|
6
|
+
}
|
|
7
|
+
int fasta_to_plain_genome(char* file_in_fasta, int motif_len_min, int& all_pos, int &nseq, int &len)
|
|
8
|
+
{
|
|
9
|
+
char l[SEQLEN], d[SEQLEN], head[400];
|
|
10
|
+
int fl = 0;
|
|
11
|
+
FILE* in;
|
|
12
|
+
|
|
13
|
+
if ((in = fopen(file_in_fasta, "rt")) == NULL)
|
|
14
|
+
{
|
|
15
|
+
printf("Input file %s can't be opened\n", file_in_fasta);
|
|
16
|
+
return -1;
|
|
17
|
+
}
|
|
18
|
+
char symbol = fgetc(in);
|
|
19
|
+
rewind(in);
|
|
20
|
+
all_pos = nseq = len = 0;
|
|
21
|
+
while (nseq >= 0)
|
|
22
|
+
{
|
|
23
|
+
if (fgets(l, sizeof(l), in) == NULL) fl = -1;
|
|
24
|
+
if (*l == '\n' && fl != -1)continue;
|
|
25
|
+
if (((*l == symbol) || (fl == -1)) && (fl != 0))
|
|
26
|
+
{
|
|
27
|
+
int lenx = (int)strlen(d);
|
|
28
|
+
if (lenx > len)len = lenx;
|
|
29
|
+
all_pos += (lenx-motif_len_min);
|
|
30
|
+
nseq++;
|
|
31
|
+
// int check = CheckStr(file, d, n, 1, outlog);
|
|
32
|
+
//if (check != -1)
|
|
33
|
+
if (fl == -1)
|
|
34
|
+
{
|
|
35
|
+
fclose(in);
|
|
36
|
+
break;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
if (*l == symbol)
|
|
40
|
+
{
|
|
41
|
+
memset(head, 0, sizeof(head));
|
|
42
|
+
DelChar(l, '\n');
|
|
43
|
+
strcpy(head, l);
|
|
44
|
+
fl = 0; continue;
|
|
45
|
+
}
|
|
46
|
+
if (fl == 0)
|
|
47
|
+
{
|
|
48
|
+
memset(d, 0, sizeof(d));
|
|
49
|
+
DelChar(l, '\n');
|
|
50
|
+
strcpy(d, l);
|
|
51
|
+
fl = 1; continue;
|
|
52
|
+
}
|
|
53
|
+
if (strlen(d) + strlen(l) > sizeof(d))
|
|
54
|
+
{
|
|
55
|
+
printf("Size is large...");
|
|
56
|
+
printf("l:%s\nstrlen(l):%zu\n", l, strlen(l));
|
|
57
|
+
printf("d:%s\nstrlen(d):%zu\n", d, strlen(d));
|
|
58
|
+
exit(1);
|
|
59
|
+
}
|
|
60
|
+
DelChar(l, '\n');
|
|
61
|
+
strcat(d, l);
|
|
62
|
+
}
|
|
63
|
+
return 1;
|
|
64
|
+
}
|
|
65
|
+
int fasta_to_plain0(char *file_in_fasta, int &length_fasta_max, int &nseq_fasta)
|
|
66
|
+
{
|
|
67
|
+
char head[200];
|
|
68
|
+
FILE *in;
|
|
69
|
+
|
|
70
|
+
if((in=fopen(file_in_fasta,"rt"))==NULL)
|
|
71
|
+
{
|
|
72
|
+
printf("Input file %s can't be opened\n",file_in_fasta);
|
|
73
|
+
return -1;
|
|
74
|
+
}
|
|
75
|
+
char c, symbol = '>';
|
|
76
|
+
nseq_fasta=0;
|
|
77
|
+
int len=0;
|
|
78
|
+
//double sum_len=0;
|
|
79
|
+
int fl=1;
|
|
80
|
+
length_fasta_max=0;
|
|
81
|
+
do
|
|
82
|
+
{
|
|
83
|
+
c=getc(in);
|
|
84
|
+
if(c==EOF)fl=-1;
|
|
85
|
+
if(c==symbol || fl==-1)
|
|
86
|
+
{
|
|
87
|
+
if(nseq_fasta>0)
|
|
88
|
+
{
|
|
89
|
+
if(len>length_fasta_max)length_fasta_max=len;
|
|
90
|
+
if(len>SEQLEN)
|
|
91
|
+
{
|
|
92
|
+
printf("Sequence N %d too long... %d nt\n",nseq_fasta,len);
|
|
93
|
+
return -1;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if(fl!=-1)
|
|
97
|
+
{
|
|
98
|
+
nseq_fasta++;
|
|
99
|
+
len=0;
|
|
100
|
+
}
|
|
101
|
+
if(fl==1)
|
|
102
|
+
{
|
|
103
|
+
fgets(head,sizeof(head),in);
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
if(c=='\n')continue;
|
|
108
|
+
if(c=='\t')continue;
|
|
109
|
+
if(c==' ')continue;
|
|
110
|
+
if(c=='\r')continue;
|
|
111
|
+
len++;
|
|
112
|
+
}
|
|
113
|
+
while(fl==1);
|
|
114
|
+
fclose(in);
|
|
115
|
+
return 1;
|
|
116
|
+
}
|
|
117
|
+
int fasta_to_plain1(char *file_in_fasta, int length_fasta_max, int nseq_fasta, char ***seq, int *peak_len)
|
|
118
|
+
{
|
|
119
|
+
int fl=1, n=0, len=0;
|
|
120
|
+
char head[200];
|
|
121
|
+
char c, symbol = '>';
|
|
122
|
+
char alfavit4[]="acgtnACGTN";
|
|
123
|
+
FILE *in;
|
|
124
|
+
if((in=fopen(file_in_fasta,"rt"))==NULL)
|
|
125
|
+
{
|
|
126
|
+
printf("Input file %s can't be opened\n",file_in_fasta);
|
|
127
|
+
return -1;
|
|
128
|
+
}
|
|
129
|
+
do
|
|
130
|
+
{
|
|
131
|
+
c=getc(in);
|
|
132
|
+
if(c==EOF)
|
|
133
|
+
{
|
|
134
|
+
fl=-1;
|
|
135
|
+
}
|
|
136
|
+
if(c==symbol || fl==-1)
|
|
137
|
+
{
|
|
138
|
+
if(len>0)
|
|
139
|
+
{
|
|
140
|
+
peak_len[n]=len;
|
|
141
|
+
seq[0][n][len]='\0';
|
|
142
|
+
strncpy(seq[1][n],seq[0][n],len);
|
|
143
|
+
seq[1][n][len]='\0';
|
|
144
|
+
ComplStr(seq[1][n]);
|
|
145
|
+
{
|
|
146
|
+
if(fl!=-1)
|
|
147
|
+
{
|
|
148
|
+
n++;
|
|
149
|
+
len=0;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
else
|
|
154
|
+
{
|
|
155
|
+
if(n>0 && fl!=-1)
|
|
156
|
+
{
|
|
157
|
+
printf("Peak length error! peak %d\n",n+1);
|
|
158
|
+
return -1;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
if(fl==-1)break;
|
|
162
|
+
if(fl==1)
|
|
163
|
+
{
|
|
164
|
+
fgets(head,sizeof(head),in);
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
if(c=='\n')continue;
|
|
169
|
+
if(c=='\t')continue;
|
|
170
|
+
if(c==' ')continue;
|
|
171
|
+
if(c=='\r')continue;
|
|
172
|
+
if(strchr(alfavit4,c)!=NULL)
|
|
173
|
+
{
|
|
174
|
+
c=TransStr(c);
|
|
175
|
+
seq[0][n][len++]=c;
|
|
176
|
+
}
|
|
177
|
+
else seq[0][n][len++]='n';
|
|
178
|
+
}
|
|
179
|
+
while(fl==1);
|
|
180
|
+
fclose(in);
|
|
181
|
+
return 1;
|
|
182
|
+
}
|