bscampp 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +68 -0
- bscampp/configs.py +169 -0
- bscampp/default.config +5 -0
- bscampp/functions.py +409 -0
- bscampp/init_configs.py +93 -0
- bscampp/jobs.py +198 -0
- bscampp/pipeline.py +249 -0
- bscampp/tools/epa-ng +0 -0
- bscampp/tools/hamming_distance/CMakeLists.txt +13 -0
- bscampp/tools/hamming_distance/fragment_hamming +0 -0
- bscampp/tools/hamming_distance/hamming +0 -0
- bscampp/tools/hamming_distance/homology +0 -0
- bscampp/tools/hamming_distance/src/fragment_hamming.cpp +180 -0
- bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp +183 -0
- bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp +214 -0
- bscampp/tools/hamming_distance/src/homology.cpp +179 -0
- bscampp/tools/hamming_distance/src/new_hamming.cpp +161 -0
- bscampp/tools/pplacer +0 -0
- bscampp/utils.py +914 -0
- bscampp-1.0.1.dist-info/LICENSE +21 -0
- bscampp-1.0.1.dist-info/METADATA +234 -0
- bscampp-1.0.1.dist-info/RECORD +25 -0
- bscampp-1.0.1.dist-info/WHEEL +5 -0
- bscampp-1.0.1.dist-info/entry_points.txt +3 -0
- bscampp-1.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,179 @@
|
|
1
|
+
#include <iostream>
|
2
|
+
#include <fstream>
|
3
|
+
#include <vector>
|
4
|
+
#include <string>
|
5
|
+
#include <omp.h>
|
6
|
+
|
7
|
+
|
8
|
+
int main( int argc, char **argv ){
|
9
|
+
if( argc <= 6 ){
|
10
|
+
std::cerr << "Usage: "<<argv[0]<<" [ref infile] [nbr ref records] [query infile] [nbr query records] [outfile] [nbr of leaves returned]" << std::endl;
|
11
|
+
return -1;
|
12
|
+
}
|
13
|
+
|
14
|
+
// read in the reference sequences first
|
15
|
+
std::ifstream input_q(argv[1]);
|
16
|
+
if(!input_q.good()){
|
17
|
+
std::cerr << "Error opening '"<<argv[1]<<"'. Bailing out." << std::endl;
|
18
|
+
return -1;
|
19
|
+
}
|
20
|
+
|
21
|
+
//std::string name_arr[std::stoi(argv[2])];
|
22
|
+
//std::string seq_arr[std::stoi(argv[2])];
|
23
|
+
int ref_size = std::stoi(argv[2]) + 3;
|
24
|
+
int count1 = 0;
|
25
|
+
std::vector<std::string> name_arr(ref_size);
|
26
|
+
std::vector<std::string> seq_arr(ref_size);
|
27
|
+
std::string line, name, content;
|
28
|
+
|
29
|
+
while( std::getline( input_q, line ).good() ){
|
30
|
+
if( line.empty() || line[0] == '>' ){ // Identifier marker
|
31
|
+
if( !name.empty() ){ // Print out what we read from the last entry
|
32
|
+
|
33
|
+
name_arr[count1] = name.c_str();
|
34
|
+
seq_arr[count1] = content.c_str();
|
35
|
+
name.clear();
|
36
|
+
|
37
|
+
count1++;
|
38
|
+
}
|
39
|
+
if( !line.empty() ){
|
40
|
+
name = line.substr(1);
|
41
|
+
}
|
42
|
+
content.clear();
|
43
|
+
} else if( !name.empty() ){
|
44
|
+
if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
|
45
|
+
name.clear();
|
46
|
+
content.clear();
|
47
|
+
} else {
|
48
|
+
content += line;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
if( !name.empty() ){ // Print out what we read from the last entry
|
54
|
+
//std::cout << name << " : " << content << std::endl;
|
55
|
+
name_arr[count1] = name;
|
56
|
+
seq_arr[count1] = content;
|
57
|
+
count1++;
|
58
|
+
}
|
59
|
+
|
60
|
+
// read in query sequences second
|
61
|
+
std::ifstream input(argv[3]);
|
62
|
+
if(!input.good()){
|
63
|
+
std::cerr << "Error opening '"<<argv[3]<<"'. Bailing out." << std::endl;
|
64
|
+
return -1;
|
65
|
+
}
|
66
|
+
|
67
|
+
int q_size = std::stoi(argv[4]) + 3;
|
68
|
+
int count2 = 0;
|
69
|
+
//std::string q_name_arr[std::stoi(argv[4])+3];
|
70
|
+
//std::string q_seq_arr[std::stoi(argv[4])+3];
|
71
|
+
std::vector<std::string> q_name_arr(q_size);
|
72
|
+
std::vector<std::string> q_seq_arr(q_size);
|
73
|
+
name = "";
|
74
|
+
|
75
|
+
while( std::getline( input, line ).good() ){
|
76
|
+
if( line.empty() || line[0] == '>' ){ // Identifier marker
|
77
|
+
if( !name.empty() ){ // Print out what we read from the last entry
|
78
|
+
//std::cout << name << " : " << content << std::endl;
|
79
|
+
q_name_arr[count2] = name.c_str();
|
80
|
+
q_seq_arr[count2] = content.c_str();
|
81
|
+
name.clear();
|
82
|
+
//std::cout << count2 << " : " << q_name_arr[count2] <<std::endl;
|
83
|
+
count2++;
|
84
|
+
}
|
85
|
+
if( !line.empty() ){
|
86
|
+
name = line.substr(1);
|
87
|
+
}
|
88
|
+
content.clear();
|
89
|
+
} else if( !name.empty() ){
|
90
|
+
if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
|
91
|
+
name.clear();
|
92
|
+
content.clear();
|
93
|
+
} else {
|
94
|
+
content += line;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
}
|
99
|
+
|
100
|
+
|
101
|
+
if( !name.empty() ){ // Print out what we read from the last entry
|
102
|
+
//std::cout << name << " : " << content << std::endl;
|
103
|
+
q_name_arr[count2] = name;
|
104
|
+
q_seq_arr[count2] = content;
|
105
|
+
count2++;
|
106
|
+
}
|
107
|
+
|
108
|
+
//std::cout << "ref count: "<< count1 <<" query count2: " <<count2 << std::endl;
|
109
|
+
|
110
|
+
std::ofstream outFile(argv[5]);
|
111
|
+
|
112
|
+
// find n (size) closest reference sequences by Hamming distance to each query
|
113
|
+
// print this to outfile with the one query per line followed by the n closest
|
114
|
+
// reference sequences separated by a comma, with their requisite Hamming
|
115
|
+
// distances separated with a semicolon.
|
116
|
+
|
117
|
+
#pragma omp parallel for
|
118
|
+
for (int c2=0; c2<count2; c2++){ //query seq array
|
119
|
+
|
120
|
+
int size = std::stoi(argv[6]);
|
121
|
+
|
122
|
+
int best_homolog[size];
|
123
|
+
int best_homolog_index[size];
|
124
|
+
int furthest_homolog_index = 0;
|
125
|
+
|
126
|
+
for (int i=0; i<size; i++){
|
127
|
+
best_homolog_index[i] = 0;
|
128
|
+
best_homolog[i] = 999999999;
|
129
|
+
}
|
130
|
+
|
131
|
+
int q_len = q_seq_arr[c2].length();
|
132
|
+
int q_hom_idx_arr[q_len];
|
133
|
+
int hom_nbr = 0;
|
134
|
+
for (int i=0; i < q_len; i++) {
|
135
|
+
if (q_seq_arr[c2][i] != '-') {
|
136
|
+
q_hom_idx_arr[hom_nbr] = i;
|
137
|
+
hom_nbr++;
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
for (int c1=0; c1<count1 ; c1++) { //ref seq array
|
142
|
+
int count = 0;
|
143
|
+
int non_hom_count = 0;
|
144
|
+
int len = seq_arr[c1].length();
|
145
|
+
for(int i=0; i < hom_nbr; i++) {
|
146
|
+
if(seq_arr[c1][q_hom_idx_arr[i]] != q_seq_arr[c2][q_hom_idx_arr[i]]) {
|
147
|
+
non_hom_count++;
|
148
|
+
if (non_hom_count > best_homolog[furthest_homolog_index]) {
|
149
|
+
break;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
}
|
153
|
+
//std::cout << "here" << std::endl;
|
154
|
+
if (non_hom_count <= best_homolog[furthest_homolog_index]) {
|
155
|
+
best_homolog[furthest_homolog_index] = non_hom_count;
|
156
|
+
best_homolog_index[furthest_homolog_index] = c1;
|
157
|
+
int high_homolog = 0;
|
158
|
+
int high_hom_index = 0;
|
159
|
+
for (int i=0; i<size; i++){
|
160
|
+
if (best_homolog[i] > high_homolog){
|
161
|
+
high_homolog = best_homolog[i];
|
162
|
+
high_hom_index = i;
|
163
|
+
}
|
164
|
+
furthest_homolog_index = high_hom_index;
|
165
|
+
}
|
166
|
+
}
|
167
|
+
}
|
168
|
+
#pragma omp critical
|
169
|
+
{
|
170
|
+
outFile << q_name_arr[c2] << ":" << hom_nbr;
|
171
|
+
for (int i=0; i<size; i++){
|
172
|
+
outFile << "," << name_arr[best_homolog_index[i]] << ":" << best_homolog[i];
|
173
|
+
}
|
174
|
+
outFile << std::endl;
|
175
|
+
}
|
176
|
+
}
|
177
|
+
outFile.close();
|
178
|
+
return 0;
|
179
|
+
}
|
@@ -0,0 +1,161 @@
|
|
1
|
+
#include <iostream>
|
2
|
+
#include <fstream>
|
3
|
+
#include <vector>
|
4
|
+
#include <string>
|
5
|
+
#include <omp.h>
|
6
|
+
|
7
|
+
|
8
|
+
int main( int argc, char **argv ){
|
9
|
+
if( argc <= 6 ){
|
10
|
+
std::cerr << "Usage: "<<argv[0]<<" [ref infile] [nbr ref records] [query infile] [nbr query records] [outfile] [nbr of leaves returned]" << std::endl;
|
11
|
+
return -1;
|
12
|
+
}
|
13
|
+
|
14
|
+
std::ifstream input_q(argv[1]);
|
15
|
+
if(!input_q.good()){
|
16
|
+
std::cerr << "Error opening '"<<argv[1]<<"'. Bailing out." << std::endl;
|
17
|
+
return -1;
|
18
|
+
}
|
19
|
+
//std::cout << argv[2] << std::endl;
|
20
|
+
int ref_size = std::stoi(argv[2]) + 3;
|
21
|
+
int count1 = 0;
|
22
|
+
std::vector<std::string> name_arr(ref_size);
|
23
|
+
std::vector<std::string> seq_arr(ref_size);
|
24
|
+
std::string line, name, content;
|
25
|
+
|
26
|
+
while( std::getline( input_q, line ).good() ){
|
27
|
+
if( line.empty() || line[0] == '>' ){ // Identifier marker
|
28
|
+
if( !name.empty() ){ // Print out what we read from the last entry
|
29
|
+
//std::cout << name << " : " << content << std::endl;
|
30
|
+
name_arr[count1] = name.c_str();
|
31
|
+
seq_arr[count1] = content.c_str();
|
32
|
+
name.clear();
|
33
|
+
//std::cout << count1<< " : " << name_arr[count1] <<std::endl;
|
34
|
+
count1++;
|
35
|
+
}
|
36
|
+
if( !line.empty() ){
|
37
|
+
name = line.substr(1);
|
38
|
+
}
|
39
|
+
content.clear();
|
40
|
+
} else if( !name.empty() ){
|
41
|
+
if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
|
42
|
+
name.clear();
|
43
|
+
content.clear();
|
44
|
+
} else {
|
45
|
+
content += line;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
}
|
50
|
+
|
51
|
+
//std::cout << "here" << std::endl;
|
52
|
+
|
53
|
+
if( !name.empty() ){ // Print out what we read from the last entry
|
54
|
+
//std::cout << name << " : " << content << std::endl;
|
55
|
+
name_arr[count1] = name;
|
56
|
+
seq_arr[count1] = content;
|
57
|
+
count1++;
|
58
|
+
}
|
59
|
+
|
60
|
+
|
61
|
+
std::ifstream input(argv[3]);
|
62
|
+
if(!input.good()){
|
63
|
+
std::cerr << "Error opening '"<<argv[3]<<"'. Bailing out." << std::endl;
|
64
|
+
return -1;
|
65
|
+
}
|
66
|
+
//std::cout << argv[4] << std::endl;
|
67
|
+
int q_size = std::stoi(argv[4]) + 3;
|
68
|
+
int count2 = 0;
|
69
|
+
//std::string q_name_arr[std::stoi(argv[4])+3];
|
70
|
+
//std::string q_seq_arr[std::stoi(argv[4])+3];
|
71
|
+
std::vector<std::string> q_name_arr(q_size);
|
72
|
+
std::vector<std::string> q_seq_arr(q_size);
|
73
|
+
name = "";
|
74
|
+
while( std::getline( input, line ).good() ){
|
75
|
+
if( line.empty() || line[0] == '>' ){ // Identifier marker
|
76
|
+
if( !name.empty() ){ // Print out what we read from the last entry
|
77
|
+
//std::cout << name << " : " << content << std::endl;
|
78
|
+
q_name_arr[count2] = name.c_str();
|
79
|
+
q_seq_arr[count2] = content.c_str();
|
80
|
+
name.clear();
|
81
|
+
//std::cout << count2 << " : " << q_name_arr[count2] <<std::endl;
|
82
|
+
count2++;
|
83
|
+
}
|
84
|
+
if( !line.empty() ){
|
85
|
+
name = line.substr(1);
|
86
|
+
}
|
87
|
+
content.clear();
|
88
|
+
} else if( !name.empty() ){
|
89
|
+
if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
|
90
|
+
name.clear();
|
91
|
+
content.clear();
|
92
|
+
} else {
|
93
|
+
content += line;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
}
|
98
|
+
|
99
|
+
|
100
|
+
if( !name.empty() ){ // Print out what we read from the last entry
|
101
|
+
//std::cout << name << " : " << content << std::endl;
|
102
|
+
q_name_arr[count2] = name;
|
103
|
+
q_seq_arr[count2] = content;
|
104
|
+
count2++;
|
105
|
+
}
|
106
|
+
|
107
|
+
//std::cout << "ref count: "<< count1 <<" query count2: " <<count2 << std::endl;
|
108
|
+
|
109
|
+
std::ofstream outFile(argv[5]);
|
110
|
+
|
111
|
+
#pragma omp parallel for
|
112
|
+
for (int c2=0; c2<count2; c2++){ //query seq array
|
113
|
+
|
114
|
+
int size = std::stoi(argv[6]);
|
115
|
+
int best_hamming[size];
|
116
|
+
int best_index[size];
|
117
|
+
int furthest_index = 0;
|
118
|
+
|
119
|
+
for (int i=0; i<size; i++){
|
120
|
+
best_index[i] = 0;
|
121
|
+
best_hamming[i] = 999999999;
|
122
|
+
}
|
123
|
+
|
124
|
+
for (int c1=0; c1<count1 ; c1++) { //ref seq array
|
125
|
+
int count = 0;
|
126
|
+
int len = seq_arr[c1].length();
|
127
|
+
for(int i=0; i < len; i++) {
|
128
|
+
if(seq_arr[c1][i] != q_seq_arr[c2][i]) {
|
129
|
+
count++;
|
130
|
+
if (count > best_hamming[furthest_index]) {
|
131
|
+
break;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
}
|
135
|
+
//std::cout << "here" << std::endl;
|
136
|
+
if (count <= best_hamming[furthest_index]) {
|
137
|
+
best_hamming[furthest_index] = count;
|
138
|
+
best_index[furthest_index] = c1;
|
139
|
+
int high_hamming = 0;
|
140
|
+
int high_index = 0;
|
141
|
+
for (int i=0; i<size; i++){
|
142
|
+
if (best_hamming[i] > high_hamming){
|
143
|
+
high_hamming = best_hamming[i];
|
144
|
+
high_index = i;
|
145
|
+
}
|
146
|
+
furthest_index = high_index;
|
147
|
+
}
|
148
|
+
}
|
149
|
+
}
|
150
|
+
#pragma omp critical
|
151
|
+
{
|
152
|
+
outFile << q_name_arr[c2];
|
153
|
+
for (int i=0; i<size; i++){
|
154
|
+
outFile << "," << name_arr[best_index[i]] << ":" << best_hamming[i];
|
155
|
+
}
|
156
|
+
outFile << std::endl;
|
157
|
+
}
|
158
|
+
}
|
159
|
+
outFile.close();
|
160
|
+
return 0;
|
161
|
+
}
|
bscampp/tools/pplacer
ADDED
Binary file
|