bscampp 1.0.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,180 @@
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <vector>
4
+ #include <string>
5
+ #include <omp.h>
6
+
7
+
8
+ int main( int argc, char **argv ){
9
+ if( argc <= 6 ){
10
+ std::cerr << "Usage: "<<argv[0]<<" [ref infile] [nbr ref records] [query infile] [nbr query records] [outfile] [nbr of leaves returned]" << std::endl;
11
+ return -1;
12
+ }
13
+
14
+
15
+ // read in the reference sequences first
16
+ std::ifstream input_q(argv[1]);
17
+ if(!input_q.good()){
18
+ std::cerr << "Error opening '"<<argv[1]<<"'. Bailing out." << std::endl;
19
+ return -1;
20
+ }
21
+ //std::cout << argv[2] << std::endl;
22
+ //std::string name_arr[std::stoi(argv[2])];
23
+ //std::string seq_arr[std::stoi(argv[2])];
24
+ int ref_size = std::stoi(argv[2]) + 3;
25
+ int count1 = 0;
26
+ std::vector<std::string> name_arr(ref_size);
27
+ std::vector<std::string> seq_arr(ref_size);
28
+ std::string line, name, content;
29
+
30
+ while( std::getline( input_q, line ).good() ){
31
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
32
+ if( !name.empty() ){ // Print out what we read from the last entry
33
+ //std::cout << name << " : " << content << std::endl;
34
+ name_arr[count1] = name.c_str();
35
+ seq_arr[count1] = content.c_str();
36
+ name.clear();
37
+ //std::cout << count1<< " : " << name_arr[count1] <<std::endl;
38
+ count1++;
39
+ }
40
+ if( !line.empty() ){
41
+ name = line.substr(1);
42
+ }
43
+ content.clear();
44
+ } else if( !name.empty() ){
45
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
46
+ name.clear();
47
+ content.clear();
48
+ } else {
49
+ content += line;
50
+ }
51
+ }
52
+ }
53
+
54
+
55
+ if( !name.empty() ){ // Print out what we read from the last entry
56
+ //std::cout << name << " : " << content << std::endl;
57
+ name_arr[count1] = name;
58
+ seq_arr[count1] = content;
59
+ count1++;
60
+ }
61
+
62
+ // read in query sequences second
63
+ std::ifstream input(argv[3]);
64
+ if(!input.good()){
65
+ std::cerr << "Error opening '"<<argv[3]<<"'. Bailing out." << std::endl;
66
+ return -1;
67
+ }
68
+ //std::cout << argv[4] << std::endl;
69
+ int q_size = std::stoi(argv[4]) + 3;
70
+ int count2 = 0;
71
+ //std::string q_name_arr[std::stoi(argv[4])+3];
72
+ //std::string q_seq_arr[std::stoi(argv[4])+3];
73
+ std::vector<std::string> q_name_arr(q_size);
74
+ std::vector<std::string> q_seq_arr(q_size);
75
+ name = "";
76
+ while( std::getline( input, line ).good() ){
77
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
78
+ if( !name.empty() ){ // Print out what we read from the last entry
79
+ //std::cout << name << " : " << content << std::endl;
80
+ q_name_arr[count2] = name.c_str();
81
+ q_seq_arr[count2] = content.c_str();
82
+ name.clear();
83
+ //std::cout << count2 << " : " << q_name_arr[count2] <<std::endl;
84
+ count2++;
85
+ }
86
+ if( !line.empty() ){
87
+ name = line.substr(1);
88
+ }
89
+ content.clear();
90
+ } else if( !name.empty() ){
91
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
92
+ name.clear();
93
+ content.clear();
94
+ } else {
95
+ content += line;
96
+ }
97
+ }
98
+
99
+ }
100
+
101
+
102
+ if( !name.empty() ){ // Print out what we read from the last entry
103
+ //std::cout << name << " : " << content << std::endl;
104
+ q_name_arr[count2] = name;
105
+ q_seq_arr[count2] = content;
106
+ count2++;
107
+ }
108
+
109
+ //std::cout << "ref count: "<< count1 <<" query count2: " <<count2 << std::endl;
110
+
111
+ std::ofstream outFile(argv[5]);
112
+
113
+ // find n (size) closest reference sequences by Hamming distance to each query
114
+ // print this to outfile with the one query per line followed by the n closest
115
+ // reference sequences separated by a comma, with their requisite Hamming
116
+ // distances separated with a semicolon.
117
+
118
+ #pragma omp parallel for
119
+ for (int c2=0; c2<count2; c2++){ //query seq array
120
+
121
+ int size = std::stoi(argv[6]);
122
+ int best_hamming[size];
123
+ int best_index[size];
124
+ int furthest_index = 0;
125
+
126
+ for (int i=0; i<size; i++){
127
+ best_index[i] = 0;
128
+ best_hamming[i] = 999999999;
129
+ }
130
+
131
+ int q_len = q_seq_arr[c2].length();
132
+ int start_idx = q_len;
133
+ int end_idx = q_len;
134
+ for (int i=0; i < q_len; i++) {
135
+ if (q_seq_arr[c2][i] != '-' && start_idx >= q_len) {
136
+ start_idx = i;
137
+ }
138
+ if (q_seq_arr[c2][q_len-i-1] != '-' && end_idx >= q_len) {
139
+ end_idx = q_len - i - 1;
140
+ }
141
+ }
142
+
143
+ for (int c1=0; c1<count1 ; c1++) { //ref seq array
144
+ int count = 0;
145
+ // int len = seq_arr[c1].length();
146
+ for(int i=start_idx; i < end_idx+1; i++) {
147
+ if(seq_arr[c1][i] != q_seq_arr[c2][i]) {
148
+ count++;
149
+ if (count > best_hamming[furthest_index]) {
150
+ break;
151
+ }
152
+ }
153
+ }
154
+ //std::cout << "here" << std::endl;
155
+ if (count <= best_hamming[furthest_index]) {
156
+ best_hamming[furthest_index] = count;
157
+ best_index[furthest_index] = c1;
158
+ int high_hamming = 0;
159
+ int high_index = 0;
160
+ for (int i=0; i<size; i++){
161
+ if (best_hamming[i] > high_hamming){
162
+ high_hamming = best_hamming[i];
163
+ high_index = i;
164
+ }
165
+ furthest_index = high_index;
166
+ }
167
+ }
168
+ }
169
+ #pragma omp critical
170
+ {
171
+ outFile << q_name_arr[c2];
172
+ for (int i=0; i<size; i++){
173
+ outFile << "," << name_arr[best_index[i]] << ":" << best_hamming[i];
174
+ }
175
+ outFile << std::endl;
176
+ }
177
+ }
178
+ outFile.close();
179
+ return 0;
180
+ }
@@ -0,0 +1,183 @@
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <omp.h>
4
+
5
+
6
+ int main( int argc, char **argv ){
7
+ if( argc <= 6 ){
8
+ std::cerr << "Usage: "<<argv[0]<<" [ref infile] [nbr ref records] [query infile] [nbr query records] [outfile] [nbr of subtrees]" << std::endl;
9
+ return -1;
10
+ }
11
+
12
+
13
+ // read in query sequences first
14
+ std::ifstream input(argv[3]);
15
+ if(!input.good()){
16
+ std::cerr << "Error opening '"<<argv[3]<<"'. Bailing out." << std::endl;
17
+ return -1;
18
+ }
19
+ std::string line, name, content;
20
+ //std::cout << argv[4] << std::endl;
21
+ std::string q_name_arr[std::stoi(argv[4])+3];
22
+ std::string q_seq_arr[std::stoi(argv[4])+3];
23
+ int count2 = 0;
24
+ name = "";
25
+ while( std::getline( input, line ).good() ){
26
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
27
+ if( !name.empty() ){ // Print out what we read from the last entry
28
+ //std::cout << name << " : " << content << std::endl;
29
+ q_name_arr[count2] = name.c_str();
30
+ q_seq_arr[count2] = content.c_str();
31
+ name.clear();
32
+ //std::cout << count2 << " : " << q_name_arr[count2] <<std::endl;
33
+ count2++;
34
+ }
35
+ if( !line.empty() ){
36
+ name = line.substr(1);
37
+ }
38
+ content.clear();
39
+ } else if( !name.empty() ){
40
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
41
+ name.clear();
42
+ content.clear();
43
+ } else {
44
+ content += line;
45
+ }
46
+ }
47
+
48
+ }
49
+
50
+
51
+ if( !name.empty() ){ // Print out what we read from the last entry
52
+ //std::cout << name << " : " << content << std::endl;
53
+ q_name_arr[count2] = name;
54
+ q_seq_arr[count2] = content;
55
+ count2++;
56
+ }
57
+
58
+
59
+
60
+ // read in the reference sequences second
61
+ double smallest_total_hamming[count2];
62
+ int best_tree_index[count2];
63
+ for (int c=0; c<count2; c++) {
64
+ smallest_total_hamming[c] = 999999999999999;
65
+ best_tree_index[c] = 0;
66
+ }
67
+
68
+ //std::cout << argv[2] << std::endl;
69
+ int subtree_size = std::stoi(argv[2]);
70
+ int nbr_subtrees = std::stoi(argv[6]);
71
+
72
+ //std::cout << nbr_subtrees << std::endl;
73
+
74
+
75
+ //std::cout << "here!!" << std::endl;
76
+
77
+ std::string subtree_path = argv[1];
78
+ int count1 = 0;
79
+
80
+ std::string name_arr[subtree_size+1];
81
+ std::string seq_arr[subtree_size+1];
82
+
83
+ for (int subtree_idx = 0; subtree_idx < nbr_subtrees; subtree_idx++){
84
+
85
+
86
+ std::string subtree_full_path = subtree_path + std::to_string(subtree_idx);
87
+
88
+ //std::cout << subtree_full_path << std::endl;
89
+
90
+ std::ifstream input_q(subtree_full_path);
91
+
92
+ if(!input_q.good()){
93
+ std::cerr << "Error opening '"<< subtree_full_path <<"'. Bailing out." << std::endl;
94
+ return -1;
95
+ }
96
+
97
+ count1 = 0;
98
+
99
+ while( std::getline( input_q, line ).good() ){
100
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
101
+ if( !name.empty() ){ // Print out what we read from the last entry
102
+ //std::cout << name << " : " << content << std::endl;
103
+ name_arr[count1] = name.c_str();
104
+ seq_arr[count1] = content.c_str();
105
+ name.clear();
106
+ //std::cout << count1<< " : " << name_arr[count1] <<std::endl;
107
+ count1++;
108
+ }
109
+ if( !line.empty() ){
110
+ name = line.substr(1);
111
+ }
112
+ content.clear();
113
+ } else if( !name.empty() ){
114
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
115
+ name.clear();
116
+ content.clear();
117
+ } else {
118
+ content += line;
119
+ }
120
+ }
121
+ }
122
+ //std::cout << "finished all but last reference sequences" << std::endl;
123
+
124
+ if( !name.empty() ){ // Print out what we read from the last entry
125
+ //std::cout << name << " : " << content << std::endl;
126
+
127
+ name_arr[count1] = name.c_str();
128
+ seq_arr[count1] = content.c_str();
129
+ count1++;
130
+ }
131
+ //std::cout << "ref count: "<< count1 <<" query count2: " <<count2 << std::endl;
132
+ //std::cout << "finished reading reference sequences" << std::endl;
133
+
134
+ // find n (size) closest reference sequences by Hamming distance to each query
135
+ // print this to outfile with the one query per line followed by the n closest
136
+ // reference sequences separated by a comma, with their requisite Hamming
137
+ // distances separated with a semicolon.
138
+
139
+ #pragma omp parallel for
140
+ for (int c2=0; c2<count2; c2++){ //query seq array
141
+
142
+
143
+ int q_len = q_seq_arr[c2].length();
144
+ int start_idx = q_len;
145
+ int end_idx = q_len;
146
+
147
+ for (int i=0; i < q_len; i++) {
148
+ if (q_seq_arr[c2][i] != '-' && start_idx >= q_len) {
149
+ start_idx = i;
150
+ }
151
+ if (q_seq_arr[c2][q_len-i-1] != '-' && end_idx >= q_len) {
152
+ end_idx = q_len - i - 1;
153
+ }
154
+ }
155
+
156
+ int count = 0;
157
+ for (int c1=0; c1<count1 ; c1++) { //ref seq array
158
+ for(int i=start_idx; i < end_idx+1; i++) { //individual seq hamming distances
159
+ if(seq_arr[c1][i] != q_seq_arr[c2][i]) {
160
+ count++;
161
+ }
162
+ }
163
+ }
164
+ //std::cout << "here" << std::endl;
165
+ if (count <= smallest_total_hamming[c2]) {
166
+ smallest_total_hamming[c2] = count;
167
+ best_tree_index[c2] = subtree_idx;
168
+ }
169
+ }
170
+ //std::cout << "finished hamming distance " << std::endl;
171
+
172
+ }
173
+
174
+ std::ofstream outFile(argv[5]);
175
+
176
+ for (int c2=0; c2<count2; c2++){ //query seq array
177
+ outFile << q_name_arr[c2] << "," << best_tree_index[c2] << ":" << smallest_total_hamming[c2] <<std::endl;
178
+ }
179
+
180
+ outFile.close();
181
+
182
+ return 0;
183
+ }
@@ -0,0 +1,214 @@
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <omp.h>
4
+
5
+
6
+ int main( int argc, char **argv ){
7
+ if( argc <= 7 ){
8
+ std::cerr << "Usage: "<<argv[0]<<" [ref infile] [nbr ref records] [query infile] [nbr query records] [outfile] [nbr of subtrees] [nbr of votes]" << std::endl;
9
+ return -1;
10
+ }
11
+
12
+
13
+ // read in query sequences first
14
+ std::ifstream input(argv[3]);
15
+ if(!input.good()){
16
+ std::cerr << "Error opening '"<<argv[3]<<"'. Bailing out." << std::endl;
17
+ return -1;
18
+ }
19
+ std::string line, name, content;
20
+ //std::cout << argv[4] << std::endl;
21
+ std::string q_name_arr[std::stoi(argv[4])+3];
22
+ std::string q_seq_arr[std::stoi(argv[4])+3];
23
+ int count2 = 0;
24
+ name = "";
25
+ while( std::getline( input, line ).good() ){
26
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
27
+ if( !name.empty() ){ // Print out what we read from the last entry
28
+ //std::cout << name << " : " << content << std::endl;
29
+ q_name_arr[count2] = name.c_str();
30
+ q_seq_arr[count2] = content.c_str();
31
+ name.clear();
32
+ //std::cout << count2 << " : " << q_name_arr[count2] <<std::endl;
33
+ count2++;
34
+ }
35
+ if( !line.empty() ){
36
+ name = line.substr(1);
37
+ }
38
+ content.clear();
39
+ } else if( !name.empty() ){
40
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
41
+ name.clear();
42
+ content.clear();
43
+ } else {
44
+ content += line;
45
+ }
46
+ }
47
+
48
+ }
49
+
50
+
51
+ if( !name.empty() ){ // Print out what we read from the last entry
52
+ //std::cout << name << " : " << content << std::endl;
53
+ q_name_arr[count2] = name;
54
+ q_seq_arr[count2] = content;
55
+ count2++;
56
+ }
57
+
58
+
59
+
60
+ // read in the reference sequences second
61
+ double smallest_total_hamming[count2];
62
+ int best_tree_index[count2];
63
+ for (int c=0; c<count2; c++)
64
+ {
65
+ smallest_total_hamming[c] = 999999999999999;
66
+ best_tree_index[c] = 0;
67
+ }
68
+ std::cout << argv[2] << std::endl;
69
+ int subtree_size = std::stoi(argv[2]);
70
+ int nbr_subtrees = std::stoi(argv[6]);
71
+
72
+ std::cout << nbr_subtrees << std::endl;
73
+
74
+
75
+ // std::cout << "here!!" << std::endl;
76
+
77
+ std::string subtree_path = argv[1];
78
+ int count1 = 0;
79
+
80
+ std::string name_arr[subtree_size+1];
81
+ std::string seq_arr[subtree_size+1];
82
+
83
+ for (int subtree_idx = 0; subtree_idx < nbr_subtrees; subtree_idx++){
84
+
85
+
86
+ std::string subtree_full_path = subtree_path + std::to_string(subtree_idx);
87
+
88
+ std::cout << subtree_full_path << std::endl;
89
+
90
+ std::ifstream input_q(subtree_full_path);
91
+
92
+ if(!input_q.good()){
93
+ std::cerr << "Error opening '"<< subtree_full_path <<"'. Bailing out." << std::endl;
94
+ return -1;
95
+ }
96
+
97
+ count1 = 0;
98
+
99
+ while( std::getline( input_q, line ).good() ){
100
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
101
+ if( !name.empty() ){ // Print out what we read from the last entry
102
+ //std::cout << name << " : " << content << std::endl;
103
+ name_arr[count1] = name.c_str();
104
+ seq_arr[count1] = content.c_str();
105
+ name.clear();
106
+ //std::cout << count1<< " : " << name_arr[count1] <<std::endl;
107
+ count1++;
108
+ }
109
+ if( !line.empty() ){
110
+ name = line.substr(1);
111
+ }
112
+ content.clear();
113
+ } else if( !name.empty() ){
114
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
115
+ name.clear();
116
+ content.clear();
117
+ } else {
118
+ content += line;
119
+ }
120
+ }
121
+ }
122
+ //std::cout << "finished all but last reference sequences" << std::endl;
123
+
124
+ if( !name.empty() ){ // Print out what we read from the last entry
125
+ //std::cout << name << " : " << content << std::endl;
126
+
127
+ name_arr[count1] = name.c_str();
128
+ seq_arr[count1] = content.c_str();
129
+ count1++;
130
+ }
131
+ std::cout << "ref count: "<< count1 <<" query count2: " <<count2 << std::endl;
132
+ //std::cout << "finished reading reference sequences" << std::endl;
133
+
134
+ // find n (size) closest reference sequences by Hamming distance to each query
135
+ // print this to outfile with the one query per line followed by the n closest
136
+ // reference sequences separated by a comma, with their requisite Hamming
137
+ // distances separated with a semicolon.
138
+
139
+ #pragma omp parallel for
140
+ for (int c2=0; c2<count2; c2++){ //query seq array
141
+
142
+ int size = std::stoi(argv[7]);
143
+ int best_hamming[size];
144
+ int best_index[size];
145
+ int furthest_index = 0;
146
+
147
+ for (int i=0; i<size; i++){
148
+ best_index[i] = 0;
149
+ best_hamming[i] = 999999999;
150
+ }
151
+
152
+ int q_len = q_seq_arr[c2].length();
153
+ int start_idx = q_len;
154
+ int end_idx = q_len;
155
+
156
+ for (int i=0; i < q_len; i++) {
157
+ if (q_seq_arr[c2][i] != '-' && start_idx >= q_len) {
158
+ start_idx = i;
159
+ }
160
+ if (q_seq_arr[c2][q_len-i-1] != '-' && end_idx >= q_len) {
161
+ end_idx = q_len - i - 1;
162
+ }
163
+ }
164
+
165
+
166
+ for (int c1=0; c1<count1 ; c1++) { //ref seq array
167
+ int count = 0;
168
+ for(int i=start_idx; i < end_idx+1; i++) { //individual seq hamming distances
169
+ if(seq_arr[c1][i] != q_seq_arr[c2][i]) {
170
+ count++;
171
+ if (count > best_hamming[furthest_index]) {
172
+ break;
173
+ }
174
+ }
175
+ }
176
+
177
+ if (count <= best_hamming[furthest_index]) {
178
+ best_hamming[furthest_index] = count;
179
+ best_index[furthest_index] = c1;
180
+ int high_hamming = 0;
181
+ int high_index = 0;
182
+ for (int i=0; i<size; i++){
183
+ if (best_hamming[i] > high_hamming){
184
+ high_hamming = best_hamming[i];
185
+ high_index = i;
186
+ }
187
+ furthest_index = high_index;
188
+ }
189
+ }
190
+ }
191
+ int count = 0;
192
+ for (int i= 0; i<size; i++){
193
+ count += best_hamming[i];
194
+ }
195
+ //std::cout << "here" << std::endl;
196
+ if (count <= smallest_total_hamming[c2]) {
197
+ smallest_total_hamming[c2] = count;
198
+ best_tree_index[c2] = subtree_idx;
199
+ }
200
+ }
201
+ std::cout << "finished hamming distance " << std::endl;
202
+
203
+ }
204
+
205
+ std::ofstream outFile(argv[5]);
206
+
207
+ for (int c2=0; c2<count2; c2++){ //query seq array
208
+ outFile << q_name_arr[c2] << "," << best_tree_index[c2] << ":" << smallest_total_hamming[c2] <<std::endl;
209
+ }
210
+
211
+ outFile.close();
212
+
213
+ return 0;
214
+ }