PyPI - bscampp - Versions diffs - 1.0.1a0__py3-none-any.whl - Mend

bscampp 1.0.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

bscampp/__init__.py +68 -0
bscampp/configs.py +169 -0
bscampp/default.config +5 -0
bscampp/functions.py +394 -0
bscampp/init_configs.py +93 -0
bscampp/jobs.py +198 -0
bscampp/pipeline.py +224 -0
bscampp/tools/epa-ng +0 -0
bscampp/tools/hamming_distance/CMakeLists.txt +13 -0
bscampp/tools/hamming_distance/fragment_hamming +0 -0
bscampp/tools/hamming_distance/hamming +0 -0
bscampp/tools/hamming_distance/homology +0 -0
bscampp/tools/hamming_distance/src/fragment_hamming.cpp +180 -0
bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp +183 -0
bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp +214 -0
bscampp/tools/hamming_distance/src/homology.cpp +179 -0
bscampp/tools/hamming_distance/src/new_hamming.cpp +161 -0
bscampp/tools/pplacer +0 -0
bscampp/utils.py +913 -0
bscampp-1.0.1a0.dist-info/LICENSE +21 -0
bscampp-1.0.1a0.dist-info/METADATA +229 -0
bscampp-1.0.1a0.dist-info/RECORD +25 -0
bscampp-1.0.1a0.dist-info/WHEEL +5 -0
bscampp-1.0.1a0.dist-info/entry_points.txt +3 -0
bscampp-1.0.1a0.dist-info/top_level.txt +1 -0

bscampp/tools/hamming_distance/src/fragment_hamming.cpp ADDED Viewed

@@ -0,0 +1,180 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <omp.h>
+int main( int argc, char **argv ){
+    if( argc <= 6 ){
+        std::cerr << "Usage: "<<argv[0]<<" [ref infile] [nbr ref records] [query infile] [nbr query records] [outfile] [nbr of leaves returned]" << std::endl;
+        return -1;
+    }
+    // read in the reference sequences first
+    std::ifstream input_q(argv[1]);
+    if(!input_q.good()){
+        std::cerr << "Error opening '"<<argv[1]<<"'. Bailing out." << std::endl;
+        return -1;
+    }
+    //std::cout << argv[2] << std::endl;
+    //std::string name_arr[std::stoi(argv[2])];
+    //std::string seq_arr[std::stoi(argv[2])];
+    int ref_size = std::stoi(argv[2]) + 3;
+    int count1 = 0;
+    std::vector<std::string> name_arr(ref_size);
+    std::vector<std::string> seq_arr(ref_size);
+    std::string line, name, content;
+    while( std::getline( input_q, line ).good() ){
+        if( line.empty() || line[0] == '>' ){ // Identifier marker
+            if( !name.empty() ){ // Print out what we read from the last entry
+                //std::cout << name << " : " << content << std::endl;
+                name_arr[count1] = name.c_str();
+                seq_arr[count1] = content.c_str();
+                name.clear();
+                //std::cout << count1<< " : " << name_arr[count1] <<std::endl;
+                count1++;
+            }
+            if( !line.empty() ){
+                name = line.substr(1);
+            }
+            content.clear();
+        } else if( !name.empty() ){
+            if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                name.clear();
+                content.clear();
+            } else {
+                content += line;
+            }
+        }
+    }
+    if( !name.empty() ){ // Print out what we read from the last entry
+        //std::cout << name << " : " << content << std::endl;
+        name_arr[count1] = name;
+        seq_arr[count1] = content;
+        count1++;
+    }
+    // read in query sequences second
+    std::ifstream input(argv[3]);
+    if(!input.good()){
+        std::cerr << "Error opening '"<<argv[3]<<"'. Bailing out." << std::endl;
+        return -1;
+    }
+    //std::cout << argv[4] << std::endl;
+    int q_size = std::stoi(argv[4]) + 3;
+    int count2 = 0;
+    //std::string q_name_arr[std::stoi(argv[4])+3];
+    //std::string q_seq_arr[std::stoi(argv[4])+3];
+    std::vector<std::string> q_name_arr(q_size);
+    std::vector<std::string> q_seq_arr(q_size);
+    name = "";
+    while( std::getline( input, line ).good() ){
+        if( line.empty() || line[0] == '>' ){ // Identifier marker
+            if( !name.empty() ){ // Print out what we read from the last entry
+                //std::cout << name << " : " << content << std::endl;
+                q_name_arr[count2] = name.c_str();
+                q_seq_arr[count2] = content.c_str();
+                name.clear();
+                //std::cout << count2 << " : " << q_name_arr[count2] <<std::endl;
+                count2++;
+            }
+            if( !line.empty() ){
+                name = line.substr(1);
+            }
+            content.clear();
+        } else if( !name.empty() ){
+            if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                name.clear();
+                content.clear();
+            } else {
+                content += line;
+            }
+        }
+    }
+    if( !name.empty() ){ // Print out what we read from the last entry
+        //std::cout << name << " : " << content << std::endl;
+        q_name_arr[count2] = name;
+        q_seq_arr[count2] = content;
+        count2++;
+    }
+    //std::cout << "ref count: "<< count1 <<" query count2: " <<count2 << std::endl;
+    std::ofstream outFile(argv[5]);
+    // find n (size) closest reference sequences by Hamming distance to each query
+    // print this to outfile with the one query per line followed by the n closest
+    // reference sequences separated by a comma, with their requisite Hamming
+    // distances separated with a semicolon.
+    #pragma omp parallel for
+    for (int c2=0; c2<count2; c2++){ //query seq array
+        int size = std::stoi(argv[6]);
+        int best_hamming[size];
+        int best_index[size];
+        int furthest_index = 0;
+        for (int i=0; i<size; i++){
+            best_index[i] = 0;
+            best_hamming[i] = 999999999;
+        }
+        int q_len = q_seq_arr[c2].length();
+        int start_idx = q_len;
+        int end_idx = q_len;
+        for (int i=0; i < q_len; i++) {
+            if (q_seq_arr[c2][i] != '-' && start_idx >= q_len) {
+                start_idx = i;
+            }
+            if (q_seq_arr[c2][q_len-i-1] != '-' && end_idx >= q_len) {
+                end_idx = q_len - i - 1;
+            }
+        }
+        for (int c1=0; c1<count1 ; c1++) { //ref seq array
+            int count = 0;
+            // int len = seq_arr[c1].length();
+            for(int i=start_idx; i < end_idx+1; i++) {
+                if(seq_arr[c1][i] != q_seq_arr[c2][i]) {
+                    count++;
+                    if (count > best_hamming[furthest_index]) {
+                        break;
+                    }
+                }
+            }
+            //std::cout << "here" << std::endl;
+            if (count <= best_hamming[furthest_index]) {
+                best_hamming[furthest_index] = count;
+                best_index[furthest_index] = c1;
+                int high_hamming = 0;
+                int high_index = 0;
+                for (int i=0; i<size; i++){
+                    if (best_hamming[i] > high_hamming){
+                        high_hamming = best_hamming[i];
+                        high_index = i;
+                    }
+                furthest_index = high_index;
+                }
+            }
+        }
+        #pragma omp critical
+        {
+            outFile << q_name_arr[c2];
+            for (int i=0; i<size; i++){
+                outFile << "," << name_arr[best_index[i]] << ":" << best_hamming[i];
+            }
+            outFile << std::endl;
+        }
+    }
+    outFile.close();
+    return 0;
+}

bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp ADDED Viewed

@@ -0,0 +1,183 @@
+#include <iostream>
+#include <fstream>
+#include <omp.h>
+int main( int argc, char **argv ){
+    if( argc <= 6 ){
+        std::cerr << "Usage: "<<argv[0]<<" [ref infile] [nbr ref records] [query infile] [nbr query records] [outfile] [nbr of subtrees]" << std::endl;
+        return -1;
+    }
+    // read in query sequences first
+    std::ifstream input(argv[3]);
+    if(!input.good()){
+        std::cerr << "Error opening '"<<argv[3]<<"'. Bailing out." << std::endl;
+        return -1;
+    }
+    std::string line, name, content;
+    //std::cout << argv[4] << std::endl;
+    std::string q_name_arr[std::stoi(argv[4])+3];
+    std::string q_seq_arr[std::stoi(argv[4])+3];
+    int count2 = 0;
+    name = "";
+    while( std::getline( input, line ).good() ){
+        if( line.empty() || line[0] == '>' ){ // Identifier marker
+            if( !name.empty() ){ // Print out what we read from the last entry
+                //std::cout << name << " : " << content << std::endl;
+                q_name_arr[count2] = name.c_str();
+                q_seq_arr[count2] = content.c_str();
+                name.clear();
+                //std::cout << count2 << " : " << q_name_arr[count2] <<std::endl;
+                count2++;
+            }
+            if( !line.empty() ){
+                name = line.substr(1);
+            }
+            content.clear();
+        } else if( !name.empty() ){
+            if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                name.clear();
+                content.clear();
+            } else {
+                content += line;
+            }
+        }
+    }
+    if( !name.empty() ){ // Print out what we read from the last entry
+        //std::cout << name << " : " << content << std::endl;
+        q_name_arr[count2] = name;
+        q_seq_arr[count2] = content;
+        count2++;
+    }
+    // read in the reference sequences second
+    double smallest_total_hamming[count2];
+    int best_tree_index[count2];
+    for (int c=0; c<count2; c++) {
+        smallest_total_hamming[c] = 999999999999999;
+        best_tree_index[c] = 0;
+    }
+    //std::cout << argv[2] << std::endl;
+    int subtree_size = std::stoi(argv[2]);
+    int nbr_subtrees = std::stoi(argv[6]);
+    //std::cout << nbr_subtrees << std::endl;
+    //std::cout << "here!!" << std::endl;
+    std::string subtree_path = argv[1];
+    int count1 = 0;
+    std::string name_arr[subtree_size+1];
+    std::string seq_arr[subtree_size+1];
+    for (int subtree_idx = 0; subtree_idx < nbr_subtrees; subtree_idx++){
+        std::string subtree_full_path = subtree_path + std::to_string(subtree_idx);
+        //std::cout << subtree_full_path << std::endl;
+        std::ifstream input_q(subtree_full_path);
+        if(!input_q.good()){
+            std::cerr << "Error opening '"<< subtree_full_path <<"'. Bailing out." << std::endl;
+            return -1;
+        }
+        count1 = 0;
+        while( std::getline( input_q, line ).good() ){
+            if( line.empty() || line[0] == '>' ){ // Identifier marker
+                if( !name.empty() ){ // Print out what we read from the last entry
+                    //std::cout << name << " : " << content << std::endl;
+                    name_arr[count1] = name.c_str();
+                    seq_arr[count1] = content.c_str();
+                    name.clear();
+                    //std::cout << count1<< " : " << name_arr[count1] <<std::endl;
+                    count1++;
+                }
+                if( !line.empty() ){
+                    name = line.substr(1);
+                }
+                content.clear();
+            } else if( !name.empty() ){
+                if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                    name.clear();
+                    content.clear();
+                } else {
+                    content += line;
+                }
+            }
+        }
+        //std::cout << "finished all but last reference sequences" << std::endl;
+        if( !name.empty() ){ // Print out what we read from the last entry
+            //std::cout << name << " : " << content << std::endl;
+            name_arr[count1] = name.c_str();
+            seq_arr[count1] = content.c_str();
+            count1++;
+        }
+        //std::cout << "ref count: "<< count1 <<" query count2: " <<count2 << std::endl;
+        //std::cout << "finished reading reference sequences" << std::endl;
+        // find n (size) closest reference sequences by Hamming distance to each query
+        // print this to outfile with the one query per line followed by the n closest
+        // reference sequences separated by a comma, with their requisite Hamming
+        // distances separated with a semicolon.
+        #pragma omp parallel for
+        for (int c2=0; c2<count2; c2++){ //query seq array
+            int q_len = q_seq_arr[c2].length();
+            int start_idx = q_len;
+            int end_idx = q_len;
+            for (int i=0; i < q_len; i++) {
+                if (q_seq_arr[c2][i] != '-' && start_idx >= q_len) {
+                    start_idx = i;
+                }
+                if (q_seq_arr[c2][q_len-i-1] != '-' && end_idx >= q_len) {
+                    end_idx = q_len - i - 1;
+                }
+            }
+            int count = 0;
+            for (int c1=0; c1<count1 ; c1++) { //ref seq array
+                for(int i=start_idx; i < end_idx+1; i++) { //individual seq hamming distances
+                    if(seq_arr[c1][i] != q_seq_arr[c2][i]) {
+                        count++;
+                    }
+                }
+            }
+            //std::cout << "here" << std::endl;
+            if (count <= smallest_total_hamming[c2]) {
+                smallest_total_hamming[c2] = count;
+                best_tree_index[c2] = subtree_idx;
+            }
+        }
+        //std::cout << "finished hamming distance " << std::endl;
+    }
+    std::ofstream outFile(argv[5]);
+    for (int c2=0; c2<count2; c2++){ //query seq array
+        outFile << q_name_arr[c2] << "," << best_tree_index[c2] << ":" << smallest_total_hamming[c2] <<std::endl;
+    }
+    outFile.close();
+    return 0;
+}

bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp ADDED Viewed

@@ -0,0 +1,214 @@
+#include <iostream>
+#include <fstream>
+#include <omp.h>
+int main( int argc, char **argv ){
+    if( argc <= 7 ){
+        std::cerr << "Usage: "<<argv[0]<<" [ref infile] [nbr ref records] [query infile] [nbr query records] [outfile] [nbr of subtrees] [nbr of votes]" << std::endl;
+        return -1;
+    }
+    // read in query sequences first
+    std::ifstream input(argv[3]);
+    if(!input.good()){
+        std::cerr << "Error opening '"<<argv[3]<<"'. Bailing out." << std::endl;
+        return -1;
+    }
+    std::string line, name, content;
+    //std::cout << argv[4] << std::endl;
+    std::string q_name_arr[std::stoi(argv[4])+3];
+    std::string q_seq_arr[std::stoi(argv[4])+3];
+    int count2 = 0;
+    name = "";
+    while( std::getline( input, line ).good() ){
+        if( line.empty() || line[0] == '>' ){ // Identifier marker
+            if( !name.empty() ){ // Print out what we read from the last entry
+                //std::cout << name << " : " << content << std::endl;
+                q_name_arr[count2] = name.c_str();
+                q_seq_arr[count2] = content.c_str();
+                name.clear();
+                //std::cout << count2 << " : " << q_name_arr[count2] <<std::endl;
+                count2++;
+            }
+            if( !line.empty() ){
+                name = line.substr(1);
+            }
+            content.clear();
+        } else if( !name.empty() ){
+            if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                name.clear();
+                content.clear();
+            } else {
+                content += line;
+            }
+        }
+    }
+    if( !name.empty() ){ // Print out what we read from the last entry
+        //std::cout << name << " : " << content << std::endl;
+        q_name_arr[count2] = name;
+        q_seq_arr[count2] = content;
+        count2++;
+    }
+    // read in the reference sequences second
+    double smallest_total_hamming[count2];
+    int best_tree_index[count2];
+    for (int c=0; c<count2; c++)
+    {
+        smallest_total_hamming[c] = 999999999999999;
+        best_tree_index[c] = 0;
+    }
+    std::cout << argv[2] << std::endl;
+    int subtree_size = std::stoi(argv[2]);
+    int nbr_subtrees = std::stoi(argv[6]);
+    std::cout << nbr_subtrees << std::endl;
+    // std::cout << "here!!" << std::endl;
+    std::string subtree_path = argv[1];
+    int count1 = 0;
+    std::string name_arr[subtree_size+1];
+    std::string seq_arr[subtree_size+1];
+    for (int subtree_idx = 0; subtree_idx < nbr_subtrees; subtree_idx++){
+        std::string subtree_full_path = subtree_path + std::to_string(subtree_idx);
+        std::cout << subtree_full_path << std::endl;
+        std::ifstream input_q(subtree_full_path);
+        if(!input_q.good()){
+            std::cerr << "Error opening '"<< subtree_full_path <<"'. Bailing out." << std::endl;
+            return -1;
+        }
+        count1 = 0;
+        while( std::getline( input_q, line ).good() ){
+            if( line.empty() || line[0] == '>' ){ // Identifier marker
+                if( !name.empty() ){ // Print out what we read from the last entry
+                    //std::cout << name << " : " << content << std::endl;
+                    name_arr[count1] = name.c_str();
+                    seq_arr[count1] = content.c_str();
+                    name.clear();
+                    //std::cout << count1<< " : " << name_arr[count1] <<std::endl;
+                    count1++;
+                }
+                if( !line.empty() ){
+                    name = line.substr(1);
+                }
+                content.clear();
+            } else if( !name.empty() ){
+                if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                    name.clear();
+                    content.clear();
+                } else {
+                    content += line;
+                }
+            }
+        }
+        //std::cout << "finished all but last reference sequences" << std::endl;
+        if( !name.empty() ){ // Print out what we read from the last entry
+            //std::cout << name << " : " << content << std::endl;
+            name_arr[count1] = name.c_str();
+            seq_arr[count1] = content.c_str();
+            count1++;
+        }
+        std::cout << "ref count: "<< count1 <<" query count2: " <<count2 << std::endl;
+        //std::cout << "finished reading reference sequences" << std::endl;
+        // find n (size) closest reference sequences by Hamming distance to each query
+        // print this to outfile with the one query per line followed by the n closest
+        // reference sequences separated by a comma, with their requisite Hamming
+        // distances separated with a semicolon.
+        #pragma omp parallel for
+        for (int c2=0; c2<count2; c2++){ //query seq array
+        int size = std::stoi(argv[7]);
+        int best_hamming[size];
+        int best_index[size];
+        int furthest_index = 0;
+        for (int i=0; i<size; i++){
+            best_index[i] = 0;
+            best_hamming[i] = 999999999;
+        }
+            int q_len = q_seq_arr[c2].length();
+            int start_idx = q_len;
+            int end_idx = q_len;
+            for (int i=0; i < q_len; i++) {
+                if (q_seq_arr[c2][i] != '-' && start_idx >= q_len) {
+                    start_idx = i;
+                }
+                if (q_seq_arr[c2][q_len-i-1] != '-' && end_idx >= q_len) {
+                    end_idx = q_len - i - 1;
+                }
+            }
+            for (int c1=0; c1<count1 ; c1++) { //ref seq array
+                int count = 0;
+                for(int i=start_idx; i < end_idx+1; i++) { //individual seq hamming distances
+                    if(seq_arr[c1][i] != q_seq_arr[c2][i]) {
+                        count++;
+                        if (count > best_hamming[furthest_index]) {
+                            break;
+                        }
+                    }
+                }
+                if (count <= best_hamming[furthest_index]) {
+                    best_hamming[furthest_index] = count;
+                    best_index[furthest_index] = c1;
+                    int high_hamming = 0;
+                    int high_index = 0;
+                    for (int i=0; i<size; i++){
+                        if (best_hamming[i] > high_hamming){
+                            high_hamming = best_hamming[i];
+                            high_index = i;
+                        }
+                    furthest_index = high_index;
+                    }
+                }
+            }
+            int count = 0;
+            for (int i= 0; i<size; i++){
+                count += best_hamming[i];
+            }
+            //std::cout << "here" << std::endl;
+            if (count <= smallest_total_hamming[c2]) {
+                smallest_total_hamming[c2] = count;
+                best_tree_index[c2] = subtree_idx;
+            }
+        }
+        std::cout << "finished hamming distance " << std::endl;
+    }
+    std::ofstream outFile(argv[5]);
+    for (int c2=0; c2<count2; c2++){ //query seq array
+        outFile << q_name_arr[c2] << "," << best_tree_index[c2] << ":" << smallest_total_hamming[c2] <<std::endl;
+    }
+    outFile.close();
+    return 0;
+}