bioroebe 0.10.80 → 0.12.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3946 -2817
- data/bin/bioroebe +13 -2
- data/bin/bioroebe_hash +7 -0
- data/bin/codon_to_aminoacid +6 -4
- data/bin/compacter +7 -0
- data/bin/plain_palindrome +7 -0
- data/bioroebe.gemspec +3 -3
- data/doc/README.gen +3918 -2793
- data/doc/quality_control/commandline_applications.md +3 -3
- data/doc/statistics/statistics.md +7 -7
- data/doc/todo/bioroebe_GUI_todo.md +19 -14
- data/doc/todo/bioroebe_java_todo.md +22 -0
- data/doc/todo/bioroebe_todo.md +2075 -2620
- data/lib/bioroebe/C++/DNA.cpp +69 -0
- data/lib/bioroebe/C++/RNA.cpp +58 -0
- data/lib/bioroebe/C++/sequence.cpp +35 -0
- data/lib/bioroebe/abstract/README.md +1 -0
- data/lib/bioroebe/abstract/features.rb +29 -0
- data/lib/bioroebe/aminoacids/aminoacid_substitution.rb +1 -9
- data/lib/bioroebe/aminoacids/codon_percentage.rb +1 -9
- data/lib/bioroebe/aminoacids/deduce_aminoacid_sequence.rb +1 -9
- data/lib/bioroebe/aminoacids/display_aminoacid_table.rb +1 -0
- data/lib/bioroebe/aminoacids/show_hydrophobicity.rb +1 -6
- data/lib/bioroebe/base/base_module/base_module.rb +36 -0
- data/lib/bioroebe/base/colours_for_base/colours_for_base.rb +18 -8
- data/lib/bioroebe/base/commandline_application/commandline_application.rb +13 -9
- data/lib/bioroebe/base/commandline_application/commandline_arguments.rb +24 -19
- data/lib/bioroebe/base/commandline_application/misc.rb +66 -49
- data/lib/bioroebe/base/commandline_application/opn.rb +8 -8
- data/lib/bioroebe/base/commandline_application/reset.rb +5 -3
- data/lib/bioroebe/base/internal_hash_module/internal_hash_module.rb +42 -0
- data/lib/bioroebe/base/misc.rb +35 -0
- data/lib/bioroebe/base/prototype/misc.rb +15 -9
- data/lib/bioroebe/base/prototype/reset.rb +10 -0
- data/lib/bioroebe/cleave_and_digest/digestion.rb +10 -2
- data/lib/bioroebe/cleave_and_digest/trypsin.rb +104 -50
- data/lib/bioroebe/codon_tables/frequencies/parse_frequency_table.rb +2 -10
- data/lib/bioroebe/codons/codons.rb +1 -1
- data/lib/bioroebe/codons/convert_this_codon_to_that_aminoacid.rb +208 -59
- data/lib/bioroebe/codons/possible_codons_for_this_aminoacid.rb +1 -9
- data/lib/bioroebe/codons/show_codon_tables.rb +8 -3
- data/lib/bioroebe/codons/show_codon_usage.rb +15 -4
- data/lib/bioroebe/colours/rev.rb +4 -1
- data/lib/bioroebe/constants/aminoacids_and_proteins.rb +1 -0
- data/lib/bioroebe/constants/database_constants.rb +1 -1
- data/lib/bioroebe/constants/files_and_directories.rb +31 -4
- data/lib/bioroebe/constants/misc.rb +20 -0
- data/lib/bioroebe/constants/nucleotides.rb +7 -0
- data/lib/bioroebe/conversions/dna_to_aminoacid_sequence.rb +109 -39
- data/lib/bioroebe/count/count_amount_of_aminoacids.rb +3 -2
- data/lib/bioroebe/count/count_amount_of_nucleotides.rb +3 -0
- data/lib/bioroebe/cpp +1 -0
- data/lib/bioroebe/crystal/README.md +2 -0
- data/lib/bioroebe/crystal/to_rna.cr +19 -0
- data/lib/bioroebe/data/README.md +11 -8
- data/lib/bioroebe/data/electron_microscopy/pos_example.pos +396 -0
- data/lib/bioroebe/data/electron_microscopy/test_particles.star +36 -0
- data/lib/bioroebe/data/fasta/human/Homo_sapiens_hemoglobin_subunit_alpha_HBB_mRNA.fasta +9 -0
- data/lib/bioroebe/data/fasta/human/Homo_sapiens_hemoglobin_subunit_beta_HBB_mRNA.fasta +8 -0
- data/lib/bioroebe/data/fasta/human/README.md +2 -0
- data/lib/bioroebe/dotplots/advanced_dotplot.rb +1 -1
- data/lib/bioroebe/electron_microscopy/coordinate_analyzer.rb +15 -18
- data/lib/bioroebe/{fasta_and_fastq/parse_fasta/run.rb → electron_microscopy/electron_microscopy_module.rb} +16 -8
- data/lib/bioroebe/electron_microscopy/fix_pos_file.rb +1 -9
- data/lib/bioroebe/electron_microscopy/flipy.rb +83 -0
- data/lib/bioroebe/electron_microscopy/parse_coordinates.rb +2 -10
- data/lib/bioroebe/electron_microscopy/read_file_xmd.rb +1 -9
- data/lib/bioroebe/electron_microscopy/simple_star_file_generator.rb +4 -9
- data/lib/bioroebe/enzymes/has_this_restriction_enzyme.rb +10 -3
- data/lib/bioroebe/enzymes/restriction_enzyme.rb +23 -1
- data/lib/bioroebe/enzymes/restriction_enzymes/statistics.rb +65 -0
- data/lib/bioroebe/fasta_and_fastq/autocorrect_the_name_of_this_fasta_file.rb +1 -9
- data/lib/bioroebe/fasta_and_fastq/compact_fasta_file/compact_fasta_file.rb +7 -9
- data/lib/bioroebe/fasta_and_fastq/fasta_defline/fasta_defline.rb +1 -5
- data/lib/bioroebe/fasta_and_fastq/fasta_to_yaml/fasta_to_yaml.rb +81 -0
- data/lib/bioroebe/fasta_and_fastq/parse_fasta/parse_fasta.rb +1518 -7
- data/lib/bioroebe/fasta_and_fastq/return_fasta_subsection_of_this_file.rb +11 -2
- data/lib/bioroebe/fasta_and_fastq/show_fasta_headers.rb +27 -12
- data/lib/bioroebe/fasta_and_fastq/simplify_fasta_header/simplify_fasta_header.rb +1 -5
- data/lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/constants.rb +0 -5
- data/lib/bioroebe/genome/README.md +4 -0
- data/lib/bioroebe/genome/genome.rb +130 -0
- data/lib/bioroebe/genomes/genome_pattern.rb +3 -9
- data/lib/bioroebe/gui/gtk +1 -0
- data/lib/bioroebe/gui/gtk3/alignment/alignment.rb +106 -137
- data/lib/bioroebe/gui/gtk3/aminoacid_composition/aminoacid_composition.rb +27 -61
- data/lib/bioroebe/gui/gtk3/aminoacid_composition/customized_dialog.rb +1 -1
- data/lib/bioroebe/gui/gtk3/blosum_matrix_viewer/blosum_matrix_viewer.rb +1 -2
- data/lib/bioroebe/gui/gtk3/calculate_cell_numbers_of_bacteria/calculate_cell_numbers_of_bacteria.rb +1 -2
- data/lib/bioroebe/gui/gtk3/controller/controller.rb +46 -29
- data/lib/bioroebe/gui/gtk3/dna_to_aminoacid_widget/dna_to_aminoacid_widget.rb +77 -52
- data/lib/bioroebe/gui/gtk3/dna_to_reverse_complement_widget/dna_to_reverse_complement_widget.rb +1 -2
- data/lib/bioroebe/gui/gtk3/fasta_table_widget/fasta_table_widget.rb +100 -23
- data/lib/bioroebe/gui/gtk3/format_converter/format_converter.rb +1 -2
- data/lib/bioroebe/gui/gtk3/gene/gene.rb +1 -2
- data/lib/bioroebe/gui/gtk3/hamming_distance/hamming_distance.rb +43 -30
- data/lib/bioroebe/gui/gtk3/levensthein_distance/levensthein_distance.rb +1 -2
- data/lib/bioroebe/gui/gtk3/nucleotide_analyser/nucleotide_analyser.rb +120 -73
- data/lib/bioroebe/gui/gtk3/primer_design_widget/primer_design_widget.rb +1 -2
- data/lib/bioroebe/gui/gtk3/protein_to_DNA/protein_to_DNA.rb +19 -20
- data/lib/bioroebe/gui/gtk3/random_sequence/random_sequence.rb +20 -13
- data/lib/bioroebe/gui/gtk3/restriction_enzymes/restriction_enzymes.rb +1 -2
- data/lib/bioroebe/gui/gtk3/show_codon_table/misc.rb +97 -22
- data/lib/bioroebe/gui/gtk3/show_codon_table/show_codon_table.rb +3 -73
- data/lib/bioroebe/gui/gtk3/show_codon_usage/show_codon_usage.rb +1 -2
- data/lib/bioroebe/gui/gtk3/sizeseq/sizeseq.rb +1 -2
- data/lib/bioroebe/gui/gtk3/three_to_one/three_to_one.rb +1 -2
- data/lib/bioroebe/gui/gtk3/www_finder/www_finder.rb +1 -2
- data/lib/bioroebe/gui/javafx/bioroebe/Bioroebe.class +0 -0
- data/lib/bioroebe/gui/javafx/bioroebe/Bioroebe.java +104 -0
- data/lib/bioroebe/gui/javafx/bioroebe.jar +0 -0
- data/lib/bioroebe/gui/javafx/bioroebe.mf +1 -0
- data/lib/bioroebe/gui/javafx/module-info.class +0 -0
- data/lib/bioroebe/gui/javafx/module-info.java +5 -0
- data/lib/bioroebe/gui/jruby/alignment/alignment.rb +165 -0
- data/lib/bioroebe/gui/jruby/aminoacid_composition/aminoacid_composition.rb +166 -0
- data/lib/bioroebe/gui/libui/alignment/alignment.rb +3 -1
- data/lib/bioroebe/gui/libui/controller/controller.rb +116 -0
- data/lib/bioroebe/gui/libui/random_sequence/random_sequence.rb +18 -2
- data/lib/bioroebe/gui/libui/show_codon_table/show_codon_table.rb +2 -0
- data/lib/bioroebe/gui/libui/three_to_one/three_to_one.rb +8 -6
- data/lib/bioroebe/gui/shared_code/alignment/alignment_module.rb +102 -0
- data/lib/bioroebe/gui/shared_code/aminoacid_composition/aminoacid_composition_module.rb +94 -0
- data/lib/bioroebe/gui/shared_code/levensthein_distance/levensthein_distance_module.rb +18 -16
- data/lib/bioroebe/gui/shared_code/protein_to_DNA/protein_to_DNA_module.rb +14 -14
- data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne$1.class +0 -0
- data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne$CloseListener.class +0 -0
- data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne.class +0 -0
- data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne.java +141 -0
- data/lib/bioroebe/images/FORWARD_PRIMER.png +0 -0
- data/lib/bioroebe/images/REVERSE_PRIMER.png +0 -0
- data/lib/bioroebe/images/images.html +29845 -0
- data/lib/bioroebe/java/README.md +5 -0
- data/lib/bioroebe/java/bioroebe/AllInOne.java +1 -0
- data/lib/bioroebe/java/bioroebe/Base.class +0 -0
- data/lib/bioroebe/java/bioroebe/Base.java +39 -5
- data/lib/bioroebe/java/bioroebe/IsPalindrome.java +23 -5
- data/lib/bioroebe/java/bioroebe/SanitizeNucleotideSequence.java +0 -0
- data/lib/bioroebe/java/bioroebe/Sequence.java +28 -3
- data/lib/bioroebe/java/bioroebe/ToCamelcase.class +0 -0
- data/lib/bioroebe/java/bioroebe/ToCamelcase.java +16 -4
- data/lib/bioroebe/java/bioroebe/ToRNA.java +43 -0
- data/lib/bioroebe/java/bioroebe/ToplevelMethods.java +6 -0
- data/lib/bioroebe/java/bioroebe/{BisulfiteTreatment.class → src/BisulfiteTreatment.class} +0 -0
- data/lib/bioroebe/java/bioroebe/{Codons.class → src/Codons.class} +0 -0
- data/lib/bioroebe/java/bioroebe/src/Codons.java +35 -0
- data/lib/bioroebe/java/bioroebe/src/Commandline.class +0 -0
- data/lib/bioroebe/java/bioroebe/src/Commandline.java +101 -0
- data/lib/bioroebe/java/bioroebe/{Esystem.class → src/Esystem.class} +0 -0
- data/lib/bioroebe/java/bioroebe/{Esystem.java → src/Esystem.java} +6 -1
- data/lib/bioroebe/java/bioroebe/{GenerateRandomDnaSequence.class → src/GenerateRandomDnaSequence.class} +0 -0
- data/lib/bioroebe/java/bioroebe/{GenerateRandomDnaSequence.java → src/GenerateRandomDnaSequence.java} +8 -2
- data/lib/bioroebe/java/bioroebe/src/PartnerNucleotide.class +0 -0
- data/lib/bioroebe/java/bioroebe/src/PartnerNucleotide.java +56 -0
- data/lib/bioroebe/java/bioroebe/{RemoveFile.java → src/RemoveFile.java} +10 -4
- data/lib/bioroebe/java/bioroebe/{RemoveNumbers.class → src/RemoveNumbers.class} +0 -0
- data/lib/bioroebe/java/bioroebe/{RemoveNumbers.java → src/RemoveNumbers.java} +1 -0
- data/lib/bioroebe/java/bioroebe/src/toplevel_methods/BaseComposition.class +0 -0
- data/lib/bioroebe/java/bioroebe/src/toplevel_methods/BaseComposition.java +75 -0
- data/lib/bioroebe/misc/ruler.rb +11 -2
- data/lib/bioroebe/nucleotides/most_likely_nucleotide_sequence_for_this_aminoacid_sequence.rb +1 -9
- data/lib/bioroebe/nucleotides/sanitize_nucleotide_sequence.rb +59 -18
- data/lib/bioroebe/nucleotides/show_nucleotide_sequence.rb +7 -7
- data/lib/bioroebe/parsers/genbank_parser.rb +347 -26
- data/lib/bioroebe/parsers/gff.rb +1 -9
- data/lib/bioroebe/patterns/scan_for_repeat.rb +1 -5
- data/lib/bioroebe/pdb/fetch_fasta_sequence_from_pdb.rb +1 -9
- data/lib/bioroebe/pdb/parse_mmCIF_file.rb +1 -9
- data/lib/bioroebe/pdb/parse_pdb_file.rb +4 -10
- data/lib/bioroebe/project/project.rb +1 -1
- data/lib/bioroebe/python/README.md +1 -0
- data/lib/bioroebe/python/__pycache__/mymodule.cpython-39.pyc +0 -0
- data/lib/bioroebe/python/gui/gtk3/all_in_one.css +4 -0
- data/lib/bioroebe/python/gui/gtk3/all_in_one.py +59 -0
- data/lib/bioroebe/python/gui/gtk3/widget1.py +20 -0
- data/lib/bioroebe/python/gui/tkinter/all_in_one.py +91 -0
- data/lib/bioroebe/python/mymodule.py +8 -0
- data/lib/bioroebe/python/protein_to_dna.py +33 -0
- data/lib/bioroebe/python/shell/shell.py +19 -0
- data/lib/bioroebe/python/to_rna.py +14 -0
- data/lib/bioroebe/python/toplevel_methods/convert_dna_to_aminoacid_sequence.py +137 -0
- data/lib/bioroebe/python/toplevel_methods/esystem.py +12 -0
- data/lib/bioroebe/python/toplevel_methods/open_in_browser.py +20 -0
- data/lib/bioroebe/python/toplevel_methods/palindromes.py +52 -0
- data/lib/bioroebe/python/toplevel_methods/rds.py +13 -0
- data/lib/bioroebe/python/toplevel_methods/shuffleseq.py +23 -0
- data/lib/bioroebe/python/toplevel_methods/three_delimiter.py +37 -0
- data/lib/bioroebe/python/toplevel_methods/time_and_date.py +43 -0
- data/lib/bioroebe/python/toplevel_methods/to_camelcase.py +21 -0
- data/lib/bioroebe/requires/require_cleave_and_digest.rb +3 -1
- data/lib/bioroebe/requires/require_the_bioroebe_project.rb +3 -1
- data/lib/bioroebe/sequence/alignment.rb +14 -4
- data/lib/bioroebe/sequence/dna.rb +1 -0
- data/lib/bioroebe/sequence/nucleotide_module/nucleotide_module.rb +28 -25
- data/lib/bioroebe/sequence/protein.rb +105 -3
- data/lib/bioroebe/sequence/rna.rb +220 -0
- data/lib/bioroebe/sequence/sequence.rb +128 -40
- data/lib/bioroebe/shell/menu.rb +3815 -3696
- data/lib/bioroebe/shell/misc.rb +9019 -3133
- data/lib/bioroebe/shell/readline/readline.rb +1 -1
- data/lib/bioroebe/shell/shell.rb +1137 -28
- data/lib/bioroebe/siRNA/siRNA.rb +81 -1
- data/lib/bioroebe/string_matching/find_longest_substring.rb +3 -2
- data/lib/bioroebe/string_matching/hamming_distance.rb +1 -9
- data/lib/bioroebe/taxonomy/class_methods.rb +3 -8
- data/lib/bioroebe/taxonomy/constants.rb +4 -3
- data/lib/bioroebe/taxonomy/edit.rb +2 -1
- data/lib/bioroebe/taxonomy/help/help.rb +10 -10
- data/lib/bioroebe/taxonomy/help/helpline.rb +2 -2
- data/lib/bioroebe/taxonomy/info/check_available.rb +15 -9
- data/lib/bioroebe/taxonomy/info/info.rb +18 -11
- data/lib/bioroebe/taxonomy/info/is_dna.rb +46 -36
- data/lib/bioroebe/taxonomy/interactive.rb +140 -104
- data/lib/bioroebe/taxonomy/menu.rb +27 -18
- data/lib/bioroebe/taxonomy/parse_fasta.rb +3 -1
- data/lib/bioroebe/taxonomy/shared.rb +1 -0
- data/lib/bioroebe/taxonomy/taxonomy.rb +1 -0
- data/lib/bioroebe/toplevel_methods/aminoacids_and_proteins.rb +31 -24
- data/lib/bioroebe/toplevel_methods/colourize_related_methods.rb +164 -0
- data/lib/bioroebe/toplevel_methods/databases.rb +1 -1
- data/lib/bioroebe/toplevel_methods/digest.rb +18 -8
- data/lib/bioroebe/toplevel_methods/fasta_and_fastq.rb +107 -63
- data/lib/bioroebe/toplevel_methods/file_and_directory_related_actions.rb +14 -2
- data/lib/bioroebe/toplevel_methods/frequencies.rb +8 -1
- data/lib/bioroebe/toplevel_methods/misc.rb +175 -11
- data/lib/bioroebe/toplevel_methods/nucleotides.rb +118 -46
- data/lib/bioroebe/toplevel_methods/open_in_browser.rb +2 -0
- data/lib/bioroebe/toplevel_methods/palindromes.rb +75 -47
- data/lib/bioroebe/toplevel_methods/taxonomy.rb +3 -3
- data/lib/bioroebe/toplevel_methods/to_camelcase.rb +5 -0
- data/lib/bioroebe/utility_scripts/align_open_reading_frames.rb +1 -9
- data/lib/bioroebe/utility_scripts/check_for_mismatches/check_for_mismatches.rb +1 -9
- data/lib/bioroebe/utility_scripts/compacter/compacter.rb +251 -0
- data/lib/bioroebe/utility_scripts/compseq/compseq.rb +1 -9
- data/lib/bioroebe/utility_scripts/consensus_sequence.rb +6 -6
- data/lib/bioroebe/utility_scripts/create_batch_entrez_file.rb +1 -9
- data/lib/bioroebe/utility_scripts/dot_alignment.rb +1 -9
- data/lib/bioroebe/utility_scripts/move_file_to_its_correct_location.rb +1 -4
- data/lib/bioroebe/utility_scripts/parse_taxonomy.rb +2 -2
- data/lib/bioroebe/utility_scripts/permutations.rb +36 -9
- data/lib/bioroebe/utility_scripts/showorf/constants.rb +0 -5
- data/lib/bioroebe/utility_scripts/showorf/reset.rb +1 -4
- data/lib/bioroebe/version/version.rb +2 -2
- data/lib/bioroebe/www/embeddable_interface.rb +121 -58
- data/lib/bioroebe/www/sinatra/sinatra.rb +186 -71
- data/lib/bioroebe/yaml/aminoacids/amino_acids_long_name_to_one_letter.yml +2 -2
- data/lib/bioroebe/yaml/aminoacids/weight_of_common_proteins.yml +17 -17
- data/lib/bioroebe/yaml/configuration/browser.yml +1 -1
- data/lib/bioroebe/yaml/configuration/temp_dir.yml +1 -1
- data/lib/bioroebe/yaml/consensus_sequences/consensus_sequences.yml +1 -0
- data/lib/bioroebe/yaml/genomes/README.md +3 -4
- data/lib/bioroebe/yaml/nucleotides/nucleotides.yml +5 -0
- data/lib/bioroebe/yaml/restriction_enzymes/restriction_enzymes.yml +57 -57
- data/spec/README.md +6 -0
- data/spec/project_wide_specification/classes.md +5 -0
- metadata +107 -70
- data/doc/setup.rb +0 -1655
- data/lib/bioroebe/fasta_and_fastq/parse_fasta/constants.rb +0 -50
- data/lib/bioroebe/fasta_and_fastq/parse_fasta/initialize.rb +0 -86
- data/lib/bioroebe/fasta_and_fastq/parse_fasta/menu.rb +0 -117
- data/lib/bioroebe/fasta_and_fastq/parse_fasta/misc.rb +0 -981
- data/lib/bioroebe/fasta_and_fastq/parse_fasta/report.rb +0 -156
- data/lib/bioroebe/fasta_and_fastq/parse_fasta/reset.rb +0 -128
- data/lib/bioroebe/genbank/genbank_parser.rb +0 -291
- data/lib/bioroebe/java/bioroebe/AllInOne.class +0 -0
- data/lib/bioroebe/java/bioroebe/Cat.class +0 -0
- data/lib/bioroebe/java/bioroebe/Codons.java +0 -22
- data/lib/bioroebe/java/bioroebe/IsPalindrome.class +0 -0
- data/lib/bioroebe/java/bioroebe/PartnerNucleotide.class +0 -0
- data/lib/bioroebe/java/bioroebe/PartnerNucleotide.java +0 -19
- data/lib/bioroebe/java/bioroebe/SanitizeNucleotideSequence.class +0 -0
- data/lib/bioroebe/java/bioroebe/ToplevelMethods.class +0 -0
- data/lib/bioroebe/java/bioroebe.jar +0 -0
- data/lib/bioroebe/shell/add.rb +0 -108
- data/lib/bioroebe/shell/assign.rb +0 -360
- data/lib/bioroebe/shell/chop_and_cut.rb +0 -281
- data/lib/bioroebe/shell/constants.rb +0 -166
- data/lib/bioroebe/shell/download.rb +0 -335
- data/lib/bioroebe/shell/enable_and_disable.rb +0 -158
- data/lib/bioroebe/shell/enzymes.rb +0 -310
- data/lib/bioroebe/shell/fasta.rb +0 -345
- data/lib/bioroebe/shell/gtk.rb +0 -76
- data/lib/bioroebe/shell/history.rb +0 -132
- data/lib/bioroebe/shell/initialize.rb +0 -217
- data/lib/bioroebe/shell/loop.rb +0 -74
- data/lib/bioroebe/shell/prompt.rb +0 -107
- data/lib/bioroebe/shell/random.rb +0 -289
- data/lib/bioroebe/shell/reset.rb +0 -335
- data/lib/bioroebe/shell/scan_and_parse.rb +0 -135
- data/lib/bioroebe/shell/search.rb +0 -337
- data/lib/bioroebe/shell/sequences.rb +0 -200
- data/lib/bioroebe/shell/show_report_and_display.rb +0 -2901
- data/lib/bioroebe/shell/startup.rb +0 -127
- data/lib/bioroebe/shell/taxonomy.rb +0 -14
- data/lib/bioroebe/shell/tk.rb +0 -23
- data/lib/bioroebe/shell/user_input.rb +0 -88
- data/lib/bioroebe/shell/xorg.rb +0 -45
- data/lib/bioroebe/utility_scripts/compacter.rb +0 -131
- /data/lib/bioroebe/java/bioroebe/{BisulfiteTreatment.java → src/BisulfiteTreatment.java} +0 -0
- /data/lib/bioroebe/java/bioroebe/{RemoveFile.class → src/RemoveFile.class} +0 -0
@@ -2,15 +2,1526 @@
|
|
2
2
|
# Encoding: UTF-8
|
3
3
|
# frozen_string_literal: true
|
4
4
|
# =========================================================================== #
|
5
|
+
# === Bioroebe::ParseFasta
|
6
|
+
#
|
7
|
+
# This class will parse through a local FASTA file and find the
|
8
|
+
# proper entries.
|
9
|
+
#
|
10
|
+
# A FASTA file may have nucleotides or an aminoacid-sequence, so
|
11
|
+
# we have to keep this in mind when parsing it.
|
12
|
+
#
|
13
|
+
# Usage examples:
|
14
|
+
#
|
15
|
+
# Bioroebe::ParseFasta.new(ARGV)
|
16
|
+
# Bioroebe.parse_fasta(ARGV)
|
17
|
+
#
|
18
|
+
# =========================================================================== #
|
5
19
|
# require 'bioroebe/fasta_and_fastq/parse_fasta/parse_fasta.rb'
|
6
|
-
# Bioroebe
|
20
|
+
# Bioroebe.parse_fasta
|
21
|
+
# Bioroebe.sizeseq
|
7
22
|
# =========================================================================== #
|
8
23
|
require 'bioroebe/base/commandline_application/commandline_application.rb'
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
24
|
+
|
25
|
+
module Bioroebe
|
26
|
+
|
27
|
+
class ParseFasta < ::Bioroebe::CommandlineApplication # === Bioroebe::ParseFasta
|
28
|
+
|
29
|
+
require 'bioroebe/sequence/dna.rb'
|
30
|
+
require 'bioroebe/calculate/calculate_gc_content.rb'
|
31
|
+
|
32
|
+
# ========================================================================= #
|
33
|
+
# === REGEX_NON_NUCLEOTIDES
|
34
|
+
#
|
35
|
+
# All non-nucleotides will be handled here via this regex.
|
36
|
+
#
|
37
|
+
# N is excluded because it may stand for "any" nucleotide too, at
|
38
|
+
# the least for a purine.
|
39
|
+
# ========================================================================= #
|
40
|
+
REGEX_NON_NUCLEOTIDES =
|
41
|
+
/BDEFHIJKLMOPQRSVWXYZ/
|
42
|
+
|
43
|
+
# ========================================================================= #
|
44
|
+
# === DEFAULT_FASTA
|
45
|
+
#
|
46
|
+
# This String can be used to quickly test code depending on FASTA
|
47
|
+
# entries.
|
48
|
+
# ========================================================================= #
|
49
|
+
DEFAULT_FASTA = '>Rosalind_6404
|
50
|
+
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
|
51
|
+
TCCCACTAATAATTCTGAGG
|
52
|
+
>Rosalind_5959
|
53
|
+
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
|
54
|
+
ATATCCATTTGTCAGCAGACACGC
|
55
|
+
>Rosalind_0808
|
56
|
+
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
|
57
|
+
TGGGAACCTGCGGGCAGTAGGTGGAAT'
|
58
|
+
|
59
|
+
# ========================================================================= #
|
60
|
+
# === DEFAULT_ROUND_TO
|
61
|
+
# ========================================================================= #
|
62
|
+
DEFAULT_ROUND_TO = 2
|
63
|
+
|
64
|
+
# ========================================================================= #
|
65
|
+
# === initialize
|
66
|
+
# ========================================================================= #
|
67
|
+
def initialize(
|
68
|
+
i = DEFAULT_FASTA,
|
69
|
+
run_already = true,
|
70
|
+
&block
|
71
|
+
)
|
72
|
+
reset
|
73
|
+
# ======================================================================= #
|
74
|
+
# === Handle blocks next
|
75
|
+
# ======================================================================= #
|
76
|
+
if block_given?
|
77
|
+
yielded = yield
|
78
|
+
# ===================================================================== #
|
79
|
+
# First handle Symbols.
|
80
|
+
# ===================================================================== #
|
81
|
+
case yielded
|
82
|
+
# ===================================================================== #
|
83
|
+
# === :be_verbose
|
84
|
+
# ===================================================================== #
|
85
|
+
when :be_verbose,
|
86
|
+
:verbose
|
87
|
+
set_be_verbose_and_report_the_sequence
|
88
|
+
# ===================================================================== #
|
89
|
+
# === :be_quiet
|
90
|
+
# ===================================================================== #
|
91
|
+
when :be_quiet,
|
92
|
+
:be_silent
|
93
|
+
be_quiet
|
94
|
+
# ===================================================================== #
|
95
|
+
# === :sizeseq
|
96
|
+
# ===================================================================== #
|
97
|
+
when :sizeseq
|
98
|
+
@sort_by_size = true
|
99
|
+
end
|
100
|
+
# ===================================================================== #
|
101
|
+
# === Handle Hashes next
|
102
|
+
# ===================================================================== #
|
103
|
+
if yielded.is_a? Hash
|
104
|
+
# =================================================================== #
|
105
|
+
# === :be_verbose
|
106
|
+
# =================================================================== #
|
107
|
+
if yielded.has_key? :be_verbose
|
108
|
+
set_be_verbose(yielded.delete(:be_verbose))
|
109
|
+
@internal_hash[:report_the_sequence] = true
|
110
|
+
end
|
111
|
+
# =================================================================== #
|
112
|
+
# === :use_colours
|
113
|
+
# =================================================================== #
|
114
|
+
if yielded.has_key? :use_colours
|
115
|
+
set_use_colours(
|
116
|
+
yielded.delete(:use_colours)
|
117
|
+
)
|
118
|
+
end
|
119
|
+
# =================================================================== #
|
120
|
+
# === :sizeseq
|
121
|
+
# =================================================================== #
|
122
|
+
if yielded.has_key? :sizeseq
|
123
|
+
@sort_by_size = true
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
set_commandline_arguments(i)
|
128
|
+
case run_already
|
129
|
+
# ======================================================================= #
|
130
|
+
# === :dont_run_yet
|
131
|
+
# ======================================================================= #
|
132
|
+
when :dont_run_yet,
|
133
|
+
:do_not_run_yet
|
134
|
+
run_already = false
|
135
|
+
end
|
136
|
+
run if run_already
|
137
|
+
end
|
138
|
+
|
139
|
+
# ========================================================================= #
|
140
|
+
# === reset (reset tag)
|
141
|
+
# ========================================================================= #
|
142
|
+
def reset
|
143
|
+
super()
|
144
|
+
infer_the_namespace
|
145
|
+
# ======================================================================= #
|
146
|
+
# === @is_a_genbank_file
|
147
|
+
# ======================================================================= #
|
148
|
+
@is_a_genbank_file = false
|
149
|
+
# ======================================================================= #
|
150
|
+
# === @input_file
|
151
|
+
#
|
152
|
+
# This variable denotes which input file is used to read data from.
|
153
|
+
#
|
154
|
+
# It is nil initially because we may skip reading from an existing
|
155
|
+
# file and e. g. only read from a String or some other non-file
|
156
|
+
# entity.
|
157
|
+
# ======================================================================= #
|
158
|
+
@input_file = nil
|
159
|
+
# ======================================================================= #
|
160
|
+
# === @hash
|
161
|
+
#
|
162
|
+
# This is the main variable for the class. It will keep entries such
|
163
|
+
# as this one here:
|
164
|
+
#
|
165
|
+
# {
|
166
|
+
# "ENSMUSG00000020122|ENSMUST08" => "CCCTCC"
|
167
|
+
# }
|
168
|
+
#
|
169
|
+
# ======================================================================= #
|
170
|
+
@hash = {}
|
171
|
+
# ======================================================================= #
|
172
|
+
# === @internal_hash
|
173
|
+
#
|
174
|
+
# This Hash exists for internal configuration of the class.
|
175
|
+
# ======================================================================= #
|
176
|
+
@internal_hash = {}
|
177
|
+
# ======================================================================= #
|
178
|
+
# === :report_the_sequence
|
179
|
+
# ======================================================================= #
|
180
|
+
@internal_hash[:report_the_sequence] = false
|
181
|
+
# ======================================================================= #
|
182
|
+
# === :overwrite_the_original_file
|
183
|
+
# ======================================================================= #
|
184
|
+
@internal_hash[:overwrite_the_original_file] = false
|
185
|
+
# ======================================================================= #
|
186
|
+
# === :save_the_file
|
187
|
+
# ======================================================================= #
|
188
|
+
@internal_hash[:save_the_file] = false
|
189
|
+
# ======================================================================= #
|
190
|
+
# === :remove_numbers_from_input
|
191
|
+
# ======================================================================= #
|
192
|
+
@internal_hash[:remove_numbers_from_input] = false
|
193
|
+
# ======================================================================= #
|
194
|
+
# === :sanitize_the_file
|
195
|
+
#
|
196
|
+
# If the following variable is true then the .fasta file at hand will
|
197
|
+
# be modified.
|
198
|
+
# ======================================================================= #
|
199
|
+
@internal_hash[:sanitize_the_file] = false
|
200
|
+
# ======================================================================= #
|
201
|
+
# === :show_the_translated_protein_sequence
|
202
|
+
#
|
203
|
+
# This setting is false initially. If set to true via the commandline
|
204
|
+
# then report() will show the translated protein sequence as well.
|
205
|
+
# ======================================================================= #
|
206
|
+
@internal_hash[:show_the_translated_protein_sequence] = false
|
207
|
+
# ======================================================================= #
|
208
|
+
# === :condense_the_sequence_onto_a_single_line
|
209
|
+
#
|
210
|
+
# By default the output of this class will include newlines for the
|
211
|
+
# sequence. If this is not wanted by the user then the following
|
212
|
+
# variable keeps track of that behaviour. You can use the flag
|
213
|
+
# called --one-line to enable a condensed output, with newlines
|
214
|
+
# being removed.
|
215
|
+
# ======================================================================= #
|
216
|
+
@internal_hash[:condense_the_sequence_onto_a_single_line] = false
|
217
|
+
# ======================================================================= #
|
218
|
+
# === :limit_the_display_to_n_nucleotides
|
219
|
+
#
|
220
|
+
# If this variable is a number rather than nil, then it will be used
|
221
|
+
# to display only a limited number of nucleotides, e. g. "1000" if
|
222
|
+
# the user passes in 1000.
|
223
|
+
# ======================================================================= #
|
224
|
+
@internal_hash[:limit_the_display_to_n_nucleotides] = nil
|
225
|
+
# ======================================================================= #
|
226
|
+
# === @may_we_exit
|
227
|
+
# ======================================================================= #
|
228
|
+
@may_we_exit = false
|
229
|
+
# ======================================================================= #
|
230
|
+
# === @current_key
|
231
|
+
# ======================================================================= #
|
232
|
+
@current_key = nil
|
233
|
+
# ======================================================================= #
|
234
|
+
# === @use_opn
|
235
|
+
# ======================================================================= #
|
236
|
+
@use_opn = ::Bioroebe.use_opn?
|
237
|
+
# ======================================================================= #
|
238
|
+
# === @colourize_sequence
|
239
|
+
# ======================================================================= #
|
240
|
+
@colourize_sequence = false
|
241
|
+
# ======================================================================= #
|
242
|
+
# === @sort_by_size
|
243
|
+
#
|
244
|
+
# If the following variable is set to true, then this class will
|
245
|
+
# run a sizeseq-comparison, that is, it will compare all sequences
|
246
|
+
# and output them in a size-sorted manner, similar to the EMBOSS
|
247
|
+
# sizeseq action.
|
248
|
+
# ======================================================================= #
|
249
|
+
@sort_by_size = false
|
250
|
+
# ======================================================================= #
|
251
|
+
# === @show_the_header
|
252
|
+
#
|
253
|
+
# If this variable is true then the header will be shown.
|
254
|
+
# ======================================================================= #
|
255
|
+
@show_the_header = false
|
256
|
+
set_round_to :default
|
257
|
+
set_be_verbose
|
258
|
+
end
|
259
|
+
|
260
|
+
# ========================================================================= #
|
261
|
+
# === menu (menu tag)
|
262
|
+
# ========================================================================= #
|
263
|
+
def menu(
|
264
|
+
i = return_commandline_arguments_that_are_not_files
|
265
|
+
)
|
266
|
+
if i.is_a? Array
|
267
|
+
i.each {|entry| menu(entry) }
|
268
|
+
else
|
269
|
+
case i # case tag
|
270
|
+
# ===================================================================== #
|
271
|
+
# === --sanitize-the-file
|
272
|
+
#
|
273
|
+
# This entry point allows the user to quickly sanitize a .fasta file.
|
274
|
+
#
|
275
|
+
# In this context, the default actions done will be to remove all
|
276
|
+
# ' ' in a given line, and to upcase the content. Other than that
|
277
|
+
# no modifications will be made.
|
278
|
+
# ===================================================================== #
|
279
|
+
when /^-?-?sanitize(-|_)?the(-|_)?file$/i
|
280
|
+
@internal_hash[:sanitize_the_file] = true
|
281
|
+
# ===================================================================== #
|
282
|
+
# === --to-protein
|
283
|
+
#
|
284
|
+
# A few aliases exist to this, such as --convert and --translate.
|
285
|
+
#
|
286
|
+
# Invocation example:
|
287
|
+
#
|
288
|
+
# pfasta *.fasta --toprotein
|
289
|
+
#
|
290
|
+
# ===================================================================== #
|
291
|
+
when /^-?-?to(-|_)?protein$/i,
|
292
|
+
/^-?-?convert$/i,
|
293
|
+
/^-?-?translate$/i
|
294
|
+
@internal_hash[:show_the_translated_protein_sequence] = true
|
295
|
+
# ===================================================================== #
|
296
|
+
# === --one-line
|
297
|
+
#
|
298
|
+
# Invocation example:
|
299
|
+
#
|
300
|
+
# pfasta rpoS_NC_000913.3.fasta --one-line
|
301
|
+
#
|
302
|
+
# ===================================================================== #
|
303
|
+
when /^-?-?one(-|_)?liner?/i
|
304
|
+
@internal_hash[:condense_the_sequence_onto_a_single_line] = true
|
305
|
+
# ===================================================================== #
|
306
|
+
# === --limit=1000
|
307
|
+
#
|
308
|
+
# Invocation example:
|
309
|
+
#
|
310
|
+
# pfasta --limit=1000
|
311
|
+
#
|
312
|
+
# ===================================================================== #
|
313
|
+
when /^-?-?limit=(\d+)$/i
|
314
|
+
@internal_hash[:limit_the_display_to_n_nucleotides] = $1.to_s.dup.to_i
|
315
|
+
# ===================================================================== #
|
316
|
+
# === --overwrite
|
317
|
+
# ===================================================================== #
|
318
|
+
when /^-?-?overwrite/i
|
319
|
+
@internal_hash[:overwrite_the_original_file] = true
|
320
|
+
# ===================================================================== #
|
321
|
+
# === --help
|
322
|
+
#
|
323
|
+
# Usage example:
|
324
|
+
#
|
325
|
+
# parse_fasta --help
|
326
|
+
#
|
327
|
+
# ===================================================================== #
|
328
|
+
when /^-?-?help/i
|
329
|
+
show_help
|
330
|
+
exit
|
331
|
+
# ===================================================================== #
|
332
|
+
# === --save-file
|
333
|
+
# ===================================================================== #
|
334
|
+
when /^-?-?save(-|_)?file/i
|
335
|
+
@internal_hash[:save_the_file] = true
|
336
|
+
# ===================================================================== #
|
337
|
+
# === --also-show-the-sequence
|
338
|
+
#
|
339
|
+
# To invoke this method try:
|
340
|
+
#
|
341
|
+
# parsefasta /Depot/Bioroebe/NP_013521.3_289_aa.fasta --show
|
342
|
+
#
|
343
|
+
# ===================================================================== #
|
344
|
+
when /^-?-?also(-|_)?show(-|_)?the(-|_)?sequence$/i,
|
345
|
+
/^-?-?report$/i,
|
346
|
+
/^-?-?show$/i
|
347
|
+
@internal_hash[:report_the_sequence] = true
|
348
|
+
# ===================================================================== #
|
349
|
+
# === --header
|
350
|
+
# ===================================================================== #
|
351
|
+
when /^-?-?header/i
|
352
|
+
do_show_the_header
|
353
|
+
# ===================================================================== #
|
354
|
+
# === --short
|
355
|
+
#
|
356
|
+
# This entry point can be used to show 300 nucleotides and not
|
357
|
+
# more, by simply using the --short commandline flag.
|
358
|
+
# ===================================================================== #
|
359
|
+
when /^-?-?short/i
|
360
|
+
@internal_hash[:limit_the_display_to_n_nucleotides] = 300
|
361
|
+
# ===================================================================== #
|
362
|
+
# === --size
|
363
|
+
#
|
364
|
+
# This will simply tell us how many nucleotides the given sequence
|
365
|
+
# has, then exit.
|
366
|
+
#
|
367
|
+
# To invoke this method try:
|
368
|
+
#
|
369
|
+
# parsefasta /Depot/Bioroebe/NP_013521.3_289_aa.fasta --size
|
370
|
+
#
|
371
|
+
# ===================================================================== #
|
372
|
+
when /^-?-?size$/i
|
373
|
+
set_be_quiet
|
374
|
+
do_process_the_commandline_arguments_that_are_files
|
375
|
+
erev size? # Report the size here.
|
376
|
+
exit
|
377
|
+
end
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
# ========================================================================= #
|
382
|
+
# === show_the_translated_protein_sequence?
|
383
|
+
# ========================================================================= #
|
384
|
+
def show_the_translated_protein_sequence?
|
385
|
+
@internal_hash[:show_the_translated_protein_sequence]
|
386
|
+
end
|
387
|
+
|
388
|
+
# ========================================================================= #
|
389
|
+
# === set_round_to
|
390
|
+
#
|
391
|
+
# This will set to how many decimal numbers we will round to. This is
|
392
|
+
# mostly done for display-purposes, hence why the default is a fairly
|
393
|
+
# low value.
|
394
|
+
# ========================================================================= #
|
395
|
+
def set_round_to(
|
396
|
+
i = :default
|
397
|
+
)
|
398
|
+
case i
|
399
|
+
# ======================================================================= #
|
400
|
+
# === :default
|
401
|
+
#
|
402
|
+
# Since as of April 2021, the new default is 2, for rounding.
|
403
|
+
# ======================================================================= #
|
404
|
+
when :default
|
405
|
+
i = DEFAULT_ROUND_TO
|
406
|
+
end
|
407
|
+
@internal_hash[:round_to] = i.to_i
|
408
|
+
end
|
409
|
+
|
410
|
+
# ========================================================================= #
|
411
|
+
# === do_process_the_commandline_arguments_that_are_files
|
412
|
+
# ========================================================================= #
|
413
|
+
def do_process_the_commandline_arguments_that_are_files(
|
414
|
+
these_files = commandline_arguments_that_are_files?
|
415
|
+
)
|
416
|
+
unless these_files.is_a? Array
|
417
|
+
these_files = [these_files].flatten.compact
|
418
|
+
end
|
419
|
+
these_files.each {|this_file|
|
420
|
+
set_input_file(this_file)
|
421
|
+
set_data # This will use the default file.
|
422
|
+
split_into_proper_sections
|
423
|
+
report_the_FASTA_header if @show_the_header
|
424
|
+
if @sort_by_size
|
425
|
+
run_sizeseq_comparison
|
426
|
+
else
|
427
|
+
# =================================================================== #
|
428
|
+
# === Handle cases where the input is a protein
|
429
|
+
# =================================================================== #
|
430
|
+
if is_the_sequence_a_polypeptide?
|
431
|
+
if be_verbose?
|
432
|
+
erev "This sequence is assumed to be a #{royalblue('protein')}#{rev}."
|
433
|
+
report_how_many_elements_we_have_found
|
434
|
+
end
|
435
|
+
else # Must be a protein.
|
436
|
+
# =================================================================== #
|
437
|
+
# === Else it must be RNA or DNA
|
438
|
+
# =================================================================== #
|
439
|
+
if be_verbose?
|
440
|
+
erev "This sequence is assumed to "\
|
441
|
+
"be #{royalblue('DNA')}#{rev} or #{royalblue('RNA')}#{rev}."
|
442
|
+
end
|
443
|
+
calculate_gc_content # GC content makes only sense for nucleotides.
|
444
|
+
report_how_many_elements_we_have_found if be_verbose?
|
445
|
+
end
|
446
|
+
if be_verbose?
|
447
|
+
report_the_nucleotide_composition
|
448
|
+
report_on_how_many_entries_we_did_work
|
449
|
+
if report_the_sequence?
|
450
|
+
do_report_the_sequence
|
451
|
+
end
|
452
|
+
end
|
453
|
+
end
|
454
|
+
}
|
455
|
+
end
|
456
|
+
|
457
|
+
# ========================================================================= #
|
458
|
+
# === sanitize_the_description
|
459
|
+
#
|
460
|
+
# This method will iterate over the description entry and sanitize
|
461
|
+
# it. In this context sanitizing means to add the "length" entry,
|
462
|
+
# and the "type" entry, such as in:
|
463
|
+
#
|
464
|
+
# " # length=231; type=dna"
|
465
|
+
#
|
466
|
+
# ========================================================================= #
|
467
|
+
def sanitize_the_description
|
468
|
+
@data.map! {|line|
|
469
|
+
if line.start_with?('>') and !line.include?('length=')
|
470
|
+
length = 0
|
471
|
+
if @hash.has_key? line.delete('>')
|
472
|
+
length = @hash[line.delete('>')].size
|
473
|
+
end
|
474
|
+
line << " # length=#{length}; type=dna" # Currently hardcoded to DNA.
|
475
|
+
end
|
476
|
+
line
|
477
|
+
}
|
478
|
+
end
|
479
|
+
|
480
|
+
# ========================================================================= #
|
481
|
+
# === entries?
|
482
|
+
# ========================================================================= #
|
483
|
+
def entries?
|
484
|
+
@data
|
485
|
+
end
|
486
|
+
|
487
|
+
# ========================================================================= #
|
488
|
+
# === we_may_exit
|
489
|
+
# ========================================================================= #
|
490
|
+
def we_may_exit
|
491
|
+
@may_we_exit = true
|
492
|
+
end
|
493
|
+
|
494
|
+
# ========================================================================= #
|
495
|
+
# === output_results
|
496
|
+
# ========================================================================= #
|
497
|
+
def output_results
|
498
|
+
pp @hash
|
499
|
+
end
|
500
|
+
|
501
|
+
# ========================================================================= #
|
502
|
+
# === do_report_the_sequence (report tag)
|
503
|
+
#
|
504
|
+
# This method is used to display the main sequence at hand.
|
505
|
+
# ========================================================================= #
|
506
|
+
def do_report_the_sequence
|
507
|
+
_ = main_sequence?
|
508
|
+
# ======================================================================= #
|
509
|
+
# Honour the --limit commandline flag next.
|
510
|
+
# ======================================================================= #
|
511
|
+
if @internal_hash[:limit_the_display_to_n_nucleotides]
|
512
|
+
_ = _[0 .. (@internal_hash[:limit_the_display_to_n_nucleotides] - 1)]
|
513
|
+
end
|
514
|
+
if @colourize_sequence
|
515
|
+
if is_polynucleotide?
|
516
|
+
# =================================================================== #
|
517
|
+
# Else assume this is DNA/RNA input.
|
518
|
+
# =================================================================== #
|
519
|
+
_.gsub!(/A/, teal('A')+rev)
|
520
|
+
_.gsub!(/C/, slateblue('C')+rev)
|
521
|
+
_.gsub!(/G/, royalblue('G')+rev)
|
522
|
+
_.gsub!(/T/, steelblue('T')+rev)
|
523
|
+
_.gsub!(/U/, steelblue('U')+rev)
|
524
|
+
#else
|
525
|
+
end
|
526
|
+
end
|
527
|
+
if condense_the_sequence_onto_a_single_line?
|
528
|
+
_ = _.delete("\n")
|
529
|
+
end
|
530
|
+
erev colourize_this_nucleotide_sequence(_)
|
531
|
+
e if condense_the_sequence_onto_a_single_line?
|
532
|
+
if show_the_translated_protein_sequence?
|
533
|
+
# ===================================================================== #
|
534
|
+
# Do show the translated protein sequence next:
|
535
|
+
# ===================================================================== #
|
536
|
+
translated_into_aa = Bioroebe.to_aa(_)
|
537
|
+
translated_into_aa_and_colourized = translated_into_aa.dup
|
538
|
+
if translated_into_aa.include? '*'
|
539
|
+
translated_into_aa_and_colourized = translated_into_aa.gsub(/\*/,tomato('*'))
|
540
|
+
end
|
541
|
+
erev 'The translated aminoacid sequence of '+
|
542
|
+
sfancy(translated_into_aa.size.to_s)+rev+
|
543
|
+
' aminoacids is:'
|
544
|
+
e
|
545
|
+
erev steelblue(" #{translated_into_aa_and_colourized}")
|
546
|
+
e
|
547
|
+
end
|
548
|
+
end; alias display do_report_the_sequence # === display
|
549
|
+
alias report do_report_the_sequence # === report
|
550
|
+
|
551
|
+
# ========================================================================= #
|
552
|
+
# === report_the_nucleotide_composition
|
553
|
+
# ========================================================================= #
|
554
|
+
def report_the_nucleotide_composition
|
555
|
+
if is_this_sequence_a_polynucleotide_sequence?
|
556
|
+
first = @hash.values.first.upcase
|
557
|
+
total_size = first.size
|
558
|
+
n_adenines = first.count('A')
|
559
|
+
n_thymidines = first.count('T')
|
560
|
+
n_cytodines = first.count('C')
|
561
|
+
n_guanines = first.count('G')
|
562
|
+
erev "The nucleotide composition is as follows:"
|
563
|
+
e " "\
|
564
|
+
"#{steelblue(n_adenines)}#{rev}x A (#{(n_adenines * 100.0 / total_size).round(2)}%), "\
|
565
|
+
"#{steelblue(n_thymidines)}#{rev}x T (#{(n_thymidines * 100.0 / total_size).round(2)}%), "\
|
566
|
+
"#{steelblue(n_cytodines)}#{rev}x C (#{(n_cytodines * 100.0 / total_size).round(2)}%), "\
|
567
|
+
"#{steelblue(n_guanines)}#{rev}x G (#{(n_guanines * 100.0 / total_size).round(2)}%)"
|
568
|
+
elsif is_a_protein?
|
569
|
+
# ===================================================================== #
|
570
|
+
# Report the composition of the protein:
|
571
|
+
# ===================================================================== #
|
572
|
+
sequence = @hash.values.first.delete("\n")
|
573
|
+
erev "The protein composition (aminoacids) is as follows:"
|
574
|
+
# e colourize_this_aminoacid_sequence_for_the_commandline(" #{sequence}")
|
575
|
+
e orchid(" #{sequence}")
|
576
|
+
end
|
577
|
+
end; alias report_the_protein_composition report_the_nucleotide_composition # === report_the_protein_composition
|
578
|
+
|
579
|
+
# ========================================================================= #
|
580
|
+
# === report_how_many_elements_we_have_found
|
581
|
+
# ========================================================================= #
|
582
|
+
def report_how_many_elements_we_have_found
|
583
|
+
if @hash
|
584
|
+
first = @hash.values.first.delete("\n")
|
585
|
+
size = first.size.to_s
|
586
|
+
if be_verbose?
|
587
|
+
n_start_codons = first.count('ATG')
|
588
|
+
# =================================================================== #
|
589
|
+
# We upcase it since as of October 2021, as some FASTA files may
|
590
|
+
# include the sequence in lowercased characters.
|
591
|
+
# =================================================================== #
|
592
|
+
n_start_codons += first.reverse.upcase.count('ATG')
|
593
|
+
result = "This sequence contains #{simp(size.to_s)}#{rev}"\
|
594
|
+
" #{nucleotides_or_aminoacids?}".dup
|
595
|
+
if is_a_nucleotide?
|
596
|
+
result << " and #{n_start_codons} "\
|
597
|
+
"ATG codons (on both strands) in total"
|
598
|
+
end
|
599
|
+
result << '.'
|
600
|
+
if size.to_i > 1_000_000
|
601
|
+
# ================================================================= #
|
602
|
+
# Format the number with '_' characters.
|
603
|
+
# ================================================================= #
|
604
|
+
formatted = size.to_i.to_s.reverse.split(/(.{3})/).reject(&:empty?).join('_').reverse
|
605
|
+
result = result.dup if result.frozen?
|
606
|
+
result << ' ('+simp(formatted+' bp')+rev+')'
|
607
|
+
end
|
608
|
+
erev result
|
609
|
+
end
|
610
|
+
end
|
611
|
+
end
|
612
|
+
|
613
|
+
# ========================================================================= #
|
614
|
+
# === report_on_how_many_entries_we_did_work
|
615
|
+
# ========================================================================= #
|
616
|
+
def report_on_how_many_entries_we_did_work
|
617
|
+
if be_verbose?
|
618
|
+
entry_or_entries = 'entry'
|
619
|
+
if @hash.keys.size > 1
|
620
|
+
entry_or_entries = 'entries'
|
621
|
+
end
|
622
|
+
erev "We have identified a total of #{orange(@hash.keys.size)}"\
|
623
|
+
"#{rev} #{entry_or_entries} in this fasta dataset."
|
624
|
+
e
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
# ========================================================================= #
|
629
|
+
# === report_the_FASTA_header
|
630
|
+
# ========================================================================= #
|
631
|
+
def report_the_FASTA_header
|
632
|
+
e "#{rev}The header is: #{steelblue(header?)}"
|
633
|
+
end
|
634
|
+
|
635
|
+
# ========================================================================= #
|
636
|
+
# === report_the_sequence?
|
637
|
+
# ========================================================================= #
|
638
|
+
def report_the_sequence?
|
639
|
+
@internal_hash[:report_the_sequence]
|
640
|
+
end
|
641
|
+
|
642
|
+
# ========================================================================= #
|
643
|
+
# === sanitize_data
|
644
|
+
# ========================================================================= #
|
645
|
+
def sanitize_data(i)
|
646
|
+
if i.is_a? Array
|
647
|
+
i.flatten!
|
648
|
+
i.reject! {|entry| entry.start_with? '#' }
|
649
|
+
i.reject! {|entry| entry.strip.empty? }
|
650
|
+
if i.first and i.first.include? "\r"
|
651
|
+
# =================================================================== #
|
652
|
+
# Some FASTA files include "\r" line endings. We will check first
|
653
|
+
# for the first entry to contain a \r, and if so, we assume the
|
654
|
+
# whole FASTA file may have \r, which then will be removed.
|
655
|
+
# =================================================================== #
|
656
|
+
i.map! {|entry| entry.delete("\r") }
|
657
|
+
end
|
658
|
+
end
|
659
|
+
# ========================================================================= #
|
660
|
+
# === Run through SanitizeNucleotideSequence
|
661
|
+
# ========================================================================= #
|
662
|
+
if @internal_hash[:remove_numbers_from_input]
|
663
|
+
i = Bioroebe::SanitizeNucleotideSequence[i]
|
664
|
+
end
|
665
|
+
i
|
666
|
+
end
|
667
|
+
|
668
|
+
# ========================================================================= #
|
669
|
+
# === current_key?
|
670
|
+
# ========================================================================= #
|
671
|
+
def current_key?
|
672
|
+
@current_key
|
673
|
+
end; alias id? current_key? # === id?
|
674
|
+
alias sequence_id? current_key? # === sequence_id?
|
675
|
+
alias title current_key? # === title
|
676
|
+
alias title? current_key? # === title?
|
677
|
+
|
678
|
+
# ========================================================================= #
|
679
|
+
# === round_to?
|
680
|
+
# ========================================================================= #
|
681
|
+
def round_to?
|
682
|
+
@internal_hash[:round_to]
|
683
|
+
end
|
684
|
+
|
685
|
+
# ========================================================================= #
|
686
|
+
# === opnn
|
687
|
+
# ========================================================================= #
|
688
|
+
def opnn
|
689
|
+
super(namespace?) if use_opn?
|
690
|
+
end
|
691
|
+
|
692
|
+
# ========================================================================= #
|
693
|
+
# === use_opn?
|
694
|
+
# ========================================================================= #
|
695
|
+
def use_opn?
|
696
|
+
@use_opn
|
697
|
+
end
|
698
|
+
|
699
|
+
# ========================================================================= #
|
700
|
+
# === calculate_gc_content
|
701
|
+
#
|
702
|
+
# Calculate the gc content through this method, which is called from
|
703
|
+
# within the method run().
|
704
|
+
# ========================================================================= #
|
705
|
+
def calculate_gc_content
|
706
|
+
_ = @hash.values.join.delete(N)
|
707
|
+
if is_polynucleotide? _
|
708
|
+
@hash.each_pair {|key, content|
|
709
|
+
# =================================================================== #
|
710
|
+
# Delegate towards the method Bioroebe.gc_content next, including
|
711
|
+
# to round towards 5 positions:
|
712
|
+
# =================================================================== #
|
713
|
+
gc_content = ::Bioroebe.gc_content(content.upcase, round_to?)
|
714
|
+
gc_content = gc_content.first if gc_content.is_a? Array
|
715
|
+
gc_content = gc_content.to_s
|
716
|
+
minimal_key = key.to_s
|
717
|
+
if minimal_key.include? '|'
|
718
|
+
minimal_key = minimal_key.split('|').last.strip
|
719
|
+
end
|
720
|
+
if be_verbose?
|
721
|
+
_ = minimal_key.strip
|
722
|
+
if _.size > 40 # Shorten the content a bit if it is too long.
|
723
|
+
_ = _[0 .. 40]+' [...]'
|
724
|
+
end
|
725
|
+
erev 'GC content of "'+simp(_)+rev+'" is: '+
|
726
|
+
"#{sfancy(gc_content)}#{rev} %"
|
727
|
+
end
|
728
|
+
}
|
729
|
+
else
|
730
|
+
erev '`'+simp(_)+rev+'` is not a polynucleotide.' if be_verbose?
|
731
|
+
end
|
732
|
+
end
|
733
|
+
|
734
|
+
# ========================================================================= #
|
735
|
+
# === first_value
|
736
|
+
#
|
737
|
+
# This will return the first entry of the Fasta files.
|
738
|
+
# ========================================================================= #
|
739
|
+
def first_value
|
740
|
+
sequences?.first
|
741
|
+
end
|
742
|
+
|
743
|
+
# ========================================================================= #
|
744
|
+
# === nucleotides_or_aminoacids?
|
745
|
+
# ========================================================================= #
|
746
|
+
def nucleotides_or_aminoacids?
|
747
|
+
if is_polynucleotide?
|
748
|
+
'nucleotides'
|
749
|
+
else
|
750
|
+
'aminoacids'
|
751
|
+
end
|
752
|
+
end
|
753
|
+
|
754
|
+
# ========================================================================= #
|
755
|
+
# === is_polynucleotide?
|
756
|
+
# ========================================================================= #
|
757
|
+
def is_polynucleotide?(i = main_sequence?)
|
758
|
+
!is_protein?(i)
|
759
|
+
end; alias is_a_nucleotide? is_polynucleotide? # === is_a_nucleotide?
|
760
|
+
|
761
|
+
# ========================================================================= #
|
762
|
+
# === is_this_sequence_a_polynucleotide_sequence?
|
763
|
+
# ========================================================================= #
|
764
|
+
def is_this_sequence_a_polynucleotide_sequence?
|
765
|
+
!is_protein?
|
766
|
+
end
|
767
|
+
|
768
|
+
# ========================================================================= #
|
769
|
+
# === data?
|
770
|
+
#
|
771
|
+
# This will contain the full content of the (whole) .fasta file, including
|
772
|
+
# the header.
|
773
|
+
# ========================================================================= #
|
774
|
+
def data?
|
775
|
+
@data
|
776
|
+
end; alias input? data? # === input?
|
777
|
+
alias dataset? data? # === dataset?
|
778
|
+
|
779
|
+
# ========================================================================= #
|
780
|
+
# === hash?
|
781
|
+
# ========================================================================= #
|
782
|
+
def hash?
|
783
|
+
@hash
|
784
|
+
end
|
785
|
+
|
786
|
+
# ========================================================================= #
|
787
|
+
# === sequences?
|
788
|
+
#
|
789
|
+
# This method will obtain all found sequences.
|
790
|
+
# ========================================================================= #
|
791
|
+
def sequences?
|
792
|
+
@hash.values
|
793
|
+
end; alias sequences sequences? # === sequences
|
794
|
+
alias values sequences? # === values
|
795
|
+
|
796
|
+
# ========================================================================= #
|
797
|
+
# === short_headers?
|
798
|
+
#
|
799
|
+
# The short-headers are like the headers, but if a ' ' token is found
|
800
|
+
# then the line will be truncated towards that first ' '.
|
801
|
+
#
|
802
|
+
# An example is:
|
803
|
+
#
|
804
|
+
# sp|Q91FT8|234R_IIV6 Uncharacterized protein 234R OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-234R PE=4 SV=1
|
805
|
+
#
|
806
|
+
# This will be truncated towards
|
807
|
+
#
|
808
|
+
# sp|Q91FT8|234R_IIV6
|
809
|
+
#
|
810
|
+
# This could then be used to automatically rename FASTA files, for
|
811
|
+
# instance.
|
812
|
+
# ========================================================================= #
|
813
|
+
def short_headers?
|
814
|
+
headers?.map {|entry|
|
815
|
+
if entry.include? ' '
|
816
|
+
entry = entry.split(' ').first
|
817
|
+
end
|
818
|
+
entry
|
819
|
+
}
|
820
|
+
end
|
821
|
+
|
822
|
+
# ========================================================================= #
|
823
|
+
# === set_data
|
824
|
+
#
|
825
|
+
# This is the setter-method towards @data. It is no longer allowed to
|
826
|
+
# invoke set_input_file() since as of 12.06.2020. This means that
|
827
|
+
# you have to invoke that method prior to calling this method.
|
828
|
+
# ========================================================================= #
|
829
|
+
def set_data(i = @input_file)
|
830
|
+
# ======================================================================= #
|
831
|
+
# The next line attempts to ensure that even an Array can be used
|
832
|
+
# as input to that method.
|
833
|
+
# ======================================================================= #
|
834
|
+
i = [i].flatten.compact.first.to_s.dup
|
835
|
+
if File.exist? i.to_s # First try to read in from a file.
|
836
|
+
if be_verbose?
|
837
|
+
opnn; erev "Will read from the file `#{sfile(i)}#{rev}`."
|
838
|
+
end
|
839
|
+
i = File.readlines(i)
|
840
|
+
if @is_a_genbank_file
|
841
|
+
selected = i.select {|line|
|
842
|
+
line.start_with?(' ') and # such as: " 61 atggggcctg caatggggcc tgcaatgggg cctgca\n"
|
843
|
+
(line.strip =~ /\d+/)
|
844
|
+
}.map {|inner_line|
|
845
|
+
inner_line.strip.delete(' 0123456789').strip.upcase
|
846
|
+
}
|
847
|
+
i = ["> genbank file"]+selected
|
848
|
+
end
|
849
|
+
end
|
850
|
+
if i.nil? or i.empty?
|
851
|
+
i = DEFAULT_FASTA
|
852
|
+
opnn; erev 'No input was provided. Thus a default FASTA '\
|
853
|
+
'sequence will be used instead.'
|
854
|
+
end
|
855
|
+
i = sanitize_data(i)
|
856
|
+
i = i.split(N) if i.is_a? String
|
857
|
+
@data = i
|
858
|
+
end; alias set_sequence set_data # === set_Sequence
|
859
|
+
|
860
|
+
# ========================================================================= #
|
861
|
+
# === set_be_verbose_and_report_the_sequence
|
862
|
+
# ========================================================================= #
|
863
|
+
def set_be_verbose_and_report_the_sequence
|
864
|
+
set_be_verbose
|
865
|
+
@internal_hash[:report_the_sequence] = true
|
866
|
+
end
|
867
|
+
|
868
|
+
# ========================================================================= #
|
869
|
+
# === condense_the_sequence_onto_a_single_line?
|
870
|
+
# ========================================================================= #
|
871
|
+
def condense_the_sequence_onto_a_single_line?
|
872
|
+
@internal_hash[:condense_the_sequence_onto_a_single_line]
|
873
|
+
end
|
874
|
+
|
875
|
+
# ========================================================================= #
|
876
|
+
# === return_size_sorted_hash
|
877
|
+
# ========================================================================= #
|
878
|
+
def return_size_sorted_hash(i = @hash)
|
879
|
+
_ = i.sort_by {|key, value| value.size }
|
880
|
+
i = Hash[_]
|
881
|
+
return i
|
882
|
+
end
|
883
|
+
|
884
|
+
# ========================================================================= #
|
885
|
+
# === do_sort_by_size
|
886
|
+
#
|
887
|
+
# This method will sort the hash by size of the sequence. It has been
|
888
|
+
# inspired by the EMBOSS sizeq functionality.
|
889
|
+
#
|
890
|
+
# The output that should be generated might look like this:
|
891
|
+
#
|
892
|
+
# https://www.bioinformatics.nl/cgi-bin/emboss/help/sizeseq#input.1
|
893
|
+
#
|
894
|
+
# Invocation example:
|
895
|
+
#
|
896
|
+
# x = Bioroebe::ParseFasta.new('/Depot/j/globins.fasta'); x.do_sort_by_size
|
897
|
+
#
|
898
|
+
# ========================================================================= #
|
899
|
+
def do_sort_by_size
|
900
|
+
# ======================================================================= #
|
901
|
+
# Sort it here first, by the size of the "value", aka the sequence body.
|
902
|
+
# ======================================================================= #
|
903
|
+
@hash = return_size_sorted_hash(@hash)
|
904
|
+
_ = ''.dup
|
905
|
+
@hash.each_pair {|key, sequence|
|
906
|
+
_ << '> ID '+sequence.size.to_s+' AA.; DE: '+key.to_s+
|
907
|
+
' SQ '+sequence.size.to_s+' AA'+N # ; unknown MW as of yet; '\
|
908
|
+
#'unknown CRC64 as of yet'+N
|
909
|
+
_ << sequence+N+N
|
910
|
+
}
|
911
|
+
e _
|
912
|
+
end; alias run_sizeseq_comparison do_sort_by_size # === run_sizeseq_comparison
|
913
|
+
|
914
|
+
# ========================================================================= #
|
915
|
+
# === n_nucleotides?
|
916
|
+
# ========================================================================= #
|
917
|
+
def n_nucleotides?
|
918
|
+
@hash.values.first.delete("\n").size
|
919
|
+
end; alias return_n_aminoacids n_nucleotides? # === return_n_aminoacids
|
920
|
+
alias size? n_nucleotides? # === size?
|
921
|
+
alias sequence_size? n_nucleotides? # === sequence_size?
|
922
|
+
|
923
|
+
# ========================================================================= #
|
924
|
+
# === headers?
|
925
|
+
# ========================================================================= #
|
926
|
+
def headers?
|
927
|
+
@hash.keys
|
928
|
+
end
|
929
|
+
|
930
|
+
# ========================================================================= #
|
931
|
+
# === first_key?
|
932
|
+
#
|
933
|
+
# Obtain the very first entry.
|
934
|
+
# ========================================================================= #
|
935
|
+
def first_key?
|
936
|
+
headers?.first
|
937
|
+
end
|
938
|
+
|
939
|
+
# ========================================================================= #
|
940
|
+
# === header?
|
941
|
+
#
|
942
|
+
# This variant will always return the first entry.
|
943
|
+
# ========================================================================= #
|
944
|
+
def header?
|
945
|
+
headers?.first.to_s
|
946
|
+
end
|
947
|
+
|
948
|
+
# ========================================================================= #
|
949
|
+
# === raw_body?
|
950
|
+
# ========================================================================= #
|
951
|
+
def raw_body?
|
952
|
+
@hash.values.first
|
953
|
+
end
|
954
|
+
|
955
|
+
# ========================================================================= #
|
956
|
+
# === do_show_the_header
|
957
|
+
# ========================================================================= #
|
958
|
+
def do_show_the_header
|
959
|
+
@show_the_header = true
|
960
|
+
end
|
961
|
+
|
962
|
+
# ========================================================================= #
|
963
|
+
# === set_input_file
|
964
|
+
#
|
965
|
+
# This method will be used to keep track of the input-file, from
|
966
|
+
# which we will read the dataset.
|
967
|
+
# ========================================================================= #
|
968
|
+
def set_input_file(i = nil)
|
969
|
+
if i.nil?
|
970
|
+
# ===================================================================== #
|
971
|
+
# First, we try to find a .fasta or .fa file in the current
|
972
|
+
# directory. If we can find it, we will use that instead.
|
973
|
+
# ===================================================================== #
|
974
|
+
unless Dir['*.{fa,fasta}'].empty?
|
975
|
+
file = Dir['*.{fa,fasta}'].first
|
976
|
+
if be_verbose?
|
977
|
+
result = 'A '
|
978
|
+
if file.end_with? '.fasta'
|
979
|
+
result < 'FASTA '
|
980
|
+
end
|
981
|
+
result << 'file was found in this directory ('+sfile(file)+').'
|
982
|
+
opnn; erev result
|
983
|
+
opnn; erev 'We will use it.'
|
984
|
+
end
|
985
|
+
i = file
|
986
|
+
end
|
987
|
+
unless Dir['*.{fa,fasta}'].empty?
|
988
|
+
file = Dir['*.{fa,fasta}'].first
|
989
|
+
if be_verbose?
|
990
|
+
opnn; erev "We have found a file in this "\
|
991
|
+
"directory (#{sfile(file)}#{rev})."
|
992
|
+
opnn; erev 'We will use it.'
|
993
|
+
end
|
994
|
+
i = file
|
995
|
+
end
|
996
|
+
end
|
997
|
+
if i and File.exist?(i)
|
998
|
+
dataset = File.read(i)
|
999
|
+
if dataset[0 .. ('LOCUS'.size - 1)] == 'LOCUS'
|
1000
|
+
@is_a_genbank_file = true
|
1001
|
+
end
|
1002
|
+
end
|
1003
|
+
@input_file = i
|
1004
|
+
end; alias set_input_files set_input_file # === set_input_files
|
1005
|
+
|
1006
|
+
# ========================================================================= #
|
1007
|
+
# === save_the_file?
|
1008
|
+
# ========================================================================= #
|
1009
|
+
def save_the_file?
|
1010
|
+
@internal_hash[:save_the_file]
|
1011
|
+
end
|
1012
|
+
|
1013
|
+
# ========================================================================= #
|
1014
|
+
# === overwrite_the_original_file?
|
1015
|
+
# ========================================================================= #
|
1016
|
+
def overwrite_the_original_file?
|
1017
|
+
@internal_hash[:overwrite_the_original_file]
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
# ========================================================================= #
|
1021
|
+
# === split_into_proper_sections
|
1022
|
+
#
|
1023
|
+
# Split up into the fasta identifier, and the content.
|
1024
|
+
# ========================================================================= #
|
1025
|
+
def split_into_proper_sections
|
1026
|
+
unless @data.to_s.include? '>'
|
1027
|
+
erev 'No ">" character was found in this dataset.'
|
1028
|
+
erev 'It is recommended to always have a > identifier '\
|
1029
|
+
'for the'
|
1030
|
+
erev 'FASTA format (such as in a .fasta or a .fa file).'
|
1031
|
+
end if be_verbose? # Ok, the input data includes >. We can proceed.
|
1032
|
+
@data.each { |line|
|
1033
|
+
# ===================================================================== #
|
1034
|
+
# === Handle the leading > FASTA identifier first
|
1035
|
+
# ===================================================================== #
|
1036
|
+
if line.start_with? '>' # leading identifier.
|
1037
|
+
@current_key = line[1..-1].chomp # Select all but the first character.
|
1038
|
+
@hash[@current_key] = ''.dup
|
1039
|
+
else
|
1040
|
+
line.delete!('_')
|
1041
|
+
unless @current_key
|
1042
|
+
@current_key = 'standard'
|
1043
|
+
@hash[@current_key] = ''.dup
|
1044
|
+
end
|
1045
|
+
# =================================================================== #
|
1046
|
+
# === Retain the newlines
|
1047
|
+
#
|
1048
|
+
# Here we may decide to get rid of newlines, but it is better to
|
1049
|
+
# NOT remove the newlines - that way we can simply save the
|
1050
|
+
# dataset again.
|
1051
|
+
# @hash[@current_key] << no_newlines(line)
|
1052
|
+
# =================================================================== #
|
1053
|
+
@hash[@current_key] << line
|
1054
|
+
end
|
1055
|
+
}
|
1056
|
+
end
|
1057
|
+
|
1058
|
+
# ========================================================================= #
|
1059
|
+
# === save_into_a_fasta_file
|
1060
|
+
# ========================================================================= #
|
1061
|
+
def save_into_a_fasta_file(
|
1062
|
+
be_verbose = be_verbose?
|
1063
|
+
)
|
1064
|
+
case be_verbose
|
1065
|
+
when :be_verbose
|
1066
|
+
be_verbose = true
|
1067
|
+
end
|
1068
|
+
if @data
|
1069
|
+
what = @data.join("\n")
|
1070
|
+
into = 'standard.fasta'
|
1071
|
+
erev 'Saving into '+sfile(into)+rev+'.' if be_verbose
|
1072
|
+
write_what_into(what, into)
|
1073
|
+
return File.absolute_path(into) # And return the file we saved into.
|
1074
|
+
else
|
1075
|
+
opnn; erev 'No @data variable exists.'
|
1076
|
+
end
|
1077
|
+
end; alias do_save_the_file save_into_a_fasta_file # === do_save_the_file
|
1078
|
+
|
1079
|
+
# ========================================================================= #
|
1080
|
+
# === add_length_information_to_the_header
|
1081
|
+
# ========================================================================= #
|
1082
|
+
def add_length_information_to_the_header
|
1083
|
+
_ = header?.strip
|
1084
|
+
_ << ' length='+sequence_size?.to_s+';'
|
1085
|
+
# ======================================================================= #
|
1086
|
+
# Next, designate where to store this file.
|
1087
|
+
# ======================================================================= #
|
1088
|
+
into = 'new_fasta_file.fasta'
|
1089
|
+
if overwrite_the_original_file?
|
1090
|
+
into = @input_file
|
1091
|
+
end
|
1092
|
+
what = ''.dup
|
1093
|
+
what << "> "+_+"\n"
|
1094
|
+
what << raw_body?
|
1095
|
+
if what and into
|
1096
|
+
erev 'Storing into `'+sfile(into)+rev+'`.'
|
1097
|
+
write_what_into(what, into)
|
1098
|
+
end
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
# ========================================================================= #
|
1102
|
+
# === simplify_header
|
1103
|
+
#
|
1104
|
+
# This method can be called to simplify the header. It will save into
|
1105
|
+
# a .fasta file at once.
|
1106
|
+
# ========================================================================= #
|
1107
|
+
def simplify_header
|
1108
|
+
_ = header?
|
1109
|
+
# ======================================================================= #
|
1110
|
+
# Next, simplify the header. We must start with checking for [] first,
|
1111
|
+
# because if there are any [] in the FASTA header then we can simplify
|
1112
|
+
# stuff at once.
|
1113
|
+
# ======================================================================= #
|
1114
|
+
if _.include?('[') and _.include?(']')
|
1115
|
+
_ = '> '+_.strip.scan(/\[.+\]/).flatten.first.delete('[]')+"\n"
|
1116
|
+
elsif _.include? ','
|
1117
|
+
_ = _[0 .. (_.index(',') - 1) ].strip
|
1118
|
+
end
|
1119
|
+
what = nil
|
1120
|
+
# ======================================================================= #
|
1121
|
+
# Next, designate where to store this file.
|
1122
|
+
# ======================================================================= #
|
1123
|
+
into = 'new_fasta_file.fasta'
|
1124
|
+
if overwrite_the_original_file?
|
1125
|
+
into = @input_file
|
1126
|
+
end
|
1127
|
+
if _.start_with? '>'
|
1128
|
+
what = _
|
1129
|
+
elsif _.include?('[') and _.include?(']') # For example: [Pan troglodytes]
|
1130
|
+
# ===================================================================== #
|
1131
|
+
# See rubular at:
|
1132
|
+
#
|
1133
|
+
# https://rubular.com/r/aDjI0JwMOUlZzP
|
1134
|
+
#
|
1135
|
+
# ===================================================================== #
|
1136
|
+
what = "> "+_.scan(/\[(.+)\]/).flatten.first.to_s+"\n".dup
|
1137
|
+
elsif _.include? 'Human'
|
1138
|
+
_scanned_result = _.scan(/(Human)/)
|
1139
|
+
what = "> "+$1.to_s.dup+"\n".dup
|
1140
|
+
else
|
1141
|
+
erev "Unsure what to do: #{steelblue(_)}"
|
1142
|
+
end
|
1143
|
+
if what and into
|
1144
|
+
what << raw_body?
|
1145
|
+
erev 'Storing into `'+sfile(into)+rev+'`.'
|
1146
|
+
write_what_into(what, into)
|
1147
|
+
end
|
1148
|
+
end
|
1149
|
+
|
1150
|
+
# ========================================================================= #
|
1151
|
+
# === sequence
|
1152
|
+
#
|
1153
|
+
# This method will return the sequence, without any newlines. It is also
|
1154
|
+
# called the "body" of a FASTA file.
|
1155
|
+
# ========================================================================= #
|
1156
|
+
def sequence
|
1157
|
+
_ = @hash.values.first
|
1158
|
+
_.chomp! if _ and _.end_with?(N)
|
1159
|
+
return no_newlines(_)
|
1160
|
+
end; alias fasta_sequence sequence # === fasta_sequence
|
1161
|
+
alias sequence? sequence # === sequence?
|
1162
|
+
alias body? sequence # === body?
|
1163
|
+
alias body sequence # === body?
|
1164
|
+
alias naseq sequence # === naseq
|
1165
|
+
alias nucleotide_sequence sequence # === nucleotide_sequence
|
1166
|
+
alias return_sequence sequence # === return_sequence
|
1167
|
+
alias content? sequence # === content?
|
1168
|
+
|
1169
|
+
# ========================================================================= #
|
1170
|
+
# === save
|
1171
|
+
#
|
1172
|
+
# This method will save our FASTA file.
|
1173
|
+
# ========================================================================= #
|
1174
|
+
def save
|
1175
|
+
if @input_file.nil?
|
1176
|
+
erev "The generic file #{sfile('foobar.fasta')}#{rev} "\
|
1177
|
+
"will be used."
|
1178
|
+
set_input_file('foobar.fasta')
|
1179
|
+
end
|
1180
|
+
into = @input_file
|
1181
|
+
what = @data.join("\n")
|
1182
|
+
erev 'Storing into '+sfile(into)+rev+'.'
|
1183
|
+
write_what_into(what, into)
|
1184
|
+
return into
|
1185
|
+
end
|
1186
|
+
|
1187
|
+
# ========================================================================= #
|
1188
|
+
# === []
|
1189
|
+
#
|
1190
|
+
# This is a simpler query-interface for obtaining the DNA/RNA sequence
|
1191
|
+
# of the FASTA file (or aminoacid sequence, if we have a protein at
|
1192
|
+
# hand here).
|
1193
|
+
#
|
1194
|
+
# Using the method sequences? here, which in turn works on @hash, is
|
1195
|
+
# ok because Hashes are kept in a sorted manner in ruby since some
|
1196
|
+
# time.
|
1197
|
+
# ========================================================================= #
|
1198
|
+
def [](i)
|
1199
|
+
sequences?[i]
|
1200
|
+
end
|
1201
|
+
|
1202
|
+
# ========================================================================= #
|
1203
|
+
# === Bioroebe::ParseFasta[]
|
1204
|
+
# ========================================================================= #
|
1205
|
+
def self.[](i)
|
1206
|
+
_ = new(i)
|
1207
|
+
_.sequences?
|
1208
|
+
end
|
1209
|
+
|
1210
|
+
# ========================================================================= #
|
1211
|
+
# === type?
|
1212
|
+
# ========================================================================= #
|
1213
|
+
def type?
|
1214
|
+
if is_the_sequence_a_polypeptide?
|
1215
|
+
:protein
|
1216
|
+
elsif is_this_sequence_a_polynucleotide_sequence?
|
1217
|
+
:dna_or_rna
|
1218
|
+
else
|
1219
|
+
:unknown
|
1220
|
+
end
|
1221
|
+
end
|
1222
|
+
|
1223
|
+
# ========================================================================= #
|
1224
|
+
# === is_the_sequence_a_polypeptide?
|
1225
|
+
#
|
1226
|
+
# This method can be used to determine whether a given input sequence
|
1227
|
+
# is a polypeptide (aka a protein) or whether it is not.
|
1228
|
+
#
|
1229
|
+
# If this sequence is a polypeptide then this method will return true.
|
1230
|
+
# Otherwise false will be returned.
|
1231
|
+
# ========================================================================= #
|
1232
|
+
def is_the_sequence_a_polypeptide?(
|
1233
|
+
i = main_sequence?
|
1234
|
+
)
|
1235
|
+
return_value = false # Set the default return value here.
|
1236
|
+
# ======================================================================= #
|
1237
|
+
# Look at the first 120 positions to determine whether this is a protein
|
1238
|
+
# or a nucleotide sequence.
|
1239
|
+
# ======================================================================= #
|
1240
|
+
subsequence = i[0 .. 119] # Must deduct 1 at the end since Arrays in ruby start at 0.
|
1241
|
+
# ======================================================================= #
|
1242
|
+
# Build a frequency of the characters there.
|
1243
|
+
# ======================================================================= #
|
1244
|
+
hash = {}
|
1245
|
+
hash.default = 0
|
1246
|
+
subsequence.chars.each {|character|
|
1247
|
+
hash[character] += 1
|
1248
|
+
}
|
1249
|
+
keys_to_check_for = %w(
|
1250
|
+
B D E F H I J K L M O P Q R S V W X Y Z
|
1251
|
+
)
|
1252
|
+
|
1253
|
+
values = hash.select {|key, value|
|
1254
|
+
if keys_to_check_for.include? key
|
1255
|
+
true
|
1256
|
+
else
|
1257
|
+
false
|
1258
|
+
end
|
1259
|
+
}.values.sum
|
1260
|
+
if values > 0
|
1261
|
+
return_value = true
|
1262
|
+
end
|
1263
|
+
return return_value
|
1264
|
+
end; alias is_protein? is_the_sequence_a_polypeptide? # === is_protein?
|
1265
|
+
alias is_a_protein? is_the_sequence_a_polypeptide? # === is_a_protein?
|
1266
|
+
|
1267
|
+
# ========================================================================= #
|
1268
|
+
# === main_sequence?
|
1269
|
+
#
|
1270
|
+
# This will always return the first entry.
|
1271
|
+
# ========================================================================= #
|
1272
|
+
def main_sequence?
|
1273
|
+
@hash.values.first
|
1274
|
+
end
|
1275
|
+
|
1276
|
+
# ========================================================================= #
|
1277
|
+
# === gc_content?
|
1278
|
+
# ========================================================================= #
|
1279
|
+
def gc_content?
|
1280
|
+
return ::Bioroebe.gc_content(main_sequence?).to_f # Must be a float.
|
1281
|
+
end; alias gc_content gc_content? # === gc_content
|
1282
|
+
|
1283
|
+
# ========================================================================= #
|
1284
|
+
# === sequence_object
|
1285
|
+
#
|
1286
|
+
# This method will return a Sequence object.
|
1287
|
+
#
|
1288
|
+
# Usage example:
|
1289
|
+
#
|
1290
|
+
# x = Bioroebe.parse_fasta 'ls_orchid.fasta'
|
1291
|
+
# y = x.sequence_object # y is now an instance of Bioroebe::Sequence
|
1292
|
+
#
|
1293
|
+
# ========================================================================= #
|
1294
|
+
def sequence_object
|
1295
|
+
::Bioroebe::Sequence.new(main_sequence?)
|
1296
|
+
end
|
1297
|
+
|
1298
|
+
# ========================================================================= #
|
1299
|
+
# === sanitize_the_file?
|
1300
|
+
# ========================================================================= #
|
1301
|
+
def sanitize_the_file?
|
1302
|
+
@internal_hash[:sanitize_the_file]
|
1303
|
+
end
|
1304
|
+
|
1305
|
+
# ========================================================================= #
|
1306
|
+
# === show_help (help tag)
|
1307
|
+
#
|
1308
|
+
# This method will inform the user how this class may be used from the
|
1309
|
+
# commandline.
|
1310
|
+
#
|
1311
|
+
# Invocation example:
|
1312
|
+
#
|
1313
|
+
# pfasta --help
|
1314
|
+
#
|
1315
|
+
# ========================================================================= #
|
1316
|
+
def show_help
|
1317
|
+
e
|
1318
|
+
eparse ' --size'
|
1319
|
+
eparse ' --also-show-the-sequence'
|
1320
|
+
eparse ' --header # show the header as well (normally the '\
|
1321
|
+
'header is not shown)'
|
1322
|
+
eparse ' --limit=1000 # limit to show only the first 1000 '\
|
1323
|
+
'nucleotides; use'
|
1324
|
+
eparse ' # any number that you need here'
|
1325
|
+
eparse ' --one-line # show the sequence on one line only, '\
|
1326
|
+
'e. g. all newlines'
|
1327
|
+
eparse ' # were removed'
|
1328
|
+
eparse ' --toprotein # show the protein sequence as well '\
|
1329
|
+
'(assumes DNA or RNA'
|
1330
|
+
eparse ' # .fasta file)'
|
1331
|
+
eparse ' --convert # alias to the above ^^^'
|
1332
|
+
eparse ' --translate # alias to the above ^^^'
|
1333
|
+
eparse ' --sanitize-the-file # delete all " " characters '\
|
1334
|
+
'and upcase the content, of a'
|
1335
|
+
eparse ' # .fasta file'
|
1336
|
+
e
|
1337
|
+
end
|
1338
|
+
|
1339
|
+
# ========================================================================= #
|
1340
|
+
# === do_sanitize_the_file_then_exit
|
1341
|
+
# ========================================================================= #
|
1342
|
+
def do_sanitize_the_file_then_exit
|
1343
|
+
_ = non_hyphened_commandline_arguments?
|
1344
|
+
first = _.first
|
1345
|
+
if File.exist?(first)
|
1346
|
+
dataset = default_readlines(first)
|
1347
|
+
# ===================================================================== #
|
1348
|
+
# Next, iterate over the dataset.
|
1349
|
+
# ===================================================================== #
|
1350
|
+
dataset.map! {|entry|
|
1351
|
+
entry = entry.dup if entry.frozen?
|
1352
|
+
entry.delete!(' ') if entry.include?(' ')
|
1353
|
+
entry.upcase!
|
1354
|
+
entry
|
1355
|
+
}
|
1356
|
+
opne 'Saving the sanitized dataset into '\
|
1357
|
+
'the file '+sfile(first)+rev+'.'
|
1358
|
+
write_what_into(dataset.join, first)
|
1359
|
+
end
|
1360
|
+
exit
|
1361
|
+
end
|
1362
|
+
|
1363
|
+
# ========================================================================= #
|
1364
|
+
# === run (run tag)
|
1365
|
+
# ========================================================================= #
|
1366
|
+
def run
|
1367
|
+
menu
|
1368
|
+
do_sanitize_the_file_then_exit if sanitize_the_file?
|
1369
|
+
do_process_the_commandline_arguments_that_are_files
|
1370
|
+
do_save_the_file if save_the_file?
|
1371
|
+
end
|
1372
|
+
|
1373
|
+
end
|
1374
|
+
|
1375
|
+
Fasta = ParseFasta # Add an "alias" constant to class ParseFasta.
|
1376
|
+
|
1377
|
+
# =========================================================================== #
|
1378
|
+
# === Bioroebe.parse_fasta_quietly
|
1379
|
+
#
|
1380
|
+
# As the variant above, but will work quietly.
|
1381
|
+
# =========================================================================== #
|
1382
|
+
def self.parse_fasta_quietly(
|
1383
|
+
i, use_colours = true
|
1384
|
+
)
|
1385
|
+
::Bioroebe.parse_fasta(i, use_colours) { :be_quiet }
|
1386
|
+
end
|
1387
|
+
|
1388
|
+
# =========================================================================== #
|
1389
|
+
# === Bioroebe.return_fasta_entry_with_the_highest_gc_content
|
1390
|
+
#
|
1391
|
+
# The first argument should be a locally existing FASTA file that
|
1392
|
+
# contains different sequences.
|
1393
|
+
#
|
1394
|
+
# Usage example:
|
1395
|
+
#
|
1396
|
+
# x = Bioroebe.return_fasta_entry_with_the_highest_gc_content('/rosalind_gc.txt')
|
1397
|
+
#
|
1398
|
+
# =========================================================================== #
|
1399
|
+
def self.return_fasta_entry_with_the_highest_gc_content(this_fasta_file)
|
1400
|
+
if File.exist? this_fasta_file
|
1401
|
+
dataset = File.read(this_fasta_file)
|
1402
|
+
dataset = parse_fasta(dataset) { :be_quiet }
|
1403
|
+
hash = dataset.hash?
|
1404
|
+
hash.transform_values! {|this_value|
|
1405
|
+
::Bioroebe.gc_content(this_value).to_f
|
1406
|
+
}
|
1407
|
+
return hash.max_by {|key, value| value }
|
1408
|
+
else
|
1409
|
+
erev "No file exists at #{sfile(this_fasta_file)}#{rev}."
|
1410
|
+
end
|
1411
|
+
end
|
1412
|
+
|
1413
|
+
# =========================================================================== #
|
1414
|
+
# === Bioroebe.sizeseq
|
1415
|
+
#
|
1416
|
+
# This method will "size-sequence compare", typically on a .fasta file.
|
1417
|
+
# =========================================================================== #
|
1418
|
+
def self.sizeseq(i)
|
1419
|
+
if i.is_a? Array
|
1420
|
+
i = i.first
|
1421
|
+
end
|
1422
|
+
_ = Bioroebe.parse_fasta(i) { :be_quiet }
|
1423
|
+
_.do_sort_by_size
|
1424
|
+
end
|
1425
|
+
|
1426
|
+
# =========================================================================== #
|
1427
|
+
# === Bioroebe.return_sizeseq
|
1428
|
+
#
|
1429
|
+
# This is as Bioroebe.sizeseq(), but it will just return the result,
|
1430
|
+
# rather than output it.
|
1431
|
+
# =========================================================================== #
|
1432
|
+
def self.return_sizeseq(i)
|
1433
|
+
if i.is_a? Array
|
1434
|
+
i = i.first
|
1435
|
+
end
|
1436
|
+
_ = Bioroebe.parse_fasta(i) { :be_quiet }
|
1437
|
+
hash = _.return_size_sorted_hash
|
1438
|
+
result = ''.dup
|
1439
|
+
hash.each_pair {|key, sequence|
|
1440
|
+
result << '> ID '+sequence.size.to_s+' AA.; DE: '+key.to_s+
|
1441
|
+
' SQ '+sequence.size.to_s+' AA'+N
|
1442
|
+
result << sequence+N+N
|
1443
|
+
}
|
1444
|
+
return result
|
1445
|
+
end
|
1446
|
+
|
1447
|
+
# =========================================================================== #
|
1448
|
+
# === Bioroebe.genbank_to_fasta
|
1449
|
+
#
|
1450
|
+
# This method will convert from a genbank file, to a .fasta file.
|
1451
|
+
#
|
1452
|
+
# Invocation example:
|
1453
|
+
#
|
1454
|
+
# Bioroebe.genbank_to_fasta('/home/x/DATA/PROGRAMMING_LANGUAGES/RUBY/src/bioroebe/lib/bioroebe/data/genbank/sample_file.genbank')
|
1455
|
+
#
|
1456
|
+
# =========================================================================== #
|
1457
|
+
def self.genbank_to_fasta(
|
1458
|
+
this_file,
|
1459
|
+
be_verbose = :be_verbose
|
1460
|
+
)
|
1461
|
+
case be_verbose
|
1462
|
+
when :be_quiet
|
1463
|
+
be_verbose = false
|
1464
|
+
end
|
1465
|
+
if this_file.is_a? Array
|
1466
|
+
this_file = this_file.first
|
1467
|
+
end
|
1468
|
+
if File.exist? this_file
|
1469
|
+
_ = Bioroebe::ParseFasta.new(this_file) { :be_quiet }
|
1470
|
+
else
|
1471
|
+
_ = Bioroebe::ParseFasta.new(:do_not_run_yet) { :be_quiet }
|
1472
|
+
_.set_data # This will use the default file.
|
1473
|
+
_.split_into_proper_sections
|
1474
|
+
end
|
1475
|
+
file_path = _.save_into_a_fasta_file(be_verbose)
|
1476
|
+
return file_path
|
1477
|
+
end
|
1478
|
+
|
1479
|
+
# =========================================================================== #
|
1480
|
+
# === Bioroebe.parse_fasta_file
|
1481
|
+
# =========================================================================== #
|
1482
|
+
def self.parse_fasta_file(
|
1483
|
+
i = ARGV,
|
1484
|
+
use_colours = true
|
1485
|
+
)
|
1486
|
+
use_this_hash = {
|
1487
|
+
use_colours: use_colours,
|
1488
|
+
be_verbose: false
|
1489
|
+
}
|
1490
|
+
ParseFasta.new(i) { use_this_hash }
|
1491
|
+
end; self.instance_eval { alias fasta_file parse_fasta_file } # === Bioroebe.fasta_file
|
1492
|
+
|
1493
|
+
# =========================================================================== #
|
1494
|
+
# === Bioroebe.parse_fasta
|
1495
|
+
#
|
1496
|
+
# Easier reader-method for .fasta files.
|
1497
|
+
#
|
1498
|
+
# The second argument determines whether we will use colours or whether
|
1499
|
+
# we will not. For now, the default is to not use colours when we use
|
1500
|
+
# this particular class method.
|
1501
|
+
#
|
1502
|
+
# Invocation examples:
|
1503
|
+
#
|
1504
|
+
# x = Bioroebe.parse_fasta('/rosalind_gc.txt')
|
1505
|
+
# hash = Bioroebe.parse_fasta('/rosalind_gc.txt').hash?
|
1506
|
+
#
|
1507
|
+
# =========================================================================== #
|
1508
|
+
def self.parse_fasta(
|
1509
|
+
i,
|
1510
|
+
use_colours = true
|
1511
|
+
)
|
1512
|
+
use_this_hash = {
|
1513
|
+
use_colours: use_colours
|
1514
|
+
}
|
1515
|
+
if block_given?
|
1516
|
+
use_this_hash = {
|
1517
|
+
use_colours: use_colours,
|
1518
|
+
be_verbose: yield
|
1519
|
+
}
|
1520
|
+
end
|
1521
|
+
::Bioroebe::ParseFasta.new(i) { use_this_hash }
|
1522
|
+
end; self.instance_eval { alias fasta parse_fasta } # === Bioroebe.fasta
|
1523
|
+
|
1524
|
+
end
|
14
1525
|
|
15
1526
|
if __FILE__ == $PROGRAM_NAME
|
16
1527
|
Bioroebe::ParseFasta.new(ARGV) { :sizeseq }
|
@@ -24,4 +1535,4 @@ end # corefasta globins.fasta
|
|
24
1535
|
# pfasta /GC.txt
|
25
1536
|
# pfasta 013521.3_289_aa.fasta --also-show-the-sequence
|
26
1537
|
# pfasta $RSRC/bioroebe/lib/bioroebe/data/GFP_mutant_3_coding_sequence.fasta --also-show-the-sequence
|
27
|
-
# corefasta $J/globins.fasta
|
1538
|
+
# corefasta $J/globins.fasta
|