bioroebe 0.10.80 → 0.12.24

Sign up to get free protection for your applications and to get access to all the features.
Files changed (301) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3946 -2817
  3. data/bin/bioroebe +13 -2
  4. data/bin/bioroebe_hash +7 -0
  5. data/bin/codon_to_aminoacid +6 -4
  6. data/bin/compacter +7 -0
  7. data/bin/plain_palindrome +7 -0
  8. data/bioroebe.gemspec +3 -3
  9. data/doc/README.gen +3918 -2793
  10. data/doc/quality_control/commandline_applications.md +3 -3
  11. data/doc/statistics/statistics.md +7 -7
  12. data/doc/todo/bioroebe_GUI_todo.md +19 -14
  13. data/doc/todo/bioroebe_java_todo.md +22 -0
  14. data/doc/todo/bioroebe_todo.md +2075 -2620
  15. data/lib/bioroebe/C++/DNA.cpp +69 -0
  16. data/lib/bioroebe/C++/RNA.cpp +58 -0
  17. data/lib/bioroebe/C++/sequence.cpp +35 -0
  18. data/lib/bioroebe/abstract/README.md +1 -0
  19. data/lib/bioroebe/abstract/features.rb +29 -0
  20. data/lib/bioroebe/aminoacids/aminoacid_substitution.rb +1 -9
  21. data/lib/bioroebe/aminoacids/codon_percentage.rb +1 -9
  22. data/lib/bioroebe/aminoacids/deduce_aminoacid_sequence.rb +1 -9
  23. data/lib/bioroebe/aminoacids/display_aminoacid_table.rb +1 -0
  24. data/lib/bioroebe/aminoacids/show_hydrophobicity.rb +1 -6
  25. data/lib/bioroebe/base/base_module/base_module.rb +36 -0
  26. data/lib/bioroebe/base/colours_for_base/colours_for_base.rb +18 -8
  27. data/lib/bioroebe/base/commandline_application/commandline_application.rb +13 -9
  28. data/lib/bioroebe/base/commandline_application/commandline_arguments.rb +24 -19
  29. data/lib/bioroebe/base/commandline_application/misc.rb +66 -49
  30. data/lib/bioroebe/base/commandline_application/opn.rb +8 -8
  31. data/lib/bioroebe/base/commandline_application/reset.rb +5 -3
  32. data/lib/bioroebe/base/internal_hash_module/internal_hash_module.rb +42 -0
  33. data/lib/bioroebe/base/misc.rb +35 -0
  34. data/lib/bioroebe/base/prototype/misc.rb +15 -9
  35. data/lib/bioroebe/base/prototype/reset.rb +10 -0
  36. data/lib/bioroebe/cleave_and_digest/digestion.rb +10 -2
  37. data/lib/bioroebe/cleave_and_digest/trypsin.rb +104 -50
  38. data/lib/bioroebe/codon_tables/frequencies/parse_frequency_table.rb +2 -10
  39. data/lib/bioroebe/codons/codons.rb +1 -1
  40. data/lib/bioroebe/codons/convert_this_codon_to_that_aminoacid.rb +208 -59
  41. data/lib/bioroebe/codons/possible_codons_for_this_aminoacid.rb +1 -9
  42. data/lib/bioroebe/codons/show_codon_tables.rb +8 -3
  43. data/lib/bioroebe/codons/show_codon_usage.rb +15 -4
  44. data/lib/bioroebe/colours/rev.rb +4 -1
  45. data/lib/bioroebe/constants/aminoacids_and_proteins.rb +1 -0
  46. data/lib/bioroebe/constants/database_constants.rb +1 -1
  47. data/lib/bioroebe/constants/files_and_directories.rb +31 -4
  48. data/lib/bioroebe/constants/misc.rb +20 -0
  49. data/lib/bioroebe/constants/nucleotides.rb +7 -0
  50. data/lib/bioroebe/conversions/dna_to_aminoacid_sequence.rb +109 -39
  51. data/lib/bioroebe/count/count_amount_of_aminoacids.rb +3 -2
  52. data/lib/bioroebe/count/count_amount_of_nucleotides.rb +3 -0
  53. data/lib/bioroebe/cpp +1 -0
  54. data/lib/bioroebe/crystal/README.md +2 -0
  55. data/lib/bioroebe/crystal/to_rna.cr +19 -0
  56. data/lib/bioroebe/data/README.md +11 -8
  57. data/lib/bioroebe/data/electron_microscopy/pos_example.pos +396 -0
  58. data/lib/bioroebe/data/electron_microscopy/test_particles.star +36 -0
  59. data/lib/bioroebe/data/fasta/human/Homo_sapiens_hemoglobin_subunit_alpha_HBB_mRNA.fasta +9 -0
  60. data/lib/bioroebe/data/fasta/human/Homo_sapiens_hemoglobin_subunit_beta_HBB_mRNA.fasta +8 -0
  61. data/lib/bioroebe/data/fasta/human/README.md +2 -0
  62. data/lib/bioroebe/dotplots/advanced_dotplot.rb +1 -1
  63. data/lib/bioroebe/electron_microscopy/coordinate_analyzer.rb +15 -18
  64. data/lib/bioroebe/{fasta_and_fastq/parse_fasta/run.rb → electron_microscopy/electron_microscopy_module.rb} +16 -8
  65. data/lib/bioroebe/electron_microscopy/fix_pos_file.rb +1 -9
  66. data/lib/bioroebe/electron_microscopy/flipy.rb +83 -0
  67. data/lib/bioroebe/electron_microscopy/parse_coordinates.rb +2 -10
  68. data/lib/bioroebe/electron_microscopy/read_file_xmd.rb +1 -9
  69. data/lib/bioroebe/electron_microscopy/simple_star_file_generator.rb +4 -9
  70. data/lib/bioroebe/enzymes/has_this_restriction_enzyme.rb +10 -3
  71. data/lib/bioroebe/enzymes/restriction_enzyme.rb +23 -1
  72. data/lib/bioroebe/enzymes/restriction_enzymes/statistics.rb +65 -0
  73. data/lib/bioroebe/fasta_and_fastq/autocorrect_the_name_of_this_fasta_file.rb +1 -9
  74. data/lib/bioroebe/fasta_and_fastq/compact_fasta_file/compact_fasta_file.rb +7 -9
  75. data/lib/bioroebe/fasta_and_fastq/fasta_defline/fasta_defline.rb +1 -5
  76. data/lib/bioroebe/fasta_and_fastq/fasta_to_yaml/fasta_to_yaml.rb +81 -0
  77. data/lib/bioroebe/fasta_and_fastq/parse_fasta/parse_fasta.rb +1518 -7
  78. data/lib/bioroebe/fasta_and_fastq/return_fasta_subsection_of_this_file.rb +11 -2
  79. data/lib/bioroebe/fasta_and_fastq/show_fasta_headers.rb +27 -12
  80. data/lib/bioroebe/fasta_and_fastq/simplify_fasta_header/simplify_fasta_header.rb +1 -5
  81. data/lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/constants.rb +0 -5
  82. data/lib/bioroebe/genome/README.md +4 -0
  83. data/lib/bioroebe/genome/genome.rb +130 -0
  84. data/lib/bioroebe/genomes/genome_pattern.rb +3 -9
  85. data/lib/bioroebe/gui/gtk +1 -0
  86. data/lib/bioroebe/gui/gtk3/alignment/alignment.rb +106 -137
  87. data/lib/bioroebe/gui/gtk3/aminoacid_composition/aminoacid_composition.rb +27 -61
  88. data/lib/bioroebe/gui/gtk3/aminoacid_composition/customized_dialog.rb +1 -1
  89. data/lib/bioroebe/gui/gtk3/blosum_matrix_viewer/blosum_matrix_viewer.rb +1 -2
  90. data/lib/bioroebe/gui/gtk3/calculate_cell_numbers_of_bacteria/calculate_cell_numbers_of_bacteria.rb +1 -2
  91. data/lib/bioroebe/gui/gtk3/controller/controller.rb +46 -29
  92. data/lib/bioroebe/gui/gtk3/dna_to_aminoacid_widget/dna_to_aminoacid_widget.rb +77 -52
  93. data/lib/bioroebe/gui/gtk3/dna_to_reverse_complement_widget/dna_to_reverse_complement_widget.rb +1 -2
  94. data/lib/bioroebe/gui/gtk3/fasta_table_widget/fasta_table_widget.rb +100 -23
  95. data/lib/bioroebe/gui/gtk3/format_converter/format_converter.rb +1 -2
  96. data/lib/bioroebe/gui/gtk3/gene/gene.rb +1 -2
  97. data/lib/bioroebe/gui/gtk3/hamming_distance/hamming_distance.rb +43 -30
  98. data/lib/bioroebe/gui/gtk3/levensthein_distance/levensthein_distance.rb +1 -2
  99. data/lib/bioroebe/gui/gtk3/nucleotide_analyser/nucleotide_analyser.rb +120 -73
  100. data/lib/bioroebe/gui/gtk3/primer_design_widget/primer_design_widget.rb +1 -2
  101. data/lib/bioroebe/gui/gtk3/protein_to_DNA/protein_to_DNA.rb +19 -20
  102. data/lib/bioroebe/gui/gtk3/random_sequence/random_sequence.rb +20 -13
  103. data/lib/bioroebe/gui/gtk3/restriction_enzymes/restriction_enzymes.rb +1 -2
  104. data/lib/bioroebe/gui/gtk3/show_codon_table/misc.rb +97 -22
  105. data/lib/bioroebe/gui/gtk3/show_codon_table/show_codon_table.rb +3 -73
  106. data/lib/bioroebe/gui/gtk3/show_codon_usage/show_codon_usage.rb +1 -2
  107. data/lib/bioroebe/gui/gtk3/sizeseq/sizeseq.rb +1 -2
  108. data/lib/bioroebe/gui/gtk3/three_to_one/three_to_one.rb +1 -2
  109. data/lib/bioroebe/gui/gtk3/www_finder/www_finder.rb +1 -2
  110. data/lib/bioroebe/gui/javafx/bioroebe/Bioroebe.class +0 -0
  111. data/lib/bioroebe/gui/javafx/bioroebe/Bioroebe.java +104 -0
  112. data/lib/bioroebe/gui/javafx/bioroebe.jar +0 -0
  113. data/lib/bioroebe/gui/javafx/bioroebe.mf +1 -0
  114. data/lib/bioroebe/gui/javafx/module-info.class +0 -0
  115. data/lib/bioroebe/gui/javafx/module-info.java +5 -0
  116. data/lib/bioroebe/gui/jruby/alignment/alignment.rb +165 -0
  117. data/lib/bioroebe/gui/jruby/aminoacid_composition/aminoacid_composition.rb +166 -0
  118. data/lib/bioroebe/gui/libui/alignment/alignment.rb +3 -1
  119. data/lib/bioroebe/gui/libui/controller/controller.rb +116 -0
  120. data/lib/bioroebe/gui/libui/random_sequence/random_sequence.rb +18 -2
  121. data/lib/bioroebe/gui/libui/show_codon_table/show_codon_table.rb +2 -0
  122. data/lib/bioroebe/gui/libui/three_to_one/three_to_one.rb +8 -6
  123. data/lib/bioroebe/gui/shared_code/alignment/alignment_module.rb +102 -0
  124. data/lib/bioroebe/gui/shared_code/aminoacid_composition/aminoacid_composition_module.rb +94 -0
  125. data/lib/bioroebe/gui/shared_code/levensthein_distance/levensthein_distance_module.rb +18 -16
  126. data/lib/bioroebe/gui/shared_code/protein_to_DNA/protein_to_DNA_module.rb +14 -14
  127. data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne$1.class +0 -0
  128. data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne$CloseListener.class +0 -0
  129. data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne.class +0 -0
  130. data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne.java +141 -0
  131. data/lib/bioroebe/images/FORWARD_PRIMER.png +0 -0
  132. data/lib/bioroebe/images/REVERSE_PRIMER.png +0 -0
  133. data/lib/bioroebe/images/images.html +29845 -0
  134. data/lib/bioroebe/java/README.md +5 -0
  135. data/lib/bioroebe/java/bioroebe/AllInOne.java +1 -0
  136. data/lib/bioroebe/java/bioroebe/Base.class +0 -0
  137. data/lib/bioroebe/java/bioroebe/Base.java +39 -5
  138. data/lib/bioroebe/java/bioroebe/IsPalindrome.java +23 -5
  139. data/lib/bioroebe/java/bioroebe/SanitizeNucleotideSequence.java +0 -0
  140. data/lib/bioroebe/java/bioroebe/Sequence.java +28 -3
  141. data/lib/bioroebe/java/bioroebe/ToCamelcase.class +0 -0
  142. data/lib/bioroebe/java/bioroebe/ToCamelcase.java +16 -4
  143. data/lib/bioroebe/java/bioroebe/ToRNA.java +43 -0
  144. data/lib/bioroebe/java/bioroebe/ToplevelMethods.java +6 -0
  145. data/lib/bioroebe/java/bioroebe/{BisulfiteTreatment.class → src/BisulfiteTreatment.class} +0 -0
  146. data/lib/bioroebe/java/bioroebe/{Codons.class → src/Codons.class} +0 -0
  147. data/lib/bioroebe/java/bioroebe/src/Codons.java +35 -0
  148. data/lib/bioroebe/java/bioroebe/src/Commandline.class +0 -0
  149. data/lib/bioroebe/java/bioroebe/src/Commandline.java +101 -0
  150. data/lib/bioroebe/java/bioroebe/{Esystem.class → src/Esystem.class} +0 -0
  151. data/lib/bioroebe/java/bioroebe/{Esystem.java → src/Esystem.java} +6 -1
  152. data/lib/bioroebe/java/bioroebe/{GenerateRandomDnaSequence.class → src/GenerateRandomDnaSequence.class} +0 -0
  153. data/lib/bioroebe/java/bioroebe/{GenerateRandomDnaSequence.java → src/GenerateRandomDnaSequence.java} +8 -2
  154. data/lib/bioroebe/java/bioroebe/src/PartnerNucleotide.class +0 -0
  155. data/lib/bioroebe/java/bioroebe/src/PartnerNucleotide.java +56 -0
  156. data/lib/bioroebe/java/bioroebe/{RemoveFile.java → src/RemoveFile.java} +10 -4
  157. data/lib/bioroebe/java/bioroebe/{RemoveNumbers.class → src/RemoveNumbers.class} +0 -0
  158. data/lib/bioroebe/java/bioroebe/{RemoveNumbers.java → src/RemoveNumbers.java} +1 -0
  159. data/lib/bioroebe/java/bioroebe/src/toplevel_methods/BaseComposition.class +0 -0
  160. data/lib/bioroebe/java/bioroebe/src/toplevel_methods/BaseComposition.java +75 -0
  161. data/lib/bioroebe/misc/ruler.rb +11 -2
  162. data/lib/bioroebe/nucleotides/most_likely_nucleotide_sequence_for_this_aminoacid_sequence.rb +1 -9
  163. data/lib/bioroebe/nucleotides/sanitize_nucleotide_sequence.rb +59 -18
  164. data/lib/bioroebe/nucleotides/show_nucleotide_sequence.rb +7 -7
  165. data/lib/bioroebe/parsers/genbank_parser.rb +347 -26
  166. data/lib/bioroebe/parsers/gff.rb +1 -9
  167. data/lib/bioroebe/patterns/scan_for_repeat.rb +1 -5
  168. data/lib/bioroebe/pdb/fetch_fasta_sequence_from_pdb.rb +1 -9
  169. data/lib/bioroebe/pdb/parse_mmCIF_file.rb +1 -9
  170. data/lib/bioroebe/pdb/parse_pdb_file.rb +4 -10
  171. data/lib/bioroebe/project/project.rb +1 -1
  172. data/lib/bioroebe/python/README.md +1 -0
  173. data/lib/bioroebe/python/__pycache__/mymodule.cpython-39.pyc +0 -0
  174. data/lib/bioroebe/python/gui/gtk3/all_in_one.css +4 -0
  175. data/lib/bioroebe/python/gui/gtk3/all_in_one.py +59 -0
  176. data/lib/bioroebe/python/gui/gtk3/widget1.py +20 -0
  177. data/lib/bioroebe/python/gui/tkinter/all_in_one.py +91 -0
  178. data/lib/bioroebe/python/mymodule.py +8 -0
  179. data/lib/bioroebe/python/protein_to_dna.py +33 -0
  180. data/lib/bioroebe/python/shell/shell.py +19 -0
  181. data/lib/bioroebe/python/to_rna.py +14 -0
  182. data/lib/bioroebe/python/toplevel_methods/convert_dna_to_aminoacid_sequence.py +137 -0
  183. data/lib/bioroebe/python/toplevel_methods/esystem.py +12 -0
  184. data/lib/bioroebe/python/toplevel_methods/open_in_browser.py +20 -0
  185. data/lib/bioroebe/python/toplevel_methods/palindromes.py +52 -0
  186. data/lib/bioroebe/python/toplevel_methods/rds.py +13 -0
  187. data/lib/bioroebe/python/toplevel_methods/shuffleseq.py +23 -0
  188. data/lib/bioroebe/python/toplevel_methods/three_delimiter.py +37 -0
  189. data/lib/bioroebe/python/toplevel_methods/time_and_date.py +43 -0
  190. data/lib/bioroebe/python/toplevel_methods/to_camelcase.py +21 -0
  191. data/lib/bioroebe/requires/require_cleave_and_digest.rb +3 -1
  192. data/lib/bioroebe/requires/require_the_bioroebe_project.rb +3 -1
  193. data/lib/bioroebe/sequence/alignment.rb +14 -4
  194. data/lib/bioroebe/sequence/dna.rb +1 -0
  195. data/lib/bioroebe/sequence/nucleotide_module/nucleotide_module.rb +28 -25
  196. data/lib/bioroebe/sequence/protein.rb +105 -3
  197. data/lib/bioroebe/sequence/rna.rb +220 -0
  198. data/lib/bioroebe/sequence/sequence.rb +128 -40
  199. data/lib/bioroebe/shell/menu.rb +3815 -3696
  200. data/lib/bioroebe/shell/misc.rb +9019 -3133
  201. data/lib/bioroebe/shell/readline/readline.rb +1 -1
  202. data/lib/bioroebe/shell/shell.rb +1137 -28
  203. data/lib/bioroebe/siRNA/siRNA.rb +81 -1
  204. data/lib/bioroebe/string_matching/find_longest_substring.rb +3 -2
  205. data/lib/bioroebe/string_matching/hamming_distance.rb +1 -9
  206. data/lib/bioroebe/taxonomy/class_methods.rb +3 -8
  207. data/lib/bioroebe/taxonomy/constants.rb +4 -3
  208. data/lib/bioroebe/taxonomy/edit.rb +2 -1
  209. data/lib/bioroebe/taxonomy/help/help.rb +10 -10
  210. data/lib/bioroebe/taxonomy/help/helpline.rb +2 -2
  211. data/lib/bioroebe/taxonomy/info/check_available.rb +15 -9
  212. data/lib/bioroebe/taxonomy/info/info.rb +18 -11
  213. data/lib/bioroebe/taxonomy/info/is_dna.rb +46 -36
  214. data/lib/bioroebe/taxonomy/interactive.rb +140 -104
  215. data/lib/bioroebe/taxonomy/menu.rb +27 -18
  216. data/lib/bioroebe/taxonomy/parse_fasta.rb +3 -1
  217. data/lib/bioroebe/taxonomy/shared.rb +1 -0
  218. data/lib/bioroebe/taxonomy/taxonomy.rb +1 -0
  219. data/lib/bioroebe/toplevel_methods/aminoacids_and_proteins.rb +31 -24
  220. data/lib/bioroebe/toplevel_methods/colourize_related_methods.rb +164 -0
  221. data/lib/bioroebe/toplevel_methods/databases.rb +1 -1
  222. data/lib/bioroebe/toplevel_methods/digest.rb +18 -8
  223. data/lib/bioroebe/toplevel_methods/fasta_and_fastq.rb +107 -63
  224. data/lib/bioroebe/toplevel_methods/file_and_directory_related_actions.rb +14 -2
  225. data/lib/bioroebe/toplevel_methods/frequencies.rb +8 -1
  226. data/lib/bioroebe/toplevel_methods/misc.rb +175 -11
  227. data/lib/bioroebe/toplevel_methods/nucleotides.rb +118 -46
  228. data/lib/bioroebe/toplevel_methods/open_in_browser.rb +2 -0
  229. data/lib/bioroebe/toplevel_methods/palindromes.rb +75 -47
  230. data/lib/bioroebe/toplevel_methods/taxonomy.rb +3 -3
  231. data/lib/bioroebe/toplevel_methods/to_camelcase.rb +5 -0
  232. data/lib/bioroebe/utility_scripts/align_open_reading_frames.rb +1 -9
  233. data/lib/bioroebe/utility_scripts/check_for_mismatches/check_for_mismatches.rb +1 -9
  234. data/lib/bioroebe/utility_scripts/compacter/compacter.rb +251 -0
  235. data/lib/bioroebe/utility_scripts/compseq/compseq.rb +1 -9
  236. data/lib/bioroebe/utility_scripts/consensus_sequence.rb +6 -6
  237. data/lib/bioroebe/utility_scripts/create_batch_entrez_file.rb +1 -9
  238. data/lib/bioroebe/utility_scripts/dot_alignment.rb +1 -9
  239. data/lib/bioroebe/utility_scripts/move_file_to_its_correct_location.rb +1 -4
  240. data/lib/bioroebe/utility_scripts/parse_taxonomy.rb +2 -2
  241. data/lib/bioroebe/utility_scripts/permutations.rb +36 -9
  242. data/lib/bioroebe/utility_scripts/showorf/constants.rb +0 -5
  243. data/lib/bioroebe/utility_scripts/showorf/reset.rb +1 -4
  244. data/lib/bioroebe/version/version.rb +2 -2
  245. data/lib/bioroebe/www/embeddable_interface.rb +121 -58
  246. data/lib/bioroebe/www/sinatra/sinatra.rb +186 -71
  247. data/lib/bioroebe/yaml/aminoacids/amino_acids_long_name_to_one_letter.yml +2 -2
  248. data/lib/bioroebe/yaml/aminoacids/weight_of_common_proteins.yml +17 -17
  249. data/lib/bioroebe/yaml/configuration/browser.yml +1 -1
  250. data/lib/bioroebe/yaml/configuration/temp_dir.yml +1 -1
  251. data/lib/bioroebe/yaml/consensus_sequences/consensus_sequences.yml +1 -0
  252. data/lib/bioroebe/yaml/genomes/README.md +3 -4
  253. data/lib/bioroebe/yaml/nucleotides/nucleotides.yml +5 -0
  254. data/lib/bioroebe/yaml/restriction_enzymes/restriction_enzymes.yml +57 -57
  255. data/spec/README.md +6 -0
  256. data/spec/project_wide_specification/classes.md +5 -0
  257. metadata +107 -70
  258. data/doc/setup.rb +0 -1655
  259. data/lib/bioroebe/fasta_and_fastq/parse_fasta/constants.rb +0 -50
  260. data/lib/bioroebe/fasta_and_fastq/parse_fasta/initialize.rb +0 -86
  261. data/lib/bioroebe/fasta_and_fastq/parse_fasta/menu.rb +0 -117
  262. data/lib/bioroebe/fasta_and_fastq/parse_fasta/misc.rb +0 -981
  263. data/lib/bioroebe/fasta_and_fastq/parse_fasta/report.rb +0 -156
  264. data/lib/bioroebe/fasta_and_fastq/parse_fasta/reset.rb +0 -128
  265. data/lib/bioroebe/genbank/genbank_parser.rb +0 -291
  266. data/lib/bioroebe/java/bioroebe/AllInOne.class +0 -0
  267. data/lib/bioroebe/java/bioroebe/Cat.class +0 -0
  268. data/lib/bioroebe/java/bioroebe/Codons.java +0 -22
  269. data/lib/bioroebe/java/bioroebe/IsPalindrome.class +0 -0
  270. data/lib/bioroebe/java/bioroebe/PartnerNucleotide.class +0 -0
  271. data/lib/bioroebe/java/bioroebe/PartnerNucleotide.java +0 -19
  272. data/lib/bioroebe/java/bioroebe/SanitizeNucleotideSequence.class +0 -0
  273. data/lib/bioroebe/java/bioroebe/ToplevelMethods.class +0 -0
  274. data/lib/bioroebe/java/bioroebe.jar +0 -0
  275. data/lib/bioroebe/shell/add.rb +0 -108
  276. data/lib/bioroebe/shell/assign.rb +0 -360
  277. data/lib/bioroebe/shell/chop_and_cut.rb +0 -281
  278. data/lib/bioroebe/shell/constants.rb +0 -166
  279. data/lib/bioroebe/shell/download.rb +0 -335
  280. data/lib/bioroebe/shell/enable_and_disable.rb +0 -158
  281. data/lib/bioroebe/shell/enzymes.rb +0 -310
  282. data/lib/bioroebe/shell/fasta.rb +0 -345
  283. data/lib/bioroebe/shell/gtk.rb +0 -76
  284. data/lib/bioroebe/shell/history.rb +0 -132
  285. data/lib/bioroebe/shell/initialize.rb +0 -217
  286. data/lib/bioroebe/shell/loop.rb +0 -74
  287. data/lib/bioroebe/shell/prompt.rb +0 -107
  288. data/lib/bioroebe/shell/random.rb +0 -289
  289. data/lib/bioroebe/shell/reset.rb +0 -335
  290. data/lib/bioroebe/shell/scan_and_parse.rb +0 -135
  291. data/lib/bioroebe/shell/search.rb +0 -337
  292. data/lib/bioroebe/shell/sequences.rb +0 -200
  293. data/lib/bioroebe/shell/show_report_and_display.rb +0 -2901
  294. data/lib/bioroebe/shell/startup.rb +0 -127
  295. data/lib/bioroebe/shell/taxonomy.rb +0 -14
  296. data/lib/bioroebe/shell/tk.rb +0 -23
  297. data/lib/bioroebe/shell/user_input.rb +0 -88
  298. data/lib/bioroebe/shell/xorg.rb +0 -45
  299. data/lib/bioroebe/utility_scripts/compacter.rb +0 -131
  300. /data/lib/bioroebe/java/bioroebe/{BisulfiteTreatment.java → src/BisulfiteTreatment.java} +0 -0
  301. /data/lib/bioroebe/java/bioroebe/{RemoveFile.class → src/RemoveFile.class} +0 -0
@@ -2,15 +2,1526 @@
2
2
  # Encoding: UTF-8
3
3
  # frozen_string_literal: true
4
4
  # =========================================================================== #
5
+ # === Bioroebe::ParseFasta
6
+ #
7
+ # This class will parse through a local FASTA file and find the
8
+ # proper entries.
9
+ #
10
+ # A FASTA file may have nucleotides or an aminoacid-sequence, so
11
+ # we have to keep this in mind when parsing it.
12
+ #
13
+ # Usage examples:
14
+ #
15
+ # Bioroebe::ParseFasta.new(ARGV)
16
+ # Bioroebe.parse_fasta(ARGV)
17
+ #
18
+ # =========================================================================== #
5
19
  # require 'bioroebe/fasta_and_fastq/parse_fasta/parse_fasta.rb'
6
- # Bioroebe::ParseFasta.new(ARGV)
20
+ # Bioroebe.parse_fasta
21
+ # Bioroebe.sizeseq
7
22
  # =========================================================================== #
8
23
  require 'bioroebe/base/commandline_application/commandline_application.rb'
9
- require 'bioroebe/fasta_and_fastq/parse_fasta/constants.rb'
10
- require 'bioroebe/fasta_and_fastq/parse_fasta/initialize.rb'
11
- require 'bioroebe/fasta_and_fastq/parse_fasta/misc.rb'
12
- require 'bioroebe/fasta_and_fastq/parse_fasta/reset.rb'
13
- require 'bioroebe/fasta_and_fastq/parse_fasta/run.rb'
24
+
25
+ module Bioroebe
26
+
27
+ class ParseFasta < ::Bioroebe::CommandlineApplication # === Bioroebe::ParseFasta
28
+
29
+ require 'bioroebe/sequence/dna.rb'
30
+ require 'bioroebe/calculate/calculate_gc_content.rb'
31
+
32
+ # ========================================================================= #
33
+ # === REGEX_NON_NUCLEOTIDES
34
+ #
35
+ # All non-nucleotides will be handled here via this regex.
36
+ #
37
+ # N is excluded because it may stand for "any" nucleotide too, at
38
+ # the least for a purine.
39
+ # ========================================================================= #
40
+ REGEX_NON_NUCLEOTIDES =
41
+ /BDEFHIJKLMOPQRSVWXYZ/
42
+
43
+ # ========================================================================= #
44
+ # === DEFAULT_FASTA
45
+ #
46
+ # This String can be used to quickly test code depending on FASTA
47
+ # entries.
48
+ # ========================================================================= #
49
+ DEFAULT_FASTA = '>Rosalind_6404
50
+ CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
51
+ TCCCACTAATAATTCTGAGG
52
+ >Rosalind_5959
53
+ CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
54
+ ATATCCATTTGTCAGCAGACACGC
55
+ >Rosalind_0808
56
+ CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
57
+ TGGGAACCTGCGGGCAGTAGGTGGAAT'
58
+
59
+ # ========================================================================= #
60
+ # === DEFAULT_ROUND_TO
61
+ # ========================================================================= #
62
+ DEFAULT_ROUND_TO = 2
63
+
64
+ # ========================================================================= #
65
+ # === initialize
66
+ # ========================================================================= #
67
+ def initialize(
68
+ i = DEFAULT_FASTA,
69
+ run_already = true,
70
+ &block
71
+ )
72
+ reset
73
+ # ======================================================================= #
74
+ # === Handle blocks next
75
+ # ======================================================================= #
76
+ if block_given?
77
+ yielded = yield
78
+ # ===================================================================== #
79
+ # First handle Symbols.
80
+ # ===================================================================== #
81
+ case yielded
82
+ # ===================================================================== #
83
+ # === :be_verbose
84
+ # ===================================================================== #
85
+ when :be_verbose,
86
+ :verbose
87
+ set_be_verbose_and_report_the_sequence
88
+ # ===================================================================== #
89
+ # === :be_quiet
90
+ # ===================================================================== #
91
+ when :be_quiet,
92
+ :be_silent
93
+ be_quiet
94
+ # ===================================================================== #
95
+ # === :sizeseq
96
+ # ===================================================================== #
97
+ when :sizeseq
98
+ @sort_by_size = true
99
+ end
100
+ # ===================================================================== #
101
+ # === Handle Hashes next
102
+ # ===================================================================== #
103
+ if yielded.is_a? Hash
104
+ # =================================================================== #
105
+ # === :be_verbose
106
+ # =================================================================== #
107
+ if yielded.has_key? :be_verbose
108
+ set_be_verbose(yielded.delete(:be_verbose))
109
+ @internal_hash[:report_the_sequence] = true
110
+ end
111
+ # =================================================================== #
112
+ # === :use_colours
113
+ # =================================================================== #
114
+ if yielded.has_key? :use_colours
115
+ set_use_colours(
116
+ yielded.delete(:use_colours)
117
+ )
118
+ end
119
+ # =================================================================== #
120
+ # === :sizeseq
121
+ # =================================================================== #
122
+ if yielded.has_key? :sizeseq
123
+ @sort_by_size = true
124
+ end
125
+ end
126
+ end
127
+ set_commandline_arguments(i)
128
+ case run_already
129
+ # ======================================================================= #
130
+ # === :dont_run_yet
131
+ # ======================================================================= #
132
+ when :dont_run_yet,
133
+ :do_not_run_yet
134
+ run_already = false
135
+ end
136
+ run if run_already
137
+ end
138
+
139
+ # ========================================================================= #
140
+ # === reset (reset tag)
141
+ # ========================================================================= #
142
+ def reset
143
+ super()
144
+ infer_the_namespace
145
+ # ======================================================================= #
146
+ # === @is_a_genbank_file
147
+ # ======================================================================= #
148
+ @is_a_genbank_file = false
149
+ # ======================================================================= #
150
+ # === @input_file
151
+ #
152
+ # This variable denotes which input file is used to read data from.
153
+ #
154
+ # It is nil initially because we may skip reading from an existing
155
+ # file and e. g. only read from a String or some other non-file
156
+ # entity.
157
+ # ======================================================================= #
158
+ @input_file = nil
159
+ # ======================================================================= #
160
+ # === @hash
161
+ #
162
+ # This is the main variable for the class. It will keep entries such
163
+ # as this one here:
164
+ #
165
+ # {
166
+ # "ENSMUSG00000020122|ENSMUST08" => "CCCTCC"
167
+ # }
168
+ #
169
+ # ======================================================================= #
170
+ @hash = {}
171
+ # ======================================================================= #
172
+ # === @internal_hash
173
+ #
174
+ # This Hash exists for internal configuration of the class.
175
+ # ======================================================================= #
176
+ @internal_hash = {}
177
+ # ======================================================================= #
178
+ # === :report_the_sequence
179
+ # ======================================================================= #
180
+ @internal_hash[:report_the_sequence] = false
181
+ # ======================================================================= #
182
+ # === :overwrite_the_original_file
183
+ # ======================================================================= #
184
+ @internal_hash[:overwrite_the_original_file] = false
185
+ # ======================================================================= #
186
+ # === :save_the_file
187
+ # ======================================================================= #
188
+ @internal_hash[:save_the_file] = false
189
+ # ======================================================================= #
190
+ # === :remove_numbers_from_input
191
+ # ======================================================================= #
192
+ @internal_hash[:remove_numbers_from_input] = false
193
+ # ======================================================================= #
194
+ # === :sanitize_the_file
195
+ #
196
+ # If the following variable is true then the .fasta file at hand will
197
+ # be modified.
198
+ # ======================================================================= #
199
+ @internal_hash[:sanitize_the_file] = false
200
+ # ======================================================================= #
201
+ # === :show_the_translated_protein_sequence
202
+ #
203
+ # This setting is false initially. If set to true via the commandline
204
+ # then report() will show the translated protein sequence as well.
205
+ # ======================================================================= #
206
+ @internal_hash[:show_the_translated_protein_sequence] = false
207
+ # ======================================================================= #
208
+ # === :condense_the_sequence_onto_a_single_line
209
+ #
210
+ # By default the output of this class will include newlines for the
211
+ # sequence. If this is not wanted by the user then the following
212
+ # variable keeps track of that behaviour. You can use the flag
213
+ # called --one-line to enable a condensed output, with newlines
214
+ # being removed.
215
+ # ======================================================================= #
216
+ @internal_hash[:condense_the_sequence_onto_a_single_line] = false
217
+ # ======================================================================= #
218
+ # === :limit_the_display_to_n_nucleotides
219
+ #
220
+ # If this variable is a number rather than nil, then it will be used
221
+ # to display only a limited number of nucleotides, e. g. "1000" if
222
+ # the user passes in 1000.
223
+ # ======================================================================= #
224
+ @internal_hash[:limit_the_display_to_n_nucleotides] = nil
225
+ # ======================================================================= #
226
+ # === @may_we_exit
227
+ # ======================================================================= #
228
+ @may_we_exit = false
229
+ # ======================================================================= #
230
+ # === @current_key
231
+ # ======================================================================= #
232
+ @current_key = nil
233
+ # ======================================================================= #
234
+ # === @use_opn
235
+ # ======================================================================= #
236
+ @use_opn = ::Bioroebe.use_opn?
237
+ # ======================================================================= #
238
+ # === @colourize_sequence
239
+ # ======================================================================= #
240
+ @colourize_sequence = false
241
+ # ======================================================================= #
242
+ # === @sort_by_size
243
+ #
244
+ # If the following variable is set to true, then this class will
245
+ # run a sizeseq-comparison, that is, it will compare all sequences
246
+ # and output them in a size-sorted manner, similar to the EMBOSS
247
+ # sizeseq action.
248
+ # ======================================================================= #
249
+ @sort_by_size = false
250
+ # ======================================================================= #
251
+ # === @show_the_header
252
+ #
253
+ # If this variable is true then the header will be shown.
254
+ # ======================================================================= #
255
+ @show_the_header = false
256
+ set_round_to :default
257
+ set_be_verbose
258
+ end
259
+
260
+ # ========================================================================= #
261
+ # === menu (menu tag)
262
+ # ========================================================================= #
263
+ def menu(
264
+ i = return_commandline_arguments_that_are_not_files
265
+ )
266
+ if i.is_a? Array
267
+ i.each {|entry| menu(entry) }
268
+ else
269
+ case i # case tag
270
+ # ===================================================================== #
271
+ # === --sanitize-the-file
272
+ #
273
+ # This entry point allows the user to quickly sanitize a .fasta file.
274
+ #
275
+ # In this context, the default actions done will be to remove all
276
+ # ' ' in a given line, and to upcase the content. Other than that
277
+ # no modifications will be made.
278
+ # ===================================================================== #
279
+ when /^-?-?sanitize(-|_)?the(-|_)?file$/i
280
+ @internal_hash[:sanitize_the_file] = true
281
+ # ===================================================================== #
282
+ # === --to-protein
283
+ #
284
+ # A few aliases exist to this, such as --convert and --translate.
285
+ #
286
+ # Invocation example:
287
+ #
288
+ # pfasta *.fasta --toprotein
289
+ #
290
+ # ===================================================================== #
291
+ when /^-?-?to(-|_)?protein$/i,
292
+ /^-?-?convert$/i,
293
+ /^-?-?translate$/i
294
+ @internal_hash[:show_the_translated_protein_sequence] = true
295
+ # ===================================================================== #
296
+ # === --one-line
297
+ #
298
+ # Invocation example:
299
+ #
300
+ # pfasta rpoS_NC_000913.3.fasta --one-line
301
+ #
302
+ # ===================================================================== #
303
+ when /^-?-?one(-|_)?liner?/i
304
+ @internal_hash[:condense_the_sequence_onto_a_single_line] = true
305
+ # ===================================================================== #
306
+ # === --limit=1000
307
+ #
308
+ # Invocation example:
309
+ #
310
+ # pfasta --limit=1000
311
+ #
312
+ # ===================================================================== #
313
+ when /^-?-?limit=(\d+)$/i
314
+ @internal_hash[:limit_the_display_to_n_nucleotides] = $1.to_s.dup.to_i
315
+ # ===================================================================== #
316
+ # === --overwrite
317
+ # ===================================================================== #
318
+ when /^-?-?overwrite/i
319
+ @internal_hash[:overwrite_the_original_file] = true
320
+ # ===================================================================== #
321
+ # === --help
322
+ #
323
+ # Usage example:
324
+ #
325
+ # parse_fasta --help
326
+ #
327
+ # ===================================================================== #
328
+ when /^-?-?help/i
329
+ show_help
330
+ exit
331
+ # ===================================================================== #
332
+ # === --save-file
333
+ # ===================================================================== #
334
+ when /^-?-?save(-|_)?file/i
335
+ @internal_hash[:save_the_file] = true
336
+ # ===================================================================== #
337
+ # === --also-show-the-sequence
338
+ #
339
+ # To invoke this method try:
340
+ #
341
+ # parsefasta /Depot/Bioroebe/NP_013521.3_289_aa.fasta --show
342
+ #
343
+ # ===================================================================== #
344
+ when /^-?-?also(-|_)?show(-|_)?the(-|_)?sequence$/i,
345
+ /^-?-?report$/i,
346
+ /^-?-?show$/i
347
+ @internal_hash[:report_the_sequence] = true
348
+ # ===================================================================== #
349
+ # === --header
350
+ # ===================================================================== #
351
+ when /^-?-?header/i
352
+ do_show_the_header
353
+ # ===================================================================== #
354
+ # === --short
355
+ #
356
+ # This entry point can be used to show 300 nucleotides and not
357
+ # more, by simply using the --short commandline flag.
358
+ # ===================================================================== #
359
+ when /^-?-?short/i
360
+ @internal_hash[:limit_the_display_to_n_nucleotides] = 300
361
+ # ===================================================================== #
362
+ # === --size
363
+ #
364
+ # This will simply tell us how many nucleotides the given sequence
365
+ # has, then exit.
366
+ #
367
+ # To invoke this method try:
368
+ #
369
+ # parsefasta /Depot/Bioroebe/NP_013521.3_289_aa.fasta --size
370
+ #
371
+ # ===================================================================== #
372
+ when /^-?-?size$/i
373
+ set_be_quiet
374
+ do_process_the_commandline_arguments_that_are_files
375
+ erev size? # Report the size here.
376
+ exit
377
+ end
378
+ end
379
+ end
380
+
381
+ # ========================================================================= #
382
+ # === show_the_translated_protein_sequence?
383
+ # ========================================================================= #
384
+ def show_the_translated_protein_sequence?
385
+ @internal_hash[:show_the_translated_protein_sequence]
386
+ end
387
+
388
+ # ========================================================================= #
389
+ # === set_round_to
390
+ #
391
+ # This will set to how many decimal numbers we will round to. This is
392
+ # mostly done for display-purposes, hence why the default is a fairly
393
+ # low value.
394
+ # ========================================================================= #
395
+ def set_round_to(
396
+ i = :default
397
+ )
398
+ case i
399
+ # ======================================================================= #
400
+ # === :default
401
+ #
402
+ # Since as of April 2021, the new default is 2, for rounding.
403
+ # ======================================================================= #
404
+ when :default
405
+ i = DEFAULT_ROUND_TO
406
+ end
407
+ @internal_hash[:round_to] = i.to_i
408
+ end
409
+
410
+ # ========================================================================= #
411
+ # === do_process_the_commandline_arguments_that_are_files
412
+ # ========================================================================= #
413
+ def do_process_the_commandline_arguments_that_are_files(
414
+ these_files = commandline_arguments_that_are_files?
415
+ )
416
+ unless these_files.is_a? Array
417
+ these_files = [these_files].flatten.compact
418
+ end
419
+ these_files.each {|this_file|
420
+ set_input_file(this_file)
421
+ set_data # This will use the default file.
422
+ split_into_proper_sections
423
+ report_the_FASTA_header if @show_the_header
424
+ if @sort_by_size
425
+ run_sizeseq_comparison
426
+ else
427
+ # =================================================================== #
428
+ # === Handle cases where the input is a protein
429
+ # =================================================================== #
430
+ if is_the_sequence_a_polypeptide?
431
+ if be_verbose?
432
+ erev "This sequence is assumed to be a #{royalblue('protein')}#{rev}."
433
+ report_how_many_elements_we_have_found
434
+ end
435
+ else # Must be a protein.
436
+ # =================================================================== #
437
+ # === Else it must be RNA or DNA
438
+ # =================================================================== #
439
+ if be_verbose?
440
+ erev "This sequence is assumed to "\
441
+ "be #{royalblue('DNA')}#{rev} or #{royalblue('RNA')}#{rev}."
442
+ end
443
+ calculate_gc_content # GC content makes only sense for nucleotides.
444
+ report_how_many_elements_we_have_found if be_verbose?
445
+ end
446
+ if be_verbose?
447
+ report_the_nucleotide_composition
448
+ report_on_how_many_entries_we_did_work
449
+ if report_the_sequence?
450
+ do_report_the_sequence
451
+ end
452
+ end
453
+ end
454
+ }
455
+ end
456
+
457
+ # ========================================================================= #
458
+ # === sanitize_the_description
459
+ #
460
+ # This method will iterate over the description entry and sanitize
461
+ # it. In this context sanitizing means to add the "length" entry,
462
+ # and the "type" entry, such as in:
463
+ #
464
+ # " # length=231; type=dna"
465
+ #
466
+ # ========================================================================= #
467
+ def sanitize_the_description
468
+ @data.map! {|line|
469
+ if line.start_with?('>') and !line.include?('length=')
470
+ length = 0
471
+ if @hash.has_key? line.delete('>')
472
+ length = @hash[line.delete('>')].size
473
+ end
474
+ line << " # length=#{length}; type=dna" # Currently hardcoded to DNA.
475
+ end
476
+ line
477
+ }
478
+ end
479
+
480
+ # ========================================================================= #
481
+ # === entries?
482
+ # ========================================================================= #
483
+ def entries?
484
+ @data
485
+ end
486
+
487
+ # ========================================================================= #
488
+ # === we_may_exit
489
+ # ========================================================================= #
490
+ def we_may_exit
491
+ @may_we_exit = true
492
+ end
493
+
494
+ # ========================================================================= #
495
+ # === output_results
496
+ # ========================================================================= #
497
+ def output_results
498
+ pp @hash
499
+ end
500
+
501
+ # ========================================================================= #
502
+ # === do_report_the_sequence (report tag)
503
+ #
504
+ # This method is used to display the main sequence at hand.
505
+ # ========================================================================= #
506
+ def do_report_the_sequence
507
+ _ = main_sequence?
508
+ # ======================================================================= #
509
+ # Honour the --limit commandline flag next.
510
+ # ======================================================================= #
511
+ if @internal_hash[:limit_the_display_to_n_nucleotides]
512
+ _ = _[0 .. (@internal_hash[:limit_the_display_to_n_nucleotides] - 1)]
513
+ end
514
+ if @colourize_sequence
515
+ if is_polynucleotide?
516
+ # =================================================================== #
517
+ # Else assume this is DNA/RNA input.
518
+ # =================================================================== #
519
+ _.gsub!(/A/, teal('A')+rev)
520
+ _.gsub!(/C/, slateblue('C')+rev)
521
+ _.gsub!(/G/, royalblue('G')+rev)
522
+ _.gsub!(/T/, steelblue('T')+rev)
523
+ _.gsub!(/U/, steelblue('U')+rev)
524
+ #else
525
+ end
526
+ end
527
+ if condense_the_sequence_onto_a_single_line?
528
+ _ = _.delete("\n")
529
+ end
530
+ erev colourize_this_nucleotide_sequence(_)
531
+ e if condense_the_sequence_onto_a_single_line?
532
+ if show_the_translated_protein_sequence?
533
+ # ===================================================================== #
534
+ # Do show the translated protein sequence next:
535
+ # ===================================================================== #
536
+ translated_into_aa = Bioroebe.to_aa(_)
537
+ translated_into_aa_and_colourized = translated_into_aa.dup
538
+ if translated_into_aa.include? '*'
539
+ translated_into_aa_and_colourized = translated_into_aa.gsub(/\*/,tomato('*'))
540
+ end
541
+ erev 'The translated aminoacid sequence of '+
542
+ sfancy(translated_into_aa.size.to_s)+rev+
543
+ ' aminoacids is:'
544
+ e
545
+ erev steelblue(" #{translated_into_aa_and_colourized}")
546
+ e
547
+ end
548
+ end; alias display do_report_the_sequence # === display
549
+ alias report do_report_the_sequence # === report
550
+
551
+ # ========================================================================= #
552
+ # === report_the_nucleotide_composition
553
+ # ========================================================================= #
554
+ def report_the_nucleotide_composition
555
+ if is_this_sequence_a_polynucleotide_sequence?
556
+ first = @hash.values.first.upcase
557
+ total_size = first.size
558
+ n_adenines = first.count('A')
559
+ n_thymidines = first.count('T')
560
+ n_cytodines = first.count('C')
561
+ n_guanines = first.count('G')
562
+ erev "The nucleotide composition is as follows:"
563
+ e " "\
564
+ "#{steelblue(n_adenines)}#{rev}x A (#{(n_adenines * 100.0 / total_size).round(2)}%), "\
565
+ "#{steelblue(n_thymidines)}#{rev}x T (#{(n_thymidines * 100.0 / total_size).round(2)}%), "\
566
+ "#{steelblue(n_cytodines)}#{rev}x C (#{(n_cytodines * 100.0 / total_size).round(2)}%), "\
567
+ "#{steelblue(n_guanines)}#{rev}x G (#{(n_guanines * 100.0 / total_size).round(2)}%)"
568
+ elsif is_a_protein?
569
+ # ===================================================================== #
570
+ # Report the composition of the protein:
571
+ # ===================================================================== #
572
+ sequence = @hash.values.first.delete("\n")
573
+ erev "The protein composition (aminoacids) is as follows:"
574
+ # e colourize_this_aminoacid_sequence_for_the_commandline(" #{sequence}")
575
+ e orchid(" #{sequence}")
576
+ end
577
+ end; alias report_the_protein_composition report_the_nucleotide_composition # === report_the_protein_composition
578
+
579
+ # ========================================================================= #
580
+ # === report_how_many_elements_we_have_found
581
+ # ========================================================================= #
582
+ def report_how_many_elements_we_have_found
583
+ if @hash
584
+ first = @hash.values.first.delete("\n")
585
+ size = first.size.to_s
586
+ if be_verbose?
587
+ n_start_codons = first.count('ATG')
588
+ # =================================================================== #
589
+ # We upcase it since as of October 2021, as some FASTA files may
590
+ # include the sequence in lowercased characters.
591
+ # =================================================================== #
592
+ n_start_codons += first.reverse.upcase.count('ATG')
593
+ result = "This sequence contains #{simp(size.to_s)}#{rev}"\
594
+ " #{nucleotides_or_aminoacids?}".dup
595
+ if is_a_nucleotide?
596
+ result << " and #{n_start_codons} "\
597
+ "ATG codons (on both strands) in total"
598
+ end
599
+ result << '.'
600
+ if size.to_i > 1_000_000
601
+ # ================================================================= #
602
+ # Format the number with '_' characters.
603
+ # ================================================================= #
604
+ formatted = size.to_i.to_s.reverse.split(/(.{3})/).reject(&:empty?).join('_').reverse
605
+ result = result.dup if result.frozen?
606
+ result << ' ('+simp(formatted+' bp')+rev+')'
607
+ end
608
+ erev result
609
+ end
610
+ end
611
+ end
612
+
613
+ # ========================================================================= #
614
+ # === report_on_how_many_entries_we_did_work
615
+ # ========================================================================= #
616
+ def report_on_how_many_entries_we_did_work
617
+ if be_verbose?
618
+ entry_or_entries = 'entry'
619
+ if @hash.keys.size > 1
620
+ entry_or_entries = 'entries'
621
+ end
622
+ erev "We have identified a total of #{orange(@hash.keys.size)}"\
623
+ "#{rev} #{entry_or_entries} in this fasta dataset."
624
+ e
625
+ end
626
+ end
627
+
628
+ # ========================================================================= #
629
+ # === report_the_FASTA_header
630
+ # ========================================================================= #
631
+ def report_the_FASTA_header
632
+ e "#{rev}The header is: #{steelblue(header?)}"
633
+ end
634
+
635
+ # ========================================================================= #
636
+ # === report_the_sequence?
637
+ # ========================================================================= #
638
+ def report_the_sequence?
639
+ @internal_hash[:report_the_sequence]
640
+ end
641
+
642
+ # ========================================================================= #
643
+ # === sanitize_data
644
+ # ========================================================================= #
645
+ def sanitize_data(i)
646
+ if i.is_a? Array
647
+ i.flatten!
648
+ i.reject! {|entry| entry.start_with? '#' }
649
+ i.reject! {|entry| entry.strip.empty? }
650
+ if i.first and i.first.include? "\r"
651
+ # =================================================================== #
652
+ # Some FASTA files include "\r" line endings. We will check first
653
+ # for the first entry to contain a \r, and if so, we assume the
654
+ # whole FASTA file may have \r, which then will be removed.
655
+ # =================================================================== #
656
+ i.map! {|entry| entry.delete("\r") }
657
+ end
658
+ end
659
+ # ========================================================================= #
660
+ # === Run through SanitizeNucleotideSequence
661
+ # ========================================================================= #
662
+ if @internal_hash[:remove_numbers_from_input]
663
+ i = Bioroebe::SanitizeNucleotideSequence[i]
664
+ end
665
+ i
666
+ end
667
+
668
+ # ========================================================================= #
669
+ # === current_key?
670
+ # ========================================================================= #
671
+ def current_key?
672
+ @current_key
673
+ end; alias id? current_key? # === id?
674
+ alias sequence_id? current_key? # === sequence_id?
675
+ alias title current_key? # === title
676
+ alias title? current_key? # === title?
677
+
678
+ # ========================================================================= #
679
+ # === round_to?
680
+ # ========================================================================= #
681
+ def round_to?
682
+ @internal_hash[:round_to]
683
+ end
684
+
685
+ # ========================================================================= #
686
+ # === opnn
687
+ # ========================================================================= #
688
+ def opnn
689
+ super(namespace?) if use_opn?
690
+ end
691
+
692
+ # ========================================================================= #
693
+ # === use_opn?
694
+ # ========================================================================= #
695
+ def use_opn?
696
+ @use_opn
697
+ end
698
+
699
+ # ========================================================================= #
700
+ # === calculate_gc_content
701
+ #
702
+ # Calculate the gc content through this method, which is called from
703
+ # within the method run().
704
+ # ========================================================================= #
705
+ def calculate_gc_content
706
+ _ = @hash.values.join.delete(N)
707
+ if is_polynucleotide? _
708
+ @hash.each_pair {|key, content|
709
+ # =================================================================== #
710
+ # Delegate towards the method Bioroebe.gc_content next, including
711
+ # to round towards 5 positions:
712
+ # =================================================================== #
713
+ gc_content = ::Bioroebe.gc_content(content.upcase, round_to?)
714
+ gc_content = gc_content.first if gc_content.is_a? Array
715
+ gc_content = gc_content.to_s
716
+ minimal_key = key.to_s
717
+ if minimal_key.include? '|'
718
+ minimal_key = minimal_key.split('|').last.strip
719
+ end
720
+ if be_verbose?
721
+ _ = minimal_key.strip
722
+ if _.size > 40 # Shorten the content a bit if it is too long.
723
+ _ = _[0 .. 40]+' [...]'
724
+ end
725
+ erev 'GC content of "'+simp(_)+rev+'" is: '+
726
+ "#{sfancy(gc_content)}#{rev} %"
727
+ end
728
+ }
729
+ else
730
+ erev '`'+simp(_)+rev+'` is not a polynucleotide.' if be_verbose?
731
+ end
732
+ end
733
+
734
+ # ========================================================================= #
735
+ # === first_value
736
+ #
737
+ # This will return the first entry of the Fasta files.
738
+ # ========================================================================= #
739
+ def first_value
740
+ sequences?.first
741
+ end
742
+
743
+ # ========================================================================= #
744
+ # === nucleotides_or_aminoacids?
745
+ # ========================================================================= #
746
+ def nucleotides_or_aminoacids?
747
+ if is_polynucleotide?
748
+ 'nucleotides'
749
+ else
750
+ 'aminoacids'
751
+ end
752
+ end
753
+
754
+ # ========================================================================= #
755
+ # === is_polynucleotide?
756
+ # ========================================================================= #
757
+ def is_polynucleotide?(i = main_sequence?)
758
+ !is_protein?(i)
759
+ end; alias is_a_nucleotide? is_polynucleotide? # === is_a_nucleotide?
760
+
761
+ # ========================================================================= #
762
+ # === is_this_sequence_a_polynucleotide_sequence?
763
+ # ========================================================================= #
764
+ def is_this_sequence_a_polynucleotide_sequence?
765
+ !is_protein?
766
+ end
767
+
768
+ # ========================================================================= #
769
+ # === data?
770
+ #
771
+ # This will contain the full content of the (whole) .fasta file, including
772
+ # the header.
773
+ # ========================================================================= #
774
+ def data?
775
+ @data
776
+ end; alias input? data? # === input?
777
+ alias dataset? data? # === dataset?
778
+
779
+ # ========================================================================= #
780
+ # === hash?
781
+ # ========================================================================= #
782
+ def hash?
783
+ @hash
784
+ end
785
+
786
+ # ========================================================================= #
787
+ # === sequences?
788
+ #
789
+ # This method will obtain all found sequences.
790
+ # ========================================================================= #
791
+ def sequences?
792
+ @hash.values
793
+ end; alias sequences sequences? # === sequences
794
+ alias values sequences? # === values
795
+
796
+ # ========================================================================= #
797
+ # === short_headers?
798
+ #
799
+ # The short-headers are like the headers, but if a ' ' token is found
800
+ # then the line will be truncated towards that first ' '.
801
+ #
802
+ # An example is:
803
+ #
804
+ # sp|Q91FT8|234R_IIV6 Uncharacterized protein 234R OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-234R PE=4 SV=1
805
+ #
806
+ # This will be truncated towards
807
+ #
808
+ # sp|Q91FT8|234R_IIV6
809
+ #
810
+ # This could then be used to automatically rename FASTA files, for
811
+ # instance.
812
+ # ========================================================================= #
813
+ def short_headers?
814
+ headers?.map {|entry|
815
+ if entry.include? ' '
816
+ entry = entry.split(' ').first
817
+ end
818
+ entry
819
+ }
820
+ end
821
+
822
+ # ========================================================================= #
823
+ # === set_data
824
+ #
825
+ # This is the setter-method towards @data. It is no longer allowed to
826
+ # invoke set_input_file() since as of 12.06.2020. This means that
827
+ # you have to invoke that method prior to calling this method.
828
+ # ========================================================================= #
829
+ def set_data(i = @input_file)
830
+ # ======================================================================= #
831
+ # The next line attempts to ensure that even an Array can be used
832
+ # as input to that method.
833
+ # ======================================================================= #
834
+ i = [i].flatten.compact.first.to_s.dup
835
+ if File.exist? i.to_s # First try to read in from a file.
836
+ if be_verbose?
837
+ opnn; erev "Will read from the file `#{sfile(i)}#{rev}`."
838
+ end
839
+ i = File.readlines(i)
840
+ if @is_a_genbank_file
841
+ selected = i.select {|line|
842
+ line.start_with?(' ') and # such as: " 61 atggggcctg caatggggcc tgcaatgggg cctgca\n"
843
+ (line.strip =~ /\d+/)
844
+ }.map {|inner_line|
845
+ inner_line.strip.delete(' 0123456789').strip.upcase
846
+ }
847
+ i = ["> genbank file"]+selected
848
+ end
849
+ end
850
+ if i.nil? or i.empty?
851
+ i = DEFAULT_FASTA
852
+ opnn; erev 'No input was provided. Thus a default FASTA '\
853
+ 'sequence will be used instead.'
854
+ end
855
+ i = sanitize_data(i)
856
+ i = i.split(N) if i.is_a? String
857
+ @data = i
858
+ end; alias set_sequence set_data # === set_Sequence
859
+
860
+ # ========================================================================= #
861
+ # === set_be_verbose_and_report_the_sequence
862
+ # ========================================================================= #
863
+ def set_be_verbose_and_report_the_sequence
864
+ set_be_verbose
865
+ @internal_hash[:report_the_sequence] = true
866
+ end
867
+
868
+ # ========================================================================= #
869
+ # === condense_the_sequence_onto_a_single_line?
870
+ # ========================================================================= #
871
+ def condense_the_sequence_onto_a_single_line?
872
+ @internal_hash[:condense_the_sequence_onto_a_single_line]
873
+ end
874
+
875
+ # ========================================================================= #
876
+ # === return_size_sorted_hash
877
+ # ========================================================================= #
878
+ def return_size_sorted_hash(i = @hash)
879
+ _ = i.sort_by {|key, value| value.size }
880
+ i = Hash[_]
881
+ return i
882
+ end
883
+
884
+ # ========================================================================= #
885
+ # === do_sort_by_size
886
+ #
887
+ # This method will sort the hash by size of the sequence. It has been
888
+ # inspired by the EMBOSS sizeq functionality.
889
+ #
890
+ # The output that should be generated might look like this:
891
+ #
892
+ # https://www.bioinformatics.nl/cgi-bin/emboss/help/sizeseq#input.1
893
+ #
894
+ # Invocation example:
895
+ #
896
+ # x = Bioroebe::ParseFasta.new('/Depot/j/globins.fasta'); x.do_sort_by_size
897
+ #
898
+ # ========================================================================= #
899
+ def do_sort_by_size
900
+ # ======================================================================= #
901
+ # Sort it here first, by the size of the "value", aka the sequence body.
902
+ # ======================================================================= #
903
+ @hash = return_size_sorted_hash(@hash)
904
+ _ = ''.dup
905
+ @hash.each_pair {|key, sequence|
906
+ _ << '> ID '+sequence.size.to_s+' AA.; DE: '+key.to_s+
907
+ ' SQ '+sequence.size.to_s+' AA'+N # ; unknown MW as of yet; '\
908
+ #'unknown CRC64 as of yet'+N
909
+ _ << sequence+N+N
910
+ }
911
+ e _
912
+ end; alias run_sizeseq_comparison do_sort_by_size # === run_sizeseq_comparison
913
+
914
+ # ========================================================================= #
915
+ # === n_nucleotides?
916
+ # ========================================================================= #
917
+ def n_nucleotides?
918
+ @hash.values.first.delete("\n").size
919
+ end; alias return_n_aminoacids n_nucleotides? # === return_n_aminoacids
920
+ alias size? n_nucleotides? # === size?
921
+ alias sequence_size? n_nucleotides? # === sequence_size?
922
+
923
+ # ========================================================================= #
924
+ # === headers?
925
+ # ========================================================================= #
926
+ def headers?
927
+ @hash.keys
928
+ end
929
+
930
+ # ========================================================================= #
931
+ # === first_key?
932
+ #
933
+ # Obtain the very first entry.
934
+ # ========================================================================= #
935
+ def first_key?
936
+ headers?.first
937
+ end
938
+
939
+ # ========================================================================= #
940
+ # === header?
941
+ #
942
+ # This variant will always return the first entry.
943
+ # ========================================================================= #
944
+ def header?
945
+ headers?.first.to_s
946
+ end
947
+
948
+ # ========================================================================= #
949
+ # === raw_body?
950
+ # ========================================================================= #
951
+ def raw_body?
952
+ @hash.values.first
953
+ end
954
+
955
+ # ========================================================================= #
956
+ # === do_show_the_header
957
+ # ========================================================================= #
958
+ def do_show_the_header
959
+ @show_the_header = true
960
+ end
961
+
962
+ # ========================================================================= #
963
+ # === set_input_file
964
+ #
965
+ # This method will be used to keep track of the input-file, from
966
+ # which we will read the dataset.
967
+ # ========================================================================= #
968
+ def set_input_file(i = nil)
969
+ if i.nil?
970
+ # ===================================================================== #
971
+ # First, we try to find a .fasta or .fa file in the current
972
+ # directory. If we can find it, we will use that instead.
973
+ # ===================================================================== #
974
+ unless Dir['*.{fa,fasta}'].empty?
975
+ file = Dir['*.{fa,fasta}'].first
976
+ if be_verbose?
977
+ result = 'A '
978
+ if file.end_with? '.fasta'
979
+ result < 'FASTA '
980
+ end
981
+ result << 'file was found in this directory ('+sfile(file)+').'
982
+ opnn; erev result
983
+ opnn; erev 'We will use it.'
984
+ end
985
+ i = file
986
+ end
987
+ unless Dir['*.{fa,fasta}'].empty?
988
+ file = Dir['*.{fa,fasta}'].first
989
+ if be_verbose?
990
+ opnn; erev "We have found a file in this "\
991
+ "directory (#{sfile(file)}#{rev})."
992
+ opnn; erev 'We will use it.'
993
+ end
994
+ i = file
995
+ end
996
+ end
997
+ if i and File.exist?(i)
998
+ dataset = File.read(i)
999
+ if dataset[0 .. ('LOCUS'.size - 1)] == 'LOCUS'
1000
+ @is_a_genbank_file = true
1001
+ end
1002
+ end
1003
+ @input_file = i
1004
+ end; alias set_input_files set_input_file # === set_input_files
1005
+
1006
+ # ========================================================================= #
1007
+ # === save_the_file?
1008
+ # ========================================================================= #
1009
+ def save_the_file?
1010
+ @internal_hash[:save_the_file]
1011
+ end
1012
+
1013
+ # ========================================================================= #
1014
+ # === overwrite_the_original_file?
1015
+ # ========================================================================= #
1016
+ def overwrite_the_original_file?
1017
+ @internal_hash[:overwrite_the_original_file]
1018
+ end
1019
+
1020
+ # ========================================================================= #
1021
+ # === split_into_proper_sections
1022
+ #
1023
+ # Split up into the fasta identifier, and the content.
1024
+ # ========================================================================= #
1025
+ def split_into_proper_sections
1026
+ unless @data.to_s.include? '>'
1027
+ erev 'No ">" character was found in this dataset.'
1028
+ erev 'It is recommended to always have a > identifier '\
1029
+ 'for the'
1030
+ erev 'FASTA format (such as in a .fasta or a .fa file).'
1031
+ end if be_verbose? # Ok, the input data includes >. We can proceed.
1032
+ @data.each { |line|
1033
+ # ===================================================================== #
1034
+ # === Handle the leading > FASTA identifier first
1035
+ # ===================================================================== #
1036
+ if line.start_with? '>' # leading identifier.
1037
+ @current_key = line[1..-1].chomp # Select all but the first character.
1038
+ @hash[@current_key] = ''.dup
1039
+ else
1040
+ line.delete!('_')
1041
+ unless @current_key
1042
+ @current_key = 'standard'
1043
+ @hash[@current_key] = ''.dup
1044
+ end
1045
+ # =================================================================== #
1046
+ # === Retain the newlines
1047
+ #
1048
+ # Here we may decide to get rid of newlines, but it is better to
1049
+ # NOT remove the newlines - that way we can simply save the
1050
+ # dataset again.
1051
+ # @hash[@current_key] << no_newlines(line)
1052
+ # =================================================================== #
1053
+ @hash[@current_key] << line
1054
+ end
1055
+ }
1056
+ end
1057
+
1058
+ # ========================================================================= #
1059
+ # === save_into_a_fasta_file
1060
+ # ========================================================================= #
1061
+ def save_into_a_fasta_file(
1062
+ be_verbose = be_verbose?
1063
+ )
1064
+ case be_verbose
1065
+ when :be_verbose
1066
+ be_verbose = true
1067
+ end
1068
+ if @data
1069
+ what = @data.join("\n")
1070
+ into = 'standard.fasta'
1071
+ erev 'Saving into '+sfile(into)+rev+'.' if be_verbose
1072
+ write_what_into(what, into)
1073
+ return File.absolute_path(into) # And return the file we saved into.
1074
+ else
1075
+ opnn; erev 'No @data variable exists.'
1076
+ end
1077
+ end; alias do_save_the_file save_into_a_fasta_file # === do_save_the_file
1078
+
1079
+ # ========================================================================= #
1080
+ # === add_length_information_to_the_header
1081
+ # ========================================================================= #
1082
+ def add_length_information_to_the_header
1083
+ _ = header?.strip
1084
+ _ << ' length='+sequence_size?.to_s+';'
1085
+ # ======================================================================= #
1086
+ # Next, designate where to store this file.
1087
+ # ======================================================================= #
1088
+ into = 'new_fasta_file.fasta'
1089
+ if overwrite_the_original_file?
1090
+ into = @input_file
1091
+ end
1092
+ what = ''.dup
1093
+ what << "> "+_+"\n"
1094
+ what << raw_body?
1095
+ if what and into
1096
+ erev 'Storing into `'+sfile(into)+rev+'`.'
1097
+ write_what_into(what, into)
1098
+ end
1099
+ end
1100
+
1101
+ # ========================================================================= #
1102
+ # === simplify_header
1103
+ #
1104
+ # This method can be called to simplify the header. It will save into
1105
+ # a .fasta file at once.
1106
+ # ========================================================================= #
1107
+ def simplify_header
1108
+ _ = header?
1109
+ # ======================================================================= #
1110
+ # Next, simplify the header. We must start with checking for [] first,
1111
+ # because if there are any [] in the FASTA header then we can simplify
1112
+ # stuff at once.
1113
+ # ======================================================================= #
1114
+ if _.include?('[') and _.include?(']')
1115
+ _ = '> '+_.strip.scan(/\[.+\]/).flatten.first.delete('[]')+"\n"
1116
+ elsif _.include? ','
1117
+ _ = _[0 .. (_.index(',') - 1) ].strip
1118
+ end
1119
+ what = nil
1120
+ # ======================================================================= #
1121
+ # Next, designate where to store this file.
1122
+ # ======================================================================= #
1123
+ into = 'new_fasta_file.fasta'
1124
+ if overwrite_the_original_file?
1125
+ into = @input_file
1126
+ end
1127
+ if _.start_with? '>'
1128
+ what = _
1129
+ elsif _.include?('[') and _.include?(']') # For example: [Pan troglodytes]
1130
+ # ===================================================================== #
1131
+ # See rubular at:
1132
+ #
1133
+ # https://rubular.com/r/aDjI0JwMOUlZzP
1134
+ #
1135
+ # ===================================================================== #
1136
+ what = "> "+_.scan(/\[(.+)\]/).flatten.first.to_s+"\n".dup
1137
+ elsif _.include? 'Human'
1138
+ _scanned_result = _.scan(/(Human)/)
1139
+ what = "> "+$1.to_s.dup+"\n".dup
1140
+ else
1141
+ erev "Unsure what to do: #{steelblue(_)}"
1142
+ end
1143
+ if what and into
1144
+ what << raw_body?
1145
+ erev 'Storing into `'+sfile(into)+rev+'`.'
1146
+ write_what_into(what, into)
1147
+ end
1148
+ end
1149
+
1150
+ # ========================================================================= #
1151
+ # === sequence
1152
+ #
1153
+ # This method will return the sequence, without any newlines. It is also
1154
+ # called the "body" of a FASTA file.
1155
+ # ========================================================================= #
1156
+ def sequence
1157
+ _ = @hash.values.first
1158
+ _.chomp! if _ and _.end_with?(N)
1159
+ return no_newlines(_)
1160
+ end; alias fasta_sequence sequence # === fasta_sequence
1161
+ alias sequence? sequence # === sequence?
1162
+ alias body? sequence # === body?
1163
+ alias body sequence # === body?
1164
+ alias naseq sequence # === naseq
1165
+ alias nucleotide_sequence sequence # === nucleotide_sequence
1166
+ alias return_sequence sequence # === return_sequence
1167
+ alias content? sequence # === content?
1168
+
1169
+ # ========================================================================= #
1170
+ # === save
1171
+ #
1172
+ # This method will save our FASTA file.
1173
+ # ========================================================================= #
1174
+ def save
1175
+ if @input_file.nil?
1176
+ erev "The generic file #{sfile('foobar.fasta')}#{rev} "\
1177
+ "will be used."
1178
+ set_input_file('foobar.fasta')
1179
+ end
1180
+ into = @input_file
1181
+ what = @data.join("\n")
1182
+ erev 'Storing into '+sfile(into)+rev+'.'
1183
+ write_what_into(what, into)
1184
+ return into
1185
+ end
1186
+
1187
+ # ========================================================================= #
1188
+ # === []
1189
+ #
1190
+ # This is a simpler query-interface for obtaining the DNA/RNA sequence
1191
+ # of the FASTA file (or aminoacid sequence, if we have a protein at
1192
+ # hand here).
1193
+ #
1194
+ # Using the method sequences? here, which in turn works on @hash, is
1195
+ # ok because Hashes are kept in a sorted manner in ruby since some
1196
+ # time.
1197
+ # ========================================================================= #
1198
+ def [](i)
1199
+ sequences?[i]
1200
+ end
1201
+
1202
+ # ========================================================================= #
1203
+ # === Bioroebe::ParseFasta[]
1204
+ # ========================================================================= #
1205
+ def self.[](i)
1206
+ _ = new(i)
1207
+ _.sequences?
1208
+ end
1209
+
1210
+ # ========================================================================= #
1211
+ # === type?
1212
+ # ========================================================================= #
1213
+ def type?
1214
+ if is_the_sequence_a_polypeptide?
1215
+ :protein
1216
+ elsif is_this_sequence_a_polynucleotide_sequence?
1217
+ :dna_or_rna
1218
+ else
1219
+ :unknown
1220
+ end
1221
+ end
1222
+
1223
+ # ========================================================================= #
1224
+ # === is_the_sequence_a_polypeptide?
1225
+ #
1226
+ # This method can be used to determine whether a given input sequence
1227
+ # is a polypeptide (aka a protein) or whether it is not.
1228
+ #
1229
+ # If this sequence is a polypeptide then this method will return true.
1230
+ # Otherwise false will be returned.
1231
+ # ========================================================================= #
1232
+ def is_the_sequence_a_polypeptide?(
1233
+ i = main_sequence?
1234
+ )
1235
+ return_value = false # Set the default return value here.
1236
+ # ======================================================================= #
1237
+ # Look at the first 120 positions to determine whether this is a protein
1238
+ # or a nucleotide sequence.
1239
+ # ======================================================================= #
1240
+ subsequence = i[0 .. 119] # Must deduct 1 at the end since Arrays in ruby start at 0.
1241
+ # ======================================================================= #
1242
+ # Build a frequency of the characters there.
1243
+ # ======================================================================= #
1244
+ hash = {}
1245
+ hash.default = 0
1246
+ subsequence.chars.each {|character|
1247
+ hash[character] += 1
1248
+ }
1249
+ keys_to_check_for = %w(
1250
+ B D E F H I J K L M O P Q R S V W X Y Z
1251
+ )
1252
+
1253
+ values = hash.select {|key, value|
1254
+ if keys_to_check_for.include? key
1255
+ true
1256
+ else
1257
+ false
1258
+ end
1259
+ }.values.sum
1260
+ if values > 0
1261
+ return_value = true
1262
+ end
1263
+ return return_value
1264
+ end; alias is_protein? is_the_sequence_a_polypeptide? # === is_protein?
1265
+ alias is_a_protein? is_the_sequence_a_polypeptide? # === is_a_protein?
1266
+
1267
+ # ========================================================================= #
1268
+ # === main_sequence?
1269
+ #
1270
+ # This will always return the first entry.
1271
+ # ========================================================================= #
1272
+ def main_sequence?
1273
+ @hash.values.first
1274
+ end
1275
+
1276
+ # ========================================================================= #
1277
+ # === gc_content?
1278
+ # ========================================================================= #
1279
+ def gc_content?
1280
+ return ::Bioroebe.gc_content(main_sequence?).to_f # Must be a float.
1281
+ end; alias gc_content gc_content? # === gc_content
1282
+
1283
+ # ========================================================================= #
1284
+ # === sequence_object
1285
+ #
1286
+ # This method will return a Sequence object.
1287
+ #
1288
+ # Usage example:
1289
+ #
1290
+ # x = Bioroebe.parse_fasta 'ls_orchid.fasta'
1291
+ # y = x.sequence_object # y is now an instance of Bioroebe::Sequence
1292
+ #
1293
+ # ========================================================================= #
1294
+ def sequence_object
1295
+ ::Bioroebe::Sequence.new(main_sequence?)
1296
+ end
1297
+
1298
+ # ========================================================================= #
1299
+ # === sanitize_the_file?
1300
+ # ========================================================================= #
1301
+ def sanitize_the_file?
1302
+ @internal_hash[:sanitize_the_file]
1303
+ end
1304
+
1305
+ # ========================================================================= #
1306
+ # === show_help (help tag)
1307
+ #
1308
+ # This method will inform the user how this class may be used from the
1309
+ # commandline.
1310
+ #
1311
+ # Invocation example:
1312
+ #
1313
+ # pfasta --help
1314
+ #
1315
+ # ========================================================================= #
1316
+ def show_help
1317
+ e
1318
+ eparse ' --size'
1319
+ eparse ' --also-show-the-sequence'
1320
+ eparse ' --header # show the header as well (normally the '\
1321
+ 'header is not shown)'
1322
+ eparse ' --limit=1000 # limit to show only the first 1000 '\
1323
+ 'nucleotides; use'
1324
+ eparse ' # any number that you need here'
1325
+ eparse ' --one-line # show the sequence on one line only, '\
1326
+ 'e. g. all newlines'
1327
+ eparse ' # were removed'
1328
+ eparse ' --toprotein # show the protein sequence as well '\
1329
+ '(assumes DNA or RNA'
1330
+ eparse ' # .fasta file)'
1331
+ eparse ' --convert # alias to the above ^^^'
1332
+ eparse ' --translate # alias to the above ^^^'
1333
+ eparse ' --sanitize-the-file # delete all " " characters '\
1334
+ 'and upcase the content, of a'
1335
+ eparse ' # .fasta file'
1336
+ e
1337
+ end
1338
+
1339
+ # ========================================================================= #
1340
+ # === do_sanitize_the_file_then_exit
1341
+ # ========================================================================= #
1342
+ def do_sanitize_the_file_then_exit
1343
+ _ = non_hyphened_commandline_arguments?
1344
+ first = _.first
1345
+ if File.exist?(first)
1346
+ dataset = default_readlines(first)
1347
+ # ===================================================================== #
1348
+ # Next, iterate over the dataset.
1349
+ # ===================================================================== #
1350
+ dataset.map! {|entry|
1351
+ entry = entry.dup if entry.frozen?
1352
+ entry.delete!(' ') if entry.include?(' ')
1353
+ entry.upcase!
1354
+ entry
1355
+ }
1356
+ opne 'Saving the sanitized dataset into '\
1357
+ 'the file '+sfile(first)+rev+'.'
1358
+ write_what_into(dataset.join, first)
1359
+ end
1360
+ exit
1361
+ end
1362
+
1363
+ # ========================================================================= #
1364
+ # === run (run tag)
1365
+ # ========================================================================= #
1366
+ def run
1367
+ menu
1368
+ do_sanitize_the_file_then_exit if sanitize_the_file?
1369
+ do_process_the_commandline_arguments_that_are_files
1370
+ do_save_the_file if save_the_file?
1371
+ end
1372
+
1373
+ end
1374
+
1375
+ Fasta = ParseFasta # Add an "alias" constant to class ParseFasta.
1376
+
1377
+ # =========================================================================== #
1378
+ # === Bioroebe.parse_fasta_quietly
1379
+ #
1380
+ # As the variant above, but will work quietly.
1381
+ # =========================================================================== #
1382
+ def self.parse_fasta_quietly(
1383
+ i, use_colours = true
1384
+ )
1385
+ ::Bioroebe.parse_fasta(i, use_colours) { :be_quiet }
1386
+ end
1387
+
1388
+ # =========================================================================== #
1389
+ # === Bioroebe.return_fasta_entry_with_the_highest_gc_content
1390
+ #
1391
+ # The first argument should be a locally existing FASTA file that
1392
+ # contains different sequences.
1393
+ #
1394
+ # Usage example:
1395
+ #
1396
+ # x = Bioroebe.return_fasta_entry_with_the_highest_gc_content('/rosalind_gc.txt')
1397
+ #
1398
+ # =========================================================================== #
1399
+ def self.return_fasta_entry_with_the_highest_gc_content(this_fasta_file)
1400
+ if File.exist? this_fasta_file
1401
+ dataset = File.read(this_fasta_file)
1402
+ dataset = parse_fasta(dataset) { :be_quiet }
1403
+ hash = dataset.hash?
1404
+ hash.transform_values! {|this_value|
1405
+ ::Bioroebe.gc_content(this_value).to_f
1406
+ }
1407
+ return hash.max_by {|key, value| value }
1408
+ else
1409
+ erev "No file exists at #{sfile(this_fasta_file)}#{rev}."
1410
+ end
1411
+ end
1412
+
1413
+ # =========================================================================== #
1414
+ # === Bioroebe.sizeseq
1415
+ #
1416
+ # This method will "size-sequence compare", typically on a .fasta file.
1417
+ # =========================================================================== #
1418
+ def self.sizeseq(i)
1419
+ if i.is_a? Array
1420
+ i = i.first
1421
+ end
1422
+ _ = Bioroebe.parse_fasta(i) { :be_quiet }
1423
+ _.do_sort_by_size
1424
+ end
1425
+
1426
+ # =========================================================================== #
1427
+ # === Bioroebe.return_sizeseq
1428
+ #
1429
+ # This is as Bioroebe.sizeseq(), but it will just return the result,
1430
+ # rather than output it.
1431
+ # =========================================================================== #
1432
+ def self.return_sizeseq(i)
1433
+ if i.is_a? Array
1434
+ i = i.first
1435
+ end
1436
+ _ = Bioroebe.parse_fasta(i) { :be_quiet }
1437
+ hash = _.return_size_sorted_hash
1438
+ result = ''.dup
1439
+ hash.each_pair {|key, sequence|
1440
+ result << '> ID '+sequence.size.to_s+' AA.; DE: '+key.to_s+
1441
+ ' SQ '+sequence.size.to_s+' AA'+N
1442
+ result << sequence+N+N
1443
+ }
1444
+ return result
1445
+ end
1446
+
1447
+ # =========================================================================== #
1448
+ # === Bioroebe.genbank_to_fasta
1449
+ #
1450
+ # This method will convert from a genbank file, to a .fasta file.
1451
+ #
1452
+ # Invocation example:
1453
+ #
1454
+ # Bioroebe.genbank_to_fasta('/home/x/DATA/PROGRAMMING_LANGUAGES/RUBY/src/bioroebe/lib/bioroebe/data/genbank/sample_file.genbank')
1455
+ #
1456
+ # =========================================================================== #
1457
+ def self.genbank_to_fasta(
1458
+ this_file,
1459
+ be_verbose = :be_verbose
1460
+ )
1461
+ case be_verbose
1462
+ when :be_quiet
1463
+ be_verbose = false
1464
+ end
1465
+ if this_file.is_a? Array
1466
+ this_file = this_file.first
1467
+ end
1468
+ if File.exist? this_file
1469
+ _ = Bioroebe::ParseFasta.new(this_file) { :be_quiet }
1470
+ else
1471
+ _ = Bioroebe::ParseFasta.new(:do_not_run_yet) { :be_quiet }
1472
+ _.set_data # This will use the default file.
1473
+ _.split_into_proper_sections
1474
+ end
1475
+ file_path = _.save_into_a_fasta_file(be_verbose)
1476
+ return file_path
1477
+ end
1478
+
1479
+ # =========================================================================== #
1480
+ # === Bioroebe.parse_fasta_file
1481
+ # =========================================================================== #
1482
+ def self.parse_fasta_file(
1483
+ i = ARGV,
1484
+ use_colours = true
1485
+ )
1486
+ use_this_hash = {
1487
+ use_colours: use_colours,
1488
+ be_verbose: false
1489
+ }
1490
+ ParseFasta.new(i) { use_this_hash }
1491
+ end; self.instance_eval { alias fasta_file parse_fasta_file } # === Bioroebe.fasta_file
1492
+
1493
+ # =========================================================================== #
1494
+ # === Bioroebe.parse_fasta
1495
+ #
1496
+ # Easier reader-method for .fasta files.
1497
+ #
1498
+ # The second argument determines whether we will use colours or whether
1499
+ # we will not. For now, the default is to not use colours when we use
1500
+ # this particular class method.
1501
+ #
1502
+ # Invocation examples:
1503
+ #
1504
+ # x = Bioroebe.parse_fasta('/rosalind_gc.txt')
1505
+ # hash = Bioroebe.parse_fasta('/rosalind_gc.txt').hash?
1506
+ #
1507
+ # =========================================================================== #
1508
+ def self.parse_fasta(
1509
+ i,
1510
+ use_colours = true
1511
+ )
1512
+ use_this_hash = {
1513
+ use_colours: use_colours
1514
+ }
1515
+ if block_given?
1516
+ use_this_hash = {
1517
+ use_colours: use_colours,
1518
+ be_verbose: yield
1519
+ }
1520
+ end
1521
+ ::Bioroebe::ParseFasta.new(i) { use_this_hash }
1522
+ end; self.instance_eval { alias fasta parse_fasta } # === Bioroebe.fasta
1523
+
1524
+ end
14
1525
 
15
1526
  if __FILE__ == $PROGRAM_NAME
16
1527
  Bioroebe::ParseFasta.new(ARGV) { :sizeseq }
@@ -24,4 +1535,4 @@ end # corefasta globins.fasta
24
1535
  # pfasta /GC.txt
25
1536
  # pfasta 013521.3_289_aa.fasta --also-show-the-sequence
26
1537
  # pfasta $RSRC/bioroebe/lib/bioroebe/data/GFP_mutant_3_coding_sequence.fasta --also-show-the-sequence
27
- # corefasta $J/globins.fasta
1538
+ # corefasta $J/globins.fasta