bioroebe 0.10.80 → 0.12.12

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of bioroebe might be problematic. Click here for more details.

Files changed (242) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3612 -2781
  3. data/bin/bioroebe +7 -1
  4. data/bin/bioroebe_hash +7 -0
  5. data/bin/codon_to_aminoacid +1 -0
  6. data/bioroebe.gemspec +3 -3
  7. data/doc/README.gen +3612 -2742
  8. data/doc/quality_control/commandline_applications.md +3 -3
  9. data/doc/todo/bioroebe_java_todo.md +22 -0
  10. data/doc/todo/bioroebe_todo.md +2059 -2615
  11. data/lib/bioroebe/aminoacids/aminoacid_substitution.rb +1 -9
  12. data/lib/bioroebe/aminoacids/codon_percentage.rb +1 -9
  13. data/lib/bioroebe/aminoacids/deduce_aminoacid_sequence.rb +1 -9
  14. data/lib/bioroebe/aminoacids/display_aminoacid_table.rb +1 -0
  15. data/lib/bioroebe/aminoacids/show_hydrophobicity.rb +1 -6
  16. data/lib/bioroebe/base/colours_for_base/colours_for_base.rb +18 -8
  17. data/lib/bioroebe/base/commandline_application/commandline_arguments.rb +15 -11
  18. data/lib/bioroebe/base/commandline_application/misc.rb +66 -49
  19. data/lib/bioroebe/base/commandline_application/opn.rb +8 -8
  20. data/lib/bioroebe/base/commandline_application/reset.rb +3 -2
  21. data/lib/bioroebe/base/misc.rb +35 -0
  22. data/lib/bioroebe/base/prototype/misc.rb +11 -1
  23. data/lib/bioroebe/codon_tables/frequencies/parse_frequency_table.rb +2 -10
  24. data/lib/bioroebe/codons/codons.rb +1 -1
  25. data/lib/bioroebe/codons/convert_this_codon_to_that_aminoacid.rb +192 -58
  26. data/lib/bioroebe/codons/possible_codons_for_this_aminoacid.rb +1 -9
  27. data/lib/bioroebe/codons/show_codon_tables.rb +6 -2
  28. data/lib/bioroebe/codons/show_codon_usage.rb +15 -4
  29. data/lib/bioroebe/colours/rev.rb +4 -1
  30. data/lib/bioroebe/constants/aminoacids_and_proteins.rb +1 -0
  31. data/lib/bioroebe/constants/database_constants.rb +1 -1
  32. data/lib/bioroebe/constants/files_and_directories.rb +31 -4
  33. data/lib/bioroebe/constants/misc.rb +20 -0
  34. data/lib/bioroebe/conversions/dna_to_aminoacid_sequence.rb +58 -24
  35. data/lib/bioroebe/count/count_amount_of_aminoacids.rb +3 -2
  36. data/lib/bioroebe/count/count_amount_of_nucleotides.rb +3 -0
  37. data/lib/bioroebe/crystal/README.md +2 -0
  38. data/lib/bioroebe/crystal/to_rna.cr +19 -0
  39. data/lib/bioroebe/data/README.md +11 -8
  40. data/lib/bioroebe/data/electron_microscopy/pos_example.pos +396 -0
  41. data/lib/bioroebe/data/electron_microscopy/test_particles.star +36 -0
  42. data/lib/bioroebe/data/fasta/human/Homo_sapiens_hemoglobin_subunit_alpha_HBB_mRNA.fasta +9 -0
  43. data/lib/bioroebe/data/fasta/human/Homo_sapiens_hemoglobin_subunit_beta_HBB_mRNA.fasta +8 -0
  44. data/lib/bioroebe/data/fasta/human/README.md +2 -0
  45. data/lib/bioroebe/electron_microscopy/coordinate_analyzer.rb +15 -18
  46. data/lib/bioroebe/{fasta_and_fastq/parse_fasta/run.rb → electron_microscopy/electron_microscopy_module.rb} +16 -8
  47. data/lib/bioroebe/electron_microscopy/fix_pos_file.rb +1 -9
  48. data/lib/bioroebe/electron_microscopy/flipy.rb +83 -0
  49. data/lib/bioroebe/electron_microscopy/parse_coordinates.rb +2 -10
  50. data/lib/bioroebe/electron_microscopy/read_file_xmd.rb +1 -9
  51. data/lib/bioroebe/electron_microscopy/simple_star_file_generator.rb +4 -9
  52. data/lib/bioroebe/enzymes/has_this_restriction_enzyme.rb +10 -3
  53. data/lib/bioroebe/enzymes/restriction_enzyme.rb +23 -1
  54. data/lib/bioroebe/enzymes/restriction_enzymes/statistics.rb +65 -0
  55. data/lib/bioroebe/fasta_and_fastq/autocorrect_the_name_of_this_fasta_file.rb +1 -9
  56. data/lib/bioroebe/fasta_and_fastq/compact_fasta_file/compact_fasta_file.rb +7 -9
  57. data/lib/bioroebe/fasta_and_fastq/fasta_defline/fasta_defline.rb +1 -5
  58. data/lib/bioroebe/fasta_and_fastq/fasta_to_yaml/fasta_to_yaml.rb +81 -0
  59. data/lib/bioroebe/fasta_and_fastq/parse_fasta/parse_fasta.rb +1465 -7
  60. data/lib/bioroebe/fasta_and_fastq/return_fasta_subsection_of_this_file.rb +11 -2
  61. data/lib/bioroebe/fasta_and_fastq/show_fasta_headers.rb +27 -12
  62. data/lib/bioroebe/fasta_and_fastq/simplify_fasta_header/simplify_fasta_header.rb +1 -5
  63. data/lib/bioroebe/fasta_and_fastq/split_this_fasta_file_into_chromosomes/constants.rb +0 -5
  64. data/lib/bioroebe/genome/README.md +4 -0
  65. data/lib/bioroebe/genome/genome.rb +67 -0
  66. data/lib/bioroebe/genomes/genome_pattern.rb +3 -9
  67. data/lib/bioroebe/gui/gtk +1 -0
  68. data/lib/bioroebe/gui/gtk3/alignment/alignment.rb +73 -128
  69. data/lib/bioroebe/gui/gtk3/controller/controller.rb +45 -27
  70. data/lib/bioroebe/gui/gtk3/dna_to_aminoacid_widget/dna_to_aminoacid_widget.rb +76 -50
  71. data/lib/bioroebe/gui/gtk3/fasta_table_widget/fasta_table_widget.rb +99 -21
  72. data/lib/bioroebe/gui/gtk3/hamming_distance/hamming_distance.rb +42 -28
  73. data/lib/bioroebe/gui/gtk3/nucleotide_analyser/nucleotide_analyser.rb +119 -71
  74. data/lib/bioroebe/gui/gtk3/protein_to_DNA/protein_to_DNA.rb +18 -18
  75. data/lib/bioroebe/gui/gtk3/random_sequence/random_sequence.rb +19 -11
  76. data/lib/bioroebe/gui/jruby/alignment/alignment.rb +165 -0
  77. data/lib/bioroebe/gui/libui/alignment/alignment.rb +3 -1
  78. data/lib/bioroebe/gui/libui/controller/controller.rb +116 -0
  79. data/lib/bioroebe/gui/libui/random_sequence/random_sequence.rb +18 -2
  80. data/lib/bioroebe/gui/libui/show_codon_table/show_codon_table.rb +2 -0
  81. data/lib/bioroebe/gui/libui/three_to_one/three_to_one.rb +8 -6
  82. data/lib/bioroebe/gui/shared_code/alignment/alignment_module.rb +102 -0
  83. data/lib/bioroebe/gui/shared_code/levensthein_distance/levensthein_distance_module.rb +18 -16
  84. data/lib/bioroebe/gui/shared_code/protein_to_DNA/protein_to_DNA_module.rb +14 -14
  85. data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne$1.class +0 -0
  86. data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne$CloseListener.class +0 -0
  87. data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne.class +0 -0
  88. data/lib/bioroebe/gui/swing/three_to_one/ThreeToOne.java +141 -0
  89. data/lib/bioroebe/images/FORWARD_PRIMER.png +0 -0
  90. data/lib/bioroebe/images/REVERSE_PRIMER.png +0 -0
  91. data/lib/bioroebe/java/README.md +4 -0
  92. data/lib/bioroebe/java/bioroebe/Sequence.java +25 -1
  93. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/Base.class +0 -0
  94. data/lib/bioroebe/java/bioroebe/{Base.java → src/main/java/bioroebe/Base.java} +15 -2
  95. data/lib/bioroebe/java/bioroebe/{BisulfiteTreatment.class → src/main/java/bioroebe/BisulfiteTreatment.class} +0 -0
  96. data/lib/bioroebe/java/bioroebe/{Codons.class → src/main/java/bioroebe/Codons.class} +0 -0
  97. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/Codons.java +34 -0
  98. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/Commandline.class +0 -0
  99. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/Commandline.java +101 -0
  100. data/lib/bioroebe/java/bioroebe/{GenerateRandomDnaSequence.class → src/main/java/bioroebe/GenerateRandomDnaSequence.class} +0 -0
  101. data/lib/bioroebe/java/bioroebe/{GenerateRandomDnaSequence.java → src/main/java/bioroebe/GenerateRandomDnaSequence.java} +8 -2
  102. data/lib/bioroebe/java/bioroebe/{IsPalindrome.class → src/main/java/bioroebe/IsPalindrome.class} +0 -0
  103. data/lib/bioroebe/java/bioroebe/{IsPalindrome.java → src/main/java/bioroebe/IsPalindrome.java} +5 -1
  104. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/PartnerNucleotide.class +0 -0
  105. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/PartnerNucleotide.java +56 -0
  106. data/lib/bioroebe/java/bioroebe/{RemoveFile.java → src/main/java/bioroebe/RemoveFile.java} +10 -4
  107. data/lib/bioroebe/java/bioroebe/{RemoveNumbers.class → src/main/java/bioroebe/RemoveNumbers.class} +0 -0
  108. data/lib/bioroebe/java/bioroebe/{RemoveNumbers.java → src/main/java/bioroebe/RemoveNumbers.java} +1 -0
  109. data/lib/bioroebe/java/bioroebe/{ToCamelcase.class → src/main/java/bioroebe/ToCamelcase.class} +0 -0
  110. data/lib/bioroebe/java/bioroebe/{ToCamelcase.java → src/main/java/bioroebe/ToCamelcase.java} +3 -3
  111. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/ToRNA.class +0 -0
  112. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/ToRNA.java +42 -0
  113. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/toplevel_methods/BaseComposition.class +0 -0
  114. data/lib/bioroebe/java/bioroebe/src/main/java/bioroebe/toplevel_methods/BaseComposition.java +75 -0
  115. data/lib/bioroebe/misc/ruler.rb +11 -2
  116. data/lib/bioroebe/nucleotides/most_likely_nucleotide_sequence_for_this_aminoacid_sequence.rb +1 -9
  117. data/lib/bioroebe/nucleotides/show_nucleotide_sequence.rb +7 -7
  118. data/lib/bioroebe/parsers/genbank_parser.rb +347 -26
  119. data/lib/bioroebe/parsers/gff.rb +1 -9
  120. data/lib/bioroebe/patterns/scan_for_repeat.rb +1 -5
  121. data/lib/bioroebe/pdb/fetch_fasta_sequence_from_pdb.rb +1 -9
  122. data/lib/bioroebe/pdb/parse_mmCIF_file.rb +1 -9
  123. data/lib/bioroebe/pdb/parse_pdb_file.rb +4 -10
  124. data/lib/bioroebe/project/project.rb +1 -1
  125. data/lib/bioroebe/python/README.md +1 -0
  126. data/lib/bioroebe/python/__pycache__/mymodule.cpython-39.pyc +0 -0
  127. data/lib/bioroebe/python/gui/gtk3/all_in_one.css +4 -0
  128. data/lib/bioroebe/python/gui/gtk3/all_in_one.py +59 -0
  129. data/lib/bioroebe/python/gui/gtk3/widget1.py +20 -0
  130. data/lib/bioroebe/python/gui/tkinter/all_in_one.py +91 -0
  131. data/lib/bioroebe/python/mymodule.py +8 -0
  132. data/lib/bioroebe/python/protein_to_dna.py +33 -0
  133. data/lib/bioroebe/python/shell/shell.py +19 -0
  134. data/lib/bioroebe/python/to_rna.py +14 -0
  135. data/lib/bioroebe/python/toplevel_methods/esystem.py +12 -0
  136. data/lib/bioroebe/python/toplevel_methods/open_in_browser.py +20 -0
  137. data/lib/bioroebe/python/toplevel_methods/palindromes.py +42 -0
  138. data/lib/bioroebe/python/toplevel_methods/rds.py +13 -0
  139. data/lib/bioroebe/python/toplevel_methods/shuffleseq.py +23 -0
  140. data/lib/bioroebe/python/toplevel_methods/three_delimiter.py +37 -0
  141. data/lib/bioroebe/python/toplevel_methods/time_and_date.py +43 -0
  142. data/lib/bioroebe/python/toplevel_methods/to_camelcase.py +21 -0
  143. data/lib/bioroebe/requires/require_the_bioroebe_project.rb +3 -1
  144. data/lib/bioroebe/sequence/alignment.rb +14 -4
  145. data/lib/bioroebe/sequence/dna.rb +1 -0
  146. data/lib/bioroebe/sequence/nucleotide_module/nucleotide_module.rb +28 -25
  147. data/lib/bioroebe/sequence/protein.rb +105 -3
  148. data/lib/bioroebe/sequence/sequence.rb +87 -21
  149. data/lib/bioroebe/shell/menu.rb +3829 -3714
  150. data/lib/bioroebe/shell/misc.rb +59 -4307
  151. data/lib/bioroebe/shell/readline/readline.rb +1 -1
  152. data/lib/bioroebe/shell/shell.rb +11255 -28
  153. data/lib/bioroebe/siRNA/siRNA.rb +81 -1
  154. data/lib/bioroebe/string_matching/find_longest_substring.rb +3 -2
  155. data/lib/bioroebe/string_matching/hamming_distance.rb +1 -9
  156. data/lib/bioroebe/taxonomy/class_methods.rb +3 -8
  157. data/lib/bioroebe/taxonomy/constants.rb +4 -3
  158. data/lib/bioroebe/taxonomy/edit.rb +2 -1
  159. data/lib/bioroebe/taxonomy/help/help.rb +10 -10
  160. data/lib/bioroebe/taxonomy/help/helpline.rb +2 -2
  161. data/lib/bioroebe/taxonomy/info/check_available.rb +15 -9
  162. data/lib/bioroebe/taxonomy/info/info.rb +18 -11
  163. data/lib/bioroebe/taxonomy/info/is_dna.rb +46 -36
  164. data/lib/bioroebe/taxonomy/interactive.rb +140 -104
  165. data/lib/bioroebe/taxonomy/menu.rb +27 -18
  166. data/lib/bioroebe/taxonomy/parse_fasta.rb +3 -1
  167. data/lib/bioroebe/taxonomy/shared.rb +1 -0
  168. data/lib/bioroebe/taxonomy/taxonomy.rb +1 -0
  169. data/lib/bioroebe/toplevel_methods/aminoacids_and_proteins.rb +31 -24
  170. data/lib/bioroebe/toplevel_methods/colourize_related_methods.rb +164 -0
  171. data/lib/bioroebe/toplevel_methods/databases.rb +1 -1
  172. data/lib/bioroebe/toplevel_methods/digest.rb +18 -8
  173. data/lib/bioroebe/toplevel_methods/fasta_and_fastq.rb +107 -63
  174. data/lib/bioroebe/toplevel_methods/file_and_directory_related_actions.rb +14 -2
  175. data/lib/bioroebe/toplevel_methods/frequencies.rb +8 -1
  176. data/lib/bioroebe/toplevel_methods/misc.rb +142 -12
  177. data/lib/bioroebe/toplevel_methods/nucleotides.rb +118 -46
  178. data/lib/bioroebe/toplevel_methods/open_in_browser.rb +2 -0
  179. data/lib/bioroebe/toplevel_methods/palindromes.rb +1 -2
  180. data/lib/bioroebe/toplevel_methods/taxonomy.rb +2 -2
  181. data/lib/bioroebe/toplevel_methods/to_camelcase.rb +5 -0
  182. data/lib/bioroebe/utility_scripts/align_open_reading_frames.rb +1 -9
  183. data/lib/bioroebe/utility_scripts/check_for_mismatches/check_for_mismatches.rb +1 -9
  184. data/lib/bioroebe/utility_scripts/compacter.rb +1 -9
  185. data/lib/bioroebe/utility_scripts/compseq/compseq.rb +1 -9
  186. data/lib/bioroebe/utility_scripts/consensus_sequence.rb +6 -6
  187. data/lib/bioroebe/utility_scripts/create_batch_entrez_file.rb +1 -9
  188. data/lib/bioroebe/utility_scripts/dot_alignment.rb +1 -9
  189. data/lib/bioroebe/utility_scripts/move_file_to_its_correct_location.rb +1 -4
  190. data/lib/bioroebe/utility_scripts/parse_taxonomy.rb +2 -2
  191. data/lib/bioroebe/utility_scripts/showorf/constants.rb +0 -5
  192. data/lib/bioroebe/utility_scripts/showorf/reset.rb +1 -4
  193. data/lib/bioroebe/version/version.rb +2 -2
  194. data/lib/bioroebe/www/embeddable_interface.rb +103 -54
  195. data/lib/bioroebe/www/sinatra/sinatra.rb +186 -70
  196. data/lib/bioroebe/yaml/aminoacids/amino_acids_long_name_to_one_letter.yml +2 -2
  197. data/lib/bioroebe/yaml/configuration/browser.yml +1 -1
  198. data/lib/bioroebe/yaml/configuration/temp_dir.yml +1 -1
  199. data/lib/bioroebe/yaml/genomes/README.md +3 -4
  200. data/lib/bioroebe/yaml/restriction_enzymes/restriction_enzymes.yml +27 -27
  201. metadata +81 -64
  202. data/doc/setup.rb +0 -1655
  203. data/lib/bioroebe/fasta_and_fastq/parse_fasta/constants.rb +0 -50
  204. data/lib/bioroebe/fasta_and_fastq/parse_fasta/initialize.rb +0 -86
  205. data/lib/bioroebe/fasta_and_fastq/parse_fasta/menu.rb +0 -117
  206. data/lib/bioroebe/fasta_and_fastq/parse_fasta/misc.rb +0 -981
  207. data/lib/bioroebe/fasta_and_fastq/parse_fasta/report.rb +0 -156
  208. data/lib/bioroebe/fasta_and_fastq/parse_fasta/reset.rb +0 -128
  209. data/lib/bioroebe/genbank/genbank_parser.rb +0 -291
  210. data/lib/bioroebe/java/bioroebe/Base.class +0 -0
  211. data/lib/bioroebe/java/bioroebe/Codons.java +0 -22
  212. data/lib/bioroebe/java/bioroebe/PartnerNucleotide.class +0 -0
  213. data/lib/bioroebe/java/bioroebe/PartnerNucleotide.java +0 -19
  214. data/lib/bioroebe/java/bioroebe.jar +0 -0
  215. data/lib/bioroebe/shell/add.rb +0 -108
  216. data/lib/bioroebe/shell/assign.rb +0 -360
  217. data/lib/bioroebe/shell/chop_and_cut.rb +0 -281
  218. data/lib/bioroebe/shell/constants.rb +0 -166
  219. data/lib/bioroebe/shell/download.rb +0 -335
  220. data/lib/bioroebe/shell/enable_and_disable.rb +0 -158
  221. data/lib/bioroebe/shell/enzymes.rb +0 -310
  222. data/lib/bioroebe/shell/fasta.rb +0 -345
  223. data/lib/bioroebe/shell/gtk.rb +0 -76
  224. data/lib/bioroebe/shell/history.rb +0 -132
  225. data/lib/bioroebe/shell/initialize.rb +0 -217
  226. data/lib/bioroebe/shell/loop.rb +0 -74
  227. data/lib/bioroebe/shell/prompt.rb +0 -107
  228. data/lib/bioroebe/shell/random.rb +0 -289
  229. data/lib/bioroebe/shell/reset.rb +0 -335
  230. data/lib/bioroebe/shell/scan_and_parse.rb +0 -135
  231. data/lib/bioroebe/shell/search.rb +0 -337
  232. data/lib/bioroebe/shell/sequences.rb +0 -200
  233. data/lib/bioroebe/shell/show_report_and_display.rb +0 -2901
  234. data/lib/bioroebe/shell/startup.rb +0 -127
  235. data/lib/bioroebe/shell/taxonomy.rb +0 -14
  236. data/lib/bioroebe/shell/tk.rb +0 -23
  237. data/lib/bioroebe/shell/user_input.rb +0 -88
  238. data/lib/bioroebe/shell/xorg.rb +0 -45
  239. /data/lib/bioroebe/java/bioroebe/{BisulfiteTreatment.java → src/main/java/bioroebe/BisulfiteTreatment.java} +0 -0
  240. /data/lib/bioroebe/java/bioroebe/{Esystem.class → src/main/java/bioroebe/Esystem.class} +0 -0
  241. /data/lib/bioroebe/java/bioroebe/{Esystem.java → src/main/java/bioroebe/Esystem.java} +0 -0
  242. /data/lib/bioroebe/java/bioroebe/{RemoveFile.class → src/main/java/bioroebe/RemoveFile.class} +0 -0
@@ -2,15 +2,1473 @@
2
2
  # Encoding: UTF-8
3
3
  # frozen_string_literal: true
4
4
  # =========================================================================== #
5
+ # === Bioroebe::ParseFasta
6
+ #
7
+ # This class will parse through a local FASTA file and find the
8
+ # proper entries.
9
+ #
10
+ # A FASTA file may have nucleotides or an aminoacid-sequence, so
11
+ # we have to keep this in mind when parsing it.
12
+ #
13
+ # Usage examples:
14
+ #
15
+ # Bioroebe::ParseFasta.new(ARGV)
16
+ # Bioroebe.parse_fasta(ARGV)
17
+ #
18
+ # =========================================================================== #
5
19
  # require 'bioroebe/fasta_and_fastq/parse_fasta/parse_fasta.rb'
6
- # Bioroebe::ParseFasta.new(ARGV)
20
+ # Bioroebe.parse_fasta
21
+ # Bioroebe.sizeseq
7
22
  # =========================================================================== #
8
23
  require 'bioroebe/base/commandline_application/commandline_application.rb'
9
- require 'bioroebe/fasta_and_fastq/parse_fasta/constants.rb'
10
- require 'bioroebe/fasta_and_fastq/parse_fasta/initialize.rb'
11
- require 'bioroebe/fasta_and_fastq/parse_fasta/misc.rb'
12
- require 'bioroebe/fasta_and_fastq/parse_fasta/reset.rb'
13
- require 'bioroebe/fasta_and_fastq/parse_fasta/run.rb'
24
+
25
+ module Bioroebe
26
+
27
+ class ParseFasta < ::Bioroebe::CommandlineApplication # === Bioroebe::ParseFasta
28
+
29
+ require 'bioroebe/sequence/dna.rb'
30
+ require 'bioroebe/calculate/calculate_gc_content.rb'
31
+
32
+ # ========================================================================= #
33
+ # === REGEX_NON_NUCLEOTIDES
34
+ #
35
+ # All non-nucleotides will be handled here via this regex.
36
+ #
37
+ # N is excluded because it may stand for "any" nucleotide too, at
38
+ # the least for a purine.
39
+ # ========================================================================= #
40
+ REGEX_NON_NUCLEOTIDES =
41
+ /BDEFHIJKLMOPQRSVWXYZ/
42
+
43
+ # ========================================================================= #
44
+ # === DEFAULT_FASTA
45
+ #
46
+ # This String can be used to quickly test code depending on FASTA
47
+ # entries.
48
+ # ========================================================================= #
49
+ DEFAULT_FASTA = '>Rosalind_6404
50
+ CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
51
+ TCCCACTAATAATTCTGAGG
52
+ >Rosalind_5959
53
+ CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
54
+ ATATCCATTTGTCAGCAGACACGC
55
+ >Rosalind_0808
56
+ CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
57
+ TGGGAACCTGCGGGCAGTAGGTGGAAT'
58
+
59
+ # ========================================================================= #
60
+ # === DEFAULT_ROUND_TO
61
+ # ========================================================================= #
62
+ DEFAULT_ROUND_TO = 2
63
+
64
+ # ========================================================================= #
65
+ # === initialize
66
+ # ========================================================================= #
67
+ def initialize(
68
+ i = DEFAULT_FASTA,
69
+ run_already = true,
70
+ &block
71
+ )
72
+ reset
73
+ # ======================================================================= #
74
+ # === Handle blocks next
75
+ # ======================================================================= #
76
+ if block_given?
77
+ yielded = yield
78
+ # ===================================================================== #
79
+ # First handle Symbols.
80
+ # ===================================================================== #
81
+ case yielded
82
+ # ===================================================================== #
83
+ # === :be_verbose
84
+ # ===================================================================== #
85
+ when :be_verbose,
86
+ :verbose
87
+ set_be_verbose_and_report_the_sequence
88
+ # ===================================================================== #
89
+ # === :be_quiet
90
+ # ===================================================================== #
91
+ when :be_quiet,
92
+ :be_silent
93
+ be_quiet
94
+ # ===================================================================== #
95
+ # === :sizeseq
96
+ # ===================================================================== #
97
+ when :sizeseq
98
+ @sort_by_size = true
99
+ end
100
+ # ===================================================================== #
101
+ # === Handle Hashes next
102
+ # ===================================================================== #
103
+ if yielded.is_a? Hash
104
+ # =================================================================== #
105
+ # === :be_verbose
106
+ # =================================================================== #
107
+ if yielded.has_key? :be_verbose
108
+ set_be_verbose(yielded.delete(:be_verbose))
109
+ @internal_hash[:report_the_sequence] = true
110
+ end
111
+ # =================================================================== #
112
+ # === :use_colours
113
+ # =================================================================== #
114
+ if yielded.has_key? :use_colours
115
+ set_use_colours(
116
+ yielded.delete(:use_colours)
117
+ )
118
+ end
119
+ # =================================================================== #
120
+ # === :sizeseq
121
+ # =================================================================== #
122
+ if yielded.has_key? :sizeseq
123
+ @sort_by_size = true
124
+ end
125
+ end
126
+ end
127
+ set_commandline_arguments(i)
128
+ case run_already
129
+ # ======================================================================= #
130
+ # === :dont_run_yet
131
+ # ======================================================================= #
132
+ when :dont_run_yet,
133
+ :do_not_run_yet
134
+ run_already = false
135
+ end
136
+ run if run_already
137
+ end
138
+
139
+ # ========================================================================= #
140
+ # === reset (reset tag)
141
+ # ========================================================================= #
142
+ def reset
143
+ super()
144
+ infer_the_namespace
145
+ # ======================================================================= #
146
+ # === @is_a_genbank_file
147
+ # ======================================================================= #
148
+ @is_a_genbank_file = false
149
+ # ======================================================================= #
150
+ # === @input_file
151
+ #
152
+ # This variable denotes which input file is used to read data from.
153
+ #
154
+ # It is nil initially because we may skip reading from an existing
155
+ # file and e. g. only read from a String or some other non-file
156
+ # entity.
157
+ # ======================================================================= #
158
+ @input_file = nil
159
+ # ======================================================================= #
160
+ # === @hash
161
+ #
162
+ # This is the main variable for the class. It will keep entries such
163
+ # as this one here:
164
+ #
165
+ # {
166
+ # "ENSMUSG00000020122|ENSMUST08" => "CCCTCC"
167
+ # }
168
+ #
169
+ # ======================================================================= #
170
+ @hash = {}
171
+ # ======================================================================= #
172
+ # === @internal_hash
173
+ #
174
+ # This Hash exists for internal configuration of the class.
175
+ # ======================================================================= #
176
+ @internal_hash = {}
177
+ # ======================================================================= #
178
+ # === :report_the_sequence
179
+ # ======================================================================= #
180
+ @internal_hash[:report_the_sequence] = false
181
+ # ======================================================================= #
182
+ # === :overwrite_the_original_file
183
+ # ======================================================================= #
184
+ @internal_hash[:overwrite_the_original_file] = false
185
+ # ======================================================================= #
186
+ # === :save_the_file
187
+ # ======================================================================= #
188
+ @internal_hash[:save_the_file] = false
189
+ # ======================================================================= #
190
+ # === :remove_numbers_from_input
191
+ # ======================================================================= #
192
+ @internal_hash[:remove_numbers_from_input] = false
193
+ # ======================================================================= #
194
+ # === :show_the_translated_protein_sequence
195
+ #
196
+ # This setting is false initially. If set to true via the commandline
197
+ # then report() will show the translated protein sequence as well.
198
+ # ======================================================================= #
199
+ @internal_hash[:show_the_translated_protein_sequence] = false
200
+ # ======================================================================= #
201
+ # === :condense_the_sequence_onto_a_single_line
202
+ #
203
+ # By default the output of this class will include newlines for the
204
+ # sequence. If this is not wanted by the user then the following
205
+ # variable keeps track of that behaviour. You can use the flag
206
+ # called --one-line to enable a condensed output, with newlines
207
+ # being removed.
208
+ # ======================================================================= #
209
+ @internal_hash[:condense_the_sequence_onto_a_single_line] = false
210
+ # ======================================================================= #
211
+ # === :limit_the_display_to_n_nucleotides
212
+ #
213
+ # If this variable is a number rather than nil, then it will be used
214
+ # to display only a limited number of nucleotides, e. g. "1000" if
215
+ # the user passes in 1000.
216
+ # ======================================================================= #
217
+ @internal_hash[:limit_the_display_to_n_nucleotides] = nil
218
+ # ======================================================================= #
219
+ # === @may_we_exit
220
+ # ======================================================================= #
221
+ @may_we_exit = false
222
+ # ======================================================================= #
223
+ # === @current_key
224
+ # ======================================================================= #
225
+ @current_key = nil
226
+ # ======================================================================= #
227
+ # === @use_opn
228
+ # ======================================================================= #
229
+ @use_opn = ::Bioroebe.use_opn?
230
+ # ======================================================================= #
231
+ # === @colourize_sequence
232
+ # ======================================================================= #
233
+ @colourize_sequence = false
234
+ # ======================================================================= #
235
+ # === @sort_by_size
236
+ #
237
+ # If the following variable is set to true, then this class will
238
+ # run a sizeseq-comparison, that is, it will compare all sequences
239
+ # and output them in a size-sorted manner, similar to the EMBOSS
240
+ # sizeseq action.
241
+ # ======================================================================= #
242
+ @sort_by_size = false
243
+ # ======================================================================= #
244
+ # === @show_the_header
245
+ #
246
+ # If this variable is true then the header will be shown.
247
+ # ======================================================================= #
248
+ @show_the_header = false
249
+ set_round_to :default
250
+ set_be_verbose
251
+ end
252
+
253
+ # ========================================================================= #
254
+ # === menu (menu tag)
255
+ # ========================================================================= #
256
+ def menu(
257
+ i = return_commandline_arguments_that_are_not_files
258
+ )
259
+ if i.is_a? Array
260
+ i.each {|entry| menu(entry) }
261
+ else
262
+ case i # case tag
263
+ # ===================================================================== #
264
+ # === --to-protein
265
+ #
266
+ # A few aliases exist to this, such as --convert and --translate.
267
+ #
268
+ # Invocation example:
269
+ #
270
+ # pfasta *.fasta --toprotein
271
+ #
272
+ # ===================================================================== #
273
+ when /^-?-?to(-|_)?protein$/i,
274
+ /^-?-?convert$/i,
275
+ /^-?-?translate$/i
276
+ @internal_hash[:show_the_translated_protein_sequence] = true
277
+ # ===================================================================== #
278
+ # === --one-line
279
+ #
280
+ # Invocation example:
281
+ #
282
+ # pfasta rpoS_NC_000913.3.fasta --one-line
283
+ #
284
+ # ===================================================================== #
285
+ when /^-?-?one(-|_)?liner?/i
286
+ @internal_hash[:condense_the_sequence_onto_a_single_line] = true
287
+ # ===================================================================== #
288
+ # === --limit=1000
289
+ #
290
+ # Invocation example:
291
+ #
292
+ # pfasta --limit=1000
293
+ #
294
+ # ===================================================================== #
295
+ when /^-?-?limit=(\d+)$/i
296
+ @internal_hash[:limit_the_display_to_n_nucleotides] = $1.to_s.dup.to_i
297
+ # ===================================================================== #
298
+ # === --overwrite
299
+ # ===================================================================== #
300
+ when /^-?-?overwrite/i
301
+ @internal_hash[:overwrite_the_original_file] = true
302
+ # ===================================================================== #
303
+ # === --help
304
+ #
305
+ # Usage example:
306
+ #
307
+ # parse_fasta --help
308
+ #
309
+ # ===================================================================== #
310
+ when /^-?-?help/i
311
+ show_help
312
+ exit
313
+ # ===================================================================== #
314
+ # === --save-file
315
+ # ===================================================================== #
316
+ when /^-?-?save(-|_)?file/i
317
+ @internal_hash[:save_the_file] = true
318
+ # ===================================================================== #
319
+ # === --also-show-the-sequence
320
+ #
321
+ # To invoke this method try:
322
+ #
323
+ # parsefasta /Depot/Bioroebe/NP_013521.3_289_aa.fasta --show
324
+ #
325
+ # ===================================================================== #
326
+ when /^-?-?also(-|_)?show(-|_)?the(-|_)?sequence$/i,
327
+ /^-?-?report$/i,
328
+ /^-?-?show$/i
329
+ @internal_hash[:report_the_sequence] = true
330
+ # ===================================================================== #
331
+ # === --header
332
+ # ===================================================================== #
333
+ when /^-?-?header/i
334
+ do_show_the_header
335
+ # ===================================================================== #
336
+ # === --short
337
+ #
338
+ # This entry point can be used to show 300 nucleotides and not
339
+ # more, by simply using the --short commandline flag.
340
+ # ===================================================================== #
341
+ when /^-?-?short/i
342
+ @internal_hash[:limit_the_display_to_n_nucleotides] = 300
343
+ # ===================================================================== #
344
+ # === --size
345
+ #
346
+ # This will simply tell us how many nucleotides the given sequence
347
+ # has, then exit.
348
+ #
349
+ # To invoke this method try:
350
+ #
351
+ # parsefasta /Depot/Bioroebe/NP_013521.3_289_aa.fasta --size
352
+ #
353
+ # ===================================================================== #
354
+ when /^-?-?size$/i
355
+ set_be_quiet
356
+ do_process_the_commandline_arguments_that_are_files
357
+ erev size? # Report the size here.
358
+ exit
359
+ end
360
+ end
361
+ end
362
+
363
+ # ========================================================================= #
364
+ # === show_the_translated_protein_sequence?
365
+ # ========================================================================= #
366
+ def show_the_translated_protein_sequence?
367
+ @internal_hash[:show_the_translated_protein_sequence]
368
+ end
369
+
370
+ # ========================================================================= #
371
+ # === set_round_to
372
+ #
373
+ # This will set to how many decimal numbers we will round to. This is
374
+ # mostly done for display-purposes, hence why the default is a fairly
375
+ # low value.
376
+ # ========================================================================= #
377
+ def set_round_to(
378
+ i = :default
379
+ )
380
+ case i
381
+ # ======================================================================= #
382
+ # === :default
383
+ #
384
+ # Since as of April 2021, the new default is 2, for rounding.
385
+ # ======================================================================= #
386
+ when :default
387
+ i = DEFAULT_ROUND_TO
388
+ end
389
+ @internal_hash[:round_to] = i.to_i
390
+ end
391
+
392
+ # ========================================================================= #
393
+ # === do_process_the_commandline_arguments_that_are_files
394
+ # ========================================================================= #
395
+ def do_process_the_commandline_arguments_that_are_files(
396
+ these_files = commandline_arguments_that_are_files?
397
+ )
398
+ unless these_files.is_a? Array
399
+ these_files = [these_files].flatten.compact
400
+ end
401
+ these_files.each {|this_file|
402
+ set_input_file(this_file)
403
+ set_data # This will use the default file.
404
+ split_into_proper_sections
405
+ report_the_FASTA_header if @show_the_header
406
+ if @sort_by_size
407
+ run_sizeseq_comparison
408
+ else
409
+ # =================================================================== #
410
+ # === Handle cases where the input is a protein
411
+ # =================================================================== #
412
+ if is_the_sequence_a_polypeptide?
413
+ if be_verbose?
414
+ erev "This sequence is assumed to be a #{royalblue('protein')}#{rev}."
415
+ report_how_many_elements_we_have_found
416
+ end
417
+ else # Must be a protein.
418
+ # =================================================================== #
419
+ # === Else it must be RNA or DNA
420
+ # =================================================================== #
421
+ if be_verbose?
422
+ erev "This sequence is assumed to "\
423
+ "be #{royalblue('DNA')}#{rev} or #{royalblue('RNA')}#{rev}."
424
+ end
425
+ calculate_gc_content # GC content makes only sense for nucleotides.
426
+ report_how_many_elements_we_have_found if be_verbose?
427
+ end
428
+ if be_verbose?
429
+ report_the_nucleotide_composition
430
+ report_on_how_many_entries_we_did_work
431
+ if report_the_sequence?
432
+ do_report_the_sequence
433
+ end
434
+ end
435
+ end
436
+ }
437
+ end
438
+
439
+ # ========================================================================= #
440
+ # === sanitize_the_description
441
+ #
442
+ # This method will iterate over the description entry and sanitize
443
+ # it. In this context sanitizing means to add the "length" entry,
444
+ # and the "type" entry, such as in:
445
+ #
446
+ # " # length=231; type=dna"
447
+ #
448
+ # ========================================================================= #
449
+ def sanitize_the_description
450
+ @data.map! {|line|
451
+ if line.start_with?('>') and !line.include?('length=')
452
+ length = 0
453
+ if @hash.has_key? line.delete('>')
454
+ length = @hash[line.delete('>')].size
455
+ end
456
+ line << " # length=#{length}; type=dna" # Currently hardcoded to DNA.
457
+ end
458
+ line
459
+ }
460
+ end
461
+
462
+ # ========================================================================= #
463
+ # === entries?
464
+ # ========================================================================= #
465
+ def entries?
466
+ @data
467
+ end
468
+
469
+ # ========================================================================= #
470
+ # === we_may_exit
471
+ # ========================================================================= #
472
+ def we_may_exit
473
+ @may_we_exit = true
474
+ end
475
+
476
+ # ========================================================================= #
477
+ # === output_results
478
+ # ========================================================================= #
479
+ def output_results
480
+ pp @hash
481
+ end
482
+
483
+ # ========================================================================= #
484
+ # === do_report_the_sequence (report tag)
485
+ #
486
+ # This method is used to display the main sequence at hand.
487
+ # ========================================================================= #
488
+ def do_report_the_sequence
489
+ _ = main_sequence?
490
+ # ======================================================================= #
491
+ # Honour the --limit commandline flag next.
492
+ # ======================================================================= #
493
+ if @internal_hash[:limit_the_display_to_n_nucleotides]
494
+ _ = _[0 .. (@internal_hash[:limit_the_display_to_n_nucleotides] - 1)]
495
+ end
496
+ if @colourize_sequence
497
+ if is_polynucleotide?
498
+ # =================================================================== #
499
+ # Else assume this is DNA/RNA input.
500
+ # =================================================================== #
501
+ _.gsub!(/A/, teal('A')+rev)
502
+ _.gsub!(/C/, slateblue('C')+rev)
503
+ _.gsub!(/G/, royalblue('G')+rev)
504
+ _.gsub!(/T/, steelblue('T')+rev)
505
+ _.gsub!(/U/, steelblue('U')+rev)
506
+ #else
507
+ end
508
+ end
509
+ if condense_the_sequence_onto_a_single_line?
510
+ _ = _.delete("\n")
511
+ end
512
+ erev colourize_this_nucleotide_sequence(_)
513
+ e if condense_the_sequence_onto_a_single_line?
514
+ if show_the_translated_protein_sequence?
515
+ # ===================================================================== #
516
+ # Do show the translated protein sequence next:
517
+ # ===================================================================== #
518
+ translated_into_aa = Bioroebe.to_aa(_)
519
+ translated_into_aa_and_colourized = translated_into_aa.dup
520
+ if translated_into_aa.include? '*'
521
+ translated_into_aa_and_colourized = translated_into_aa.gsub(/\*/,tomato('*'))
522
+ end
523
+ erev 'The translated aminoacid sequence of '+
524
+ sfancy(translated_into_aa.size.to_s)+rev+
525
+ ' aminoacids is:'
526
+ e
527
+ erev steelblue(" #{translated_into_aa_and_colourized}")
528
+ e
529
+ end
530
+ end; alias display do_report_the_sequence # === display
531
+ alias report do_report_the_sequence # === report
532
+
533
+ # ========================================================================= #
534
+ # === report_the_nucleotide_composition
535
+ # ========================================================================= #
536
+ def report_the_nucleotide_composition
537
+ if is_this_sequence_a_polynucleotide_sequence?
538
+ first = @hash.values.first.upcase
539
+ total_size = first.size
540
+ n_adenines = first.count('A')
541
+ n_thymidines = first.count('T')
542
+ n_cytodines = first.count('C')
543
+ n_guanines = first.count('G')
544
+ erev "The nucleotide composition is as follows:"
545
+ e " "\
546
+ "#{steelblue(n_adenines)}#{rev}x A (#{(n_adenines * 100.0 / total_size).round(2)}%), "\
547
+ "#{steelblue(n_thymidines)}#{rev}x T (#{(n_thymidines * 100.0 / total_size).round(2)}%), "\
548
+ "#{steelblue(n_cytodines)}#{rev}x C (#{(n_cytodines * 100.0 / total_size).round(2)}%), "\
549
+ "#{steelblue(n_guanines)}#{rev}x G (#{(n_guanines * 100.0 / total_size).round(2)}%)"
550
+ elsif is_a_protein?
551
+ # ===================================================================== #
552
+ # Report the composition of the protein:
553
+ # ===================================================================== #
554
+ sequence = @hash.values.first.delete("\n")
555
+ erev "The protein composition (aminoacids) is as follows:"
556
+ # e colourize_this_aminoacid_sequence_for_the_commandline(" #{sequence}")
557
+ e orchid(" #{sequence}")
558
+ end
559
+ end; alias report_the_protein_composition report_the_nucleotide_composition # === report_the_protein_composition
560
+
561
+ # ========================================================================= #
562
+ # === report_how_many_elements_we_have_found
563
+ # ========================================================================= #
564
+ def report_how_many_elements_we_have_found
565
+ if @hash
566
+ first = @hash.values.first.delete("\n")
567
+ size = first.size.to_s
568
+ if be_verbose?
569
+ n_start_codons = first.count('ATG')
570
+ # =================================================================== #
571
+ # We upcase it since as of October 2021, as some FASTA files may
572
+ # include the sequence in lowercased characters.
573
+ # =================================================================== #
574
+ n_start_codons += first.reverse.upcase.count('ATG')
575
+ result = "This sequence contains #{simp(size.to_s)}#{rev}"\
576
+ " #{nucleotides_or_aminoacids?}".dup
577
+ if is_a_nucleotide?
578
+ result << " and #{n_start_codons} "\
579
+ "ATG codons (on both strands) in total"
580
+ end
581
+ result << '.'
582
+ if size.to_i > 1_000_000
583
+ # ================================================================= #
584
+ # Format the number with '_' characters.
585
+ # ================================================================= #
586
+ formatted = size.to_i.to_s.reverse.split(/(.{3})/).reject(&:empty?).join('_').reverse
587
+ result = result.dup if result.frozen?
588
+ result << ' ('+simp(formatted+' bp')+rev+')'
589
+ end
590
+ erev result
591
+ end
592
+ end
593
+ end
594
+
595
+ # ========================================================================= #
596
+ # === report_on_how_many_entries_we_did_work
597
+ # ========================================================================= #
598
+ def report_on_how_many_entries_we_did_work
599
+ if be_verbose?
600
+ entry_or_entries = 'entry'
601
+ if @hash.keys.size > 1
602
+ entry_or_entries = 'entries'
603
+ end
604
+ erev "We have identified a total of #{orange(@hash.keys.size)}"\
605
+ "#{rev} #{entry_or_entries} in this fasta dataset."
606
+ e
607
+ end
608
+ end
609
+
610
+ # ========================================================================= #
611
+ # === report_the_FASTA_header
612
+ # ========================================================================= #
613
+ def report_the_FASTA_header
614
+ e "#{rev}The header is: #{steelblue(header?)}"
615
+ end
616
+
617
+ # ========================================================================= #
618
+ # === report_the_sequence?
619
+ # ========================================================================= #
620
+ def report_the_sequence?
621
+ @internal_hash[:report_the_sequence]
622
+ end
623
+
624
+ # ========================================================================= #
625
+ # === sanitize_data
626
+ # ========================================================================= #
627
+ def sanitize_data(i)
628
+ if i.is_a? Array
629
+ i.flatten!
630
+ i.reject! {|entry| entry.start_with? '#' }
631
+ i.reject! {|entry| entry.strip.empty? }
632
+ if i.first and i.first.include? "\r"
633
+ # =================================================================== #
634
+ # Some FASTA files include "\r" line endings. We will check first
635
+ # for the first entry to contain a \r, and if so, we assume the
636
+ # whole FASTA file may have \r, which then will be removed.
637
+ # =================================================================== #
638
+ i.map! {|entry| entry.delete("\r") }
639
+ end
640
+ end
641
+ # ========================================================================= #
642
+ # === Run through SanitizeNucleotideSequence
643
+ # ========================================================================= #
644
+ if @internal_hash[:remove_numbers_from_input]
645
+ i = Bioroebe::SanitizeNucleotideSequence[i]
646
+ end
647
+ i
648
+ end
649
+
650
+ # ========================================================================= #
651
+ # === current_key?
652
+ # ========================================================================= #
653
+ def current_key?
654
+ @current_key
655
+ end; alias id? current_key? # === id?
656
+ alias sequence_id? current_key? # === sequence_id?
657
+ alias title current_key? # === title
658
+ alias title? current_key? # === title?
659
+
660
+ # ========================================================================= #
661
+ # === round_to?
662
+ # ========================================================================= #
663
+ def round_to?
664
+ @internal_hash[:round_to]
665
+ end
666
+
667
+ # ========================================================================= #
668
+ # === opnn
669
+ # ========================================================================= #
670
+ def opnn
671
+ super(namespace?) if use_opn?
672
+ end
673
+
674
+ # ========================================================================= #
675
+ # === use_opn?
676
+ # ========================================================================= #
677
+ def use_opn?
678
+ @use_opn
679
+ end
680
+
681
+ # ========================================================================= #
682
+ # === calculate_gc_content
683
+ #
684
+ # Calculate the gc content through this method, which is called from
685
+ # within the method run().
686
+ # ========================================================================= #
687
+ def calculate_gc_content
688
+ _ = @hash.values.join.delete(N)
689
+ if is_polynucleotide? _
690
+ @hash.each_pair {|key, content|
691
+ # =================================================================== #
692
+ # Delegate towards the method Bioroebe.gc_content next, including
693
+ # to round towards 5 positions:
694
+ # =================================================================== #
695
+ gc_content = ::Bioroebe.gc_content(content.upcase, round_to?)
696
+ gc_content = gc_content.first if gc_content.is_a? Array
697
+ gc_content = gc_content.to_s
698
+ minimal_key = key.to_s
699
+ if minimal_key.include? '|'
700
+ minimal_key = minimal_key.split('|').last.strip
701
+ end
702
+ if be_verbose?
703
+ _ = minimal_key.strip
704
+ if _.size > 40 # Shorten the content a bit if it is too long.
705
+ _ = _[0 .. 40]+' [...]'
706
+ end
707
+ erev 'GC content of "'+simp(_)+rev+'" is: '+
708
+ "#{sfancy(gc_content)}#{rev} %"
709
+ end
710
+ }
711
+ else
712
+ erev '`'+simp(_)+rev+'` is not a polynucleotide.' if be_verbose?
713
+ end
714
+ end
715
+
716
+ # ========================================================================= #
717
+ # === first_value
718
+ #
719
+ # This will return the first entry of the Fasta files.
720
+ # ========================================================================= #
721
+ def first_value
722
+ sequences?.first
723
+ end
724
+
725
+ # ========================================================================= #
726
+ # === nucleotides_or_aminoacids?
727
+ # ========================================================================= #
728
+ def nucleotides_or_aminoacids?
729
+ if is_polynucleotide?
730
+ 'nucleotides'
731
+ else
732
+ 'aminoacids'
733
+ end
734
+ end
735
+
736
+ # ========================================================================= #
737
+ # === is_polynucleotide?
738
+ # ========================================================================= #
739
+ def is_polynucleotide?(i = main_sequence?)
740
+ !is_protein?(i)
741
+ end; alias is_a_nucleotide? is_polynucleotide? # === is_a_nucleotide?
742
+
743
+ # ========================================================================= #
744
+ # === is_this_sequence_a_polynucleotide_sequence?
745
+ # ========================================================================= #
746
+ def is_this_sequence_a_polynucleotide_sequence?
747
+ !is_protein?
748
+ end
749
+
750
+ # ========================================================================= #
751
+ # === data?
752
+ #
753
+ # This will contain the full content of the (whole) .fasta file, including
754
+ # the header.
755
+ # ========================================================================= #
756
+ def data?
757
+ @data
758
+ end; alias input? data? # === input?
759
+ alias dataset? data? # === dataset?
760
+
761
+ # ========================================================================= #
762
+ # === hash?
763
+ # ========================================================================= #
764
+ def hash?
765
+ @hash
766
+ end
767
+
768
+ # ========================================================================= #
769
+ # === sequences?
770
+ #
771
+ # This method will obtain all found sequences.
772
+ # ========================================================================= #
773
+ def sequences?
774
+ @hash.values
775
+ end; alias sequences sequences? # === sequences
776
+ alias values sequences? # === values
777
+
778
+ # ========================================================================= #
779
+ # === short_headers?
780
+ #
781
+ # The short-headers are like the headers, but if a ' ' token is found
782
+ # then the line will be truncated towards that first ' '.
783
+ #
784
+ # An example is:
785
+ #
786
+ # sp|Q91FT8|234R_IIV6 Uncharacterized protein 234R OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-234R PE=4 SV=1
787
+ #
788
+ # This will be truncated towards
789
+ #
790
+ # sp|Q91FT8|234R_IIV6
791
+ #
792
+ # This could then be used to automatically rename FASTA files, for
793
+ # instance.
794
+ # ========================================================================= #
795
+ def short_headers?
796
+ headers?.map {|entry|
797
+ if entry.include? ' '
798
+ entry = entry.split(' ').first
799
+ end
800
+ entry
801
+ }
802
+ end
803
+
804
+ # ========================================================================= #
805
+ # === set_data
806
+ #
807
+ # This is the setter-method towards @data. It is no longer allowed to
808
+ # invoke set_input_file() since as of 12.06.2020. This means that
809
+ # you have to invoke that method prior to calling this method.
810
+ # ========================================================================= #
811
+ def set_data(i = @input_file)
812
+ # ======================================================================= #
813
+ # The next line attempts to ensure that even an Array can be used
814
+ # as input to that method.
815
+ # ======================================================================= #
816
+ i = [i].flatten.compact.first.to_s.dup
817
+ if File.exist? i.to_s # First try to read in from a file.
818
+ if be_verbose?
819
+ opnn; erev "Will read from the file `#{sfile(i)}#{rev}`."
820
+ end
821
+ i = File.readlines(i)
822
+ if @is_a_genbank_file
823
+ selected = i.select {|line|
824
+ line.start_with?(' ') and # such as: " 61 atggggcctg caatggggcc tgcaatgggg cctgca\n"
825
+ (line.strip =~ /\d+/)
826
+ }.map {|inner_line|
827
+ inner_line.strip.delete(' 0123456789').strip.upcase
828
+ }
829
+ i = ["> genbank file"]+selected
830
+ end
831
+ end
832
+ if i.nil? or i.empty?
833
+ i = DEFAULT_FASTA
834
+ opnn; erev 'No input was provided. Thus a default FASTA '\
835
+ 'sequence will be used instead.'
836
+ end
837
+ i = sanitize_data(i)
838
+ i = i.split(N) if i.is_a? String
839
+ @data = i
840
+ end; alias set_sequence set_data # === set_Sequence
841
+
842
+ # ========================================================================= #
843
+ # === set_be_verbose_and_report_the_sequence
844
+ # ========================================================================= #
845
+ def set_be_verbose_and_report_the_sequence
846
+ set_be_verbose
847
+ @internal_hash[:report_the_sequence] = true
848
+ end
849
+
850
+ # ========================================================================= #
851
+ # === condense_the_sequence_onto_a_single_line?
852
+ # ========================================================================= #
853
+ def condense_the_sequence_onto_a_single_line?
854
+ @internal_hash[:condense_the_sequence_onto_a_single_line]
855
+ end
856
+
857
+ # ========================================================================= #
858
+ # === return_size_sorted_hash
859
+ # ========================================================================= #
860
+ def return_size_sorted_hash(i = @hash)
861
+ _ = i.sort_by {|key, value| value.size }
862
+ i = Hash[_]
863
+ return i
864
+ end
865
+
866
+ # ========================================================================= #
867
+ # === do_sort_by_size
868
+ #
869
+ # This method will sort the hash by size of the sequence. It has been
870
+ # inspired by the EMBOSS sizeq functionality.
871
+ #
872
+ # The output that should be generated might look like this:
873
+ #
874
+ # https://www.bioinformatics.nl/cgi-bin/emboss/help/sizeseq#input.1
875
+ #
876
+ # Invocation example:
877
+ #
878
+ # x = Bioroebe::ParseFasta.new('/Depot/j/globins.fasta'); x.do_sort_by_size
879
+ #
880
+ # ========================================================================= #
881
+ def do_sort_by_size
882
+ # ======================================================================= #
883
+ # Sort it here first, by the size of the "value", aka the sequence body.
884
+ # ======================================================================= #
885
+ @hash = return_size_sorted_hash(@hash)
886
+ _ = ''.dup
887
+ @hash.each_pair {|key, sequence|
888
+ _ << '> ID '+sequence.size.to_s+' AA.; DE: '+key.to_s+
889
+ ' SQ '+sequence.size.to_s+' AA'+N # ; unknown MW as of yet; '\
890
+ #'unknown CRC64 as of yet'+N
891
+ _ << sequence+N+N
892
+ }
893
+ e _
894
+ end; alias run_sizeseq_comparison do_sort_by_size # === run_sizeseq_comparison
895
+
896
+ # ========================================================================= #
897
+ # === n_nucleotides?
898
+ # ========================================================================= #
899
+ def n_nucleotides?
900
+ @hash.values.first.delete("\n").size
901
+ end; alias return_n_aminoacids n_nucleotides? # === return_n_aminoacids
902
+ alias size? n_nucleotides? # === size?
903
+ alias sequence_size? n_nucleotides? # === sequence_size?
904
+
905
+ # ========================================================================= #
906
+ # === headers?
907
+ # ========================================================================= #
908
+ def headers?
909
+ @hash.keys
910
+ end
911
+
912
+ # ========================================================================= #
913
+ # === first_key?
914
+ #
915
+ # Obtain the very first entry.
916
+ # ========================================================================= #
917
+ def first_key?
918
+ headers?.first
919
+ end
920
+
921
+ # ========================================================================= #
922
+ # === header?
923
+ #
924
+ # This variant will always return the first entry.
925
+ # ========================================================================= #
926
+ def header?
927
+ headers?.first.to_s
928
+ end
929
+
930
+ # ========================================================================= #
931
+ # === raw_body?
932
+ # ========================================================================= #
933
+ def raw_body?
934
+ @hash.values.first
935
+ end
936
+
937
+ # ========================================================================= #
938
+ # === do_show_the_header
939
+ # ========================================================================= #
940
+ def do_show_the_header
941
+ @show_the_header = true
942
+ end
943
+
944
+ # ========================================================================= #
945
+ # === set_input_file
946
+ #
947
+ # This method will be used to keep track of the input-file, from
948
+ # which we will read the dataset.
949
+ # ========================================================================= #
950
+ def set_input_file(i = nil)
951
+ if i.nil?
952
+ # ===================================================================== #
953
+ # First, we try to find a .fasta or .fa file in the current
954
+ # directory. If we can find it, we will use that instead.
955
+ # ===================================================================== #
956
+ unless Dir['*.{fa,fasta}'].empty?
957
+ file = Dir['*.{fa,fasta}'].first
958
+ if be_verbose?
959
+ result = 'A '
960
+ if file.end_with? '.fasta'
961
+ result < 'FASTA '
962
+ end
963
+ result << 'file was found in this directory ('+sfile(file)+').'
964
+ opnn; erev result
965
+ opnn; erev 'We will use it.'
966
+ end
967
+ i = file
968
+ end
969
+ unless Dir['*.{fa,fasta}'].empty?
970
+ file = Dir['*.{fa,fasta}'].first
971
+ if be_verbose?
972
+ opnn; erev "We have found a file in this "\
973
+ "directory (#{sfile(file)}#{rev})."
974
+ opnn; erev 'We will use it.'
975
+ end
976
+ i = file
977
+ end
978
+ end
979
+ if i and File.exist?(i)
980
+ dataset = File.read(i)
981
+ if dataset[0 .. ('LOCUS'.size - 1)] == 'LOCUS'
982
+ @is_a_genbank_file = true
983
+ end
984
+ end
985
+ @input_file = i
986
+ end; alias set_input_files set_input_file # === set_input_files
987
+
988
+ # ========================================================================= #
989
+ # === save_the_file?
990
+ # ========================================================================= #
991
+ def save_the_file?
992
+ @internal_hash[:save_the_file]
993
+ end
994
+
995
+ # ========================================================================= #
996
+ # === overwrite_the_original_file?
997
+ # ========================================================================= #
998
+ def overwrite_the_original_file?
999
+ @internal_hash[:overwrite_the_original_file]
1000
+ end
1001
+
1002
+ # ========================================================================= #
1003
+ # === split_into_proper_sections
1004
+ #
1005
+ # Split up into the fasta identifier, and the content.
1006
+ # ========================================================================= #
1007
+ def split_into_proper_sections
1008
+ unless @data.to_s.include? '>'
1009
+ erev 'No ">" character was found in this dataset.'
1010
+ erev 'It is recommended to always have a > identifier '\
1011
+ 'for the'
1012
+ erev 'FASTA format (such as in a .fasta or a .fa file).'
1013
+ end if be_verbose? # Ok, the input data includes >. We can proceed.
1014
+ @data.each { |line|
1015
+ # ===================================================================== #
1016
+ # === Handle the leading > FASTA identifier first
1017
+ # ===================================================================== #
1018
+ if line.start_with? '>' # leading identifier.
1019
+ @current_key = line[1..-1].chomp # Select all but the first character.
1020
+ @hash[@current_key] = ''.dup
1021
+ else
1022
+ line.delete!('_')
1023
+ unless @current_key
1024
+ @current_key = 'standard'
1025
+ @hash[@current_key] = ''.dup
1026
+ end
1027
+ # =================================================================== #
1028
+ # === Retain the newlines
1029
+ #
1030
+ # Here we may decide to get rid of newlines, but it is better to
1031
+ # NOT remove the newlines - that way we can simply save the
1032
+ # dataset again.
1033
+ # @hash[@current_key] << no_newlines(line)
1034
+ # =================================================================== #
1035
+ @hash[@current_key] << line
1036
+ end
1037
+ }
1038
+ end
1039
+
1040
+ # ========================================================================= #
1041
+ # === save_into_a_fasta_file
1042
+ # ========================================================================= #
1043
+ def save_into_a_fasta_file(
1044
+ be_verbose = be_verbose?
1045
+ )
1046
+ case be_verbose
1047
+ when :be_verbose
1048
+ be_verbose = true
1049
+ end
1050
+ if @data
1051
+ what = @data.join("\n")
1052
+ into = 'standard.fasta'
1053
+ erev 'Saving into '+sfile(into)+rev+'.' if be_verbose
1054
+ write_what_into(what, into)
1055
+ return File.absolute_path(into) # And return the file we saved into.
1056
+ else
1057
+ opnn; erev 'No @data variable exists.'
1058
+ end
1059
+ end; alias do_save_the_file save_into_a_fasta_file # === do_save_the_file
1060
+
1061
+ # ========================================================================= #
1062
+ # === add_length_information_to_the_header
1063
+ # ========================================================================= #
1064
+ def add_length_information_to_the_header
1065
+ _ = header?.strip
1066
+ _ << ' length='+sequence_size?.to_s+';'
1067
+ # ======================================================================= #
1068
+ # Next, designate where to store this file.
1069
+ # ======================================================================= #
1070
+ into = 'new_fasta_file.fasta'
1071
+ if overwrite_the_original_file?
1072
+ into = @input_file
1073
+ end
1074
+ what = ''.dup
1075
+ what << "> "+_+"\n"
1076
+ what << raw_body?
1077
+ if what and into
1078
+ erev 'Storing into `'+sfile(into)+rev+'`.'
1079
+ write_what_into(what, into)
1080
+ end
1081
+ end
1082
+
1083
+ # ========================================================================= #
1084
+ # === simplify_header
1085
+ #
1086
+ # This method can be called to simplify the header. It will save into
1087
+ # a .fasta file at once.
1088
+ # ========================================================================= #
1089
+ def simplify_header
1090
+ _ = header?
1091
+ # ======================================================================= #
1092
+ # Next, simplify the header. We must start with checking for [] first,
1093
+ # because if there are any [] in the FASTA header then we can simplify
1094
+ # stuff at once.
1095
+ # ======================================================================= #
1096
+ if _.include?('[') and _.include?(']')
1097
+ _ = '> '+_.strip.scan(/\[.+\]/).flatten.first.delete('[]')+"\n"
1098
+ elsif _.include? ','
1099
+ _ = _[0 .. (_.index(',') - 1) ].strip
1100
+ end
1101
+ what = nil
1102
+ # ======================================================================= #
1103
+ # Next, designate where to store this file.
1104
+ # ======================================================================= #
1105
+ into = 'new_fasta_file.fasta'
1106
+ if overwrite_the_original_file?
1107
+ into = @input_file
1108
+ end
1109
+ if _.start_with? '>'
1110
+ what = _
1111
+ elsif _.include?('[') and _.include?(']') # For example: [Pan troglodytes]
1112
+ # ===================================================================== #
1113
+ # See rubular at:
1114
+ #
1115
+ # https://rubular.com/r/aDjI0JwMOUlZzP
1116
+ #
1117
+ # ===================================================================== #
1118
+ what = "> "+_.scan(/\[(.+)\]/).flatten.first.to_s+"\n".dup
1119
+ elsif _.include? 'Human'
1120
+ _scanned_result = _.scan(/(Human)/)
1121
+ what = "> "+$1.to_s.dup+"\n".dup
1122
+ else
1123
+ erev "Unsure what to do: #{steelblue(_)}"
1124
+ end
1125
+ if what and into
1126
+ what << raw_body?
1127
+ erev 'Storing into `'+sfile(into)+rev+'`.'
1128
+ write_what_into(what, into)
1129
+ end
1130
+ end
1131
+
1132
+ # ========================================================================= #
1133
+ # === sequence
1134
+ #
1135
+ # This method will return the sequence, without any newlines. It is also
1136
+ # called the "body" of a FASTA file.
1137
+ # ========================================================================= #
1138
+ def sequence
1139
+ _ = @hash.values.first
1140
+ _.chomp! if _ and _.end_with?(N)
1141
+ return no_newlines(_)
1142
+ end; alias fasta_sequence sequence # === fasta_sequence
1143
+ alias sequence? sequence # === sequence?
1144
+ alias body? sequence # === body?
1145
+ alias body sequence # === body?
1146
+ alias naseq sequence # === naseq
1147
+ alias nucleotide_sequence sequence # === nucleotide_sequence
1148
+ alias return_sequence sequence # === return_sequence
1149
+ alias content? sequence # === content?
1150
+
1151
+ # ========================================================================= #
1152
+ # === save
1153
+ #
1154
+ # This method will save our FASTA file.
1155
+ # ========================================================================= #
1156
+ def save
1157
+ if @input_file.nil?
1158
+ erev "The generic file #{sfile('foobar.fasta')}#{rev} "\
1159
+ "will be used."
1160
+ set_input_file('foobar.fasta')
1161
+ end
1162
+ into = @input_file
1163
+ what = @data.join("\n")
1164
+ erev 'Storing into '+sfile(into)+rev+'.'
1165
+ write_what_into(what, into)
1166
+ return into
1167
+ end
1168
+
1169
+ # ========================================================================= #
1170
+ # === []
1171
+ #
1172
+ # This is a simpler query-interface for obtaining the DNA/RNA sequence
1173
+ # of the FASTA file (or aminoacid sequence, if we have a protein at
1174
+ # hand here).
1175
+ #
1176
+ # Using the method sequences? here, which in turn works on @hash, is
1177
+ # ok because Hashes are kept in a sorted manner in ruby since some
1178
+ # time.
1179
+ # ========================================================================= #
1180
+ def [](i)
1181
+ sequences?[i]
1182
+ end
1183
+
1184
+ # ========================================================================= #
1185
+ # === Bioroebe::ParseFasta[]
1186
+ # ========================================================================= #
1187
+ def self.[](i)
1188
+ _ = new(i)
1189
+ _.sequences?
1190
+ end
1191
+
1192
+ # ========================================================================= #
1193
+ # === type?
1194
+ # ========================================================================= #
1195
+ def type?
1196
+ if is_the_sequence_a_polypeptide?
1197
+ :protein
1198
+ elsif is_this_sequence_a_polynucleotide_sequence?
1199
+ :dna_or_rna
1200
+ else
1201
+ :unknown
1202
+ end
1203
+ end
1204
+
1205
+ # ========================================================================= #
1206
+ # === is_the_sequence_a_polypeptide?
1207
+ #
1208
+ # This method can be used to determine whether a given input sequence
1209
+ # is a polypeptide (aka a protein) or whether it is not.
1210
+ #
1211
+ # If this sequence is a polypeptide then this method will return true.
1212
+ # Otherwise false will be returned.
1213
+ # ========================================================================= #
1214
+ def is_the_sequence_a_polypeptide?(
1215
+ i = main_sequence?
1216
+ )
1217
+ return_value = false # Set the default return value here.
1218
+ # ======================================================================= #
1219
+ # Look at the first 120 positions to determine whether this is a protein
1220
+ # or a nucleotide sequence.
1221
+ # ======================================================================= #
1222
+ subsequence = i[0 .. 119] # Must deduct 1 at the end since Arrays in ruby start at 0.
1223
+ # ======================================================================= #
1224
+ # Build a frequency of the characters there.
1225
+ # ======================================================================= #
1226
+ hash = {}
1227
+ hash.default = 0
1228
+ subsequence.chars.each {|character|
1229
+ hash[character] += 1
1230
+ }
1231
+ keys_to_check_for = %w(
1232
+ B D E F H I J K L M O P Q R S V W X Y Z
1233
+ )
1234
+
1235
+ values = hash.select {|key, value|
1236
+ if keys_to_check_for.include? key
1237
+ true
1238
+ else
1239
+ false
1240
+ end
1241
+ }.values.sum
1242
+ if values > 0
1243
+ return_value = true
1244
+ end
1245
+ return return_value
1246
+ end; alias is_protein? is_the_sequence_a_polypeptide? # === is_protein?
1247
+ alias is_a_protein? is_the_sequence_a_polypeptide? # === is_a_protein?
1248
+
1249
+ # ========================================================================= #
1250
+ # === main_sequence?
1251
+ #
1252
+ # This will always return the first entry.
1253
+ # ========================================================================= #
1254
+ def main_sequence?
1255
+ @hash.values.first
1256
+ end
1257
+
1258
+ # ========================================================================= #
1259
+ # === gc_content?
1260
+ # ========================================================================= #
1261
+ def gc_content?
1262
+ return ::Bioroebe.gc_content(main_sequence?).to_f # Must be a float.
1263
+ end; alias gc_content gc_content? # === gc_content
1264
+
1265
+ # ========================================================================= #
1266
+ # === sequence_object
1267
+ #
1268
+ # This method will return a Sequence object.
1269
+ #
1270
+ # Usage example:
1271
+ #
1272
+ # x = Bioroebe.parse_fasta 'ls_orchid.fasta'
1273
+ # y = x.sequence_object # y is now an instance of Bioroebe::Sequence
1274
+ #
1275
+ # ========================================================================= #
1276
+ def sequence_object
1277
+ ::Bioroebe::Sequence.new(main_sequence?)
1278
+ end
1279
+
1280
+ # ========================================================================= #
1281
+ # === show_help (help tag)
1282
+ #
1283
+ # This method will inform the user how this class may be used from the
1284
+ # commandline.
1285
+ #
1286
+ # Invocation example:
1287
+ #
1288
+ # pfasta --help
1289
+ #
1290
+ # ========================================================================= #
1291
+ def show_help
1292
+ e
1293
+ eparse ' --size'
1294
+ eparse ' --also-show-the-sequence'
1295
+ eparse ' --header # show the header as well (normally the '\
1296
+ 'header is not shown)'
1297
+ eparse ' --limit=1000 # limit to show only the first 1000 '\
1298
+ 'nucleotides; use'
1299
+ eparse ' # any number that you need here'
1300
+ eparse ' --one-line # show the sequence on one line only, '\
1301
+ 'e. g. all newlines'
1302
+ eparse ' # were removed'
1303
+ eparse ' --toprotein # show the protein sequence as well '\
1304
+ '(assumes DNA or RNA'
1305
+ eparse ' # .fasta file)'
1306
+ eparse ' --convert # alias to the above ^^^'
1307
+ eparse ' --translate # alias to the above ^^^'
1308
+ e
1309
+ end
1310
+
1311
+ # ========================================================================= #
1312
+ # === run (run tag)
1313
+ # ========================================================================= #
1314
+ def run
1315
+ menu
1316
+ do_process_the_commandline_arguments_that_are_files
1317
+ do_save_the_file if save_the_file?
1318
+ end
1319
+
1320
+ end
1321
+
1322
+ Fasta = ParseFasta # Add an "alias" constant to class ParseFasta.
1323
+
1324
+ # =========================================================================== #
1325
+ # === Bioroebe.parse_fasta_quietly
1326
+ #
1327
+ # As the variant above, but will work quietly.
1328
+ # =========================================================================== #
1329
+ def self.parse_fasta_quietly(
1330
+ i, use_colours = true
1331
+ )
1332
+ ::Bioroebe.parse_fasta(i, use_colours) { :be_quiet }
1333
+ end
1334
+
1335
+ # =========================================================================== #
1336
+ # === Bioroebe.return_fasta_entry_with_the_highest_gc_content
1337
+ #
1338
+ # The first argument should be a locally existing FASTA file that
1339
+ # contains different sequences.
1340
+ #
1341
+ # Usage example:
1342
+ #
1343
+ # x = Bioroebe.return_fasta_entry_with_the_highest_gc_content('/rosalind_gc.txt')
1344
+ #
1345
+ # =========================================================================== #
1346
+ def self.return_fasta_entry_with_the_highest_gc_content(this_fasta_file)
1347
+ if File.exist? this_fasta_file
1348
+ dataset = File.read(this_fasta_file)
1349
+ dataset = parse_fasta(dataset) { :be_quiet }
1350
+ hash = dataset.hash?
1351
+ hash.transform_values! {|this_value|
1352
+ ::Bioroebe.gc_content(this_value).to_f
1353
+ }
1354
+ return hash.max_by {|key, value| value }
1355
+ else
1356
+ erev "No file exists at #{sfile(this_fasta_file)}#{rev}."
1357
+ end
1358
+ end
1359
+
1360
+ # =========================================================================== #
1361
+ # === Bioroebe.sizeseq
1362
+ #
1363
+ # This method will "size-sequence compare", typically on a .fasta file.
1364
+ # =========================================================================== #
1365
+ def self.sizeseq(i)
1366
+ if i.is_a? Array
1367
+ i = i.first
1368
+ end
1369
+ _ = Bioroebe.parse_fasta(i) { :be_quiet }
1370
+ _.do_sort_by_size
1371
+ end
1372
+
1373
+ # =========================================================================== #
1374
+ # === Bioroebe.return_sizeseq
1375
+ #
1376
+ # This is as Bioroebe.sizeseq(), but it will just return the result,
1377
+ # rather than output it.
1378
+ # =========================================================================== #
1379
+ def self.return_sizeseq(i)
1380
+ if i.is_a? Array
1381
+ i = i.first
1382
+ end
1383
+ _ = Bioroebe.parse_fasta(i) { :be_quiet }
1384
+ hash = _.return_size_sorted_hash
1385
+ result = ''.dup
1386
+ hash.each_pair {|key, sequence|
1387
+ result << '> ID '+sequence.size.to_s+' AA.; DE: '+key.to_s+
1388
+ ' SQ '+sequence.size.to_s+' AA'+N
1389
+ result << sequence+N+N
1390
+ }
1391
+ return result
1392
+ end
1393
+
1394
+ # =========================================================================== #
1395
+ # === Bioroebe.genbank_to_fasta
1396
+ #
1397
+ # This method will convert from a genbank file, to a .fasta file.
1398
+ #
1399
+ # Invocation example:
1400
+ #
1401
+ # Bioroebe.genbank_to_fasta('/home/x/DATA/PROGRAMMING_LANGUAGES/RUBY/src/bioroebe/lib/bioroebe/data/genbank/sample_file.genbank')
1402
+ #
1403
+ # =========================================================================== #
1404
+ def self.genbank_to_fasta(
1405
+ this_file,
1406
+ be_verbose = :be_verbose
1407
+ )
1408
+ case be_verbose
1409
+ when :be_quiet
1410
+ be_verbose = false
1411
+ end
1412
+ if this_file.is_a? Array
1413
+ this_file = this_file.first
1414
+ end
1415
+ if File.exist? this_file
1416
+ _ = Bioroebe::ParseFasta.new(this_file) { :be_quiet }
1417
+ else
1418
+ _ = Bioroebe::ParseFasta.new(:do_not_run_yet) { :be_quiet }
1419
+ _.set_data # This will use the default file.
1420
+ _.split_into_proper_sections
1421
+ end
1422
+ file_path = _.save_into_a_fasta_file(be_verbose)
1423
+ return file_path
1424
+ end
1425
+
1426
+ # =========================================================================== #
1427
+ # === Bioroebe.parse_fasta_file
1428
+ # =========================================================================== #
1429
+ def self.parse_fasta_file(
1430
+ i = ARGV,
1431
+ use_colours = true
1432
+ )
1433
+ use_this_hash = {
1434
+ use_colours: use_colours,
1435
+ be_verbose: false
1436
+ }
1437
+ ParseFasta.new(i) { use_this_hash }
1438
+ end; self.instance_eval { alias fasta_file parse_fasta_file } # === Bioroebe.fasta_file
1439
+
1440
+ # =========================================================================== #
1441
+ # === Bioroebe.parse_fasta
1442
+ #
1443
+ # Easier reader-method for .fasta files.
1444
+ #
1445
+ # The second argument determines whether we will use colours or whether
1446
+ # we will not. For now, the default is to not use colours when we use
1447
+ # this particular class method.
1448
+ #
1449
+ # Invocation examples:
1450
+ #
1451
+ # x = Bioroebe.parse_fasta('/rosalind_gc.txt')
1452
+ # hash = Bioroebe.parse_fasta('/rosalind_gc.txt').hash?
1453
+ #
1454
+ # =========================================================================== #
1455
+ def self.parse_fasta(
1456
+ i,
1457
+ use_colours = true
1458
+ )
1459
+ use_this_hash = {
1460
+ use_colours: use_colours
1461
+ }
1462
+ if block_given?
1463
+ use_this_hash = {
1464
+ use_colours: use_colours,
1465
+ be_verbose: yield
1466
+ }
1467
+ end
1468
+ ::Bioroebe::ParseFasta.new(i) { use_this_hash }
1469
+ end; self.instance_eval { alias fasta parse_fasta } # === Bioroebe.fasta
1470
+
1471
+ end
14
1472
 
15
1473
  if __FILE__ == $PROGRAM_NAME
16
1474
  Bioroebe::ParseFasta.new(ARGV) { :sizeseq }
@@ -24,4 +1482,4 @@ end # corefasta globins.fasta
24
1482
  # pfasta /GC.txt
25
1483
  # pfasta 013521.3_289_aa.fasta --also-show-the-sequence
26
1484
  # pfasta $RSRC/bioroebe/lib/bioroebe/data/GFP_mutant_3_coding_sequence.fasta --also-show-the-sequence
27
- # corefasta $J/globins.fasta
1485
+ # corefasta $J/globins.fasta