seqtrimnext 2.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ if ARGV.count != 1
5
+ puts "#{$0} FASTA "
6
+ exit
7
+ end
8
+
9
+
10
+
11
+ file = ARGV.shift
12
+
13
+ f=File.open(file)
14
+
15
+ f.each_line do |line|
16
+ puts line
17
+ end
@@ -0,0 +1,130 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_fastq'
4
+
5
+ VERBOSE=false
6
+
7
+ if !(ARGV.count==3 or ARGV.count==5)
8
+ puts "Usage: #{$0} paired1 paired2 output_base_name [paired1_tag paired2_tag]"
9
+
10
+ exit
11
+ end
12
+
13
+ p1_path=ARGV[0]
14
+ p2_path=ARGV[1]
15
+ output_base_name=ARGV[2]
16
+
17
+ paired1_tag='/1'
18
+ paired2_tag='/2'
19
+
20
+ if (ARGV.count==5)
21
+ paired1_tag=ARGV[3]
22
+ paired2_tag=ARGV[4]
23
+ end
24
+
25
+ PAIRED1_TAG_RE=/#{Regexp.quote(paired1_tag)}$/
26
+ PAIRED2_TAG_RE=/#{Regexp.quote(paired2_tag)}$/
27
+
28
+
29
+
30
+ if !File.exists?(p1_path)
31
+ puts "File #{p1_path} doesn't exists"
32
+ exit
33
+ end
34
+
35
+ if !File.exists?(p2_path)
36
+ puts "File #{p2_path} doesn't exists"
37
+ exit
38
+ end
39
+
40
+ def read_to_file(file)
41
+ res ={}
42
+
43
+ f_file = FastqFile.new(file,'r',:sanger, true)
44
+
45
+ f_file.each do |n,f,q,c|
46
+ res[n.gsub(PAIRED2_TAG_RE,'')]=[f,q,c]
47
+
48
+ if ((f_file.num_seqs%10000) == 0)
49
+ puts "Loading: #{f_file.num_seqs}"
50
+ end
51
+
52
+
53
+ end
54
+
55
+ f_file.close
56
+
57
+ return res
58
+ end
59
+
60
+
61
+
62
+ p1 = FastqFile.new(p1_path,'r',:sanger, true)
63
+
64
+ # p2 = FastqFile.new(p2_path,'r',:sanger, true)
65
+
66
+ p2 = read_to_file(p2_path)
67
+
68
+ puts "Sequences from #{p2_path} loaded. Total: #{p2.count}"
69
+
70
+
71
+ normal_out = FastqFile.new(output_base_name+'_normal.fastq','w',:sanger, true)
72
+ paired_out = FastqFile.new(output_base_name+'_all_paired.fastq','w',:sanger, true)
73
+ paired1_out = FastqFile.new(output_base_name+'_paired1.fastq','w',:sanger, true)
74
+ paired2_out = FastqFile.new(output_base_name+'_paired2.fastq','w',:sanger, true)
75
+
76
+
77
+ p1.each do |n1,f1,q1,c1|
78
+
79
+ n1.gsub!(PAIRED1_TAG_RE,'')
80
+ puts "Find #{n1}" if VERBOSE
81
+
82
+ seq_in_p2=p2[n1]
83
+ # p2.find{|e| e[0]==n1}
84
+
85
+ if seq_in_p2
86
+ n2=n1
87
+ f2,q2,c2=seq_in_p2
88
+ puts " ===> PAIRED #{n2}" if VERBOSE
89
+
90
+ paired_out.write_seq(n1+paired1_tag,f1,q1,c1)
91
+ paired1_out.write_seq(n1+paired1_tag,f1,q1,c1)
92
+
93
+ paired_out.write_seq(n2+paired2_tag,f2,q2,c2)
94
+ paired2_out.write_seq(n2+paired2_tag,f2,q2,c2)
95
+
96
+ p2.delete(n2)
97
+
98
+ else
99
+ puts " ===> NOT PAIRED #{n1}" if VERBOSE
100
+ normal_out.write_seq(n1+paired1_tag,f1,q1,c1)
101
+ end
102
+
103
+ if ((p1.num_seqs%10000) == 0)
104
+ puts p1.num_seqs
105
+ end
106
+
107
+ end
108
+
109
+ # remaining at p2 goes to normal_out
110
+
111
+
112
+ p2.each do |seq_in_p2,v|
113
+ n2=seq_in_p2
114
+ f2,q2,c2=v
115
+
116
+ normal_out.write_seq(n2+paired2_tag,f2,q2,c2)
117
+
118
+ end
119
+
120
+ p1.close
121
+ # p2.close
122
+
123
+ normal_out.close
124
+ paired_out.close
125
+ paired1_out.close
126
+ paired2_out.close
127
+
128
+
129
+
130
+
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+ require 'scbi_fastq'
5
+
6
+ if ARGV.count != 2
7
+
8
+
9
+ end
10
+
11
+
12
+ # >Cluster 0
13
+ # 0 216aa, >E9LAHD006DQKVK... *
14
+ # >Cluster 1
15
+ # 0 203aa, >E9LAHD006DODWR... *
16
+ # >Cluster 2
17
+ # 0 198aa, >E9LAHD006DQCDS... *
18
+ # >Cluster 3
19
+ # 0 195aa, >E9LAHD006DQURO... *
20
+ # 1 172aa, >E9LAHD006DOSHR... at 93.02%
21
+ # 2 172aa, >E9LAHD006DSV4P... at 93.02%
22
+ # 3 172aa, >E9LAHD006DI00Q... at 93.02%
23
+ # 4 172aa, >E9LAHD006DR7MR... at 93.02%
24
+ # 5 175aa, >E9LAHD006DTDA7... at 90.86%
25
+ # 6 172aa, >E9LAHD006DVCR3... at 93.02%
26
+ # 7 172aa, >E9LAHD006DHY3H... at 93.02%
27
+ # 8 177aa, >E9LAHD006DI52X... at 90.96%
28
+
29
+
30
+
31
+ def load_repeated_seqs(file_path,min_repetitions)
32
+ clusters=[]
33
+ # count=0
34
+ current_cluster=[]
35
+ if File.exists?(file_path)
36
+ # File.open(ARGV[0]).each_line do |line|
37
+ # $LOG.debug("Repeated file path:"+file_path)
38
+
39
+ File.open(file_path).each_line do |line|
40
+
41
+ if line =~ /^>Cluster/
42
+ if !current_cluster.empty? && (current_cluster.count <= min_repetitions)
43
+ clusters += current_cluster
44
+ end
45
+
46
+ # count=0
47
+ current_cluster=[]
48
+ elsif line =~ />([^\.]+)\.\.\.\s/
49
+ current_cluster << $1
50
+ end
51
+
52
+ end
53
+
54
+ if !current_cluster.empty? && (current_cluster.count <= min_repetitions)
55
+ clusters += current_cluster
56
+ end
57
+
58
+ # $LOG.info("Repeated sequence count: #{@clusters.count}")
59
+ else
60
+ # $LOG.error("Clustering file's doesn't exists: #{@clusters.count}")
61
+
62
+ end
63
+
64
+ return clusters
65
+
66
+ end
67
+
68
+
69
+ def remove_singletons_from_file(input_file_path,singletons)
70
+ fqr=FastqFile.new(input_file_path)
71
+
72
+ out=FastqFile.new(input_file_path+'_without_singletons','w+')
73
+
74
+
75
+ fqr.each do |n,f,q,c|
76
+ if !singletons.include?(n)
77
+ out.write_seq(n,f,q,c)
78
+ end
79
+ end
80
+
81
+ out.close
82
+ fqr.close
83
+
84
+ end
85
+
86
+ input_file_path=ARGV.shift
87
+ min_repetitions = ARGV.shift.to_i
88
+
89
+ `cd-hit -i #{input_file_path} -o clusters`
90
+
91
+ singletons = load_repeated_seqs('clusters.clrs',min_repetitions)
92
+
93
+ remove_singletons_from_file(input_file_path,singletons)
94
+
95
+ # puts singletons.to_json
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yajl'
4
+ require 'json'
5
+
6
+ unless file = ARGV.shift
7
+ puts "\nUsage: $0 results.json action1 [action] [action] [action] ...\n\n"
8
+ exit(0)
9
+ end
10
+
11
+
12
+ actions = ARGV
13
+ if actions.empty?
14
+ puts "\nUsage: $0 results.json action1 [action] [action] [action] ...\n\n"
15
+ exit(0)
16
+ end
17
+
18
+ json = File.new(file, 'r')
19
+
20
+ puts "Counting sequences with these actions: #{actions.join(",")}"
21
+ puts ""
22
+
23
+ total = 0
24
+ count = 0
25
+ separate_count={}
26
+
27
+ actions.each do |a|
28
+ separate_count[a]=0
29
+ end
30
+
31
+ all_actions =[]
32
+
33
+ Yajl::Parser.parse(json) { |seq|
34
+
35
+ total += 1
36
+ action_names=seq['actions'].map {|a| a['type']}
37
+
38
+ if (action_names & actions).count == actions.count
39
+ count +=1
40
+ end
41
+
42
+ action_names.each do |a|
43
+ if actions.include?(a)
44
+ separate_count[a] += 1
45
+ end
46
+ end
47
+
48
+ all_actions = (all_actions + action_names).uniq
49
+
50
+ }
51
+
52
+ puts "="*20 + "Separate count" + "="*20
53
+ separate_count.each do |k,v|
54
+ puts "#{k} = #{v}"
55
+
56
+ end
57
+ puts "="*20 + "Summarized" + "="*20
58
+
59
+ puts "Number of sequences with all actions: #{count}"
60
+ puts "Total sequences: #{total}"
61
+
62
+ puts "\n"
63
+ puts "="*20 + "Other used actions" + "="*20
64
+ puts (all_actions-actions).join(',')
65
+
66
+
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+
5
+ def get_json_data(file_path)
6
+
7
+ file1 = File.open(file_path)
8
+ text = file1.read
9
+ file1.close
10
+
11
+ # puts text
12
+ # # wipe text
13
+ # text=text.grep(/^\s*[^#]/).to_s
14
+
15
+ # decode json
16
+ data = JSON.parse(text)
17
+
18
+ return data
19
+ end
20
+
21
+
22
+ # extract params loading to external file in ingebiol
23
+
24
+ params={}
25
+
26
+ params['vector_db_field']='vectors_db'
27
+ params['contaminants_db_field']='contaminants_db'
28
+ params['species_field']='genus'
29
+ params['min_insert_size_field']='min_insert_size_trimmed'
30
+ params['min_paired_insert_size_field']='min_insert_size_paired'
31
+ params['min_quality_value_field']='min_quality'
32
+
33
+ if ARGV.count!=2
34
+ puts "#{$0} ingebiol_params_file.json seqtrim_params_file"
35
+ exit(-1)
36
+ end
37
+
38
+ input_file = ARGV[0]
39
+
40
+ params_file=ARGV[1]
41
+
42
+ if !File.exists?(input_file)
43
+ puts "File #{input_file} doesn't exists"
44
+ exit(-1)
45
+ end
46
+
47
+ if !File.exists?(params_file)
48
+ puts "File #{params_file} doesn't exists"
49
+ exit(-1)
50
+ end
51
+
52
+ sq_params=File.open(params_file,'r')
53
+
54
+ data=get_json_data(input_file)
55
+ # puts data.keys
56
+ # puts data['vector_db_field']
57
+
58
+ # replace params
59
+
60
+ # sq_params.each_line do |line|
61
+ # line.chomp!
62
+ #
63
+ # if line =~ /^\s*(.+)\s*=\s*(.+)\s*/
64
+ # puts $1,$2
65
+ # end
66
+ #
67
+ # end
68
+
69
+ sq_params=File.open(params_file,'a+')
70
+
71
+ data.each do |k,v|
72
+
73
+ sq_name=params[k]
74
+ # puts k,sq_name
75
+
76
+ if sq_name && v && !v.empty?
77
+ sq_params.puts "#{sq_name}=#{v}"
78
+ end
79
+
80
+ end
81
+
82
+ sq_params.close
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+
5
+ if ARGV.count != 2
6
+ puts "#{$0} cluster.fasta.clstr COUNT"
7
+ exit
8
+ end
9
+
10
+ path=ARGV.shift
11
+ list_max=ARGV.shift.to_i
12
+
13
+ # puts path
14
+
15
+ h={}
16
+
17
+ last_line = ''
18
+
19
+ f=File.open(path)
20
+
21
+ f.each do |line|
22
+ if line =~ />Cluster/
23
+ if !last_line.empty?
24
+ if last_line =~ /^([\d]+)\s[^>]*>([^\s]*)\.\.\.\s/
25
+ # puts $1
26
+ h[$2]=$1.to_i+1
27
+ end
28
+ end
29
+ end
30
+
31
+ last_line=line
32
+
33
+ end
34
+
35
+ f.close
36
+
37
+
38
+ # puts "30 most repeated sequences:"
39
+ list_max.times do
40
+ ma=h.max_by{|k,v| v}
41
+ if ma
42
+ puts ma.join(' => ')
43
+ h.delete(ma[0])
44
+ end
45
+ end
46
+
47
+
48
+ # puts h.sort.to_json
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env bash
2
+
3
+ if [ ! -f $1 ]; then
4
+ echo "You must specify a file with seqtrim's rejected sequences"
5
+ echo "Usage $0 rejected_seqtrim_file";
6
+ exit;
7
+ fi
8
+
9
+ cat $1 | cut -d ' ' -f 2-20 | sort | uniq -c;
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_fasta'
4
+
5
+ if ARGV.count!=3
6
+ puts "Usage: #{$0} fasta qual output_base_name"
7
+ exit
8
+ end
9
+
10
+ fasta_path = ARGV[0]
11
+ qual_path = ARGV[1]
12
+ name = ARGV[2]
13
+
14
+
15
+ out_fasta = name+'.fasta'
16
+ out_qual = name+'.fasta.qual'
17
+
18
+ puts "Opening #{fasta_path}, #{qual_path}"
19
+
20
+ fqr=FastaQualFile.new(fasta_path,qual_path,true)
21
+
22
+ out_f=File.new(out_fasta,'w+')
23
+ out_q=File.new(out_qual,'w+')
24
+
25
+ c=0
26
+
27
+ fqr.each do |n,f,q|
28
+
29
+ out_f.puts ">#{n}"
30
+ out_q.puts ">#{n}"
31
+
32
+ if n.index('dir=F')
33
+ out_f.puts f.reverse.tr('actgACTG','tgacTGAC')
34
+ out_q.puts q.reverse.join(' ')
35
+ else
36
+ out_f.puts f
37
+ out_q.puts q.join(' ')
38
+ end
39
+
40
+ c=c+1
41
+ end
42
+
43
+ puts c
44
+
45
+ fqr.close
46
+
47
+ out_f.close
48
+ out_q.close
49
+