seqtrimnext 2.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +114 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +159 -0
  5. data/Rakefile +38 -0
  6. data/bin/create_graphs.rb +46 -0
  7. data/bin/extract_seqs.rb +45 -0
  8. data/bin/extract_seqs_from_fasta.rb +56 -0
  9. data/bin/extract_seqs_from_fastq.rb +45 -0
  10. data/bin/fasta2fastq.rb +38 -0
  11. data/bin/fastq2fasta.rb +35 -0
  12. data/bin/gen_qual.rb +46 -0
  13. data/bin/get_seq.rb +46 -0
  14. data/bin/group_by_range.rb +17 -0
  15. data/bin/join_ilumina_paired.rb +130 -0
  16. data/bin/parse_amplicons.rb +95 -0
  17. data/bin/parse_json_results.rb +66 -0
  18. data/bin/parse_params.rb +82 -0
  19. data/bin/resume_clusters.rb +48 -0
  20. data/bin/resume_rejected.sh +9 -0
  21. data/bin/reverse_paired.rb +49 -0
  22. data/bin/seqtrimnext +368 -0
  23. data/bin/split_fastq.rb +42 -0
  24. data/bin/split_ilumina_paired.rb +65 -0
  25. data/bin/split_paired.rb +70 -0
  26. data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
  27. data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
  28. data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
  29. data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
  30. data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
  31. data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
  32. data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
  33. data/lib/seqtrimnext/actions/action_insert.rb +32 -0
  34. data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
  35. data/lib/seqtrimnext/actions/action_key.rb +30 -0
  36. data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
  37. data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
  38. data/lib/seqtrimnext/actions/action_linker.rb +30 -0
  39. data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
  40. data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
  41. data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
  42. data/lib/seqtrimnext/actions/action_mid.rb +30 -0
  43. data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
  44. data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
  45. data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
  46. data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
  47. data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
  48. data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
  49. data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
  50. data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
  51. data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
  52. data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
  53. data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
  54. data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
  55. data/lib/seqtrimnext/classes/action_manager.rb +47 -0
  56. data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
  57. data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
  58. data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
  59. data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
  60. data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
  61. data/lib/seqtrimnext/classes/install_database.rb +43 -0
  62. data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
  63. data/lib/seqtrimnext/classes/list_db.rb +49 -0
  64. data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
  65. data/lib/seqtrimnext/classes/one_blast.rb +41 -0
  66. data/lib/seqtrimnext/classes/params.rb +387 -0
  67. data/lib/seqtrimnext/classes/piro.rb +78 -0
  68. data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
  69. data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
  70. data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
  71. data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
  72. data/lib/seqtrimnext/classes/sequence.rb +55 -0
  73. data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
  74. data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
  75. data/lib/seqtrimnext/plugins/plugin.rb +267 -0
  76. data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
  77. data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
  78. data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
  79. data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
  80. data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
  81. data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
  82. data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
  83. data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
  84. data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
  85. data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
  86. data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
  87. data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
  88. data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
  89. data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
  90. data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
  91. data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
  92. data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
  93. data/lib/seqtrimnext/templates/amplicons.txt +16 -0
  94. data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
  95. data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
  96. data/lib/seqtrimnext/templates/low_quality.txt +5 -0
  97. data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
  98. data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
  99. data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
  100. data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
  101. data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
  102. data/lib/seqtrimnext/utils/global_match.rb +65 -0
  103. data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
  104. data/lib/seqtrimnext/utils/json_utils.rb +50 -0
  105. data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
  106. data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
  107. data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
  108. data/lib/seqtrimnext/utils/string_utils.rb +56 -0
  109. data/lib/seqtrimnext.rb +37 -0
  110. data/script/console +10 -0
  111. data/script/destroy +14 -0
  112. data/script/generate +14 -0
  113. data/test/test_helper.rb +3 -0
  114. data/test/test_seqtrimnext.rb +11 -0
  115. metadata +318 -0
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ if ARGV.count != 1
5
+ puts "#{$0} FASTA "
6
+ exit
7
+ end
8
+
9
+
10
+
11
+ file = ARGV.shift
12
+
13
+ f=File.open(file)
14
+
15
+ f.each_line do |line|
16
+ puts line
17
+ end
@@ -0,0 +1,130 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_fastq'
4
+
5
+ VERBOSE=false
6
+
7
+ if !(ARGV.count==3 or ARGV.count==5)
8
+ puts "Usage: #{$0} paired1 paired2 output_base_name [paired1_tag paired2_tag]"
9
+
10
+ exit
11
+ end
12
+
13
+ p1_path=ARGV[0]
14
+ p2_path=ARGV[1]
15
+ output_base_name=ARGV[2]
16
+
17
+ paired1_tag='/1'
18
+ paired2_tag='/2'
19
+
20
+ if (ARGV.count==5)
21
+ paired1_tag=ARGV[3]
22
+ paired2_tag=ARGV[4]
23
+ end
24
+
25
+ PAIRED1_TAG_RE=/#{Regexp.quote(paired1_tag)}$/
26
+ PAIRED2_TAG_RE=/#{Regexp.quote(paired2_tag)}$/
27
+
28
+
29
+
30
+ if !File.exists?(p1_path)
31
+ puts "File #{p1_path} doesn't exists"
32
+ exit
33
+ end
34
+
35
+ if !File.exists?(p2_path)
36
+ puts "File #{p2_path} doesn't exists"
37
+ exit
38
+ end
39
+
40
+ def read_to_file(file)
41
+ res ={}
42
+
43
+ f_file = FastqFile.new(file,'r',:sanger, true)
44
+
45
+ f_file.each do |n,f,q,c|
46
+ res[n.gsub(PAIRED2_TAG_RE,'')]=[f,q,c]
47
+
48
+ if ((f_file.num_seqs%10000) == 0)
49
+ puts "Loading: #{f_file.num_seqs}"
50
+ end
51
+
52
+
53
+ end
54
+
55
+ f_file.close
56
+
57
+ return res
58
+ end
59
+
60
+
61
+
62
+ p1 = FastqFile.new(p1_path,'r',:sanger, true)
63
+
64
+ # p2 = FastqFile.new(p2_path,'r',:sanger, true)
65
+
66
+ p2 = read_to_file(p2_path)
67
+
68
+ puts "Sequences from #{p2_path} loaded. Total: #{p2.count}"
69
+
70
+
71
+ normal_out = FastqFile.new(output_base_name+'_normal.fastq','w',:sanger, true)
72
+ paired_out = FastqFile.new(output_base_name+'_all_paired.fastq','w',:sanger, true)
73
+ paired1_out = FastqFile.new(output_base_name+'_paired1.fastq','w',:sanger, true)
74
+ paired2_out = FastqFile.new(output_base_name+'_paired2.fastq','w',:sanger, true)
75
+
76
+
77
+ p1.each do |n1,f1,q1,c1|
78
+
79
+ n1.gsub!(PAIRED1_TAG_RE,'')
80
+ puts "Find #{n1}" if VERBOSE
81
+
82
+ seq_in_p2=p2[n1]
83
+ # p2.find{|e| e[0]==n1}
84
+
85
+ if seq_in_p2
86
+ n2=n1
87
+ f2,q2,c2=seq_in_p2
88
+ puts " ===> PAIRED #{n2}" if VERBOSE
89
+
90
+ paired_out.write_seq(n1+paired1_tag,f1,q1,c1)
91
+ paired1_out.write_seq(n1+paired1_tag,f1,q1,c1)
92
+
93
+ paired_out.write_seq(n2+paired2_tag,f2,q2,c2)
94
+ paired2_out.write_seq(n2+paired2_tag,f2,q2,c2)
95
+
96
+ p2.delete(n2)
97
+
98
+ else
99
+ puts " ===> NOT PAIRED #{n1}" if VERBOSE
100
+ normal_out.write_seq(n1+paired1_tag,f1,q1,c1)
101
+ end
102
+
103
+ if ((p1.num_seqs%10000) == 0)
104
+ puts p1.num_seqs
105
+ end
106
+
107
+ end
108
+
109
+ # remaining at p2 goes to normal_out
110
+
111
+
112
+ p2.each do |seq_in_p2,v|
113
+ n2=seq_in_p2
114
+ f2,q2,c2=v
115
+
116
+ normal_out.write_seq(n2+paired2_tag,f2,q2,c2)
117
+
118
+ end
119
+
120
+ p1.close
121
+ # p2.close
122
+
123
+ normal_out.close
124
+ paired_out.close
125
+ paired1_out.close
126
+ paired2_out.close
127
+
128
+
129
+
130
+
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+ require 'scbi_fastq'
5
+
6
+ if ARGV.count != 2
7
+
8
+
9
+ end
10
+
11
+
12
+ # >Cluster 0
13
+ # 0 216aa, >E9LAHD006DQKVK... *
14
+ # >Cluster 1
15
+ # 0 203aa, >E9LAHD006DODWR... *
16
+ # >Cluster 2
17
+ # 0 198aa, >E9LAHD006DQCDS... *
18
+ # >Cluster 3
19
+ # 0 195aa, >E9LAHD006DQURO... *
20
+ # 1 172aa, >E9LAHD006DOSHR... at 93.02%
21
+ # 2 172aa, >E9LAHD006DSV4P... at 93.02%
22
+ # 3 172aa, >E9LAHD006DI00Q... at 93.02%
23
+ # 4 172aa, >E9LAHD006DR7MR... at 93.02%
24
+ # 5 175aa, >E9LAHD006DTDA7... at 90.86%
25
+ # 6 172aa, >E9LAHD006DVCR3... at 93.02%
26
+ # 7 172aa, >E9LAHD006DHY3H... at 93.02%
27
+ # 8 177aa, >E9LAHD006DI52X... at 90.96%
28
+
29
+
30
+
31
+ def load_repeated_seqs(file_path,min_repetitions)
32
+ clusters=[]
33
+ # count=0
34
+ current_cluster=[]
35
+ if File.exists?(file_path)
36
+ # File.open(ARGV[0]).each_line do |line|
37
+ # $LOG.debug("Repeated file path:"+file_path)
38
+
39
+ File.open(file_path).each_line do |line|
40
+
41
+ if line =~ /^>Cluster/
42
+ if !current_cluster.empty? && (current_cluster.count <= min_repetitions)
43
+ clusters += current_cluster
44
+ end
45
+
46
+ # count=0
47
+ current_cluster=[]
48
+ elsif line =~ />([^\.]+)\.\.\.\s/
49
+ current_cluster << $1
50
+ end
51
+
52
+ end
53
+
54
+ if !current_cluster.empty? && (current_cluster.count <= min_repetitions)
55
+ clusters += current_cluster
56
+ end
57
+
58
+ # $LOG.info("Repeated sequence count: #{@clusters.count}")
59
+ else
60
+ # $LOG.error("Clustering file's doesn't exists: #{@clusters.count}")
61
+
62
+ end
63
+
64
+ return clusters
65
+
66
+ end
67
+
68
+
69
+ def remove_singletons_from_file(input_file_path,singletons)
70
+ fqr=FastqFile.new(input_file_path)
71
+
72
+ out=FastqFile.new(input_file_path+'_without_singletons','w+')
73
+
74
+
75
+ fqr.each do |n,f,q,c|
76
+ if !singletons.include?(n)
77
+ out.write_seq(n,f,q,c)
78
+ end
79
+ end
80
+
81
+ out.close
82
+ fqr.close
83
+
84
+ end
85
+
86
+ input_file_path=ARGV.shift
87
+ min_repetitions = ARGV.shift.to_i
88
+
89
+ `cd-hit -i #{input_file_path} -o clusters`
90
+
91
+ singletons = load_repeated_seqs('clusters.clrs',min_repetitions)
92
+
93
+ remove_singletons_from_file(input_file_path,singletons)
94
+
95
+ # puts singletons.to_json
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'yajl'
4
+ require 'json'
5
+
6
+ unless file = ARGV.shift
7
+ puts "\nUsage: $0 results.json action1 [action] [action] [action] ...\n\n"
8
+ exit(0)
9
+ end
10
+
11
+
12
+ actions = ARGV
13
+ if actions.empty?
14
+ puts "\nUsage: $0 results.json action1 [action] [action] [action] ...\n\n"
15
+ exit(0)
16
+ end
17
+
18
+ json = File.new(file, 'r')
19
+
20
+ puts "Counting sequences with these actions: #{actions.join(",")}"
21
+ puts ""
22
+
23
+ total = 0
24
+ count = 0
25
+ separate_count={}
26
+
27
+ actions.each do |a|
28
+ separate_count[a]=0
29
+ end
30
+
31
+ all_actions =[]
32
+
33
+ Yajl::Parser.parse(json) { |seq|
34
+
35
+ total += 1
36
+ action_names=seq['actions'].map {|a| a['type']}
37
+
38
+ if (action_names & actions).count == actions.count
39
+ count +=1
40
+ end
41
+
42
+ action_names.each do |a|
43
+ if actions.include?(a)
44
+ separate_count[a] += 1
45
+ end
46
+ end
47
+
48
+ all_actions = (all_actions + action_names).uniq
49
+
50
+ }
51
+
52
+ puts "="*20 + "Separate count" + "="*20
53
+ separate_count.each do |k,v|
54
+ puts "#{k} = #{v}"
55
+
56
+ end
57
+ puts "="*20 + "Summarized" + "="*20
58
+
59
+ puts "Number of sequences with all actions: #{count}"
60
+ puts "Total sequences: #{total}"
61
+
62
+ puts "\n"
63
+ puts "="*20 + "Other used actions" + "="*20
64
+ puts (all_actions-actions).join(',')
65
+
66
+
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+
5
+ def get_json_data(file_path)
6
+
7
+ file1 = File.open(file_path)
8
+ text = file1.read
9
+ file1.close
10
+
11
+ # puts text
12
+ # # wipe text
13
+ # text=text.grep(/^\s*[^#]/).to_s
14
+
15
+ # decode json
16
+ data = JSON.parse(text)
17
+
18
+ return data
19
+ end
20
+
21
+
22
+ # extract params loading to external file in ingebiol
23
+
24
+ params={}
25
+
26
+ params['vector_db_field']='vectors_db'
27
+ params['contaminants_db_field']='contaminants_db'
28
+ params['species_field']='genus'
29
+ params['min_insert_size_field']='min_insert_size_trimmed'
30
+ params['min_paired_insert_size_field']='min_insert_size_paired'
31
+ params['min_quality_value_field']='min_quality'
32
+
33
+ if ARGV.count!=2
34
+ puts "#{$0} ingebiol_params_file.json seqtrim_params_file"
35
+ exit(-1)
36
+ end
37
+
38
+ input_file = ARGV[0]
39
+
40
+ params_file=ARGV[1]
41
+
42
+ if !File.exists?(input_file)
43
+ puts "File #{input_file} doesn't exists"
44
+ exit(-1)
45
+ end
46
+
47
+ if !File.exists?(params_file)
48
+ puts "File #{params_file} doesn't exists"
49
+ exit(-1)
50
+ end
51
+
52
+ sq_params=File.open(params_file,'r')
53
+
54
+ data=get_json_data(input_file)
55
+ # puts data.keys
56
+ # puts data['vector_db_field']
57
+
58
+ # replace params
59
+
60
+ # sq_params.each_line do |line|
61
+ # line.chomp!
62
+ #
63
+ # if line =~ /^\s*(.+)\s*=\s*(.+)\s*/
64
+ # puts $1,$2
65
+ # end
66
+ #
67
+ # end
68
+
69
+ sq_params=File.open(params_file,'a+')
70
+
71
+ data.each do |k,v|
72
+
73
+ sq_name=params[k]
74
+ # puts k,sq_name
75
+
76
+ if sq_name && v && !v.empty?
77
+ sq_params.puts "#{sq_name}=#{v}"
78
+ end
79
+
80
+ end
81
+
82
+ sq_params.close
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'json'
4
+
5
+ if ARGV.count != 2
6
+ puts "#{$0} cluster.fasta.clstr COUNT"
7
+ exit
8
+ end
9
+
10
+ path=ARGV.shift
11
+ list_max=ARGV.shift.to_i
12
+
13
+ # puts path
14
+
15
+ h={}
16
+
17
+ last_line = ''
18
+
19
+ f=File.open(path)
20
+
21
+ f.each do |line|
22
+ if line =~ />Cluster/
23
+ if !last_line.empty?
24
+ if last_line =~ /^([\d]+)\s[^>]*>([^\s]*)\.\.\.\s/
25
+ # puts $1
26
+ h[$2]=$1.to_i+1
27
+ end
28
+ end
29
+ end
30
+
31
+ last_line=line
32
+
33
+ end
34
+
35
+ f.close
36
+
37
+
38
+ # puts "30 most repeated sequences:"
39
+ list_max.times do
40
+ ma=h.max_by{|k,v| v}
41
+ if ma
42
+ puts ma.join(' => ')
43
+ h.delete(ma[0])
44
+ end
45
+ end
46
+
47
+
48
+ # puts h.sort.to_json
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env bash
2
+
3
+ if [ ! -f $1 ]; then
4
+ echo "You must specify a file with seqtrim's rejected sequences"
5
+ echo "Usage $0 rejected_seqtrim_file";
6
+ exit;
7
+ fi
8
+
9
+ cat $1 | cut -d ' ' -f 2-20 | sort | uniq -c;
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_fasta'
4
+
5
+ if ARGV.count!=3
6
+ puts "Usage: #{$0} fasta qual output_base_name"
7
+ exit
8
+ end
9
+
10
+ fasta_path = ARGV[0]
11
+ qual_path = ARGV[1]
12
+ name = ARGV[2]
13
+
14
+
15
+ out_fasta = name+'.fasta'
16
+ out_qual = name+'.fasta.qual'
17
+
18
+ puts "Opening #{fasta_path}, #{qual_path}"
19
+
20
+ fqr=FastaQualFile.new(fasta_path,qual_path,true)
21
+
22
+ out_f=File.new(out_fasta,'w+')
23
+ out_q=File.new(out_qual,'w+')
24
+
25
+ c=0
26
+
27
+ fqr.each do |n,f,q|
28
+
29
+ out_f.puts ">#{n}"
30
+ out_q.puts ">#{n}"
31
+
32
+ if n.index('dir=F')
33
+ out_f.puts f.reverse.tr('actgACTG','tgacTGAC')
34
+ out_q.puts q.reverse.join(' ')
35
+ else
36
+ out_f.puts f
37
+ out_q.puts q.join(' ')
38
+ end
39
+
40
+ c=c+1
41
+ end
42
+
43
+ puts c
44
+
45
+ fqr.close
46
+
47
+ out_f.close
48
+ out_q.close
49
+