RubyGems - seqtrimnext - Versions diffs - 2.0.29 - Mend

seqtrimnext 2.0.29

Files changed (115) hide show

data/History.txt +3 -0
data/Manifest.txt +114 -0
data/PostInstall.txt +7 -0
data/README.rdoc +159 -0
data/Rakefile +38 -0
data/bin/create_graphs.rb +46 -0
data/bin/extract_seqs.rb +45 -0
data/bin/extract_seqs_from_fasta.rb +56 -0
data/bin/extract_seqs_from_fastq.rb +45 -0
data/bin/fasta2fastq.rb +38 -0
data/bin/fastq2fasta.rb +35 -0
data/bin/gen_qual.rb +46 -0
data/bin/get_seq.rb +46 -0
data/bin/group_by_range.rb +17 -0
data/bin/join_ilumina_paired.rb +130 -0
data/bin/parse_amplicons.rb +95 -0
data/bin/parse_json_results.rb +66 -0
data/bin/parse_params.rb +82 -0
data/bin/resume_clusters.rb +48 -0
data/bin/resume_rejected.sh +9 -0
data/bin/reverse_paired.rb +49 -0
data/bin/seqtrimnext +368 -0
data/bin/split_fastq.rb +42 -0
data/bin/split_ilumina_paired.rb +65 -0
data/bin/split_paired.rb +70 -0
data/lib/seqtrimnext/actions/action_ab_adapter.rb +32 -0
data/lib/seqtrimnext/actions/action_ab_far_adapter.rb +32 -0
data/lib/seqtrimnext/actions/action_ab_left_adapter.rb +32 -0
data/lib/seqtrimnext/actions/action_empty_insert.rb +22 -0
data/lib/seqtrimnext/actions/action_ignore_repeated.rb +24 -0
data/lib/seqtrimnext/actions/action_indetermination.rb +30 -0
data/lib/seqtrimnext/actions/action_induced_low_complexity.rb +29 -0
data/lib/seqtrimnext/actions/action_insert.rb +32 -0
data/lib/seqtrimnext/actions/action_is_contaminated.rb +30 -0
data/lib/seqtrimnext/actions/action_key.rb +30 -0
data/lib/seqtrimnext/actions/action_left_adapter.rb +32 -0
data/lib/seqtrimnext/actions/action_left_primer.rb +17 -0
data/lib/seqtrimnext/actions/action_linker.rb +30 -0
data/lib/seqtrimnext/actions/action_low_complexity.rb +30 -0
data/lib/seqtrimnext/actions/action_low_high_size.rb +31 -0
data/lib/seqtrimnext/actions/action_low_quality.rb +33 -0
data/lib/seqtrimnext/actions/action_mid.rb +30 -0
data/lib/seqtrimnext/actions/action_multiple_linker.rb +29 -0
data/lib/seqtrimnext/actions/action_paired_reads.rb +28 -0
data/lib/seqtrimnext/actions/action_poly_a.rb +29 -0
data/lib/seqtrimnext/actions/action_poly_t.rb +29 -0
data/lib/seqtrimnext/actions/action_rem_adit_artifacts.rb +32 -0
data/lib/seqtrimnext/actions/action_right_adapter.rb +29 -0
data/lib/seqtrimnext/actions/action_right_primer.rb +25 -0
data/lib/seqtrimnext/actions/action_short_insert.rb +32 -0
data/lib/seqtrimnext/actions/action_unexpected_poly_t.rb +29 -0
data/lib/seqtrimnext/actions/action_unexpected_vector.rb +31 -0
data/lib/seqtrimnext/actions/action_vectors.rb +31 -0
data/lib/seqtrimnext/actions/seqtrim_action.rb +136 -0
data/lib/seqtrimnext/classes/action_manager.rb +47 -0
data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +335 -0
data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +290 -0
data/lib/seqtrimnext/classes/extract_stats.rb +255 -0
data/lib/seqtrimnext/classes/gnu_plot_graph.rb +140 -0
data/lib/seqtrimnext/classes/graph_stats.rb +74 -0
data/lib/seqtrimnext/classes/install_database.rb +43 -0
data/lib/seqtrimnext/classes/install_requirements.rb +123 -0
data/lib/seqtrimnext/classes/list_db.rb +49 -0
data/lib/seqtrimnext/classes/make_blast_db.rb +113 -0
data/lib/seqtrimnext/classes/one_blast.rb +41 -0
data/lib/seqtrimnext/classes/params.rb +387 -0
data/lib/seqtrimnext/classes/piro.rb +78 -0
data/lib/seqtrimnext/classes/plugin_manager.rb +153 -0
data/lib/seqtrimnext/classes/scan_for_restr_site.rb +138 -0
data/lib/seqtrimnext/classes/scbi_stats.rb +68 -0
data/lib/seqtrimnext/classes/seqtrim.rb +317 -0
data/lib/seqtrimnext/classes/sequence.rb +55 -0
data/lib/seqtrimnext/classes/sequence_group.rb +72 -0
data/lib/seqtrimnext/classes/sequence_with_action.rb +503 -0
data/lib/seqtrimnext/plugins/plugin.rb +267 -0
data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +189 -0
data/lib/seqtrimnext/plugins/plugin_adapters.rb +165 -0
data/lib/seqtrimnext/plugins/plugin_amplicons.rb +221 -0
data/lib/seqtrimnext/plugins/plugin_contaminants.rb +209 -0
data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +438 -0
data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +393 -0
data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +101 -0
data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +199 -0
data/lib/seqtrimnext/plugins/plugin_key.rb +70 -0
data/lib/seqtrimnext/plugins/plugin_linker.rb +232 -0
data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +98 -0
data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +74 -0
data/lib/seqtrimnext/plugins/plugin_low_quality.rb +394 -0
data/lib/seqtrimnext/plugins/plugin_mids.rb +231 -0
data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +246 -0
data/lib/seqtrimnext/plugins/plugin_short_insert.rb +244 -0
data/lib/seqtrimnext/plugins/plugin_vectors.rb +191 -0
data/lib/seqtrimnext/templates/amplicons.txt +16 -0
data/lib/seqtrimnext/templates/genomics_454.txt +5 -0
data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +5 -0
data/lib/seqtrimnext/templates/low_quality.txt +5 -0
data/lib/seqtrimnext/templates/low_quality_and_low_complexity.txt +5 -0
data/lib/seqtrimnext/templates/transcriptomics_454.txt +8 -0
data/lib/seqtrimnext/templates/transcriptomics_plants.txt +8 -0
data/lib/seqtrimnext/utils/extract_samples.rb +52 -0
data/lib/seqtrimnext/utils/fasta2xml.rb +69 -0
data/lib/seqtrimnext/utils/global_match.rb +65 -0
data/lib/seqtrimnext/utils/hash_stats.rb +29 -0
data/lib/seqtrimnext/utils/json_utils.rb +50 -0
data/lib/seqtrimnext/utils/load_fasta_names_in_hash.rb +37 -0
data/lib/seqtrimnext/utils/load_qual_in_hash.rb +37 -0
data/lib/seqtrimnext/utils/recover_mid.rb +95 -0
data/lib/seqtrimnext/utils/string_utils.rb +56 -0
data/lib/seqtrimnext.rb +37 -0
data/script/console +10 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/test/test_helper.rb +3 -0
data/test/test_seqtrimnext.rb +11 -0
metadata +318 -0

data/bin/group_by_range.rb ADDED Viewed

@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+if ARGV.count != 1
+  puts "#{$0} FASTA "
+  exit
+end
+file = ARGV.shift
+f=File.open(file)
+f.each_line do |line|
+  puts line
+end

data/bin/join_ilumina_paired.rb ADDED Viewed

@@ -0,0 +1,130 @@
+#!/usr/bin/env ruby
+require 'scbi_fastq'
+VERBOSE=false
+if !(ARGV.count==3 or ARGV.count==5)
+  puts "Usage: #{$0} paired1 paired2 output_base_name [paired1_tag paired2_tag]"
+  exit
+end
+p1_path=ARGV[0]
+p2_path=ARGV[1]
+output_base_name=ARGV[2]
+paired1_tag='/1'
+paired2_tag='/2'
+if (ARGV.count==5)
+  paired1_tag=ARGV[3]
+  paired2_tag=ARGV[4]
+end
+PAIRED1_TAG_RE=/#{Regexp.quote(paired1_tag)}$/
+PAIRED2_TAG_RE=/#{Regexp.quote(paired2_tag)}$/
+if !File.exists?(p1_path)
+  puts "File #{p1_path} doesn't exists"
+  exit
+end
+if !File.exists?(p2_path)
+  puts "File #{p2_path} doesn't exists"
+  exit
+end
+def read_to_file(file)
+  res ={}
+  f_file = FastqFile.new(file,'r',:sanger, true)
+  f_file.each do |n,f,q,c|
+    res[n.gsub(PAIRED2_TAG_RE,'')]=[f,q,c]
+    if ((f_file.num_seqs%10000) == 0)
+      puts "Loading: #{f_file.num_seqs}"
+    end
+  end
+  f_file.close
+  return res
+end
+p1 = FastqFile.new(p1_path,'r',:sanger, true)
+# p2 = FastqFile.new(p2_path,'r',:sanger, true)
+p2 = read_to_file(p2_path)
+puts "Sequences from #{p2_path} loaded. Total: #{p2.count}"
+normal_out = FastqFile.new(output_base_name+'_normal.fastq','w',:sanger, true)
+paired_out = FastqFile.new(output_base_name+'_all_paired.fastq','w',:sanger, true)
+paired1_out = FastqFile.new(output_base_name+'_paired1.fastq','w',:sanger, true)
+paired2_out = FastqFile.new(output_base_name+'_paired2.fastq','w',:sanger, true)
+p1.each do |n1,f1,q1,c1|
+  n1.gsub!(PAIRED1_TAG_RE,'')
+  puts "Find #{n1}" if VERBOSE
+  seq_in_p2=p2[n1]
+  # p2.find{|e| e[0]==n1}
+  if seq_in_p2
+    n2=n1
+    f2,q2,c2=seq_in_p2
+    puts "  ===> PAIRED #{n2}" if VERBOSE
+    paired_out.write_seq(n1+paired1_tag,f1,q1,c1)
+    paired1_out.write_seq(n1+paired1_tag,f1,q1,c1)
+    paired_out.write_seq(n2+paired2_tag,f2,q2,c2)
+    paired2_out.write_seq(n2+paired2_tag,f2,q2,c2)
+    p2.delete(n2)
+  else
+    puts "  ===> NOT PAIRED #{n1}"  if VERBOSE
+    normal_out.write_seq(n1+paired1_tag,f1,q1,c1)
+  end
+  if ((p1.num_seqs%10000) == 0)
+    puts p1.num_seqs
+  end
+end
+# remaining at p2 goes to normal_out
+p2.each do |seq_in_p2,v|
+  n2=seq_in_p2
+  f2,q2,c2=v
+  normal_out.write_seq(n2+paired2_tag,f2,q2,c2)
+end
+p1.close
+# p2.close
+normal_out.close
+paired_out.close
+paired1_out.close
+paired2_out.close

data/bin/parse_amplicons.rb ADDED Viewed

@@ -0,0 +1,95 @@
+#!/usr/bin/env ruby
+require 'json'
+require 'scbi_fastq'
+if ARGV.count != 2
+end
+# >Cluster 0
+# 0       216aa, >E9LAHD006DQKVK... *
+# >Cluster 1
+# 0       203aa, >E9LAHD006DODWR... *
+# >Cluster 2
+# 0       198aa, >E9LAHD006DQCDS... *
+# >Cluster 3
+# 0       195aa, >E9LAHD006DQURO... *
+# 1       172aa, >E9LAHD006DOSHR... at 93.02%
+# 2       172aa, >E9LAHD006DSV4P... at 93.02%
+# 3       172aa, >E9LAHD006DI00Q... at 93.02%
+# 4       172aa, >E9LAHD006DR7MR... at 93.02%
+# 5       175aa, >E9LAHD006DTDA7... at 90.86%
+# 6       172aa, >E9LAHD006DVCR3... at 93.02%
+# 7       172aa, >E9LAHD006DHY3H... at 93.02%
+# 8       177aa, >E9LAHD006DI52X... at 90.96%
+def load_repeated_seqs(file_path,min_repetitions)
+  clusters=[]
+  # count=0
+  current_cluster=[]
+  if File.exists?(file_path)
+    # File.open(ARGV[0]).each_line do |line|
+    # $LOG.debug("Repeated file path:"+file_path)
+    File.open(file_path).each_line do |line|
+      if line =~ /^>Cluster/
+        if !current_cluster.empty? && (current_cluster.count <= min_repetitions)
+          clusters += current_cluster
+        end
+        # count=0
+        current_cluster=[]
+      elsif line =~ />([^\.]+)\.\.\.\s/
+        current_cluster << $1
+      end
+    end
+    if !current_cluster.empty? && (current_cluster.count <= min_repetitions)
+      clusters += current_cluster
+    end
+    # $LOG.info("Repeated sequence count: #{@clusters.count}")
+  else
+    # $LOG.error("Clustering file's doesn't exists: #{@clusters.count}")
+  end
+  return clusters
+end
+def remove_singletons_from_file(input_file_path,singletons)
+  fqr=FastqFile.new(input_file_path)
+  out=FastqFile.new(input_file_path+'_without_singletons','w+')
+  fqr.each do |n,f,q,c|
+    if !singletons.include?(n)
+      out.write_seq(n,f,q,c)
+    end
+  end
+  out.close
+  fqr.close
+end
+input_file_path=ARGV.shift
+min_repetitions = ARGV.shift.to_i
+`cd-hit -i #{input_file_path} -o clusters`
+singletons = load_repeated_seqs('clusters.clrs',min_repetitions)
+remove_singletons_from_file(input_file_path,singletons)
+# puts singletons.to_json

data/bin/parse_json_results.rb ADDED Viewed

@@ -0,0 +1,66 @@
+#!/usr/bin/env ruby
+require 'yajl'
+require 'json'
+unless file = ARGV.shift
+  puts "\nUsage: $0 results.json action1 [action] [action] [action] ...\n\n"
+  exit(0)
+end
+actions = ARGV
+if actions.empty?
+  puts "\nUsage: $0 results.json action1 [action] [action] [action] ...\n\n"
+  exit(0)
+end
+json = File.new(file, 'r')
+puts "Counting sequences with these actions: #{actions.join(",")}"
+puts ""
+total = 0
+count = 0
+separate_count={}
+actions.each do |a|
+  separate_count[a]=0
+end
+all_actions =[]
+Yajl::Parser.parse(json) { |seq|
+  total += 1
+  action_names=seq['actions'].map {|a| a['type']}
+  if (action_names & actions).count == actions.count
+    count +=1
+  end
+  action_names.each do |a|
+    if actions.include?(a)
+        separate_count[a] += 1
+    end
+  end
+  all_actions = (all_actions + action_names).uniq
+}
+puts "="*20 + "Separate count" + "="*20
+separate_count.each do |k,v|
+  puts "#{k} = #{v}"
+end
+puts "="*20 + "Summarized" + "="*20
+puts "Number of sequences with all actions: #{count}"
+puts "Total sequences: #{total}"
+puts "\n"
+puts "="*20 + "Other used actions" + "="*20
+puts (all_actions-actions).join(',')

data/bin/parse_params.rb ADDED Viewed

@@ -0,0 +1,82 @@
+#!/usr/bin/env ruby
+require 'json'
+def get_json_data(file_path)
+  file1 = File.open(file_path)
+  text = file1.read
+  file1.close
+  # puts text
+  # # wipe text
+  # text=text.grep(/^\s*[^#]/).to_s
+  # decode json
+  data = JSON.parse(text)
+  return data
+end
+# extract params loading to external file in ingebiol
+params={}
+params['vector_db_field']='vectors_db'
+params['contaminants_db_field']='contaminants_db'
+params['species_field']='genus'
+params['min_insert_size_field']='min_insert_size_trimmed'
+params['min_paired_insert_size_field']='min_insert_size_paired'
+params['min_quality_value_field']='min_quality'
+if ARGV.count!=2
+  puts "#{$0} ingebiol_params_file.json seqtrim_params_file"
+  exit(-1)
+end
+input_file = ARGV[0]
+params_file=ARGV[1]
+if !File.exists?(input_file)
+  puts "File #{input_file} doesn't exists"
+  exit(-1)
+end
+if !File.exists?(params_file)
+  puts "File #{params_file} doesn't exists"
+  exit(-1)
+end
+sq_params=File.open(params_file,'r')
+data=get_json_data(input_file)
+# puts data.keys
+# puts data['vector_db_field']
+# replace params
+# sq_params.each_line do |line|
+#   line.chomp!
+#
+#   if line =~ /^\s*(.+)\s*=\s*(.+)\s*/
+#     puts $1,$2
+#   end
+#
+# end
+sq_params=File.open(params_file,'a+')
+data.each do |k,v|
+  sq_name=params[k]
+  # puts k,sq_name
+  if sq_name && v && !v.empty?
+    sq_params.puts "#{sq_name}=#{v}"
+  end
+end
+sq_params.close

data/bin/resume_clusters.rb ADDED Viewed

@@ -0,0 +1,48 @@
+#!/usr/bin/env ruby
+require 'json'
+if ARGV.count != 2
+  puts "#{$0} cluster.fasta.clstr COUNT"
+  exit
+end
+path=ARGV.shift
+list_max=ARGV.shift.to_i
+# puts path
+h={}
+last_line = ''
+f=File.open(path)
+f.each do |line|
+  if line =~ />Cluster/
+      if !last_line.empty?
+        if last_line =~ /^([\d]+)\s[^>]*>([^\s]*)\.\.\.\s/
+          # puts $1
+          h[$2]=$1.to_i+1
+        end
+      end
+  end
+  last_line=line
+end
+f.close
+# puts "30 most repeated sequences:"
+list_max.times do
+  ma=h.max_by{|k,v| v}
+  if ma
+    puts ma.join(' => ')
+    h.delete(ma[0])
+  end
+end
+# puts h.sort.to_json

data/bin/resume_rejected.sh ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+if [ ! -f $1 ]; then
+	echo "You must specify a file with seqtrim's rejected sequences"
+	echo "Usage $0 rejected_seqtrim_file";
+	exit;
+fi
+cat $1 | cut -d ' ' -f 2-20 | sort | uniq -c;

data/bin/reverse_paired.rb ADDED Viewed

@@ -0,0 +1,49 @@
+#!/usr/bin/env ruby
+require 'scbi_fasta'
+if ARGV.count!=3
+	puts "Usage: #{$0} fasta qual output_base_name"
+	exit
+end
+fasta_path = ARGV[0]
+qual_path = ARGV[1]
+name = ARGV[2]
+out_fasta = name+'.fasta'
+out_qual = name+'.fasta.qual'
+puts "Opening #{fasta_path}, #{qual_path}"
+fqr=FastaQualFile.new(fasta_path,qual_path,true)
+out_f=File.new(out_fasta,'w+')
+out_q=File.new(out_qual,'w+')
+c=0
+fqr.each do |n,f,q|
+  out_f.puts ">#{n}"
+	out_q.puts ">#{n}"
+	if n.index('dir=F')
+		out_f.puts f.reverse.tr('actgACTG','tgacTGAC')
+		out_q.puts q.reverse.join(' ')
+	else
+		out_f.puts f
+		out_q.puts q.join(' ')
+	end
+	c=c+1
+end
+puts c
+fqr.close
+out_f.close
+out_q.close