viral_seq 1.0.14 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '048e85ab67fbb667919d02d4509a15111798b116b3f927c921d203dc8565a1a2'
4
- data.tar.gz: 6951e410bd4f9b727a44fab1aa88f9cc263151cf9aed2a9c25ae9d866ed72450
3
+ metadata.gz: ea453e452e6832e942512cdb94462c33af89ffd8295017806c9aa6ff7ec77ad4
4
+ data.tar.gz: 2bb89d193e0e84ebe0791882c53e226a0a934ea3b9d1e61f87b8ffff6c22af1b
5
5
  SHA512:
6
- metadata.gz: 02cc87e245918a5c8f1b16b0db978da66e3bf7e83c6c6140c394c560d31c86ab1845b337a4f53b4b7883ff8e452e8caef0b036ea113c4416d6a29d16f419eb81
7
- data.tar.gz: 9f53bd6c46f4a49b5c14b8b8019ffa3e1abcf442bf0a6cc09a7dbc768a474f2afd81d5cf168f38eb8ab5abbd601c60f1f824f753bc77dbcc0d3c0d93568b9ae3
6
+ metadata.gz: 9dc0403ecaea119d3aa3e832305a0bd4f038fdb71789dcd036080fa89b0e454ee79001b6042df171364e4207a93b2d4d5747336b2fb7f8fb7d83103f5d641134
7
+ data.tar.gz: 510ccfce7d717b56d55e2477ae01124009d1f53f010635759cf2f69afe0132313e08db9abaae1ec6d8d894961beba1c2d70a637eafa9b57b05f0aac3372cd0ca
data/.gitignore CHANGED
@@ -2,7 +2,6 @@
2
2
  /.yardoc
3
3
  /_yardoc/
4
4
  /coverage/
5
- /doc/
6
5
  /pkg/
7
6
  /spec/reports/
8
7
  /tmp/
data/README.md CHANGED
@@ -2,7 +2,16 @@
2
2
 
3
3
  A Ruby Gem containing bioinformatics tools for processing viral NGS data.
4
4
 
5
- Specifically for Primer-ID sequencing and HIV drug resistance analysis.
5
+ Specifically for Primer ID sequencing and HIV drug resistance analysis.
6
+
7
+ ## Illustration for the Primer ID Sequencing
8
+
9
+
10
+ ![Primer ID Sequencing](https://storage.googleapis.com/tcs-dr-public/pid.png)
11
+
12
+ ### Reference readings on the Primer ID sequencing
13
+ [Primer ID JID paper](https://doi.org/10.21769/BioProtoc.3938)
14
+ [Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
6
15
 
7
16
  ## Install
8
17
 
@@ -14,19 +23,45 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
14
23
 
15
24
  ### Excutables
16
25
 
17
- Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
26
+ ### `tcs`
27
+ Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
18
28
 
29
+ Example commands:
19
30
  ```bash
20
- $ locator -i sequence.fasta -o sequence.fasta.csv
31
+ $ tcs -p params.json # run TCS pipeline with params.json
32
+ $ tcs -j # CLI to generate params.json
33
+ $ tcs -h # print out the help
21
34
  ```
35
+ ---
36
+ ### `tcs_log`
37
+
38
+ Use `tcs_log` script to pool run logs and TCS fasta files after one batch of `tcs` jobs.
22
39
 
23
- Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data.
24
40
 
41
+ Example file structure:
42
+ ```
43
+ batch_tcs_jobs/
44
+ ├── lib1
45
+ ├── lib2
46
+ ├── lib3
47
+ ├── lib4
48
+ ├── ...
49
+ ```
50
+
51
+ Example command:
25
52
  ```bash
26
- $ tcs -p params.json # run TCS pipeline with params.json
27
- $ tcs -j # CLI to generate params.json
28
- $ tcs -h # print out the help
53
+ $ tcs_log batch_tcs_jobs
54
+ ```
55
+
56
+ ---
57
+
58
+ ### `locator`
59
+ Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
60
+
61
+ ```bash
62
+ $ locator -i sequence.fasta -o sequence.fasta.csv
29
63
  ```
64
+ ---
30
65
 
31
66
  ## Some Examples
32
67
 
@@ -86,6 +121,16 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
86
121
 
87
122
  ## Updates
88
123
 
124
+ ### Version 1.1.0-03252021
125
+
126
+ 1. Optimized the algorithm of end-join.
127
+ 2. Fixed a bug in the `tcs` pipeline that sometimes combined tcs files are not saved.
128
+ 3. Added `tcs_log` command to pool run logs and tcs files from one batch of tcs jobs.
129
+ 4. Added the preset of MPID-HIVDR params file ***dr.json*** in /doc.
130
+ 5. Add `platform_format` option in the json generator of the `tcs` Pipeline.
131
+ Users can choose from 3 MiSeq platforms for processing their sequencing data.
132
+ MiSeq 300x7x300 is the default option.
133
+
89
134
  ### Version 1.0.14-03052021
90
135
 
91
136
  1. Add a function `ViralSeq::TcsCore.validate_file_name` to check MiSeq paired-end file names.
data/bin/tcs CHANGED
@@ -23,7 +23,7 @@
23
23
  # THE SOFTWARE.
24
24
 
25
25
  # Use JSON file as the run param
26
- # run tcs_json_generator.rb to generate param json file.
26
+ # run `tcs -j` to generate param json file.
27
27
 
28
28
  require 'viral_seq'
29
29
  require 'json'
@@ -115,6 +115,12 @@ else
115
115
  error_rate = 0.02
116
116
  end
117
117
 
118
+ if params[:platform_format]
119
+ $platform_sequencing_length = params[:platform_format]
120
+ else
121
+ $platform_sequencing_length = 300
122
+ end
123
+
118
124
  primers = params[:primer_pairs]
119
125
  if primers.empty?
120
126
  ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
@@ -273,7 +279,6 @@ primers.each do |primer|
273
279
  r1_sub_seq << bio_r1[seq_name]
274
280
  r2_sub_seq << bio_r2[seq_name]
275
281
  end
276
-
277
282
  #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
278
283
  consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
279
284
  r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
@@ -364,6 +369,7 @@ primers.each do |primer|
364
369
  shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
365
370
  joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
366
371
  log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
372
+
367
373
  summary_json[:combined_tcs] = joined_sh.size
368
374
 
369
375
  if export_raw
@@ -433,12 +439,15 @@ primers.each do |primer|
433
439
  trim_end = primer[:trim_ref_end]
434
440
  trim_ref = primer[:trim_ref].to_sym
435
441
  joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
436
- joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
437
442
  if export_raw
438
443
  joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
439
- joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
440
444
  end
441
445
  end
446
+
447
+ joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
448
+ if export_raw
449
+ joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
450
+ end
442
451
  end
443
452
 
444
453
  File.open(outfile_log, "w") do |f|
data/bin/tcs_log ADDED
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # pool run logs from one batch of tcs jobs
4
+ # file structure:
5
+ # batch_tcs_jobs/
6
+ # ├── lib1
7
+ # ├── lib2
8
+ # ├── lib3
9
+ # ├── lib4
10
+ # ├── ...
11
+ #
12
+ # command example:
13
+ # $ tcs_log batch_tcs_jobs
14
+
15
+ require 'viral_seq'
16
+ require 'pathname'
17
+ require 'json'
18
+ require 'fileutils'
19
+
20
+ indir = ARGV[0].chomp
21
+ indir_basename = File.basename(indir)
22
+ indir_dirname = File.dirname(indir)
23
+
24
+ tcs_dir = File.join(indir_dirname, (indir_basename + "_tcs"))
25
+ Dir.mkdir(tcs_dir) unless File.directory?(tcs_dir)
26
+
27
+ libs = []
28
+ Dir.chdir(indir) {libs = Dir.glob("*")}
29
+
30
+ outdir2 = File.join(tcs_dir, "combined_TCS_per_lib")
31
+ outdir3 = File.join(tcs_dir, "TCS_per_region")
32
+ outdir4 = File.join(tcs_dir, "combined_TCS_per_region")
33
+
34
+ Dir.mkdir(outdir2) unless File.directory?(outdir2)
35
+ Dir.mkdir(outdir3) unless File.directory?(outdir3)
36
+ Dir.mkdir(outdir4) unless File.directory?(outdir4)
37
+
38
+ log_file = File.join(tcs_dir,"log.csv")
39
+ log = File.open(log_file,'w')
40
+ log.puts "lib name,Region,Raw Sequences per barcode,R1 Raw,R2 Raw,Paired Raw,Cutoff,PID Length,Consensus1,Consensus2,Distinct to Raw,Resampling index,Combined TCS,Combined TCS after QC"
41
+
42
+ libs.each do |lib|
43
+ Dir.mkdir(File.join(outdir2, lib)) unless File.directory?(File.join(outdir2, lib))
44
+ fasta_files = []
45
+ json_files = []
46
+ Dir.chdir(File.join(indir, lib)) do
47
+ fasta_files = Dir.glob("**/*.fasta")
48
+ json_files = Dir.glob("**/log.json")
49
+ end
50
+ fasta_files.each do |f|
51
+ path_array = Pathname(f).each_filename.to_a
52
+ region = path_array[0]
53
+ if path_array[-1] == "combined.fasta"
54
+ FileUtils.cp(File.join(indir, lib, f), File.join(outdir2, lib, (lib + "_" + region)))
55
+ Dir.mkdir(File.join(outdir4,region)) unless File.directory?(File.join(outdir4,region))
56
+ FileUtils.cp(File.join(indir, lib, f), File.join(outdir4, region, (lib + "_" + region)))
57
+ else
58
+ Dir.mkdir(File.join(outdir3,region)) unless File.directory?(File.join(outdir3,region))
59
+ Dir.mkdir(File.join(outdir3,region, lib)) unless File.directory?(File.join(outdir3,region, lib))
60
+ FileUtils.cp(File.join(indir, lib, f), File.join(outdir3, region, lib, (lib + "_" + region + "_" + path_array[-1])))
61
+ end
62
+ end
63
+
64
+ json_files.each do |f|
65
+ json_log = JSON.parse(File.read(File.join(indir, lib, f)), symbolize_names: true)
66
+ log.print [lib,
67
+ json_log[:primer_set_name],
68
+ json_log[:total_raw_sequence],
69
+ json_log[:r1_filtered_raw],
70
+ json_log[:r2_filtered_raw],
71
+ json_log[:paired_raw_sequence],
72
+ json_log[:consensus_cutoff],
73
+ json_log[:length_of_pid],
74
+ json_log[:total_tcs_with_ambiguities],
75
+ json_log[:total_tcs],
76
+ json_log[:distinct_to_raw],
77
+ json_log[:resampling_param],
78
+ json_log[:combined_tcs],
79
+ json_log[:combined_tcs_after_qc],
80
+ ].join(',') + "\n"
81
+ end
82
+ end
83
+ log.close
data/doc/dr.json ADDED
@@ -0,0 +1,68 @@
1
+ {
2
+ "raw_sequence_dir": "MyExampleDir",
3
+ "platform_error_rate": 0.02,
4
+ "primer_pairs": [
5
+ {
6
+ "region": "RT",
7
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCACTATAGGCTGTACTGTCCATTTATC",
8
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
9
+ "majority": 0.5,
10
+ "end_join": true,
11
+ "end_join_option": 1,
12
+ "overlap": 0,
13
+ "TCS_QC": true,
14
+ "ref_genome": "HXB2",
15
+ "ref_start": 2648,
16
+ "ref_end": 3257,
17
+ "indel": true,
18
+ "trim": false
19
+ },
20
+ {
21
+ "region": "PR",
22
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
23
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
24
+ "majority": 0.5,
25
+ "end_join": true,
26
+ "end_join_option": 3,
27
+ "TCS_QC": true,
28
+ "ref_genome": "HXB2",
29
+ "ref_start": 0,
30
+ "ref_end": 2591,
31
+ "indel": true,
32
+ "trim": true,
33
+ "trim_ref": "HXB2",
34
+ "trim_ref_start": 2253,
35
+ "trim_ref_end": 2549
36
+ },
37
+ {
38
+ "region": "IN",
39
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNATCGAATACTGCCATTTGTACTGC",
40
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNAAAAGGAGAAGCCATGCATG",
41
+ "majority": 0.5,
42
+ "end_join": true,
43
+ "end_join_option": 3,
44
+ "overlap": 171,
45
+ "TCS_QC": true,
46
+ "ref_genome": "HXB2",
47
+ "ref_start": 4384,
48
+ "ref_end": 4751,
49
+ "indel": false,
50
+ "trim": false
51
+ },
52
+ {
53
+ "region": "V1V3",
54
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
55
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
56
+ "majority": 0.5,
57
+ "end_join": true,
58
+ "end_join_option": 1,
59
+ "overlap": 0,
60
+ "TCS_QC": true,
61
+ "ref_genome": "HXB2",
62
+ "ref_start": 6585,
63
+ "ref_end": 7208,
64
+ "indel": true,
65
+ "trim": false
66
+ }
67
+ ]
68
+ }
@@ -394,7 +394,6 @@ module ViralSeq
394
394
  end
395
395
  end
396
396
  end
397
-
398
397
  consensus_seq += call_consensus_base(max_base_list)
399
398
  end
400
399
  return consensus_seq
@@ -742,6 +741,7 @@ module ViralSeq
742
741
  seq_hash_unique_pass = []
743
742
 
744
743
  seq_hash_unique.each do |seq|
744
+ next if seq.nil?
745
745
  loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
746
746
  next unless loc # if locator tool fails, skip this seq.
747
747
  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
@@ -110,19 +110,21 @@ module ViralSeq
110
110
  raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
111
111
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
112
112
  joined_seq = {}
113
- seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
113
+ seq_pair_hash.each do |seq_name,seq_pair|
114
114
  r1_seq = seq_pair[0]
115
115
  r2_seq = seq_pair[1]
116
116
  if overlap.zero?
117
117
  joined_sequence = r1_seq + r2_seq
118
+ elsif diff.zero?
119
+ if r1_seq[-overlap..-1] == r2_seq[0,overlap]
120
+ joined_sequence= r1_seq + r2_seq[overlap..-1]
121
+ end
118
122
  elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
119
123
  joined_sequence= r1_seq + r2_seq[overlap..-1]
120
124
  else
121
125
  next
122
126
  end
123
- seq_names.each do |seq_name|
124
- joined_seq[seq_name] = joined_sequence
125
- end
127
+ joined_seq[seq_name] = joined_sequence if joined_sequence
126
128
  end
127
129
 
128
130
  joined_seq_hash = ViralSeq::SeqHash.new
@@ -305,7 +305,9 @@ module ViralSeq
305
305
  end
306
306
 
307
307
  def general_filter(seq)
308
- if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
308
+ if seq.size < $platform_sequencing_length
309
+ return false
310
+ elsif seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
309
311
  return false
310
312
  elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
311
313
  return false
@@ -13,6 +13,22 @@ module ViralSeq
13
13
  print '> '
14
14
  param[:raw_sequence_dir] = gets.chomp.rstrip
15
15
 
16
+ puts "Choose MiSeq Platform (1-3):\n1. 150x7x150\n2. 250x7x250\n3. 300x7x300 (default)"
17
+ print "> "
18
+ pf_option = gets.chomp.rstrip
19
+ # while ![1,2,3].include?(pf_option.to_i)
20
+ # print "Entered MiSeq Platform #{pf_option.red.bold} not valid (choose 1-3), try again\n> "
21
+ # pf_option = gets.chomp.rstrip
22
+ # end
23
+ case pf_option.to_i
24
+ when 1
25
+ param[:platform_format] = 150
26
+ when 2
27
+ param[:platform_format] = 250
28
+ else
29
+ param[:platform_format] = 300
30
+ end
31
+
16
32
  puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
17
33
  print '> '
18
34
  input_error = gets.chomp.rstrip.to_f
@@ -52,12 +68,12 @@ module ViralSeq
52
68
  if ej =~ /y|yes/i
53
69
  data[:end_join] = true
54
70
 
55
- print "End-join option? Choose from (1-4):\n
56
- 1: simple join, no overlap
57
- 2: known overlap \n
58
- 3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
59
- 4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
60
- > "
71
+ puts "End-join option? Choose from (1-4):"
72
+ puts "1: simple join, no overlap"
73
+ puts "2: known overlap"
74
+ puts "3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap"
75
+ puts "4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap"
76
+ print "> "
61
77
  ej_option = gets.chomp.rstrip
62
78
  while ![1,2,3,4].include?(ej_option.to_i)
63
79
  puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
@@ -138,7 +154,12 @@ module ViralSeq
138
154
  if save_option =~ /y|yes/i
139
155
  print "Path to save JSON file:\n> "
140
156
  path = gets.chomp.rstrip
141
- File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
157
+ while !validate_path_name(path)
158
+ print "Entered path no valid, try again.\n".red.bold
159
+ print "Path to save JSON file:\n> "
160
+ path = gets.chomp.rstrip
161
+ end
162
+ File.open(validate_path_name(path), 'w') {|f| f.puts JSON.pretty_generate(param)}
142
163
  end
143
164
 
144
165
  print "\nDo you wish to execute tcs pipeline with the input params now? Y/N \n> "
@@ -147,7 +168,7 @@ module ViralSeq
147
168
  if rsp =~ /y/i
148
169
  return param
149
170
  else
150
- abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`"
171
+ abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`".blue
151
172
  end
152
173
 
153
174
  end
@@ -172,7 +193,17 @@ module ViralSeq
172
193
  when 3
173
194
  :MAC239
174
195
  end
175
- end
176
- end
196
+ end # end of get_ref
197
+
198
+ def validate_path_name(path)
199
+ if path.empty?
200
+ return false
201
+ elsif File.directory? path
202
+ return File.join(path, 'params.json')
203
+ elsif File.directory?(File.dirname(path))
204
+ return path
205
+ end
206
+ end # end of validate_path_name
207
+ end # end of class << self
177
208
  end # end TcsJson
178
209
  end # end main module
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.0.14"
6
- TCS_VERSION = "2.1.1"
5
+ VERSION = "1.1.0"
6
+ TCS_VERSION = "2.2.0"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.14
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-03-06 00:00:00.000000000 Z
12
+ date: 2021-03-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -90,6 +90,7 @@ email:
90
90
  executables:
91
91
  - locator
92
92
  - tcs
93
+ - tcs_log
93
94
  extensions: []
94
95
  extra_rdoc_files: []
95
96
  files:
@@ -104,6 +105,8 @@ files:
104
105
  - Rakefile
105
106
  - bin/locator
106
107
  - bin/tcs
108
+ - bin/tcs_log
109
+ - doc/dr.json
107
110
  - lib/viral_seq.rb
108
111
  - lib/viral_seq/constant.rb
109
112
  - lib/viral_seq/enumerable.rb