viral_seq 1.0.14 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '048e85ab67fbb667919d02d4509a15111798b116b3f927c921d203dc8565a1a2'
4
- data.tar.gz: 6951e410bd4f9b727a44fab1aa88f9cc263151cf9aed2a9c25ae9d866ed72450
3
+ metadata.gz: ea453e452e6832e942512cdb94462c33af89ffd8295017806c9aa6ff7ec77ad4
4
+ data.tar.gz: 2bb89d193e0e84ebe0791882c53e226a0a934ea3b9d1e61f87b8ffff6c22af1b
5
5
  SHA512:
6
- metadata.gz: 02cc87e245918a5c8f1b16b0db978da66e3bf7e83c6c6140c394c560d31c86ab1845b337a4f53b4b7883ff8e452e8caef0b036ea113c4416d6a29d16f419eb81
7
- data.tar.gz: 9f53bd6c46f4a49b5c14b8b8019ffa3e1abcf442bf0a6cc09a7dbc768a474f2afd81d5cf168f38eb8ab5abbd601c60f1f824f753bc77dbcc0d3c0d93568b9ae3
6
+ metadata.gz: 9dc0403ecaea119d3aa3e832305a0bd4f038fdb71789dcd036080fa89b0e454ee79001b6042df171364e4207a93b2d4d5747336b2fb7f8fb7d83103f5d641134
7
+ data.tar.gz: 510ccfce7d717b56d55e2477ae01124009d1f53f010635759cf2f69afe0132313e08db9abaae1ec6d8d894961beba1c2d70a637eafa9b57b05f0aac3372cd0ca
data/.gitignore CHANGED
@@ -2,7 +2,6 @@
2
2
  /.yardoc
3
3
  /_yardoc/
4
4
  /coverage/
5
- /doc/
6
5
  /pkg/
7
6
  /spec/reports/
8
7
  /tmp/
data/README.md CHANGED
@@ -2,7 +2,16 @@
2
2
 
3
3
  A Ruby Gem containing bioinformatics tools for processing viral NGS data.
4
4
 
5
- Specifically for Primer-ID sequencing and HIV drug resistance analysis.
5
+ Specifically for Primer ID sequencing and HIV drug resistance analysis.
6
+
7
+ ## Illustration for the Primer ID Sequencing
8
+
9
+
10
+ ![Primer ID Sequencing](https://storage.googleapis.com/tcs-dr-public/pid.png)
11
+
12
+ ### Reference readings on the Primer ID sequencing
13
+ [Primer ID JID paper](https://doi.org/10.21769/BioProtoc.3938)
14
+ [Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
6
15
 
7
16
  ## Install
8
17
 
@@ -14,19 +23,45 @@ Specifically for Primer-ID sequencing and HIV drug resistance analysis.
14
23
 
15
24
  ### Excutables
16
25
 
17
- Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
26
+ ### `tcs`
27
+ Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
18
28
 
29
+ Example commands:
19
30
  ```bash
20
- $ locator -i sequence.fasta -o sequence.fasta.csv
31
+ $ tcs -p params.json # run TCS pipeline with params.json
32
+ $ tcs -j # CLI to generate params.json
33
+ $ tcs -h # print out the help
21
34
  ```
35
+ ---
36
+ ### `tcs_log`
37
+
38
+ Use `tcs_log` script to pool run logs and TCS fasta files after one batch of `tcs` jobs.
22
39
 
23
- Use executable `tcs` pipeline to process Primer ID MiSeq sequencing data.
24
40
 
41
+ Example file structure:
42
+ ```
43
+ batch_tcs_jobs/
44
+ ├── lib1
45
+ ├── lib2
46
+ ├── lib3
47
+ ├── lib4
48
+ ├── ...
49
+ ```
50
+
51
+ Example command:
25
52
  ```bash
26
- $ tcs -p params.json # run TCS pipeline with params.json
27
- $ tcs -j # CLI to generate params.json
28
- $ tcs -h # print out the help
53
+ $ tcs_log batch_tcs_jobs
54
+ ```
55
+
56
+ ---
57
+
58
+ ### `locator`
59
+ Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
60
+
61
+ ```bash
62
+ $ locator -i sequence.fasta -o sequence.fasta.csv
29
63
  ```
64
+ ---
30
65
 
31
66
  ## Some Examples
32
67
 
@@ -86,6 +121,16 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
86
121
 
87
122
  ## Updates
88
123
 
124
+ ### Version 1.1.0-03252021
125
+
126
+ 1. Optimized the algorithm of end-join.
127
+ 2. Fixed a bug in the `tcs` pipeline that sometimes combined tcs files are not saved.
128
+ 3. Added `tcs_log` command to pool run logs and tcs files from one batch of tcs jobs.
129
+ 4. Added the preset of MPID-HIVDR params file ***dr.json*** in /doc.
130
+ 5. Add `platform_format` option in the json generator of the `tcs` Pipeline.
131
+ Users can choose from 3 MiSeq platforms for processing their sequencing data.
132
+ MiSeq 300x7x300 is the default option.
133
+
89
134
  ### Version 1.0.14-03052021
90
135
 
91
136
  1. Add a function `ViralSeq::TcsCore.validate_file_name` to check MiSeq paired-end file names.
data/bin/tcs CHANGED
@@ -23,7 +23,7 @@
23
23
  # THE SOFTWARE.
24
24
 
25
25
  # Use JSON file as the run param
26
- # run tcs_json_generator.rb to generate param json file.
26
+ # run `tcs -j` to generate param json file.
27
27
 
28
28
  require 'viral_seq'
29
29
  require 'json'
@@ -115,6 +115,12 @@ else
115
115
  error_rate = 0.02
116
116
  end
117
117
 
118
+ if params[:platform_format]
119
+ $platform_sequencing_length = params[:platform_format]
120
+ else
121
+ $platform_sequencing_length = 300
122
+ end
123
+
118
124
  primers = params[:primer_pairs]
119
125
  if primers.empty?
120
126
  ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
@@ -273,7 +279,6 @@ primers.each do |primer|
273
279
  r1_sub_seq << bio_r1[seq_name]
274
280
  r2_sub_seq << bio_r2[seq_name]
275
281
  end
276
-
277
282
  #consensus name including the Primer ID and number of raw sequences of that Primer ID, library name and setname.
278
283
  consensus_name = ">" + primer_id + "_" + seq_with_same_primer_id.size.to_s + "_" + libname + "_" + region
279
284
  r1_consensus = ViralSeq::SeqHash.array(r1_sub_seq).consensus(majority_cut_off)
@@ -364,6 +369,7 @@ primers.each do |primer|
364
369
  shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
365
370
  joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
366
371
  log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
372
+
367
373
  summary_json[:combined_tcs] = joined_sh.size
368
374
 
369
375
  if export_raw
@@ -433,12 +439,15 @@ primers.each do |primer|
433
439
  trim_end = primer[:trim_ref_end]
434
440
  trim_ref = primer[:trim_ref].to_sym
435
441
  joined_sh = joined_sh.trim(trim_start, trim_end, trim_ref)
436
- joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
437
442
  if export_raw
438
443
  joined_sh_raw = joined_sh_raw.trim(trim_start, trim_end, trim_ref)
439
- joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
440
444
  end
441
445
  end
446
+
447
+ joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
448
+ if export_raw
449
+ joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
450
+ end
442
451
  end
443
452
 
444
453
  File.open(outfile_log, "w") do |f|
data/bin/tcs_log ADDED
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # pool run logs from one batch of tcs jobs
4
+ # file structure:
5
+ # batch_tcs_jobs/
6
+ # ├── lib1
7
+ # ├── lib2
8
+ # ├── lib3
9
+ # ├── lib4
10
+ # ├── ...
11
+ #
12
+ # command example:
13
+ # $ tcs_log batch_tcs_jobs
14
+
15
+ require 'viral_seq'
16
+ require 'pathname'
17
+ require 'json'
18
+ require 'fileutils'
19
+
20
+ indir = ARGV[0].chomp
21
+ indir_basename = File.basename(indir)
22
+ indir_dirname = File.dirname(indir)
23
+
24
+ tcs_dir = File.join(indir_dirname, (indir_basename + "_tcs"))
25
+ Dir.mkdir(tcs_dir) unless File.directory?(tcs_dir)
26
+
27
+ libs = []
28
+ Dir.chdir(indir) {libs = Dir.glob("*")}
29
+
30
+ outdir2 = File.join(tcs_dir, "combined_TCS_per_lib")
31
+ outdir3 = File.join(tcs_dir, "TCS_per_region")
32
+ outdir4 = File.join(tcs_dir, "combined_TCS_per_region")
33
+
34
+ Dir.mkdir(outdir2) unless File.directory?(outdir2)
35
+ Dir.mkdir(outdir3) unless File.directory?(outdir3)
36
+ Dir.mkdir(outdir4) unless File.directory?(outdir4)
37
+
38
+ log_file = File.join(tcs_dir,"log.csv")
39
+ log = File.open(log_file,'w')
40
+ log.puts "lib name,Region,Raw Sequences per barcode,R1 Raw,R2 Raw,Paired Raw,Cutoff,PID Length,Consensus1,Consensus2,Distinct to Raw,Resampling index,Combined TCS,Combined TCS after QC"
41
+
42
+ libs.each do |lib|
43
+ Dir.mkdir(File.join(outdir2, lib)) unless File.directory?(File.join(outdir2, lib))
44
+ fasta_files = []
45
+ json_files = []
46
+ Dir.chdir(File.join(indir, lib)) do
47
+ fasta_files = Dir.glob("**/*.fasta")
48
+ json_files = Dir.glob("**/log.json")
49
+ end
50
+ fasta_files.each do |f|
51
+ path_array = Pathname(f).each_filename.to_a
52
+ region = path_array[0]
53
+ if path_array[-1] == "combined.fasta"
54
+ FileUtils.cp(File.join(indir, lib, f), File.join(outdir2, lib, (lib + "_" + region)))
55
+ Dir.mkdir(File.join(outdir4,region)) unless File.directory?(File.join(outdir4,region))
56
+ FileUtils.cp(File.join(indir, lib, f), File.join(outdir4, region, (lib + "_" + region)))
57
+ else
58
+ Dir.mkdir(File.join(outdir3,region)) unless File.directory?(File.join(outdir3,region))
59
+ Dir.mkdir(File.join(outdir3,region, lib)) unless File.directory?(File.join(outdir3,region, lib))
60
+ FileUtils.cp(File.join(indir, lib, f), File.join(outdir3, region, lib, (lib + "_" + region + "_" + path_array[-1])))
61
+ end
62
+ end
63
+
64
+ json_files.each do |f|
65
+ json_log = JSON.parse(File.read(File.join(indir, lib, f)), symbolize_names: true)
66
+ log.print [lib,
67
+ json_log[:primer_set_name],
68
+ json_log[:total_raw_sequence],
69
+ json_log[:r1_filtered_raw],
70
+ json_log[:r2_filtered_raw],
71
+ json_log[:paired_raw_sequence],
72
+ json_log[:consensus_cutoff],
73
+ json_log[:length_of_pid],
74
+ json_log[:total_tcs_with_ambiguities],
75
+ json_log[:total_tcs],
76
+ json_log[:distinct_to_raw],
77
+ json_log[:resampling_param],
78
+ json_log[:combined_tcs],
79
+ json_log[:combined_tcs_after_qc],
80
+ ].join(',') + "\n"
81
+ end
82
+ end
83
+ log.close
data/doc/dr.json ADDED
@@ -0,0 +1,68 @@
1
+ {
2
+ "raw_sequence_dir": "MyExampleDir",
3
+ "platform_error_rate": 0.02,
4
+ "primer_pairs": [
5
+ {
6
+ "region": "RT",
7
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCACTATAGGCTGTACTGTCCATTTATC",
8
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
9
+ "majority": 0.5,
10
+ "end_join": true,
11
+ "end_join_option": 1,
12
+ "overlap": 0,
13
+ "TCS_QC": true,
14
+ "ref_genome": "HXB2",
15
+ "ref_start": 2648,
16
+ "ref_end": 3257,
17
+ "indel": true,
18
+ "trim": false
19
+ },
20
+ {
21
+ "region": "PR",
22
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
23
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
24
+ "majority": 0.5,
25
+ "end_join": true,
26
+ "end_join_option": 3,
27
+ "TCS_QC": true,
28
+ "ref_genome": "HXB2",
29
+ "ref_start": 0,
30
+ "ref_end": 2591,
31
+ "indel": true,
32
+ "trim": true,
33
+ "trim_ref": "HXB2",
34
+ "trim_ref_start": 2253,
35
+ "trim_ref_end": 2549
36
+ },
37
+ {
38
+ "region": "IN",
39
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNATCGAATACTGCCATTTGTACTGC",
40
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNAAAAGGAGAAGCCATGCATG",
41
+ "majority": 0.5,
42
+ "end_join": true,
43
+ "end_join_option": 3,
44
+ "overlap": 171,
45
+ "TCS_QC": true,
46
+ "ref_genome": "HXB2",
47
+ "ref_start": 4384,
48
+ "ref_end": 4751,
49
+ "indel": false,
50
+ "trim": false
51
+ },
52
+ {
53
+ "region": "V1V3",
54
+ "cdna": "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
55
+ "forward": "GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
56
+ "majority": 0.5,
57
+ "end_join": true,
58
+ "end_join_option": 1,
59
+ "overlap": 0,
60
+ "TCS_QC": true,
61
+ "ref_genome": "HXB2",
62
+ "ref_start": 6585,
63
+ "ref_end": 7208,
64
+ "indel": true,
65
+ "trim": false
66
+ }
67
+ ]
68
+ }
@@ -394,7 +394,6 @@ module ViralSeq
394
394
  end
395
395
  end
396
396
  end
397
-
398
397
  consensus_seq += call_consensus_base(max_base_list)
399
398
  end
400
399
  return consensus_seq
@@ -742,6 +741,7 @@ module ViralSeq
742
741
  seq_hash_unique_pass = []
743
742
 
744
743
  seq_hash_unique.each do |seq|
744
+ next if seq.nil?
745
745
  loc = ViralSeq::Sequence.new('', seq).locator(ref_option, path_to_muscle)
746
746
  next unless loc # if locator tool fails, skip this seq.
747
747
  if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
@@ -110,19 +110,21 @@ module ViralSeq
110
110
  raise ArgumentError.new(":overlap has to be Integer, input #{overlap} invalid.") unless overlap.is_a? Integer
111
111
  raise ArgumentError.new(":diff has to be float or integer, input #{diff} invalid.") unless (diff.is_a? Integer or diff.is_a? Float)
112
112
  joined_seq = {}
113
- seq_pair_hash.uniq_hash.each do |seq_pair, seq_names|
113
+ seq_pair_hash.each do |seq_name,seq_pair|
114
114
  r1_seq = seq_pair[0]
115
115
  r2_seq = seq_pair[1]
116
116
  if overlap.zero?
117
117
  joined_sequence = r1_seq + r2_seq
118
+ elsif diff.zero?
119
+ if r1_seq[-overlap..-1] == r2_seq[0,overlap]
120
+ joined_sequence= r1_seq + r2_seq[overlap..-1]
121
+ end
118
122
  elsif r1_seq[-overlap..-1].compare_with(r2_seq[0,overlap]) <= (overlap * diff)
119
123
  joined_sequence= r1_seq + r2_seq[overlap..-1]
120
124
  else
121
125
  next
122
126
  end
123
- seq_names.each do |seq_name|
124
- joined_seq[seq_name] = joined_sequence
125
- end
127
+ joined_seq[seq_name] = joined_sequence if joined_sequence
126
128
  end
127
129
 
128
130
  joined_seq_hash = ViralSeq::SeqHash.new
@@ -305,7 +305,9 @@ module ViralSeq
305
305
  end
306
306
 
307
307
  def general_filter(seq)
308
- if seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
308
+ if seq.size < $platform_sequencing_length
309
+ return false
310
+ elsif seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
309
311
  return false
310
312
  elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
311
313
  return false
@@ -13,6 +13,22 @@ module ViralSeq
13
13
  print '> '
14
14
  param[:raw_sequence_dir] = gets.chomp.rstrip
15
15
 
16
+ puts "Choose MiSeq Platform (1-3):\n1. 150x7x150\n2. 250x7x250\n3. 300x7x300 (default)"
17
+ print "> "
18
+ pf_option = gets.chomp.rstrip
19
+ # while ![1,2,3].include?(pf_option.to_i)
20
+ # print "Entered MiSeq Platform #{pf_option.red.bold} not valid (choose 1-3), try again\n> "
21
+ # pf_option = gets.chomp.rstrip
22
+ # end
23
+ case pf_option.to_i
24
+ when 1
25
+ param[:platform_format] = 150
26
+ when 2
27
+ param[:platform_format] = 250
28
+ else
29
+ param[:platform_format] = 300
30
+ end
31
+
16
32
  puts 'Enter the estimated platform error rate (for TCS cut-off calculation), default as ' + '0.02'.red.bold
17
33
  print '> '
18
34
  input_error = gets.chomp.rstrip.to_f
@@ -52,12 +68,12 @@ module ViralSeq
52
68
  if ej =~ /y|yes/i
53
69
  data[:end_join] = true
54
70
 
55
- print "End-join option? Choose from (1-4):\n
56
- 1: simple join, no overlap
57
- 2: known overlap \n
58
- 3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap\n
59
- 4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap\n
60
- > "
71
+ puts "End-join option? Choose from (1-4):"
72
+ puts "1: simple join, no overlap"
73
+ puts "2: known overlap"
74
+ puts "3: unknow overlap, use sample consensus to determine overlap, all sequence pairs have same overlap"
75
+ puts "4: unknow overlap, determine overlap by individual sequence pairs, sequence pairs can have different overlap"
76
+ print "> "
61
77
  ej_option = gets.chomp.rstrip
62
78
  while ![1,2,3,4].include?(ej_option.to_i)
63
79
  puts "Entered end-join option #{ej_option.red.bold} not valid (choose 1-4), try again"
@@ -138,7 +154,12 @@ module ViralSeq
138
154
  if save_option =~ /y|yes/i
139
155
  print "Path to save JSON file:\n> "
140
156
  path = gets.chomp.rstrip
141
- File.open(path, 'w') {|f| f.puts JSON.pretty_generate(param)}
157
+ while !validate_path_name(path)
158
+ print "Entered path no valid, try again.\n".red.bold
159
+ print "Path to save JSON file:\n> "
160
+ path = gets.chomp.rstrip
161
+ end
162
+ File.open(validate_path_name(path), 'w') {|f| f.puts JSON.pretty_generate(param)}
142
163
  end
143
164
 
144
165
  print "\nDo you wish to execute tcs pipeline with the input params now? Y/N \n> "
@@ -147,7 +168,7 @@ module ViralSeq
147
168
  if rsp =~ /y/i
148
169
  return param
149
170
  else
150
- abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`"
171
+ abort "Params json file generated. You can execute tcs pipeline using `tcs -p [params.json]`".blue
151
172
  end
152
173
 
153
174
  end
@@ -172,7 +193,17 @@ module ViralSeq
172
193
  when 3
173
194
  :MAC239
174
195
  end
175
- end
176
- end
196
+ end # end of get_ref
197
+
198
+ def validate_path_name(path)
199
+ if path.empty?
200
+ return false
201
+ elsif File.directory? path
202
+ return File.join(path, 'params.json')
203
+ elsif File.directory?(File.dirname(path))
204
+ return path
205
+ end
206
+ end # end of validate_path_name
207
+ end # end of class << self
177
208
  end # end TcsJson
178
209
  end # end main module
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.0.14"
6
- TCS_VERSION = "2.1.1"
5
+ VERSION = "1.1.0"
6
+ TCS_VERSION = "2.2.0"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.14
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-03-06 00:00:00.000000000 Z
12
+ date: 2021-03-26 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -90,6 +90,7 @@ email:
90
90
  executables:
91
91
  - locator
92
92
  - tcs
93
+ - tcs_log
93
94
  extensions: []
94
95
  extra_rdoc_files: []
95
96
  files:
@@ -104,6 +105,8 @@ files:
104
105
  - Rakefile
105
106
  - bin/locator
106
107
  - bin/tcs
108
+ - bin/tcs_log
109
+ - doc/dr.json
107
110
  - lib/viral_seq.rb
108
111
  - lib/viral_seq/constant.rb
109
112
  - lib/viral_seq/enumerable.rb