viral_seq 1.6.4 → 1.7.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 02d26d720fef0501d70b012d9919f932b23a21b1caabbf508a56fffa162c311b
4
- data.tar.gz: '0681add2b2fa2ca7dedffeaaf43bd8b0e2b6200dd38633e2eeda58946ced8238'
3
+ metadata.gz: 4e6d55ab37ecd3b9c5688c99772fc49792a5319bac853ac768367a8b42c0e0b6
4
+ data.tar.gz: a69e78c80f22848facb41ad4f9d9fb64e6d4e47ff6e18afa3421d64513ce6558
5
5
  SHA512:
6
- metadata.gz: 301c188d736c9812006d30db8995fa7df683cc63443c1370de3d487dae77b88cfc60c8b674abc93b35f7457ced536a6e374433e5fbe0f423e4a7993ea4240ebc
7
- data.tar.gz: 1118ab7b586da98bb2c3533f81c35a02f0efb1978c0b91e15b56218b05342ad83e73f96bd2df53f75e2e6cdd16c591582ebe864562a02d3dd78997678b706233
6
+ metadata.gz: ae34ac12bd2b86d4c7fc040765b26b94d41cfe239a206b2e84bf55841988826bcfbf685e788b93224ee78e29b1280454059991d644f81cbf24f1b97fff3f2294
7
+ data.tar.gz: 254993ea2126ca51d0ad5e2b6be2dca90e1b3ed817266e46b5ca46f91d2a69288c0a87c58906ef3da8ad6465e08788c850d6bc72013e30f1ead13e186ba16dfd
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- viral_seq (1.6.2)
4
+ viral_seq (1.6.5)
5
5
  colorize (~> 0.1)
6
6
  combine_pdf (~> 1.0, >= 1.0.0)
7
7
  muscle_bio (~> 0.5)
data/README.md CHANGED
@@ -10,6 +10,8 @@ A Ruby Gem containing bioinformatics tools for processing viral NGS data.
10
10
 
11
11
  Specifically for Primer ID sequencing and HIV drug resistance analysis.
12
12
 
13
+ CLI tools `tcs`, `tcs_sdrm`, `tcs_log` and `locator` included in the gem.
14
+
13
15
  #### tcs web app - https://primer-id.org/
14
16
 
15
17
  ## Illustration for the Primer ID Sequencing
@@ -22,6 +24,12 @@ Specifically for Primer ID sequencing and HIV drug resistance analysis.
22
24
  [Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
23
25
  [Application of Primer ID sequencing in COVID-19 research](https://doi.org/10.1126/scitranslmed.abb5883)
24
26
 
27
+ ## Requirements
28
+
29
+ Required Ruby Version: >= 2.5
30
+
31
+ Required RubyGems version: >= 1.3.6
32
+
25
33
  ## Install
26
34
 
27
35
  ```bash
@@ -179,6 +187,18 @@ qc_seqhash.sdrm_hiv_pr(cut_off)
179
187
 
180
188
  ## Updates
181
189
 
190
+ ### Version-1.7.1-05120203
191
+
192
+ 1. Add a size check for the raw sequences. If the size smaller than the input params, error messages will be sent to users. IF the actual size is greater than the input params, extra bases will be truncated.
193
+ 2. Now allows mismatch for the primer region sequences. Forward primer region allows 2 nt differences and cDNA primer region allows 3 nt differences.
194
+ 3. Bug fix.
195
+ 4. TCS version to 2.5.2
196
+
197
+ ### Version-1.7.0-08242022
198
+
199
+ 1. Add warnings if `tcs` pipeline is excecuting through source instead of installing from `gem`.
200
+ 2. Optimized `ViralSeq:SeqHash#a3g` hypermut algorithm. Allowing a external reference other than the sample reference.
201
+
182
202
  ### Version-1.6.4-07182022
183
203
 
184
204
  1. Included region "P17" in the default `tcs -d` pipeline setting. `tcs` pipeline updated to version 2.5.1.
data/bin/tcs CHANGED
@@ -22,20 +22,38 @@
22
22
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
23
  # THE SOFTWARE.
24
24
 
25
+ # Install using `gem install viral_seq`
25
26
  # Use JSON file as the run param
26
27
  # run `tcs -j` to generate param json file.
27
28
 
28
- require 'viral_seq'
29
+ def gem_installed?(gem_name)
30
+ found_gem = false
31
+ begin
32
+ found_gem = Gem::Specification.find_by_name(gem_name)
33
+ rescue Gem::LoadError
34
+ return false
35
+ else
36
+ return true
37
+ end
38
+ end
39
+
40
+ if gem_installed?('viral_seq')
41
+ require 'viral_seq'
42
+ else
43
+ printf "\n****************************************************\n"
44
+ printf "**** THIS PACKAGE CANNOT BE RUN FROM SOURCE ********\n"
45
+ printf "**** PLEASE INSTALL USING `gem install viral_seq` **\n"
46
+ printf "****************************************************\n\n"
47
+ exit 1
48
+ end
49
+
50
+
29
51
  require 'json'
30
52
  require 'colorize'
31
53
  require 'optparse'
32
54
 
33
55
  options = {}
34
56
 
35
- # banner = '-'*50 + "\n" +
36
- # '| The TCS Pipeline ' + "Version #{ViralSeq::TCS_VERSION}".red.bold + " by " + "Shuntai Zhou".blue.bold + ' |' + "\n" +
37
- # '-'*50 + "\n"
38
-
39
57
  banner = "\n" +
40
58
  "████████ ██████ ███████ ██████ ██ ██████ ███████ ██ ██ ███ ██ ███████\n".light_red +
41
59
  " ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██\n".light_yellow +
@@ -86,7 +104,7 @@ end.parse!
86
104
  if options[:json_generator]
87
105
  params = ViralSeq::TcsJson.generate
88
106
  elsif options[:dr]
89
- params = ViralSeq::TcsDr::PARAMS
107
+ params = ViralSeq::TcsDr::PARAMS
90
108
  elsif (options[:params_json] && File.exist?(options[:params_json]))
91
109
  params = JSON.parse(File.read(options[:params_json]), symbolize_names: true)
92
110
  else
@@ -145,6 +163,24 @@ begin
145
163
  $platform_sequencing_length = 300
146
164
  end
147
165
 
166
+ r1_raw_size = r1_fastq_sh.dna_hash.values[0].size
167
+ r2_raw_size = r2_fastq_sh.dna_hash.values[0].size
168
+
169
+ if r1_raw_size >= $platform_sequencing_length
170
+ r1_size_diff = r1_raw_size - $platform_sequencing_length
171
+ else
172
+ raise StandardError.new "R1 size smaller than the input platform format #{$platform_sequencing_length} bp."
173
+ end
174
+
175
+ if r2_raw_size >= $platform_sequencing_length
176
+ r2_size_diff = r2_raw_size - $platform_sequencing_length
177
+ else
178
+ raise StandardError.new "R2 size smaller than the input platform format #{$platform_sequencing_length} bp."
179
+ end
180
+
181
+ r1_truncate_base_number = 2 + r1_size_diff
182
+ r2_truncate_base_number = 2 + r2_size_diff
183
+
148
184
  primers = params[:primer_pairs]
149
185
  if primers.empty? or primers.nil?
150
186
  ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
@@ -217,8 +253,8 @@ begin
217
253
  r2_seq = r2_passed_seq[seqtag]
218
254
  pid = r2_seq[0, pid_length]
219
255
  id[seqtag] = pid
220
- bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-2]
221
- bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-2]
256
+ bio_r2[seqtag] = r2_seq[filter_r2[:reverse_starting_number]..-r2_truncate_base_number]
257
+ bio_r1[seqtag] = r1_seq[filter_r1[:forward_starting_number]..-r1_truncate_base_number]
222
258
  end
223
259
 
224
260
  # TCS cut-off
data/bin/tcs_log CHANGED
@@ -155,7 +155,7 @@ region_colors = {"Other" => "#808080"}
155
155
  CSV.foreach(log_file).each_with_index do |row, i|
156
156
  next if i == 0 || row[0] == nil
157
157
 
158
- lib_name = row[0]
158
+ lib_name = row[0].to_s
159
159
  region = row[1]
160
160
  raw_sequences_per_barcode = row[2].to_i
161
161
 
@@ -450,7 +450,7 @@ module ViralSeq
450
450
  # function to determine if the sequences have APOBEC3g/f hypermutation.
451
451
  # # APOBEC3G/F pattern: GRD -> ARD
452
452
  # # control pattern: G[YN|RC] -> A[YN|RC]
453
- # # use the sample consensus to determine potential a3g sites
453
+ # # use the sample consensus to determine potential a3g sites (default) or provide external reference sequences as a `String`
454
454
  # # Two criteria to identify hypermutation
455
455
  # # 1. Fisher's exact test on the frequencies of G to A mutation at A3G positions vs. non-A3G positions
456
456
  # # 2. Poisson distribution of G to A mutations at A3G positions, outliers sequences
@@ -486,7 +486,7 @@ module ViralSeq
486
486
  # # but it is still called as hypermutation sequence b/c it's Poisson outlier sequence.
487
487
  # @see https://www.hiv.lanl.gov/content/sequence/HYPERMUT/hypermut.html LANL Hypermut
488
488
 
489
- def a3g_hypermut
489
+ def a3g_hypermut(ref = nil)
490
490
  # mut_hash number of apobec3g/f mutations per sequence
491
491
  mut_hash = {}
492
492
  hm_hash = {}
@@ -495,8 +495,10 @@ module ViralSeq
495
495
  # total G->A mutations at apobec3g/f positions.
496
496
  total = 0
497
497
 
498
- # make consensus sequence for the input sequence hash
499
- ref = self.consensus
498
+ unless ref
499
+ # make consensus sequence for the input sequence hash
500
+ ref = self.consensus
501
+ end
500
502
 
501
503
  # obtain apobec3g positions and control positions
502
504
  apobec = apobec3gf(ref)
@@ -509,7 +511,6 @@ module ViralSeq
509
511
  c = 0 # control muts
510
512
  d = 0 # potenrial controls
511
513
  mut.each do |n|
512
- next if v[n] == "-"
513
514
  if v[n] == "A"
514
515
  a += 1
515
516
  b += 1
@@ -521,7 +522,6 @@ module ViralSeq
521
522
  total += a
522
523
 
523
524
  control.each do |n|
524
- next if v[n] == "-"
525
525
  if v[n] == "A"
526
526
  c += 1
527
527
  d += 1
@@ -544,7 +544,7 @@ module ViralSeq
544
544
  end
545
545
  end
546
546
 
547
- if self.dna_hash.size > 20
547
+ if self.dna_hash.size > 200
548
548
  rate = total.to_f/(self.dna_hash.size)
549
549
  count_mut = mut_hash.values.count_freq
550
550
  maxi_count = count_mut.values.max
@@ -566,10 +566,12 @@ module ViralSeq
566
566
  end
567
567
  end
568
568
  end
569
+
569
570
  hm_seq_hash = ViralSeq::SeqHash.new
570
571
  hm_hash.each do |k,_v|
571
572
  hm_seq_hash.dna_hash[k] = self.dna_hash[k]
572
573
  end
574
+
573
575
  hm_seq_hash.title = self.title + "_hypermut"
574
576
  hm_seq_hash.file = self.file
575
577
  filtered_seq_hash = self.sub(self.dna_hash.keys - hm_hash.keys)
@@ -1356,7 +1358,7 @@ module ViralSeq
1356
1358
  # APOBEC3G/F pattern: GRD -> ARD,
1357
1359
  # control pattern: G[YN|RC] -> A[YN|RC],
1358
1360
  def apobec3gf(seq = '')
1359
- seq.tr!("-", "")
1361
+ #seq.tr!("-", "")
1360
1362
  seq_length = seq.size
1361
1363
  apobec_position = []
1362
1364
  control_position = []
@@ -1368,6 +1370,7 @@ module ViralSeq
1368
1370
  control_position << n
1369
1371
  end
1370
1372
  end
1373
+
1371
1374
  return [apobec_position,control_position]
1372
1375
  end # end of #apobec3gf
1373
1376
 
@@ -180,7 +180,7 @@ module ViralSeq
180
180
  l1 = 0
181
181
  l2 = 0
182
182
 
183
- aln_seq = ViralSeq::Muscle.align(ori_ref, seq, :PPP, path_to_muscle)
183
+ aln_seq = ViralSeq::Muscle.align(ori_ref, seq, :Super5, path_to_muscle)
184
184
  aln_test = aln_seq[1]
185
185
  aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
186
186
  gap_begin = $1.size
@@ -214,7 +214,7 @@ module ViralSeq
214
214
  l2 = l2 + (post_aln - b2)
215
215
  end
216
216
 
217
- aln_seq = ViralSeq::Muscle.align(ref, seq, :PPP, path_to_muscle)
217
+ aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
218
218
  aln_test = aln_seq[1]
219
219
  aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
220
220
  gap_begin = $1.size
@@ -263,7 +263,7 @@ module ViralSeq
263
263
  end
264
264
 
265
265
  while repeat == 1
266
- aln_seq = ViralSeq::Muscle.align(ref, seq, :PPP, path_to_muscle)
266
+ aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
267
267
  aln_test = aln_seq[1]
268
268
  aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
269
269
  gap_begin = $1.size
@@ -293,7 +293,7 @@ module ViralSeq
293
293
  end
294
294
  ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
295
295
 
296
- aln_seq = ViralSeq::Muscle.align(ref, seq, :PPP, path_to_muscle)
296
+ aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
297
297
  aln_test = aln_seq[1]
298
298
  ref = aln_seq[0]
299
299
 
@@ -307,7 +307,7 @@ module ViralSeq
307
307
 
308
308
  if (ori_ref_l - l2 - 1) >= l1
309
309
  ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
310
- aln_seq = ViralSeq::Muscle.align(ref, seq, :PPP, path_to_muscle)
310
+ aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
311
311
  aln_test = aln_seq[1]
312
312
  ref = aln_seq[0]
313
313
 
@@ -56,6 +56,43 @@ class String
56
56
  Regexp.new match
57
57
  end
58
58
 
59
+ # parse the nucleotide sequences as an Array of Array
60
+ # @return [Array] Array of Array at each position
61
+ # @example parse a sequence with ambiguities to Array of Array
62
+ # "ATRWCG".nt_to_array
63
+ # => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
64
+
65
+ def nt_to_array
66
+ return_array = []
67
+ self.each_char.each do |base|
68
+ base_array = base.to_list
69
+ return_array.append base_array
70
+ end
71
+ return return_array
72
+ end
73
+
74
+
75
+ # compare the given nt sequence string with the ref sequence string
76
+ # @param ref [String] the ref sequence string to compare with
77
+ # @return [Interger] Number of differences
78
+ # @example parse a sequence with ambiguities to Array of Array
79
+ # "ATRWCG".nt_to_array
80
+ # => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
81
+
82
+ def nt_diff(ref)
83
+ count_diff = 0
84
+ self_array = self.split("")
85
+ ref_array = ref.nt_to_array
86
+ self_array.each_with_index do |nt, i|
87
+ ref_nt = ref_array[i]
88
+ unless ref_nt.include? nt
89
+ count_diff += 1
90
+ end
91
+ end
92
+ return count_diff
93
+ end
94
+
95
+
59
96
  # parse IUPAC nucleotide ambiguity codes (W S M K R Y B D H V N) as String if String.size == 1
60
97
  # @return [Array] parsed nt bases
61
98
  # @example parse IUPAC `R`
@@ -223,7 +223,7 @@ module ViralSeq
223
223
  end
224
224
  forward_bio_primer_size = forward_bio_primer.size
225
225
  forward_starting_number = forward_n + forward_bio_primer_size
226
- forward_primer_ref = forward_bio_primer.nt_parser
226
+ #forward_primer_ref = forward_bio_primer.nt_parser
227
227
 
228
228
  r1_passed_seq = {}
229
229
  r1_raw = r1_sh.dna_hash
@@ -232,7 +232,7 @@ module ViralSeq
232
232
  seq = r1_raw[name]
233
233
  next unless general_filter seq
234
234
  primer_region_seq = seq[forward_n, forward_bio_primer_size]
235
- if primer_region_seq =~ forward_primer_ref
235
+ if primer_region_seq.nt_diff(forward_bio_primer) < 3
236
236
  new_name = remove_tag name
237
237
  r1_passed_seq[new_name] = seq
238
238
  end
@@ -255,13 +255,13 @@ module ViralSeq
255
255
  cdna_bio_primer = $2
256
256
  cdna_bio_primer_size = cdna_bio_primer.size
257
257
  reverse_starting_number = pid_length + cdna_bio_primer_size
258
- cdna_primer_ref = cdna_bio_primer.nt_parser
258
+ # cdna_primer_ref = cdna_bio_primer.nt_to_array
259
259
  r2_passed_seq = {}
260
260
  proc_filter = proc do |name|
261
261
  seq = r2_raw[name]
262
262
  next unless general_filter seq
263
263
  primer_region_seq = seq[pid_length, cdna_bio_primer_size]
264
- if primer_region_seq =~ cdna_primer_ref
264
+ if primer_region_seq.nt_diff(cdna_bio_primer) < 4
265
265
  new_name = remove_tag name
266
266
  r2_passed_seq[new_name] = seq
267
267
  end
@@ -2,6 +2,6 @@
2
2
  # version info and histroy
3
3
 
4
4
  module ViralSeq
5
- VERSION = "1.6.4"
6
- TCS_VERSION = "2.5.1"
5
+ VERSION = "1.7.1"
6
+ TCS_VERSION = "2.5.2"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: viral_seq
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.4
4
+ version: 1.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuntai Zhou
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2022-07-19 00:00:00.000000000 Z
12
+ date: 2023-05-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -193,7 +193,6 @@ files:
193
193
  - lib/viral_seq/tcs_dr.rb
194
194
  - lib/viral_seq/tcs_json.rb
195
195
  - lib/viral_seq/version.rb
196
- - rc_swans.svc@longleaf.unc.edu
197
196
  - viral_seq.gemspec
198
197
  homepage: https://github.com/ViralSeq/viral_seq
199
198
  licenses:
Binary file