viral_seq 1.9.1 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -1
- data/README.md +130 -120
- data/bin/locator +2 -2
- data/bin/tcs +38 -38
- data/lib/viral_seq/R.rb +3 -1
- data/lib/viral_seq/seq_hash.rb +48 -12
- data/lib/viral_seq/sequence.rb +22 -171
- data/lib/viral_seq/string.rb +3 -6
- data/lib/viral_seq/tcs_core.rb +4 -0
- data/lib/viral_seq/tcs_dr.rb +82 -1
- data/lib/viral_seq/util/drm_versions_config.json +52 -0
- data/lib/viral_seq/version.rb +2 -2
- data/lib/viral_seq.rb +2 -0
- data/viral_seq.gemspec +5 -0
- metadata +31 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d940e5f465cba40def34166fe50e0a21b1c62a1fff8e0be8abdabb7b4c4aab77
|
4
|
+
data.tar.gz: 7e4be6ec82d9081a1ea3130eed49dcaac080608e481c7520b43c2e58a50e379d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 15805b09c96b6d1bff023a82948f23ceb584c60ffb21b85e59d6f4ddc2e2394045a29788a7c5811c714afedfae6405c36b88e0bcadce0d1408068418c497e596
|
7
|
+
data.tar.gz: '0871676e5ee49fa14f84ec3c109172d964efac18f3f104ec38ad52daa69b9ef85a935c35ca2377d261b38edc5d8d438469b2360ec791a902116f60c8daeef5c2'
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.
|
4
|
+
viral_seq (1.10.1)
|
5
5
|
colorize (~> 0.1)
|
6
6
|
combine_pdf (~> 1.0, >= 1.0.0)
|
7
7
|
muscle_bio (= 0.4)
|
8
8
|
prawn (~> 2.3, >= 2.3.0)
|
9
9
|
prawn-table (~> 0.2, >= 0.2.0)
|
10
|
+
shellwords (~> 0.2)
|
11
|
+
virust-locator-ruby (~> 0.3)
|
10
12
|
|
11
13
|
GEM
|
12
14
|
remote: https://rubygems.org/
|
@@ -41,8 +43,11 @@ GEM
|
|
41
43
|
rspec-support (~> 3.13.0)
|
42
44
|
rspec-support (3.13.1)
|
43
45
|
ruby-rc4 (0.1.5)
|
46
|
+
shellwords (0.2.0)
|
44
47
|
ttfunk (1.8.0)
|
45
48
|
bigdecimal (~> 3.1)
|
49
|
+
virust-locator-ruby (0.3.0)
|
50
|
+
shellwords (~> 0.2)
|
46
51
|
|
47
52
|
PLATFORMS
|
48
53
|
ruby
|
data/README.md
CHANGED
@@ -16,10 +16,10 @@ CLI tools `tcs`, `tcs_sdrm`, `tcs_log` and `locator` included in the gem.
|
|
16
16
|
|
17
17
|
## Illustration for the Primer ID Sequencing
|
18
18
|
|
19
|
-
|
20
19
|

|
21
20
|
|
22
21
|
### Reference readings on the Primer ID sequencing
|
22
|
+
|
23
23
|
[Explantion of Primer ID sequencing](https://doi.org/10.21769/BioProtoc.3938)
|
24
24
|
[Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
|
25
25
|
[Application of Primer ID sequencing in COVID-19 research](https://doi.org/10.1126/scitranslmed.abb5883)
|
@@ -41,11 +41,13 @@ Required RubyGems version: >= 1.3.6
|
|
41
41
|
### Excutables
|
42
42
|
|
43
43
|
### `tcs`
|
44
|
+
|
44
45
|
Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
|
45
46
|
|
46
47
|
Web-based `tcs` analysis can be accessed at https://primer-id.org/
|
47
48
|
|
48
49
|
Example commands:
|
50
|
+
|
49
51
|
```bash
|
50
52
|
$ tcs -p params.json # run TCS pipeline with params.json
|
51
53
|
$ tcs -p params.json -i DIRECTORY
|
@@ -61,12 +63,13 @@ Example commands:
|
|
61
63
|
[sample params.json for the tcs-dr pipeline](./docs/dr.json)
|
62
64
|
|
63
65
|
---
|
66
|
+
|
64
67
|
### `tcs_log`
|
65
68
|
|
66
69
|
Use `tcs_log` script to pool run logs and TCS fasta files after one batch of `tcs` jobs. This command generates log.html to visualize the sequencing runs.
|
67
70
|
|
68
|
-
|
69
71
|
Example file structure:
|
72
|
+
|
70
73
|
```
|
71
74
|
batch_tcs_jobs/
|
72
75
|
├── lib1
|
@@ -77,21 +80,25 @@ batch_tcs_jobs/
|
|
77
80
|
```
|
78
81
|
|
79
82
|
Example command:
|
83
|
+
|
80
84
|
```bash
|
81
85
|
$ tcs_log batch_tcs_jobs
|
82
86
|
```
|
83
87
|
|
84
88
|
---
|
89
|
+
|
85
90
|
### `tcs_sdrm`
|
86
91
|
|
87
92
|
Use `tcs_sdrm` pipeline for HIV-1 drug resistance mutation and recency.
|
88
93
|
|
89
94
|
Example command:
|
95
|
+
|
90
96
|
```bash
|
91
97
|
$ tcs_sdrm libs_dir
|
92
98
|
```
|
93
99
|
|
94
100
|
lib_dir file structure:
|
101
|
+
|
95
102
|
```
|
96
103
|
libs_dir/
|
97
104
|
├── lib1
|
@@ -109,8 +116,8 @@ libs_dir/
|
|
109
116
|
|
110
117
|
Output data in a new dir as 'libs_dir_SDRM'
|
111
118
|
|
112
|
-
|
113
119
|
**Note: [R](https://www.r-project.org/) and the following R libraries are required:**
|
120
|
+
|
114
121
|
- phangorn
|
115
122
|
- ape
|
116
123
|
- scales
|
@@ -122,11 +129,13 @@ Output data in a new dir as 'libs_dir_SDRM'
|
|
122
129
|
---
|
123
130
|
|
124
131
|
### `locator`
|
132
|
+
|
125
133
|
Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
126
134
|
|
127
135
|
```bash
|
128
136
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
129
137
|
```
|
138
|
+
|
130
139
|
---
|
131
140
|
|
132
141
|
## Some Examples
|
@@ -179,248 +188,249 @@ Examine for drug resistance mutations for HIV PR region
|
|
179
188
|
```ruby
|
180
189
|
qc_seqhash.sdrm_hiv_pr(cut_off)
|
181
190
|
```
|
182
|
-
## Known issues
|
183
|
-
|
184
|
-
1. ~~have a conflict with rails.~~
|
185
|
-
2. ~~Update on 03032021. Still have conflict. But in rails gem file, can just use `requires: false` globally and only require "viral_seq" when the module is needed in controller.~~
|
186
|
-
3. The conflict seems to be resovled. It was from a combination of using `!` as a function for factorial and the gem name `viral_seq`. @_@
|
187
191
|
|
188
192
|
## Updates
|
189
193
|
|
194
|
+
### Version-1.10.1
|
195
|
+
|
196
|
+
1. Added quality filter for Illumina 2-color sequencing platforms (filter poly-G and poly-C)
|
197
|
+
2. Replaced `MuscleBio` with [`VirustLocator`]("https://github.com/ViralSeq/virust-locator-ruby") for faster and more accurate pairwise alignment.
|
198
|
+
3. Added DR primer version 4.
|
199
|
+
4. Added a helper function to properly treat input params for #hiv_seq_qc.
|
200
|
+
5. Solved the slow-performance issue when spawning a subprocess to call `VirustLocator` when holding a large amount of data in the momery. When Ruby run shell commands, a child process is spawned and share the parent's memory pages. To set it up, the OS has to walk the parent's entire memory table, causing an incremental delay in each subsequent process spawning. To solve this, I redid the `VirustLocator` API to allow all the arguments to be processed with one shell command instead of spawning individual child process.
|
201
|
+
|
190
202
|
### Version-1.9.1-12022024
|
191
203
|
|
192
|
-
|
204
|
+
1. Fixed a bug in the `tcs_sdrm` pipeline.
|
193
205
|
|
194
206
|
### Version-1.9.0-11132024
|
195
207
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
208
|
+
1. `ViralSeq::TcsCore::validate_file_name` will not report errors when non-sequence data in the folder, instead these files will be ignored.
|
209
|
+
2. Rewrote the APIs for DRM analysis for HIV. Now uses version config files for the sequencing information and DRM list configure files for DRM interpretation. Two configure files are at located in `/lib/viral_seq/util/`
|
210
|
+
3. `tcs_sdrm` will take a second argument for DRM config versions. Currently supports `["v1", "v2", "v3"]`. Refer to the documentations of the APIs for the details.
|
211
|
+
4. Next update will use secondary command `tcs sdrm` to replace `tcs_sdrm`, and `tcs log` to replace `tcs_log`.
|
200
212
|
|
201
213
|
### Version-1.8.1-06042024
|
202
214
|
|
203
|
-
|
215
|
+
1. Fixed a bug that causes `tcs_sdrm` pipeline to crash.
|
204
216
|
|
205
217
|
### Version-1.8.0-04052024
|
206
218
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
219
|
+
1. Use `muscle-v3.8.1` as default aligner because of the compatibility issues with `muscle-v5` on some platforms.
|
220
|
+
2. Adjust the end-join model for short insert (insert size less than read length substracted by adaptor size)
|
221
|
+
3. Add an option in the DR pipeline for different versions of the pipeline, default version as "v1".
|
222
|
+
4. Add Days Post Infection (DPI) prediction model in the SDRM pipeline.
|
223
|
+
5. Re-organize the R scripts as stand-alone R files.
|
224
|
+
6. Bug fix.
|
225
|
+
7. **NOT SOLVED**: to include versions of DR in reports
|
214
226
|
|
215
227
|
### Version-1.7.1-05122023
|
216
228
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
229
|
+
1. Add a size check for the raw sequences. If the size smaller than the input params, error messages will be sent to users. IF the actual size is greater than the input params, extra bases will be truncated.
|
230
|
+
2. Now allows mismatch for the primer region sequences. Forward primer region allows 2 nt differences and cDNA primer region allows 3 nt differences.
|
231
|
+
3. Bug fix.
|
232
|
+
4. TCS version to 2.5.2
|
221
233
|
|
222
234
|
### Version-1.7.0-08242022
|
223
235
|
|
224
|
-
|
225
|
-
|
236
|
+
1. Add warnings if `tcs` pipeline is excecuting through source instead of installing from `gem`.
|
237
|
+
2. Optimized `ViralSeq:SeqHash#a3g` hypermut algorithm. Allowing a external reference other than the sample reference.
|
226
238
|
|
227
239
|
### Version-1.6.4-07182022
|
228
240
|
|
229
|
-
|
230
|
-
|
231
|
-
|
241
|
+
1. Included region "P17" in the default `tcs -d` pipeline setting. `tcs` pipeline updated to version 2.5.1.
|
242
|
+
2. Loosen the locator params for the "V1V3" end region for rare alignment issues. Now the default "V1V3" region end with position 7205 to 7210 instead of 7208.
|
243
|
+
3. `tcs_sdrm` now analyse "P17" region for pairwise diversity.
|
232
244
|
|
233
245
|
### Version-1.6.3-02052022
|
234
246
|
|
235
|
-
|
236
|
-
|
237
|
-
|
247
|
+
1. Updated on `ViralSeq::Muscle` module along with the update of `muscle` from version 3.8.1 to 5.1.
|
248
|
+
2. Optimized the `locator` algorithm based on `muscle` v5.1.
|
249
|
+
3. Optimized the `tcs_sdrm` pipeline based on `muscle` v5.1.
|
238
250
|
|
239
251
|
### Version-1.6.1-02022022
|
240
252
|
|
241
|
-
|
242
|
-
|
253
|
+
1. Fixed the `nav bar` in tcs_log html file.
|
254
|
+
2. Fixed a typo in `tcs`.
|
243
255
|
|
244
256
|
### Version 1.6.0-01042022
|
245
257
|
|
246
|
-
|
247
|
-
|
258
|
+
1. Update the `ViralSeq::TcsCore::detection_limit` with pre-calculated values to save processing time.
|
259
|
+
2. Update `tcs` pipeline to v2.5.0. HTML report will generated after running `tcs_log` script after `tcs` pipeline.
|
248
260
|
|
249
261
|
### Version 1.5.0-01042022
|
250
262
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
263
|
+
1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
|
264
|
+
2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
|
265
|
+
3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
|
266
|
+
4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
|
267
|
+
5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
|
256
268
|
|
257
269
|
### Version 1.4.0-10132021
|
258
270
|
|
259
|
-
|
260
|
-
|
271
|
+
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
272
|
+
2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
|
261
273
|
|
262
274
|
### Version 1.3.0-08302021
|
263
275
|
|
264
|
-
|
276
|
+
1. Fixed a bug in the `tcs` pipeline.
|
265
277
|
|
266
278
|
### Version 1.2.9-08022021
|
267
279
|
|
268
|
-
|
269
|
-
|
280
|
+
1. Fixed a bug when reading the input primer sequences in lowercases.
|
281
|
+
2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
|
270
282
|
|
271
283
|
### Version 1.2.8-07292021
|
272
284
|
|
273
|
-
|
285
|
+
1. Fixed an issue when reading .fastq files containing blank_lines.
|
274
286
|
|
275
287
|
### Version 1.2.7-07152021
|
276
288
|
|
277
|
-
|
278
|
-
|
279
|
-
|
289
|
+
1. Optimzed the workflow of the `tcs` pipeline on raw data with uneven lengths.
|
290
|
+
`tcs` version to v2.3.6.
|
280
291
|
|
281
292
|
### Version 1.2.6-07122021
|
282
293
|
|
283
|
-
|
284
|
-
|
285
|
-
|
294
|
+
1. Optimized the workflow of the `tcs` pipeline in the "end-join/QC/Trimming" section.
|
295
|
+
`tcs` version to v2.3.5.
|
286
296
|
|
287
297
|
### Version 1.2.5-06232021
|
288
298
|
|
289
|
-
|
290
|
-
|
291
|
-
|
299
|
+
1. Add error rescue and report in the `tcs` pipeline.
|
300
|
+
error messages are stored in the .tcs_error file. `tcs` pipeline updated to v2.3.4.
|
301
|
+
2. Use simple majority for the consensus cut-off in the default setting of the `tcs -dr` pipeline.
|
292
302
|
|
293
303
|
### Version 1.2.2-05272021
|
294
304
|
|
295
|
-
|
296
|
-
|
305
|
+
1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
|
306
|
+
`tcs` pipeline upgraded to v2.3.2
|
297
307
|
|
298
308
|
### Version 1.2.1-05172021
|
299
309
|
|
300
|
-
|
310
|
+
1. Added a function in R to check and install missing R packages for `tcs_sdrm` pipeline.
|
301
311
|
|
302
312
|
### Version 1.2.0-05102021
|
303
313
|
|
304
|
-
|
305
|
-
|
314
|
+
1. Added `tcs_sdrm` pipeline as an excutable.
|
315
|
+
`tcs_sdrm` processes `tcs`-processed HIV MPID-NGS data for drug resistance mutations, recency and phylogentic analysis.
|
306
316
|
|
307
|
-
|
317
|
+
2. Added function ViralSeq::SeqHash#sample.
|
308
318
|
|
309
|
-
|
319
|
+
3. Added recency determining function `ViralSeq::Recency::define`
|
310
320
|
|
311
|
-
|
321
|
+
4. Fixed a few bugs related to `tcs_sdrm`.
|
312
322
|
|
313
323
|
### Version 1.1.2-04262021
|
314
324
|
|
315
|
-
|
316
|
-
|
317
|
-
|
325
|
+
1. Added function `ViralSeq::DRMs.sdrm_json` to export SDRM as json object.
|
326
|
+
2. Added a random string to the temp file names for `muscle_bio` to avoid issues when running scripts in parallel.
|
327
|
+
3. Added `--keep-original` flag to the `tcs` pipeline.
|
318
328
|
|
319
329
|
### Version 1.1.1-04012021
|
320
330
|
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
331
|
+
1. Added warning when paired_raw_sequence less than 0.1% of total_raw_sequence.
|
332
|
+
2. Added option `-i WORKING_DIRECTORY` to the `tcs` script.
|
333
|
+
If the `params.json` file does not contain the path to the working directory, it will append path to the run params.
|
334
|
+
3. Added option `-dr` to the `tcs` script.
|
325
335
|
|
326
336
|
### Version 1.1.0-03252021
|
327
337
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
338
|
+
1. Optimized the algorithm of end-join.
|
339
|
+
2. Fixed a bug in the `tcs` pipeline that sometimes combined tcs files are not saved.
|
340
|
+
3. Added `tcs_log` command to pool run logs and tcs files from one batch of tcs jobs.
|
341
|
+
4. Added the preset of MPID-HIVDR params file [**_dr.json_**](./docs/dr.json) in /docs.
|
342
|
+
5. Add `platform_format` option in the json generator of the `tcs` Pipeline.
|
343
|
+
Users can choose from 3 MiSeq platforms for processing their sequencing data.
|
344
|
+
MiSeq 300x7x300 is the default option.
|
335
345
|
|
336
346
|
### Version 1.0.14-03052021
|
337
347
|
|
338
|
-
|
348
|
+
1. Add a function `ViralSeq::TcsCore.validate_file_name` to check MiSeq paired-end file names.
|
339
349
|
|
340
350
|
### Version 1.0.13-03032021
|
341
351
|
|
342
|
-
|
352
|
+
1. Fixed the conflict with rails.
|
343
353
|
|
344
354
|
### Version 1.0.12-03032021
|
345
355
|
|
346
|
-
|
356
|
+
1. Fixed an issue that may cause conflicts with ActiveRecord.
|
347
357
|
|
348
358
|
### Version 1.0.11-03022021
|
349
359
|
|
350
|
-
|
351
|
-
|
360
|
+
1. Fixed an issue when calculating Poisson cutoff for minority mutations `ViralSeq::SeqHash.pm`.
|
361
|
+
2. fixed an issue loading class 'OptionParser'in some ruby environments.
|
352
362
|
|
353
363
|
### Version 1.0.10-11112020:
|
354
364
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
365
|
+
1. Modularize TCS pipeline. Move key functions into /viral_seq/tcs_core.rb
|
366
|
+
2. `tcs_json_generator` is removed. This CLI is delivered within the `tcs` pipeline, by running `tcs -j`. The scripts are included in the /viral_seq/tcs_json.rb
|
367
|
+
3. consensus model now includes a true simple majority model, where no nt needs to be over 50% to be called.
|
368
|
+
4. a few optimizations.
|
369
|
+
5. TCS 2.1.0 delivered.
|
370
|
+
6. Tried parallel processing. Cannot achieve the goal because `parallel` gem by default can't pool data from memory of child processors and `in_threads` does not help with the speed.
|
361
371
|
|
362
372
|
### Version 1.0.9-07182020:
|
363
373
|
|
364
|
-
|
374
|
+
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
365
375
|
|
366
|
-
|
376
|
+
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
367
377
|
|
368
378
|
### Version 1.0.8-02282020:
|
369
379
|
|
370
|
-
|
371
|
-
|
372
|
-
|
380
|
+
1. TCS pipeline (version 2.0.0) added as executable.
|
381
|
+
tcs - main TCS pipeline script.
|
382
|
+
tcs_json_generator - step-by-step script to generate json file for tcs pipeline.
|
373
383
|
|
374
|
-
|
375
|
-
|
384
|
+
2. Methods added:
|
385
|
+
ViralSeq::SeqHash#trim
|
376
386
|
|
377
|
-
|
387
|
+
3. Bug fix for several methods.
|
378
388
|
|
379
389
|
### Version 1.0.7-01282020:
|
380
390
|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
391
|
+
1. Several methods added, including
|
392
|
+
ViralSeq::SeqHash#error_table
|
393
|
+
ViralSeq::SeqHash#random_select
|
394
|
+
2. Improved performance for several functions.
|
385
395
|
|
386
396
|
### Version 1.0.6-07232019:
|
387
397
|
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
398
|
+
1. Several methods added to ViralSeq::SeqHash, including
|
399
|
+
ViralSeq::SeqHash#size
|
400
|
+
ViralSeq::SeqHash#+
|
401
|
+
ViralSeq::SeqHash#write_nt_fa
|
402
|
+
ViralSeq::SeqHash#mutation
|
403
|
+
2. Update documentations and rspec samples.
|
394
404
|
|
395
405
|
### Version 1.0.5-07112019:
|
396
406
|
|
397
|
-
|
398
|
-
|
399
|
-
|
407
|
+
1. Update ViralSeq::SeqHash#sequence_locator.
|
408
|
+
Program will try to determine the direction (`+` or `-` of the query sequence)
|
409
|
+
2. update executable `locator` to have a column of `direction` in output .csv file
|
400
410
|
|
401
411
|
### Version 1.0.4-07102019:
|
402
412
|
|
403
|
-
|
404
|
-
|
413
|
+
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
414
|
+
2. Fix bugs in bin `locator`
|
405
415
|
|
406
416
|
### Version 1.0.3-07102019:
|
407
417
|
|
408
|
-
|
418
|
+
1. Bug fix.
|
409
419
|
|
410
420
|
### Version 1.0.2-07102019:
|
411
421
|
|
412
|
-
|
422
|
+
1. Fixed a gem loading issue.
|
413
423
|
|
414
424
|
### Version 1.0.1-07102019:
|
415
425
|
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
426
|
+
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
427
|
+
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
428
|
+
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
429
|
+
4. update documentations
|
420
430
|
|
421
431
|
### Version 1.0.0-07092019:
|
422
432
|
|
423
|
-
|
433
|
+
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
424
434
|
|
425
435
|
## Development
|
426
436
|
|
data/bin/locator
CHANGED
@@ -38,7 +38,7 @@ def myparser
|
|
38
38
|
options[:outfile] = o
|
39
39
|
end
|
40
40
|
|
41
|
-
opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `
|
41
|
+
opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `SIVmm239`".blue.bold}") do |o|
|
42
42
|
options[:ref_option] = o.to_sym
|
43
43
|
end
|
44
44
|
|
@@ -84,7 +84,7 @@ begin
|
|
84
84
|
seqs = ViralSeq::SeqHash.fa(seq_file)
|
85
85
|
opt = options[:ref_option] ? options[:ref_option] : :HXB2
|
86
86
|
|
87
|
-
unless [:HXB2, :
|
87
|
+
unless [:HXB2, :SIVmm239].include? opt
|
88
88
|
puts "Reference option `#{opt}` not recognized, using `HXB2` as the reference genome.".red.bold
|
89
89
|
opt = :HXB2
|
90
90
|
end
|
data/bin/tcs
CHANGED
@@ -27,9 +27,8 @@
|
|
27
27
|
# run `tcs -j` to generate param json file.
|
28
28
|
|
29
29
|
def gem_installed?(gem_name)
|
30
|
-
found_gem = false
|
31
30
|
begin
|
32
|
-
|
31
|
+
Gem::Specification.find_by_name(gem_name)
|
33
32
|
rescue Gem::LoadError
|
34
33
|
return false
|
35
34
|
else
|
@@ -217,8 +216,8 @@ begin
|
|
217
216
|
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
218
217
|
end
|
219
218
|
|
220
|
-
|
221
219
|
primers.each do |primer|
|
220
|
+
|
222
221
|
summary_json = {}
|
223
222
|
summary_json[:warnings] = []
|
224
223
|
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
@@ -470,6 +469,34 @@ begin
|
|
470
469
|
f.puts JSON.pretty_generate(pid_json)
|
471
470
|
end
|
472
471
|
|
472
|
+
filter_r1 = nil
|
473
|
+
filter_r2 = nil
|
474
|
+
r1_passed_seq = nil
|
475
|
+
r2_passed_seq = nil
|
476
|
+
r1_temp = nil
|
477
|
+
r2_temp = nil
|
478
|
+
r1_temp_sh = nil
|
479
|
+
r2_temp_sh = nil
|
480
|
+
r1_consensus_filtered = nil
|
481
|
+
r2_consensus_filtered = nil
|
482
|
+
consensus_filtered = nil
|
483
|
+
pid_json = nil
|
484
|
+
consensus = nil
|
485
|
+
r1_seq = nil
|
486
|
+
r2_seq = nil
|
487
|
+
bio_r1 = nil
|
488
|
+
bio_r2 = nil
|
489
|
+
id = nil
|
490
|
+
primer_id_count = nil
|
491
|
+
primer_id_dis = nil
|
492
|
+
primer_id_list = nil
|
493
|
+
primer_id_count_over_n = nil
|
494
|
+
r1_sub_seq = nil
|
495
|
+
r2_sub_seq = nil
|
496
|
+
common_keys = nil
|
497
|
+
|
498
|
+
GC.start
|
499
|
+
|
473
500
|
# start end-join
|
474
501
|
def end_join(dir, option, overlap)
|
475
502
|
shp = ViralSeq::SeqHashPair.fa(dir)
|
@@ -492,7 +519,6 @@ begin
|
|
492
519
|
|
493
520
|
if primer[:end_join]
|
494
521
|
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
495
|
-
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
496
522
|
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
497
523
|
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
498
524
|
|
@@ -502,6 +528,11 @@ begin
|
|
502
528
|
joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
|
503
529
|
end
|
504
530
|
|
531
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
532
|
+
if export_raw
|
533
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
534
|
+
end
|
535
|
+
|
505
536
|
if primer[:TCS_QC]
|
506
537
|
ref_start = primer[:ref_start]
|
507
538
|
ref_end = primer[:ref_end]
|
@@ -513,42 +544,11 @@ begin
|
|
513
544
|
if ref_end == 0
|
514
545
|
ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
515
546
|
end
|
516
|
-
if primer[:end_join_option] == 1
|
517
|
-
r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
|
518
|
-
r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
|
519
|
-
r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
520
|
-
r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
521
|
-
new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
522
|
-
new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
523
|
-
joined_seq = {}
|
524
|
-
new_r1_seq.each do |seq_name, seq|
|
525
|
-
next unless seq
|
526
|
-
next unless new_r2_seq[seq_name]
|
527
|
-
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
528
|
-
end
|
529
|
-
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
530
547
|
|
531
|
-
|
532
|
-
r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
|
533
|
-
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
534
|
-
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
535
|
-
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
536
|
-
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
537
|
-
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
538
|
-
joined_seq_raw = {}
|
539
|
-
new_r1_seq_raw.each do |seq_name, seq|
|
540
|
-
next unless seq
|
541
|
-
next unless new_r2_seq_raw[seq_name]
|
542
|
-
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
543
|
-
end
|
544
|
-
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
545
|
-
end
|
546
|
-
else
|
547
|
-
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
548
|
+
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
548
549
|
|
549
|
-
|
550
|
-
|
551
|
-
end
|
550
|
+
if export_raw
|
551
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
552
552
|
end
|
553
553
|
|
554
554
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
data/lib/viral_seq/R.rb
CHANGED
@@ -14,7 +14,9 @@ module ViralSeq
|
|
14
14
|
|
15
15
|
# check if required R packages is installed.
|
16
16
|
def self.check_R_packages
|
17
|
-
|
17
|
+
file = File.join(ViralSeq.root, "viral_seq", "util", "check_env.r")
|
18
|
+
safe_file = Shellwords.escape(file)
|
19
|
+
if system "Rscript #{safe_file}"
|
18
20
|
return 0
|
19
21
|
else
|
20
22
|
raise "Non-zero exit code. Error happens when checking required R packages."
|
data/lib/viral_seq/seq_hash.rb
CHANGED
@@ -656,7 +656,7 @@ module ViralSeq
|
|
656
656
|
|
657
657
|
def nt_variants
|
658
658
|
return_obj = {}
|
659
|
-
|
659
|
+
|
660
660
|
tcs_number = self.size
|
661
661
|
dl = ViralSeq::TcsCore.detection_limit(tcs_number)
|
662
662
|
fdr_hash = self.fdr
|
@@ -869,7 +869,7 @@ module ViralSeq
|
|
869
869
|
# @param start_nt [Integer,Range,Array] start nt position(s) on the refernce genome, can be single number (Integer) or a range of Integers (Range), or an Array
|
870
870
|
# @param end_nt [Integer,Range,Array] end nt position(s) on the refernce genome,can be single number (Integer) or a range of Integers (Range), or an Array
|
871
871
|
# @param indel [Boolean] allow indels or not, `ture` or `false`
|
872
|
-
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:
|
872
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:SIVmm239`
|
873
873
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
874
874
|
# @return [ViralSeq::SeqHash] a new ViralSeq::SeqHash object with only the sequences that meet the QC criterias
|
875
875
|
# @example QC for sequences in a FASTA files
|
@@ -880,17 +880,19 @@ module ViralSeq
|
|
880
880
|
# filtered_seqhash.dna_hash.size
|
881
881
|
# => 4
|
882
882
|
|
883
|
-
def hiv_seq_qc(start_nt, end_nt, indel=true, ref_option = :HXB2
|
884
|
-
start_nt = start_nt
|
885
|
-
end_nt = end_nt
|
883
|
+
def hiv_seq_qc(start_nt, end_nt, indel=true, ref_option = :HXB2)
|
884
|
+
start_nt = position_helper(start_nt)
|
885
|
+
end_nt = position_helper(end_nt)
|
886
|
+
|
886
887
|
seq_hash = self.dna_hash.dup
|
887
888
|
seq_hash_unique = seq_hash.values.uniq
|
888
889
|
seq_hash_unique_pass = []
|
889
890
|
|
890
|
-
seq_hash_unique.
|
891
|
-
|
892
|
-
loc =
|
893
|
-
|
891
|
+
batch_locator = VirustLocator::Locator.exec(seq_hash_unique.join("\s"), "nt", 1, ref_option).split("\n")
|
892
|
+
seq_hash_unique.each_with_index do |seq, i|
|
893
|
+
loc = batch_locator[i]
|
894
|
+
loc = locator_helper(loc)
|
895
|
+
next unless loc
|
894
896
|
if start_nt.include?(loc[0]) && end_nt.include?(loc[1])
|
895
897
|
if indel
|
896
898
|
seq_hash_unique_pass << seq
|
@@ -898,8 +900,11 @@ module ViralSeq
|
|
898
900
|
seq_hash_unique_pass << seq
|
899
901
|
end
|
900
902
|
end
|
903
|
+
|
901
904
|
end
|
905
|
+
|
902
906
|
seq_pass = []
|
907
|
+
|
903
908
|
seq_hash_unique_pass.each do |seq|
|
904
909
|
seq_hash.each do |seq_name, orginal_seq|
|
905
910
|
if orginal_seq == seq
|
@@ -909,10 +914,10 @@ module ViralSeq
|
|
909
914
|
end
|
910
915
|
end
|
911
916
|
self.sub(seq_pass)
|
912
|
-
end # end of #hiv_seq_qc
|
917
|
+
end # end of #hiv_seq_qc # end of #hiv_seq_qc
|
913
918
|
|
914
919
|
# sequence locator for SeqHash object, resembling HIV Sequence Locator from LANL
|
915
|
-
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:
|
920
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:SIVmm239`
|
916
921
|
# @return [Array] two dimensional array `[[],[],[],...]` for each sequence, including the following information:
|
917
922
|
#
|
918
923
|
# title of the SeqHash object (String)
|
@@ -1341,7 +1346,7 @@ module ViralSeq
|
|
1341
1346
|
seq_hash_unique = seq_hash.uniq_hash
|
1342
1347
|
trimmed_seq_hash = {}
|
1343
1348
|
seq_hash_unique.each do |seq, names|
|
1344
|
-
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option
|
1349
|
+
trimmed_seq = ViralSeq::Sequence.new('', seq).sequence_clip(start_nt, end_nt, ref_option).dna
|
1345
1350
|
names.each do |name|
|
1346
1351
|
trimmed_seq_hash[name] = trimmed_seq
|
1347
1352
|
end
|
@@ -1431,6 +1436,37 @@ module ViralSeq
|
|
1431
1436
|
var_count.sort_by{|key,_value|key}.to_h
|
1432
1437
|
end # end of #varaint_for_poisson
|
1433
1438
|
|
1439
|
+
# helper for start/end position for #hiv_seq_qc
|
1440
|
+
def position_helper(position)
|
1441
|
+
if position.is_a?(Range)
|
1442
|
+
return position
|
1443
|
+
elsif position.is_a?(Integer)
|
1444
|
+
return position..position
|
1445
|
+
elsif position.is_a?(String)
|
1446
|
+
return position.to_i..position.to_i
|
1447
|
+
elsif position.is_a?(Array)
|
1448
|
+
return position[0].to_i..position[1].to_i
|
1449
|
+
else
|
1450
|
+
raise "Position #{position} not recognized"
|
1451
|
+
end
|
1452
|
+
end # position_helper
|
1453
|
+
|
1454
|
+
# helper for batch locator
|
1455
|
+
# @param loc [String] the output of batch locator
|
1456
|
+
# @return [Array] the locator information in an array
|
1457
|
+
def locator_helper(loc)
|
1458
|
+
loc = loc.split("\t")
|
1459
|
+
loc[0] = loc[0].to_i
|
1460
|
+
loc[1] = loc[1].to_i
|
1461
|
+
loc[2] = loc[2].to_f.round(1)
|
1462
|
+
if loc[3].to_s.downcase == "true"
|
1463
|
+
loc[3] = true
|
1464
|
+
else
|
1465
|
+
loc[3] = false
|
1466
|
+
end
|
1467
|
+
return loc
|
1468
|
+
end
|
1469
|
+
|
1434
1470
|
end # end of SeqHash
|
1435
1471
|
|
1436
1472
|
end # end of ViralSeq
|
data/lib/viral_seq/sequence.rb
CHANGED
@@ -165,7 +165,7 @@ module ViralSeq
|
|
165
165
|
|
166
166
|
# HIV sequence locator function, resembling HIV Sequence Locator from LANL
|
167
167
|
# # current version only supports nucleotide sequence, not for amino acid sequence.
|
168
|
-
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:
|
168
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:SIVmm239`
|
169
169
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
170
170
|
# @return [Array] an array of the following info:
|
171
171
|
#
|
@@ -181,182 +181,32 @@ module ViralSeq
|
|
181
181
|
#
|
182
182
|
# aligned_reference_sequence (String)
|
183
183
|
#
|
184
|
-
# @example identify the location of the input sequence on the
|
184
|
+
# @example identify the location of the input sequence on the HXB2 genome
|
185
185
|
# sequence = 'AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC'
|
186
186
|
# s = ViralSeq::Sequence.new('my_sequence', sequence)
|
187
|
-
# loc = s.locator(:
|
188
|
-
# h = ViralSeq::SeqHash.new; h.dna_hash['
|
187
|
+
# loc = s.locator(:HXB2)
|
188
|
+
# h = ViralSeq::SeqHash.new; h.dna_hash['HXB2'] = loc[5]; h.dna_hash[s.name] = loc[4]
|
189
189
|
# rs_string = h.to_rsphylip.split("\n")[1..-1].join("\n") # get a relaxed phylip format string for display of alignment.
|
190
|
-
# puts "The input sequence \"#{s.name}\" is located on the
|
191
|
-
# => The input sequence "my_sequence" is located on the
|
192
|
-
# => It is
|
190
|
+
# puts "The input sequence \"#{s.name}\" is located on the HXB2 nt sequence from #{loc[0].to_s} to #{loc[1].to_s}.\nIt is #{loc[2].round(1).to_s}% similar to the reference.\nIt #{loc[3]? "does" : "does not"} have indels.\nThe alignment is\n#{rs_string}"
|
191
|
+
# => The input sequence "my_sequence" is located on the HXB2 nt sequence from 2333 to 2433.
|
192
|
+
# => It is 97.0% similar to the reference.
|
193
193
|
# => It does not have indels.
|
194
194
|
# => The alignment is
|
195
|
-
# =>
|
195
|
+
# => HXB2 AGCAGATGAT ACAGTATTAG AAGAAATGAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAGTATGAT C
|
196
196
|
# => my_sequence AGCAGATGAT ACAGTATTAG AAGAAATAAA TTTGCCAGGA AGATGGAAAC CAAAAATGAT AGGGGGAATT GGAGGTTTTA TCAAAGTAAG ACAATATGAT C
|
197
197
|
# @see https://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html LANL Sequence Locator
|
198
|
-
|
199
|
-
def locator(ref_option = :HXB2, path_to_muscle = false)
|
198
|
+
def locator(ref_option = :HXB2, algorithm = 1)
|
200
199
|
seq = self.dna
|
201
|
-
|
202
|
-
|
200
|
+
ref = ref_option.to_s
|
203
201
|
begin
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
211
|
-
gap_begin = $1.size
|
212
|
-
gap_end = $3.size
|
213
|
-
aln_test2 = $2
|
214
|
-
ref = aln_seq[0]
|
215
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
216
|
-
ref_size = ref.size
|
217
|
-
if ref_size > 1.3*(seq.size)
|
218
|
-
l1 = l1 + gap_begin
|
219
|
-
l2 = l2 + gap_end
|
220
|
-
max_seq = aln_test2.scan(/[ACGT]+/).max_by(&:length)
|
221
|
-
aln_test2 =~ /#{max_seq}/
|
222
|
-
before_aln_seq = $`
|
223
|
-
before_aln = $`.size
|
224
|
-
post_aln_seq = $'
|
225
|
-
post_aln = $'.size
|
226
|
-
before_aln_seq_size = before_aln_seq.scan(/[ACGT]+/).join("").size
|
227
|
-
b1 = (1.3 * before_aln_seq_size).to_i
|
228
|
-
post_aln_seq_size = post_aln_seq.scan(/[ACGT]+/).join("").size
|
229
|
-
b2 = (1.3 * post_aln_seq_size).to_i
|
230
|
-
if (before_aln > seq.size) and (post_aln <= seq.size)
|
231
|
-
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1)]
|
232
|
-
l1 = l1 + (before_aln - b1)
|
233
|
-
elsif (post_aln > seq.size) and (before_aln <= seq.size)
|
234
|
-
ref = ref[before_aln..(ref_size - post_aln - 1 + b2)]
|
235
|
-
l2 = l2 + post_aln - b2
|
236
|
-
elsif (post_aln > seq.size) and (before_aln > seq.size)
|
237
|
-
ref = ref[(before_aln - b1)..(ref_size - post_aln - 1 + b2)]
|
238
|
-
l1 = l1 + (before_aln - b1)
|
239
|
-
l2 = l2 + (post_aln - b2)
|
240
|
-
end
|
241
|
-
|
242
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
243
|
-
aln_test = aln_seq[1]
|
244
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
245
|
-
gap_begin = $1.size
|
246
|
-
gap_end = $3.size
|
247
|
-
ref = aln_seq[0]
|
248
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
249
|
-
end
|
250
|
-
|
251
|
-
aln_test = aln_seq[1]
|
252
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
253
|
-
gap_begin = $1.size
|
254
|
-
gap_end = $3.size
|
255
|
-
aln_test = $2
|
256
|
-
aln_test =~ /^(\w+)(\-*)\w/
|
257
|
-
s1 = $1.size
|
258
|
-
g1 = $2.size
|
259
|
-
aln_test =~ /\w(\-*)(\w+)$/
|
260
|
-
s2 = $2.size
|
261
|
-
g2 = $1.size
|
262
|
-
|
263
|
-
l1 = l1 + gap_begin
|
264
|
-
l2 = l2 + gap_end
|
265
|
-
repeat = 0
|
266
|
-
|
267
|
-
if g1 == g2 and (s1 + g1 + s2) == ref.size
|
268
|
-
if s1 > s2 and g2 >= s2
|
269
|
-
ref = ref[0..(-g2-1)]
|
270
|
-
repeat = 1
|
271
|
-
l2 = l2 + g2
|
272
|
-
elsif s1 < s2 and g1 >= s1
|
273
|
-
ref = ref[g1..-1]
|
274
|
-
repeat = 1
|
275
|
-
l1 = l1 + g1
|
276
|
-
end
|
277
|
-
else
|
278
|
-
if g1 >= s1
|
279
|
-
ref = ref[g1..-1]
|
280
|
-
repeat = 1
|
281
|
-
l1 = l1 + g1
|
282
|
-
end
|
283
|
-
if g2 >= s2
|
284
|
-
ref = ref[0..(-g2 - 1)]
|
285
|
-
repeat = 1
|
286
|
-
l2 = l2 + g2
|
287
|
-
end
|
288
|
-
end
|
289
|
-
|
290
|
-
while repeat == 1
|
291
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
292
|
-
aln_test = aln_seq[1]
|
293
|
-
aln_test =~ /^(\-*)(\w.*\w)(\-*)$/
|
294
|
-
gap_begin = $1.size
|
295
|
-
gap_end = $3.size
|
296
|
-
aln_test = $2
|
297
|
-
aln_test =~ /^(\w+)(\-*)\w/
|
298
|
-
s1 = $1.size
|
299
|
-
g1 = $2.size
|
300
|
-
aln_test =~ /\w(\-*)(\w+)$/
|
301
|
-
s2 = $2.size
|
302
|
-
g2 = $1.size
|
303
|
-
ref = aln_seq[0]
|
304
|
-
ref = ref[gap_begin..(-gap_end-1)]
|
305
|
-
l1 = l1 + gap_begin
|
306
|
-
l2 = l2 + gap_end
|
307
|
-
repeat = 0
|
308
|
-
if g1 >= s1
|
309
|
-
ref = ref[g1..-1]
|
310
|
-
repeat = 1
|
311
|
-
l1 = l1 + g1
|
312
|
-
end
|
313
|
-
if g2 >= s2
|
314
|
-
ref = ref[0..(-g2 - 1)]
|
315
|
-
repeat = 1
|
316
|
-
l2 = l2 + g2
|
317
|
-
end
|
318
|
-
end
|
319
|
-
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
320
|
-
|
321
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
322
|
-
aln_test = aln_seq[1]
|
323
|
-
ref = aln_seq[0]
|
324
|
-
|
325
|
-
#refine alignment
|
326
|
-
|
327
|
-
if ref =~ /^(\-+)/
|
328
|
-
l1 = l1 - $1.size
|
329
|
-
elsif ref =~ /(\-+)$/
|
330
|
-
l2 = l2 - $1.size
|
331
|
-
end
|
332
|
-
|
333
|
-
if (ori_ref_l - l2 - 1) >= l1
|
334
|
-
ref = ori_ref[l1..(ori_ref_l - l2 - 1)]
|
335
|
-
aln_seq = ViralSeq::Muscle.align(ref, seq, :Super5, path_to_muscle)
|
336
|
-
aln_test = aln_seq[1]
|
337
|
-
ref = aln_seq[0]
|
338
|
-
|
339
|
-
ref_size = ref.size
|
340
|
-
sim_count = 0
|
341
|
-
(0..(ref_size-1)).each do |n|
|
342
|
-
ref_base = ref[n]
|
343
|
-
test_base = aln_test[n]
|
344
|
-
sim_count += 1 if ref_base == test_base
|
345
|
-
end
|
346
|
-
similarity = (sim_count/ref_size.to_f*100).round(1)
|
347
|
-
|
348
|
-
loc_p1 = l1 + 1
|
349
|
-
loc_p2 = ori_ref_l - l2
|
350
|
-
if seq.size != (loc_p2 - loc_p1 + 1)
|
351
|
-
indel = true
|
352
|
-
elsif aln_test.include?("-")
|
353
|
-
indel = true
|
354
|
-
else
|
355
|
-
indel = false
|
356
|
-
end
|
357
|
-
return [loc_p1,loc_p2,similarity,indel,aln_test,ref]
|
202
|
+
loc = VirustLocator::Locator.exec(seq, "nt", algorithm, ref).split("\t")
|
203
|
+
loc[0] = loc[0].to_i
|
204
|
+
loc[1] = loc[1].to_i
|
205
|
+
loc[2] = loc[2].to_f.round(1)
|
206
|
+
if loc[3].to_s.downcase == "true"
|
207
|
+
loc[3] = true
|
358
208
|
else
|
359
|
-
|
209
|
+
loc[3] = false
|
360
210
|
end
|
361
211
|
rescue => e
|
362
212
|
puts "Unexpected error occured."
|
@@ -366,12 +216,13 @@ module ViralSeq
|
|
366
216
|
puts "ViralSeq.sequence_locator returns nil"
|
367
217
|
return nil
|
368
218
|
end
|
369
|
-
|
219
|
+
return loc
|
220
|
+
end #end of locator
|
370
221
|
|
371
222
|
# Given start and end positions on the reference genome, return a sub-sequence of the target sequence in that range
|
372
223
|
# @param p1 [Integer] start position number on the reference genome
|
373
224
|
# @param p2 [Integer] end position number on the reference genome
|
374
|
-
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:
|
225
|
+
# @param ref_option [Symbol], name of reference genomes, options are `:HXB2`, `:SIVmm239`
|
375
226
|
# @param path_to_muscle [String], path to the muscle executable, if not provided, use MuscleBio to run Muscle
|
376
227
|
# @return [ViralSeq::Sequence, nil] a new ViralSeq::Sequence object that of input range on the reference genome or nil
|
377
228
|
# if either the start or end position is beyond the range of the target sequence.
|
@@ -381,8 +232,8 @@ module ViralSeq
|
|
381
232
|
# s.sequence_clip(2333, 2433, :HXB2).dna
|
382
233
|
# => "AGCAGATGATACAGTATTAGAAGAAATAAATTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAATATGATC"
|
383
234
|
|
384
|
-
def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2
|
385
|
-
loc = self.locator(ref_option
|
235
|
+
def sequence_clip(p1 = 0, p2 = 0, ref_option = :HXB2)
|
236
|
+
loc = self.locator(ref_option)
|
386
237
|
l1 = loc[0]
|
387
238
|
l2 = loc[1]
|
388
239
|
if (p1 >= l1) & (p2 <= l2)
|
data/lib/viral_seq/string.rb
CHANGED
@@ -56,13 +56,13 @@ class String
|
|
56
56
|
Regexp.new match
|
57
57
|
end
|
58
58
|
|
59
|
-
# parse the nucleotide sequences as an Array of Array
|
59
|
+
# parse the nucleotide sequences as an Array of Array
|
60
60
|
# @return [Array] Array of Array at each position
|
61
61
|
# @example parse a sequence with ambiguities to Array of Array
|
62
62
|
# "ATRWCG".nt_to_array
|
63
63
|
# => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
|
64
|
-
|
65
|
-
def nt_to_array
|
64
|
+
|
65
|
+
def nt_to_array
|
66
66
|
return_array = []
|
67
67
|
self.each_char.each do |base|
|
68
68
|
base_array = base.to_list
|
@@ -75,9 +75,6 @@ class String
|
|
75
75
|
# compare the given nt sequence string with the ref sequence string
|
76
76
|
# @param ref [String] the ref sequence string to compare with
|
77
77
|
# @return [Interger] Number of differences
|
78
|
-
# @example parse a sequence with ambiguities to Array of Array
|
79
|
-
# "ATRWCG".nt_to_array
|
80
|
-
# => [["A"], ["T"], ["A", "G"], ["A", "T"], ["C"], ["G"]]
|
81
78
|
|
82
79
|
def nt_diff(ref)
|
83
80
|
count_diff = 0
|
data/lib/viral_seq/tcs_core.rb
CHANGED
@@ -331,6 +331,10 @@ module ViralSeq
|
|
331
331
|
return false
|
332
332
|
elsif seq[1..-2] =~ /N/ # sequences with ambiguities except the 1st and last position removed
|
333
333
|
return false
|
334
|
+
elsif seq =~ /G{11}/ # a string of poly-G indicates poor quanlity in 2-color chemistry
|
335
|
+
return false
|
336
|
+
elsif seq =~ /C{11}/ # a string of poly-C indicates poor quanlity in 2-color chemistry
|
337
|
+
return false
|
334
338
|
elsif seq =~ /A{11}/ # a string of poly-A indicates adaptor sequence
|
335
339
|
return false
|
336
340
|
elsif seq =~ /T{11}/ # a string of poly-T indicates adaptor sequence
|
data/lib/viral_seq/tcs_dr.rb
CHANGED
@@ -186,7 +186,7 @@ module ViralSeq
|
|
186
186
|
:trim=>false},
|
187
187
|
{:region=>"PR",
|
188
188
|
:cdna=>
|
189
|
-
"
|
189
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
|
190
190
|
:forward=>
|
191
191
|
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
|
192
192
|
:majority=>0,
|
@@ -247,6 +247,87 @@ module ViralSeq
|
|
247
247
|
]
|
248
248
|
},
|
249
249
|
|
250
|
+
"v4" => {:platform_error_rate=>0.01,
|
251
|
+
:primer_pairs=>
|
252
|
+
[{:region=>"RT",
|
253
|
+
:cdna=>
|
254
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTAAGGAATGGAGGTTCTTTCTGATG",
|
255
|
+
:forward=>
|
256
|
+
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGGCCATTGACAGAAGAAAAAATAAAAGC",
|
257
|
+
:majority=>0,
|
258
|
+
:end_join=>true,
|
259
|
+
:end_join_option=>1,
|
260
|
+
:overlap=>0,
|
261
|
+
:TCS_QC=>true,
|
262
|
+
:ref_genome=>"HXB2",
|
263
|
+
:ref_start=>2648,
|
264
|
+
:ref_end=>3209,
|
265
|
+
:indel=>true,
|
266
|
+
:trim=>false},
|
267
|
+
{:region=>"PR",
|
268
|
+
:cdna=>
|
269
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNCAGTTTAACTTTTGGGCCATCCATTCC",
|
270
|
+
:forward=>
|
271
|
+
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTCAGAGCAGACCAGAGCCAACAGCCCCA",
|
272
|
+
:majority=>0,
|
273
|
+
:end_join=>true,
|
274
|
+
:end_join_option=>3,
|
275
|
+
:TCS_QC=>true,
|
276
|
+
:ref_genome=>"HXB2",
|
277
|
+
:ref_start=>0,
|
278
|
+
:ref_end=>2591,
|
279
|
+
:indel=>true,
|
280
|
+
:trim=>true,
|
281
|
+
:trim_ref=>"HXB2",
|
282
|
+
:trim_ref_start=>2253,
|
283
|
+
:trim_ref_end=>2549},
|
284
|
+
{:region=>"IN",
|
285
|
+
:cdna=>
|
286
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCATCACCTGCCATCTGTTTTCCAT",
|
287
|
+
:forward=>"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGCAGAAGTTATYCCAGCAGAAACA",
|
288
|
+
:majority=>0,
|
289
|
+
:end_join=>true,
|
290
|
+
:end_join_option=>2,
|
291
|
+
:overlap=>3,
|
292
|
+
:TCS_QC=>true,
|
293
|
+
:ref_genome=>"HXB2",
|
294
|
+
:ref_start=>4509,
|
295
|
+
:ref_end=>5040,
|
296
|
+
:indel=>true,
|
297
|
+
:trim=>false},
|
298
|
+
{:region=>"V1V3",
|
299
|
+
:cdna=>
|
300
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCCATTTTGCTYTAYTRABVTTACAATRTGC",
|
301
|
+
:forward=>
|
302
|
+
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNTTATGGGATCAAAGCCTAAAGCCATGTGTA",
|
303
|
+
:majority=>0,
|
304
|
+
:end_join=>true,
|
305
|
+
:end_join_option=>1,
|
306
|
+
:overlap=>0,
|
307
|
+
:TCS_QC=>true,
|
308
|
+
:ref_genome=>"HXB2",
|
309
|
+
:ref_start=>6585,
|
310
|
+
:ref_end=>7205..7210,
|
311
|
+
:indel=>true,
|
312
|
+
:trim=>false},
|
313
|
+
{:region=>"CA",
|
314
|
+
:cdna=>
|
315
|
+
"GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTNNNNNNNNNNNCAGTCAACAAGGTTTCTGTCATCCAATTTTTTAC",
|
316
|
+
:forward=>
|
317
|
+
"GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAGNNNNGTCAGCCAAAATTACCCTATAGTGC",
|
318
|
+
:majority=>0,
|
319
|
+
:end_join=>true,
|
320
|
+
:end_join_option=>1,
|
321
|
+
:overlap=>0,
|
322
|
+
:TCS_QC=>true,
|
323
|
+
:ref_genome=>"HXB2",
|
324
|
+
:ref_start=>1196,
|
325
|
+
:ref_end=>1725,
|
326
|
+
:indel=>true,
|
327
|
+
:trim=>false}
|
328
|
+
]
|
329
|
+
},
|
330
|
+
|
250
331
|
|
251
332
|
}
|
252
333
|
|
@@ -54,6 +54,58 @@
|
|
54
54
|
}
|
55
55
|
}
|
56
56
|
|
57
|
+
},
|
58
|
+
{
|
59
|
+
"version": "v4",
|
60
|
+
"DRM_range": {
|
61
|
+
"CAI": [56,57, 66, 67, 70, 74, 105, 107],
|
62
|
+
"PI": [23, 24, 30, 32, 46, 47, 48, 50, 53, 54, 73, 76, 82, 83, 84, 88, 90],
|
63
|
+
"NRTI": [41, 65, 67, 69, 70, 74, 75, 77, 115, 116, 151, 184, 210, 215, 219],
|
64
|
+
"NNRTI": [98, 100, 101, 103, 106, 138, 179, 181, 188, 190],
|
65
|
+
"INSTI": [95, 97, 121, 140, 143, 147, 148, 155, 263]
|
66
|
+
},
|
67
|
+
"seq_coord": {
|
68
|
+
"CA": {
|
69
|
+
"minimum": 1196,
|
70
|
+
"maximum": 1725,
|
71
|
+
"gap": {
|
72
|
+
"minimum": 1466,
|
73
|
+
"maximum": 1471
|
74
|
+
}
|
75
|
+
},
|
76
|
+
"PR": {
|
77
|
+
"minimum": 2253,
|
78
|
+
"maximum": 2549
|
79
|
+
},
|
80
|
+
"RT": {
|
81
|
+
"minimum": 2648,
|
82
|
+
"maximum": 3209,
|
83
|
+
"gap": {
|
84
|
+
"minimum": 2915,
|
85
|
+
"maximum": 2949
|
86
|
+
}
|
87
|
+
},
|
88
|
+
"IN": {
|
89
|
+
"minimum": 4509,
|
90
|
+
"maximum": 5040
|
91
|
+
}
|
92
|
+
},
|
93
|
+
"seq_drm_correlation": {
|
94
|
+
"CA": ["CAI"],
|
95
|
+
"RT": ["NRTI", "NNRTI"],
|
96
|
+
"PR": ["PI"],
|
97
|
+
"IN": ["INSTI"]
|
98
|
+
},
|
99
|
+
"ref_info": {
|
100
|
+
"ref_type": "HXB2",
|
101
|
+
"ref_coord": {
|
102
|
+
"CA": [1186,1878],
|
103
|
+
"PR": [2253,2549],
|
104
|
+
"RT": [2550,3869],
|
105
|
+
"IN": [4230,5096]
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
57
109
|
},
|
58
110
|
{
|
59
111
|
"version": "v1",
|
data/lib/viral_seq/version.rb
CHANGED
data/lib/viral_seq.rb
CHANGED
data/viral_seq.gemspec
CHANGED
@@ -37,6 +37,9 @@ Gem::Specification.new do |spec|
|
|
37
37
|
# muscle_bio gem required
|
38
38
|
spec.add_runtime_dependency "muscle_bio", "= 0.4"
|
39
39
|
|
40
|
+
# virust-locator-ruby required
|
41
|
+
spec.add_runtime_dependency "virust-locator-ruby", "~> 0.3"
|
42
|
+
|
40
43
|
# colorize gem required
|
41
44
|
spec.add_runtime_dependency "colorize", "~> 0.1"
|
42
45
|
|
@@ -47,4 +50,6 @@ Gem::Specification.new do |spec|
|
|
47
50
|
spec.add_runtime_dependency "combine_pdf", "~> 1.0", '>= 1.0.0'
|
48
51
|
|
49
52
|
spec.requirements << 'R required for some functions'
|
53
|
+
|
54
|
+
spec.add_dependency "shellwords", "~> 0.2"
|
50
55
|
end
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: viral_seq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuntai Zhou
|
8
8
|
- Michael Clark
|
9
|
-
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: bundler
|
@@ -67,6 +66,20 @@ dependencies:
|
|
67
66
|
- - '='
|
68
67
|
- !ruby/object:Gem::Version
|
69
68
|
version: '0.4'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: virust-locator-ruby
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.3'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.3'
|
70
83
|
- !ruby/object:Gem::Dependency
|
71
84
|
name: colorize
|
72
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -141,6 +154,20 @@ dependencies:
|
|
141
154
|
- - ">="
|
142
155
|
- !ruby/object:Gem::Version
|
143
156
|
version: 1.0.0
|
157
|
+
- !ruby/object:Gem::Dependency
|
158
|
+
name: shellwords
|
159
|
+
requirement: !ruby/object:Gem::Requirement
|
160
|
+
requirements:
|
161
|
+
- - "~>"
|
162
|
+
- !ruby/object:Gem::Version
|
163
|
+
version: '0.2'
|
164
|
+
type: :runtime
|
165
|
+
prerelease: false
|
166
|
+
version_requirements: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - "~>"
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0.2'
|
144
171
|
description: |-
|
145
172
|
A Ruby Gem with bioinformatics tools for processing viral NGS data.
|
146
173
|
Specifically for Primer-ID sequencing and HIV drug resistance analysis.
|
@@ -226,8 +253,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
226
253
|
version: 1.3.6
|
227
254
|
requirements:
|
228
255
|
- R required for some functions
|
229
|
-
rubygems_version: 3.
|
230
|
-
signing_key:
|
256
|
+
rubygems_version: 3.6.7
|
231
257
|
specification_version: 4
|
232
258
|
summary: A Ruby Gem containing bioinformatics tools for processing viral NGS data.
|
233
259
|
test_files: []
|