viral_seq 1.9.1 → 1.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/Gemfile.lock +6 -1
- data/README.md +134 -120
- data/bin/locator +2 -2
- data/bin/tcs +38 -38
- data/lib/viral_seq/R.rb +3 -1
- data/lib/viral_seq/seq_hash.rb +50 -12
- data/lib/viral_seq/sequence.rb +22 -171
- data/lib/viral_seq/string.rb +3 -6
- data/lib/viral_seq/tcs_core.rb +4 -0
- data/lib/viral_seq/tcs_dr.rb +85 -4
- data/lib/viral_seq/tcs_json.rb +2 -2
- data/lib/viral_seq/util/drm_versions_config.json +52 -0
- data/lib/viral_seq/version.rb +2 -2
- data/lib/viral_seq.rb +2 -0
- data/viral_seq.gemspec +5 -0
- metadata +31 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 280f17e2219e2ddeccc8ba9904289c71339e5a956be850fc80bc63ad89f39a33
|
4
|
+
data.tar.gz: 07efd4b233f28e8d19cd31b588b1b5d430fbdbccaddae0ffccf6b32b9419de75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 83b0fdee89ea5a75227fcdc5e5e58bad83102f27ac29506f81be71785a61caf0815bdc75d488253c0f88392df8e5ed9d43ea480d5d1b2333365088d023da756b
|
7
|
+
data.tar.gz: 90af1682bc4718be2261fdca138bdf445af38d35785b4b7d291107b4e07b1c75a6cdc75db83e17eee129f7a6113687ef822c42eb783515b9f9653df26c726d19
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.
|
4
|
+
viral_seq (1.10.0)
|
5
5
|
colorize (~> 0.1)
|
6
6
|
combine_pdf (~> 1.0, >= 1.0.0)
|
7
7
|
muscle_bio (= 0.4)
|
8
8
|
prawn (~> 2.3, >= 2.3.0)
|
9
9
|
prawn-table (~> 0.2, >= 0.2.0)
|
10
|
+
shellwords (~> 0.2)
|
11
|
+
virust-locator-ruby (~> 0.3)
|
10
12
|
|
11
13
|
GEM
|
12
14
|
remote: https://rubygems.org/
|
@@ -41,8 +43,11 @@ GEM
|
|
41
43
|
rspec-support (~> 3.13.0)
|
42
44
|
rspec-support (3.13.1)
|
43
45
|
ruby-rc4 (0.1.5)
|
46
|
+
shellwords (0.2.0)
|
44
47
|
ttfunk (1.8.0)
|
45
48
|
bigdecimal (~> 3.1)
|
49
|
+
virust-locator-ruby (0.3.0)
|
50
|
+
shellwords (~> 0.2)
|
46
51
|
|
47
52
|
PLATFORMS
|
48
53
|
ruby
|
data/README.md
CHANGED
@@ -16,10 +16,10 @@ CLI tools `tcs`, `tcs_sdrm`, `tcs_log` and `locator` included in the gem.
|
|
16
16
|
|
17
17
|
## Illustration for the Primer ID Sequencing
|
18
18
|
|
19
|
-
|
20
19
|

|
21
20
|
|
22
21
|
### Reference readings on the Primer ID sequencing
|
22
|
+
|
23
23
|
[Explantion of Primer ID sequencing](https://doi.org/10.21769/BioProtoc.3938)
|
24
24
|
[Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
|
25
25
|
[Application of Primer ID sequencing in COVID-19 research](https://doi.org/10.1126/scitranslmed.abb5883)
|
@@ -41,11 +41,13 @@ Required RubyGems version: >= 1.3.6
|
|
41
41
|
### Excutables
|
42
42
|
|
43
43
|
### `tcs`
|
44
|
+
|
44
45
|
Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
|
45
46
|
|
46
47
|
Web-based `tcs` analysis can be accessed at https://primer-id.org/
|
47
48
|
|
48
49
|
Example commands:
|
50
|
+
|
49
51
|
```bash
|
50
52
|
$ tcs -p params.json # run TCS pipeline with params.json
|
51
53
|
$ tcs -p params.json -i DIRECTORY
|
@@ -61,12 +63,13 @@ Example commands:
|
|
61
63
|
[sample params.json for the tcs-dr pipeline](./docs/dr.json)
|
62
64
|
|
63
65
|
---
|
66
|
+
|
64
67
|
### `tcs_log`
|
65
68
|
|
66
69
|
Use `tcs_log` script to pool run logs and TCS fasta files after one batch of `tcs` jobs. This command generates log.html to visualize the sequencing runs.
|
67
70
|
|
68
|
-
|
69
71
|
Example file structure:
|
72
|
+
|
70
73
|
```
|
71
74
|
batch_tcs_jobs/
|
72
75
|
├── lib1
|
@@ -77,21 +80,25 @@ batch_tcs_jobs/
|
|
77
80
|
```
|
78
81
|
|
79
82
|
Example command:
|
83
|
+
|
80
84
|
```bash
|
81
85
|
$ tcs_log batch_tcs_jobs
|
82
86
|
```
|
83
87
|
|
84
88
|
---
|
89
|
+
|
85
90
|
### `tcs_sdrm`
|
86
91
|
|
87
92
|
Use `tcs_sdrm` pipeline for HIV-1 drug resistance mutation and recency.
|
88
93
|
|
89
94
|
Example command:
|
95
|
+
|
90
96
|
```bash
|
91
97
|
$ tcs_sdrm libs_dir
|
92
98
|
```
|
93
99
|
|
94
100
|
lib_dir file structure:
|
101
|
+
|
95
102
|
```
|
96
103
|
libs_dir/
|
97
104
|
├── lib1
|
@@ -109,8 +116,8 @@ libs_dir/
|
|
109
116
|
|
110
117
|
Output data in a new dir as 'libs_dir_SDRM'
|
111
118
|
|
112
|
-
|
113
119
|
**Note: [R](https://www.r-project.org/) and the following R libraries are required:**
|
120
|
+
|
114
121
|
- phangorn
|
115
122
|
- ape
|
116
123
|
- scales
|
@@ -122,11 +129,13 @@ Output data in a new dir as 'libs_dir_SDRM'
|
|
122
129
|
---
|
123
130
|
|
124
131
|
### `locator`
|
132
|
+
|
125
133
|
Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
126
134
|
|
127
135
|
```bash
|
128
136
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
129
137
|
```
|
138
|
+
|
130
139
|
---
|
131
140
|
|
132
141
|
## Some Examples
|
@@ -179,248 +188,253 @@ Examine for drug resistance mutations for HIV PR region
|
|
179
188
|
```ruby
|
180
189
|
qc_seqhash.sdrm_hiv_pr(cut_off)
|
181
190
|
```
|
182
|
-
## Known issues
|
183
|
-
|
184
|
-
1. ~~have a conflict with rails.~~
|
185
|
-
2. ~~Update on 03032021. Still have conflict. But in rails gem file, can just use `requires: false` globally and only require "viral_seq" when the module is needed in controller.~~
|
186
|
-
3. The conflict seems to be resovled. It was from a combination of using `!` as a function for factorial and the gem name `viral_seq`. @_@
|
187
191
|
|
188
192
|
## Updates
|
189
193
|
|
194
|
+
### Version-1.10.2-07210225
|
195
|
+
|
196
|
+
1. Fixed a bug processing parameters for HIV sequence QC.
|
197
|
+
|
198
|
+
### Version-1.10.1-05012025
|
199
|
+
|
200
|
+
1. Added quality filter for Illumina 2-color sequencing platforms (filter poly-G and poly-C)
|
201
|
+
2. Replaced `MuscleBio` with [`VirustLocator`]("https://github.com/ViralSeq/virust-locator-ruby") for faster and more accurate pairwise alignment.
|
202
|
+
3. Added DR primer version 4.
|
203
|
+
4. Added a helper function to properly treat input params for #hiv_seq_qc.
|
204
|
+
5. Solved the slow-performance issue when spawning a subprocess to call `VirustLocator` when holding a large amount of data in the momery. When Ruby run shell commands, a child process is spawned and share the parent's memory pages. To set it up, the OS has to walk the parent's entire memory table, causing an incremental delay in each subsequent process spawning. To solve this, I redid the `VirustLocator` API to allow all the arguments to be processed with one shell command instead of spawning individual child process.
|
205
|
+
|
190
206
|
### Version-1.9.1-12022024
|
191
207
|
|
192
|
-
|
208
|
+
1. Fixed a bug in the `tcs_sdrm` pipeline.
|
193
209
|
|
194
210
|
### Version-1.9.0-11132024
|
195
211
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
212
|
+
1. `ViralSeq::TcsCore::validate_file_name` will not report errors when non-sequence data in the folder, instead these files will be ignored.
|
213
|
+
2. Rewrote the APIs for DRM analysis for HIV. Now uses version config files for the sequencing information and DRM list configure files for DRM interpretation. Two configure files are at located in `/lib/viral_seq/util/`
|
214
|
+
3. `tcs_sdrm` will take a second argument for DRM config versions. Currently supports `["v1", "v2", "v3"]`. Refer to the documentations of the APIs for the details.
|
215
|
+
4. Next update will use secondary command `tcs sdrm` to replace `tcs_sdrm`, and `tcs log` to replace `tcs_log`.
|
200
216
|
|
201
217
|
### Version-1.8.1-06042024
|
202
218
|
|
203
|
-
|
219
|
+
1. Fixed a bug that causes `tcs_sdrm` pipeline to crash.
|
204
220
|
|
205
221
|
### Version-1.8.0-04052024
|
206
222
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
223
|
+
1. Use `muscle-v3.8.1` as default aligner because of the compatibility issues with `muscle-v5` on some platforms.
|
224
|
+
2. Adjust the end-join model for short insert (insert size less than read length substracted by adaptor size)
|
225
|
+
3. Add an option in the DR pipeline for different versions of the pipeline, default version as "v1".
|
226
|
+
4. Add Days Post Infection (DPI) prediction model in the SDRM pipeline.
|
227
|
+
5. Re-organize the R scripts as stand-alone R files.
|
228
|
+
6. Bug fix.
|
229
|
+
7. **NOT SOLVED**: to include versions of DR in reports
|
214
230
|
|
215
231
|
### Version-1.7.1-05122023
|
216
232
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
233
|
+
1. Add a size check for the raw sequences. If the size smaller than the input params, error messages will be sent to users. IF the actual size is greater than the input params, extra bases will be truncated.
|
234
|
+
2. Now allows mismatch for the primer region sequences. Forward primer region allows 2 nt differences and cDNA primer region allows 3 nt differences.
|
235
|
+
3. Bug fix.
|
236
|
+
4. TCS version to 2.5.2
|
221
237
|
|
222
238
|
### Version-1.7.0-08242022
|
223
239
|
|
224
|
-
|
225
|
-
|
240
|
+
1. Add warnings if `tcs` pipeline is excecuting through source instead of installing from `gem`.
|
241
|
+
2. Optimized `ViralSeq:SeqHash#a3g` hypermut algorithm. Allowing a external reference other than the sample reference.
|
226
242
|
|
227
243
|
### Version-1.6.4-07182022
|
228
244
|
|
229
|
-
|
230
|
-
|
231
|
-
|
245
|
+
1. Included region "P17" in the default `tcs -d` pipeline setting. `tcs` pipeline updated to version 2.5.1.
|
246
|
+
2. Loosen the locator params for the "V1V3" end region for rare alignment issues. Now the default "V1V3" region end with position 7205 to 7210 instead of 7208.
|
247
|
+
3. `tcs_sdrm` now analyse "P17" region for pairwise diversity.
|
232
248
|
|
233
249
|
### Version-1.6.3-02052022
|
234
250
|
|
235
|
-
|
236
|
-
|
237
|
-
|
251
|
+
1. Updated on `ViralSeq::Muscle` module along with the update of `muscle` from version 3.8.1 to 5.1.
|
252
|
+
2. Optimized the `locator` algorithm based on `muscle` v5.1.
|
253
|
+
3. Optimized the `tcs_sdrm` pipeline based on `muscle` v5.1.
|
238
254
|
|
239
255
|
### Version-1.6.1-02022022
|
240
256
|
|
241
|
-
|
242
|
-
|
257
|
+
1. Fixed the `nav bar` in tcs_log html file.
|
258
|
+
2. Fixed a typo in `tcs`.
|
243
259
|
|
244
260
|
### Version 1.6.0-01042022
|
245
261
|
|
246
|
-
|
247
|
-
|
262
|
+
1. Update the `ViralSeq::TcsCore::detection_limit` with pre-calculated values to save processing time.
|
263
|
+
2. Update `tcs` pipeline to v2.5.0. HTML report will generated after running `tcs_log` script after `tcs` pipeline.
|
248
264
|
|
249
265
|
### Version 1.5.0-01042022
|
250
266
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
267
|
+
1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
|
268
|
+
2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
|
269
|
+
3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
|
270
|
+
4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
|
271
|
+
5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
|
256
272
|
|
257
273
|
### Version 1.4.0-10132021
|
258
274
|
|
259
|
-
|
260
|
-
|
275
|
+
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
276
|
+
2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
|
261
277
|
|
262
278
|
### Version 1.3.0-08302021
|
263
279
|
|
264
|
-
|
280
|
+
1. Fixed a bug in the `tcs` pipeline.
|
265
281
|
|
266
282
|
### Version 1.2.9-08022021
|
267
283
|
|
268
|
-
|
269
|
-
|
284
|
+
1. Fixed a bug when reading the input primer sequences in lowercases.
|
285
|
+
2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
|
270
286
|
|
271
287
|
### Version 1.2.8-07292021
|
272
288
|
|
273
|
-
|
289
|
+
1. Fixed an issue when reading .fastq files containing blank_lines.
|
274
290
|
|
275
291
|
### Version 1.2.7-07152021
|
276
292
|
|
277
|
-
|
278
|
-
|
279
|
-
|
293
|
+
1. Optimzed the workflow of the `tcs` pipeline on raw data with uneven lengths.
|
294
|
+
`tcs` version to v2.3.6.
|
280
295
|
|
281
296
|
### Version 1.2.6-07122021
|
282
297
|
|
283
|
-
|
284
|
-
|
285
|
-
|
298
|
+
1. Optimized the workflow of the `tcs` pipeline in the "end-join/QC/Trimming" section.
|
299
|
+
`tcs` version to v2.3.5.
|
286
300
|
|
287
301
|
### Version 1.2.5-06232021
|
288
302
|
|
289
|
-
|
290
|
-
|
291
|
-
|
303
|
+
1. Add error rescue and report in the `tcs` pipeline.
|
304
|
+
error messages are stored in the .tcs_error file. `tcs` pipeline updated to v2.3.4.
|
305
|
+
2. Use simple majority for the consensus cut-off in the default setting of the `tcs -dr` pipeline.
|
292
306
|
|
293
307
|
### Version 1.2.2-05272021
|
294
308
|
|
295
|
-
|
296
|
-
|
309
|
+
1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
|
310
|
+
`tcs` pipeline upgraded to v2.3.2
|
297
311
|
|
298
312
|
### Version 1.2.1-05172021
|
299
313
|
|
300
|
-
|
314
|
+
1. Added a function in R to check and install missing R packages for `tcs_sdrm` pipeline.
|
301
315
|
|
302
316
|
### Version 1.2.0-05102021
|
303
317
|
|
304
|
-
|
305
|
-
|
318
|
+
1. Added `tcs_sdrm` pipeline as an excutable.
|
319
|
+
`tcs_sdrm` processes `tcs`-processed HIV MPID-NGS data for drug resistance mutations, recency and phylogentic analysis.
|
306
320
|
|
307
|
-
|
321
|
+
2. Added function ViralSeq::SeqHash#sample.
|
308
322
|
|
309
|
-
|
323
|
+
3. Added recency determining function `ViralSeq::Recency::define`
|
310
324
|
|
311
|
-
|
325
|
+
4. Fixed a few bugs related to `tcs_sdrm`.
|
312
326
|
|
313
327
|
### Version 1.1.2-04262021
|
314
328
|
|
315
|
-
|
316
|
-
|
317
|
-
|
329
|
+
1. Added function `ViralSeq::DRMs.sdrm_json` to export SDRM as json object.
|
330
|
+
2. Added a random string to the temp file names for `muscle_bio` to avoid issues when running scripts in parallel.
|
331
|
+
3. Added `--keep-original` flag to the `tcs` pipeline.
|
318
332
|
|
319
333
|
### Version 1.1.1-04012021
|
320
334
|
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
335
|
+
1. Added warning when paired_raw_sequence less than 0.1% of total_raw_sequence.
|
336
|
+
2. Added option `-i WORKING_DIRECTORY` to the `tcs` script.
|
337
|
+
If the `params.json` file does not contain the path to the working directory, it will append path to the run params.
|
338
|
+
3. Added option `-dr` to the `tcs` script.
|
325
339
|
|
326
340
|
### Version 1.1.0-03252021
|
327
341
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
342
|
+
1. Optimized the algorithm of end-join.
|
343
|
+
2. Fixed a bug in the `tcs` pipeline that sometimes combined tcs files are not saved.
|
344
|
+
3. Added `tcs_log` command to pool run logs and tcs files from one batch of tcs jobs.
|
345
|
+
4. Added the preset of MPID-HIVDR params file [**_dr.json_**](./docs/dr.json) in /docs.
|
346
|
+
5. Add `platform_format` option in the json generator of the `tcs` Pipeline.
|
347
|
+
Users can choose from 3 MiSeq platforms for processing their sequencing data.
|
348
|
+
MiSeq 300x7x300 is the default option.
|
335
349
|
|
336
350
|
### Version 1.0.14-03052021
|
337
351
|
|
338
|
-
|
352
|
+
1. Add a function `ViralSeq::TcsCore.validate_file_name` to check MiSeq paired-end file names.
|
339
353
|
|
340
354
|
### Version 1.0.13-03032021
|
341
355
|
|
342
|
-
|
356
|
+
1. Fixed the conflict with rails.
|
343
357
|
|
344
358
|
### Version 1.0.12-03032021
|
345
359
|
|
346
|
-
|
360
|
+
1. Fixed an issue that may cause conflicts with ActiveRecord.
|
347
361
|
|
348
362
|
### Version 1.0.11-03022021
|
349
363
|
|
350
|
-
|
351
|
-
|
364
|
+
1. Fixed an issue when calculating Poisson cutoff for minority mutations `ViralSeq::SeqHash.pm`.
|
365
|
+
2. fixed an issue loading class 'OptionParser'in some ruby environments.
|
352
366
|
|
353
367
|
### Version 1.0.10-11112020:
|
354
368
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
369
|
+
1. Modularize TCS pipeline. Move key functions into /viral_seq/tcs_core.rb
|
370
|
+
2. `tcs_json_generator` is removed. This CLI is delivered within the `tcs` pipeline, by running `tcs -j`. The scripts are included in the /viral_seq/tcs_json.rb
|
371
|
+
3. consensus model now includes a true simple majority model, where no nt needs to be over 50% to be called.
|
372
|
+
4. a few optimizations.
|
373
|
+
5. TCS 2.1.0 delivered.
|
374
|
+
6. Tried parallel processing. Cannot achieve the goal because `parallel` gem by default can't pool data from memory of child processors and `in_threads` does not help with the speed.
|
361
375
|
|
362
376
|
### Version 1.0.9-07182020:
|
363
377
|
|
364
|
-
|
378
|
+
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
365
379
|
|
366
|
-
|
380
|
+
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
367
381
|
|
368
382
|
### Version 1.0.8-02282020:
|
369
383
|
|
370
|
-
|
371
|
-
|
372
|
-
|
384
|
+
1. TCS pipeline (version 2.0.0) added as executable.
|
385
|
+
tcs - main TCS pipeline script.
|
386
|
+
tcs_json_generator - step-by-step script to generate json file for tcs pipeline.
|
373
387
|
|
374
|
-
|
375
|
-
|
388
|
+
2. Methods added:
|
389
|
+
ViralSeq::SeqHash#trim
|
376
390
|
|
377
|
-
|
391
|
+
3. Bug fix for several methods.
|
378
392
|
|
379
393
|
### Version 1.0.7-01282020:
|
380
394
|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
395
|
+
1. Several methods added, including
|
396
|
+
ViralSeq::SeqHash#error_table
|
397
|
+
ViralSeq::SeqHash#random_select
|
398
|
+
2. Improved performance for several functions.
|
385
399
|
|
386
400
|
### Version 1.0.6-07232019:
|
387
401
|
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
402
|
+
1. Several methods added to ViralSeq::SeqHash, including
|
403
|
+
ViralSeq::SeqHash#size
|
404
|
+
ViralSeq::SeqHash#+
|
405
|
+
ViralSeq::SeqHash#write_nt_fa
|
406
|
+
ViralSeq::SeqHash#mutation
|
407
|
+
2. Update documentations and rspec samples.
|
394
408
|
|
395
409
|
### Version 1.0.5-07112019:
|
396
410
|
|
397
|
-
|
398
|
-
|
399
|
-
|
411
|
+
1. Update ViralSeq::SeqHash#sequence_locator.
|
412
|
+
Program will try to determine the direction (`+` or `-` of the query sequence)
|
413
|
+
2. update executable `locator` to have a column of `direction` in output .csv file
|
400
414
|
|
401
415
|
### Version 1.0.4-07102019:
|
402
416
|
|
403
|
-
|
404
|
-
|
417
|
+
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
418
|
+
2. Fix bugs in bin `locator`
|
405
419
|
|
406
420
|
### Version 1.0.3-07102019:
|
407
421
|
|
408
|
-
|
422
|
+
1. Bug fix.
|
409
423
|
|
410
424
|
### Version 1.0.2-07102019:
|
411
425
|
|
412
|
-
|
426
|
+
1. Fixed a gem loading issue.
|
413
427
|
|
414
428
|
### Version 1.0.1-07102019:
|
415
429
|
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
430
|
+
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
431
|
+
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
432
|
+
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
433
|
+
4. update documentations
|
420
434
|
|
421
435
|
### Version 1.0.0-07092019:
|
422
436
|
|
423
|
-
|
437
|
+
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
424
438
|
|
425
439
|
## Development
|
426
440
|
|
data/bin/locator
CHANGED
@@ -38,7 +38,7 @@ def myparser
|
|
38
38
|
options[:outfile] = o
|
39
39
|
end
|
40
40
|
|
41
|
-
opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `
|
41
|
+
opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `SIVmm239`".blue.bold}") do |o|
|
42
42
|
options[:ref_option] = o.to_sym
|
43
43
|
end
|
44
44
|
|
@@ -84,7 +84,7 @@ begin
|
|
84
84
|
seqs = ViralSeq::SeqHash.fa(seq_file)
|
85
85
|
opt = options[:ref_option] ? options[:ref_option] : :HXB2
|
86
86
|
|
87
|
-
unless [:HXB2, :
|
87
|
+
unless [:HXB2, :SIVmm239].include? opt
|
88
88
|
puts "Reference option `#{opt}` not recognized, using `HXB2` as the reference genome.".red.bold
|
89
89
|
opt = :HXB2
|
90
90
|
end
|
data/bin/tcs
CHANGED
@@ -27,9 +27,8 @@
|
|
27
27
|
# run `tcs -j` to generate param json file.
|
28
28
|
|
29
29
|
def gem_installed?(gem_name)
|
30
|
-
found_gem = false
|
31
30
|
begin
|
32
|
-
|
31
|
+
Gem::Specification.find_by_name(gem_name)
|
33
32
|
rescue Gem::LoadError
|
34
33
|
return false
|
35
34
|
else
|
@@ -217,8 +216,8 @@ begin
|
|
217
216
|
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
218
217
|
end
|
219
218
|
|
220
|
-
|
221
219
|
primers.each do |primer|
|
220
|
+
|
222
221
|
summary_json = {}
|
223
222
|
summary_json[:warnings] = []
|
224
223
|
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
@@ -470,6 +469,34 @@ begin
|
|
470
469
|
f.puts JSON.pretty_generate(pid_json)
|
471
470
|
end
|
472
471
|
|
472
|
+
filter_r1 = nil
|
473
|
+
filter_r2 = nil
|
474
|
+
r1_passed_seq = nil
|
475
|
+
r2_passed_seq = nil
|
476
|
+
r1_temp = nil
|
477
|
+
r2_temp = nil
|
478
|
+
r1_temp_sh = nil
|
479
|
+
r2_temp_sh = nil
|
480
|
+
r1_consensus_filtered = nil
|
481
|
+
r2_consensus_filtered = nil
|
482
|
+
consensus_filtered = nil
|
483
|
+
pid_json = nil
|
484
|
+
consensus = nil
|
485
|
+
r1_seq = nil
|
486
|
+
r2_seq = nil
|
487
|
+
bio_r1 = nil
|
488
|
+
bio_r2 = nil
|
489
|
+
id = nil
|
490
|
+
primer_id_count = nil
|
491
|
+
primer_id_dis = nil
|
492
|
+
primer_id_list = nil
|
493
|
+
primer_id_count_over_n = nil
|
494
|
+
r1_sub_seq = nil
|
495
|
+
r2_sub_seq = nil
|
496
|
+
common_keys = nil
|
497
|
+
|
498
|
+
GC.start
|
499
|
+
|
473
500
|
# start end-join
|
474
501
|
def end_join(dir, option, overlap)
|
475
502
|
shp = ViralSeq::SeqHashPair.fa(dir)
|
@@ -492,7 +519,6 @@ begin
|
|
492
519
|
|
493
520
|
if primer[:end_join]
|
494
521
|
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
495
|
-
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
496
522
|
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
497
523
|
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
498
524
|
|
@@ -502,6 +528,11 @@ begin
|
|
502
528
|
joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
|
503
529
|
end
|
504
530
|
|
531
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
532
|
+
if export_raw
|
533
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
534
|
+
end
|
535
|
+
|
505
536
|
if primer[:TCS_QC]
|
506
537
|
ref_start = primer[:ref_start]
|
507
538
|
ref_end = primer[:ref_end]
|
@@ -513,42 +544,11 @@ begin
|
|
513
544
|
if ref_end == 0
|
514
545
|
ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
515
546
|
end
|
516
|
-
if primer[:end_join_option] == 1
|
517
|
-
r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
|
518
|
-
r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
|
519
|
-
r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
520
|
-
r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
521
|
-
new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
522
|
-
new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
523
|
-
joined_seq = {}
|
524
|
-
new_r1_seq.each do |seq_name, seq|
|
525
|
-
next unless seq
|
526
|
-
next unless new_r2_seq[seq_name]
|
527
|
-
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
528
|
-
end
|
529
|
-
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
530
547
|
|
531
|
-
|
532
|
-
r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
|
533
|
-
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
534
|
-
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
535
|
-
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
536
|
-
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
537
|
-
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
538
|
-
joined_seq_raw = {}
|
539
|
-
new_r1_seq_raw.each do |seq_name, seq|
|
540
|
-
next unless seq
|
541
|
-
next unless new_r2_seq_raw[seq_name]
|
542
|
-
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
543
|
-
end
|
544
|
-
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
545
|
-
end
|
546
|
-
else
|
547
|
-
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
548
|
+
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
548
549
|
|
549
|
-
|
550
|
-
|
551
|
-
end
|
550
|
+
if export_raw
|
551
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
552
552
|
end
|
553
553
|
|
554
554
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
data/lib/viral_seq/R.rb
CHANGED
@@ -14,7 +14,9 @@ module ViralSeq
|
|
14
14
|
|
15
15
|
# check if required R packages is installed.
|
16
16
|
def self.check_R_packages
|
17
|
-
|
17
|
+
file = File.join(ViralSeq.root, "viral_seq", "util", "check_env.r")
|
18
|
+
safe_file = Shellwords.escape(file)
|
19
|
+
if system "Rscript #{safe_file}"
|
18
20
|
return 0
|
19
21
|
else
|
20
22
|
raise "Non-zero exit code. Error happens when checking required R packages."
|