viral_seq 1.9.0 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -1
- data/README.md +133 -119
- data/bin/locator +2 -2
- data/bin/tcs +38 -38
- data/bin/tcs_sdrm +2 -2
- data/lib/viral_seq/R.rb +3 -1
- data/lib/viral_seq/pid.rb +1 -4
- data/lib/viral_seq/seq_hash.rb +48 -12
- data/lib/viral_seq/sequence.rb +22 -171
- data/lib/viral_seq/string.rb +3 -6
- data/lib/viral_seq/tcs_core.rb +4 -0
- data/lib/viral_seq/tcs_dr.rb +82 -1
- data/lib/viral_seq/util/drm_versions_config.json +52 -0
- data/lib/viral_seq/version.rb +2 -2
- data/lib/viral_seq.rb +2 -0
- data/viral_seq.gemspec +5 -0
- metadata +31 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d940e5f465cba40def34166fe50e0a21b1c62a1fff8e0be8abdabb7b4c4aab77
|
4
|
+
data.tar.gz: 7e4be6ec82d9081a1ea3130eed49dcaac080608e481c7520b43c2e58a50e379d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 15805b09c96b6d1bff023a82948f23ceb584c60ffb21b85e59d6f4ddc2e2394045a29788a7c5811c714afedfae6405c36b88e0bcadce0d1408068418c497e596
|
7
|
+
data.tar.gz: '0871676e5ee49fa14f84ec3c109172d964efac18f3f104ec38ad52daa69b9ef85a935c35ca2377d261b38edc5d8d438469b2360ec791a902116f60c8daeef5c2'
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
viral_seq (1.
|
4
|
+
viral_seq (1.10.1)
|
5
5
|
colorize (~> 0.1)
|
6
6
|
combine_pdf (~> 1.0, >= 1.0.0)
|
7
7
|
muscle_bio (= 0.4)
|
8
8
|
prawn (~> 2.3, >= 2.3.0)
|
9
9
|
prawn-table (~> 0.2, >= 0.2.0)
|
10
|
+
shellwords (~> 0.2)
|
11
|
+
virust-locator-ruby (~> 0.3)
|
10
12
|
|
11
13
|
GEM
|
12
14
|
remote: https://rubygems.org/
|
@@ -41,8 +43,11 @@ GEM
|
|
41
43
|
rspec-support (~> 3.13.0)
|
42
44
|
rspec-support (3.13.1)
|
43
45
|
ruby-rc4 (0.1.5)
|
46
|
+
shellwords (0.2.0)
|
44
47
|
ttfunk (1.8.0)
|
45
48
|
bigdecimal (~> 3.1)
|
49
|
+
virust-locator-ruby (0.3.0)
|
50
|
+
shellwords (~> 0.2)
|
46
51
|
|
47
52
|
PLATFORMS
|
48
53
|
ruby
|
data/README.md
CHANGED
@@ -16,10 +16,10 @@ CLI tools `tcs`, `tcs_sdrm`, `tcs_log` and `locator` included in the gem.
|
|
16
16
|
|
17
17
|
## Illustration for the Primer ID Sequencing
|
18
18
|
|
19
|
-
|
20
19
|

|
21
20
|
|
22
21
|
### Reference readings on the Primer ID sequencing
|
22
|
+
|
23
23
|
[Explantion of Primer ID sequencing](https://doi.org/10.21769/BioProtoc.3938)
|
24
24
|
[Primer ID MiSeq protocol](https://doi.org/10.1128/JVI.00522-15)
|
25
25
|
[Application of Primer ID sequencing in COVID-19 research](https://doi.org/10.1126/scitranslmed.abb5883)
|
@@ -41,11 +41,13 @@ Required RubyGems version: >= 1.3.6
|
|
41
41
|
### Excutables
|
42
42
|
|
43
43
|
### `tcs`
|
44
|
+
|
44
45
|
Use executable `tcs` pipeline to process **Primer ID MiSeq sequencing** data.
|
45
46
|
|
46
47
|
Web-based `tcs` analysis can be accessed at https://primer-id.org/
|
47
48
|
|
48
49
|
Example commands:
|
50
|
+
|
49
51
|
```bash
|
50
52
|
$ tcs -p params.json # run TCS pipeline with params.json
|
51
53
|
$ tcs -p params.json -i DIRECTORY
|
@@ -61,12 +63,13 @@ Example commands:
|
|
61
63
|
[sample params.json for the tcs-dr pipeline](./docs/dr.json)
|
62
64
|
|
63
65
|
---
|
66
|
+
|
64
67
|
### `tcs_log`
|
65
68
|
|
66
69
|
Use `tcs_log` script to pool run logs and TCS fasta files after one batch of `tcs` jobs. This command generates log.html to visualize the sequencing runs.
|
67
70
|
|
68
|
-
|
69
71
|
Example file structure:
|
72
|
+
|
70
73
|
```
|
71
74
|
batch_tcs_jobs/
|
72
75
|
├── lib1
|
@@ -77,21 +80,25 @@ batch_tcs_jobs/
|
|
77
80
|
```
|
78
81
|
|
79
82
|
Example command:
|
83
|
+
|
80
84
|
```bash
|
81
85
|
$ tcs_log batch_tcs_jobs
|
82
86
|
```
|
83
87
|
|
84
88
|
---
|
89
|
+
|
85
90
|
### `tcs_sdrm`
|
86
91
|
|
87
92
|
Use `tcs_sdrm` pipeline for HIV-1 drug resistance mutation and recency.
|
88
93
|
|
89
94
|
Example command:
|
95
|
+
|
90
96
|
```bash
|
91
97
|
$ tcs_sdrm libs_dir
|
92
98
|
```
|
93
99
|
|
94
100
|
lib_dir file structure:
|
101
|
+
|
95
102
|
```
|
96
103
|
libs_dir/
|
97
104
|
├── lib1
|
@@ -109,8 +116,8 @@ libs_dir/
|
|
109
116
|
|
110
117
|
Output data in a new dir as 'libs_dir_SDRM'
|
111
118
|
|
112
|
-
|
113
119
|
**Note: [R](https://www.r-project.org/) and the following R libraries are required:**
|
120
|
+
|
114
121
|
- phangorn
|
115
122
|
- ape
|
116
123
|
- scales
|
@@ -122,11 +129,13 @@ Output data in a new dir as 'libs_dir_SDRM'
|
|
122
129
|
---
|
123
130
|
|
124
131
|
### `locator`
|
132
|
+
|
125
133
|
Use executable `locator` to get the coordinates of the sequences on HIV/SIV reference genome from a FASTA file through a terminal
|
126
134
|
|
127
135
|
```bash
|
128
136
|
$ locator -i sequence.fasta -o sequence.fasta.csv
|
129
137
|
```
|
138
|
+
|
130
139
|
---
|
131
140
|
|
132
141
|
## Some Examples
|
@@ -179,244 +188,249 @@ Examine for drug resistance mutations for HIV PR region
|
|
179
188
|
```ruby
|
180
189
|
qc_seqhash.sdrm_hiv_pr(cut_off)
|
181
190
|
```
|
182
|
-
## Known issues
|
183
|
-
|
184
|
-
1. ~~have a conflict with rails.~~
|
185
|
-
2. ~~Update on 03032021. Still have conflict. But in rails gem file, can just use `requires: false` globally and only require "viral_seq" when the module is needed in controller.~~
|
186
|
-
3. The conflict seems to be resovled. It was from a combination of using `!` as a function for factorial and the gem name `viral_seq`. @_@
|
187
191
|
|
188
192
|
## Updates
|
189
193
|
|
194
|
+
### Version-1.10.1
|
195
|
+
|
196
|
+
1. Added quality filter for Illumina 2-color sequencing platforms (filter poly-G and poly-C)
|
197
|
+
2. Replaced `MuscleBio` with [`VirustLocator`]("https://github.com/ViralSeq/virust-locator-ruby") for faster and more accurate pairwise alignment.
|
198
|
+
3. Added DR primer version 4.
|
199
|
+
4. Added a helper function to properly treat input params for #hiv_seq_qc.
|
200
|
+
5. Solved the slow-performance issue when spawning a subprocess to call `VirustLocator` when holding a large amount of data in the momery. When Ruby run shell commands, a child process is spawned and share the parent's memory pages. To set it up, the OS has to walk the parent's entire memory table, causing an incremental delay in each subsequent process spawning. To solve this, I redid the `VirustLocator` API to allow all the arguments to be processed with one shell command instead of spawning individual child process.
|
201
|
+
|
202
|
+
### Version-1.9.1-12022024
|
203
|
+
|
204
|
+
1. Fixed a bug in the `tcs_sdrm` pipeline.
|
205
|
+
|
190
206
|
### Version-1.9.0-11132024
|
191
207
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
208
|
+
1. `ViralSeq::TcsCore::validate_file_name` will not report errors when non-sequence data in the folder, instead these files will be ignored.
|
209
|
+
2. Rewrote the APIs for DRM analysis for HIV. Now uses version config files for the sequencing information and DRM list configure files for DRM interpretation. Two configure files are at located in `/lib/viral_seq/util/`
|
210
|
+
3. `tcs_sdrm` will take a second argument for DRM config versions. Currently supports `["v1", "v2", "v3"]`. Refer to the documentations of the APIs for the details.
|
211
|
+
4. Next update will use secondary command `tcs sdrm` to replace `tcs_sdrm`, and `tcs log` to replace `tcs_log`.
|
196
212
|
|
197
213
|
### Version-1.8.1-06042024
|
198
214
|
|
199
|
-
|
215
|
+
1. Fixed a bug that causes `tcs_sdrm` pipeline to crash.
|
200
216
|
|
201
217
|
### Version-1.8.0-04052024
|
202
218
|
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
219
|
+
1. Use `muscle-v3.8.1` as default aligner because of the compatibility issues with `muscle-v5` on some platforms.
|
220
|
+
2. Adjust the end-join model for short insert (insert size less than read length substracted by adaptor size)
|
221
|
+
3. Add an option in the DR pipeline for different versions of the pipeline, default version as "v1".
|
222
|
+
4. Add Days Post Infection (DPI) prediction model in the SDRM pipeline.
|
223
|
+
5. Re-organize the R scripts as stand-alone R files.
|
224
|
+
6. Bug fix.
|
225
|
+
7. **NOT SOLVED**: to include versions of DR in reports
|
210
226
|
|
211
227
|
### Version-1.7.1-05122023
|
212
228
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
229
|
+
1. Add a size check for the raw sequences. If the size smaller than the input params, error messages will be sent to users. IF the actual size is greater than the input params, extra bases will be truncated.
|
230
|
+
2. Now allows mismatch for the primer region sequences. Forward primer region allows 2 nt differences and cDNA primer region allows 3 nt differences.
|
231
|
+
3. Bug fix.
|
232
|
+
4. TCS version to 2.5.2
|
217
233
|
|
218
234
|
### Version-1.7.0-08242022
|
219
235
|
|
220
|
-
|
221
|
-
|
236
|
+
1. Add warnings if `tcs` pipeline is excecuting through source instead of installing from `gem`.
|
237
|
+
2. Optimized `ViralSeq:SeqHash#a3g` hypermut algorithm. Allowing a external reference other than the sample reference.
|
222
238
|
|
223
239
|
### Version-1.6.4-07182022
|
224
240
|
|
225
|
-
|
226
|
-
|
227
|
-
|
241
|
+
1. Included region "P17" in the default `tcs -d` pipeline setting. `tcs` pipeline updated to version 2.5.1.
|
242
|
+
2. Loosen the locator params for the "V1V3" end region for rare alignment issues. Now the default "V1V3" region end with position 7205 to 7210 instead of 7208.
|
243
|
+
3. `tcs_sdrm` now analyse "P17" region for pairwise diversity.
|
228
244
|
|
229
245
|
### Version-1.6.3-02052022
|
230
246
|
|
231
|
-
|
232
|
-
|
233
|
-
|
247
|
+
1. Updated on `ViralSeq::Muscle` module along with the update of `muscle` from version 3.8.1 to 5.1.
|
248
|
+
2. Optimized the `locator` algorithm based on `muscle` v5.1.
|
249
|
+
3. Optimized the `tcs_sdrm` pipeline based on `muscle` v5.1.
|
234
250
|
|
235
251
|
### Version-1.6.1-02022022
|
236
252
|
|
237
|
-
|
238
|
-
|
253
|
+
1. Fixed the `nav bar` in tcs_log html file.
|
254
|
+
2. Fixed a typo in `tcs`.
|
239
255
|
|
240
256
|
### Version 1.6.0-01042022
|
241
257
|
|
242
|
-
|
243
|
-
|
258
|
+
1. Update the `ViralSeq::TcsCore::detection_limit` with pre-calculated values to save processing time.
|
259
|
+
2. Update `tcs` pipeline to v2.5.0. HTML report will generated after running `tcs_log` script after `tcs` pipeline.
|
244
260
|
|
245
261
|
### Version 1.5.0-01042022
|
246
262
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
263
|
+
1. Added a function to calcute detection limit/sensitivity for minority variants (R required). `ViralSeq::TcsCore::detection_limit`
|
264
|
+
2. Added a function to get a sub SeqHash object given a range of nt positions. `ViralSeq::SeqHash#nt_range`
|
265
|
+
3. Added a function to quality check dna sequences comparing with sample consensus for indels. `ViralSeq::SeqHash#qc_indel`
|
266
|
+
4. Added a function for DNA variant analysis. Return a Hash object that can output as a JSON file. `ViralSeq::SeqHash#nt_variants`
|
267
|
+
5. Added a function to check the size of sequences of a SeqHash object. `ViralSeq::SeqHash#check_nt_size`
|
252
268
|
|
253
269
|
### Version 1.4.0-10132021
|
254
270
|
|
255
|
-
|
256
|
-
|
271
|
+
1. Added a function to calculate false detectionr rate (FDR, aka, Benjamini-Hochberg correction) for minority mutations detected in the sequences. `ViralSeq::SeqHash#fdr`
|
272
|
+
2. Updated `bin\tcs_sdrm` script to add FDR value to each DRMs detected.
|
257
273
|
|
258
274
|
### Version 1.3.0-08302021
|
259
275
|
|
260
|
-
|
276
|
+
1. Fixed a bug in the `tcs` pipeline.
|
261
277
|
|
262
278
|
### Version 1.2.9-08022021
|
263
279
|
|
264
|
-
|
265
|
-
|
280
|
+
1. Fixed a bug when reading the input primer sequences in lowercases.
|
281
|
+
2. Fixed a bug in the method ViralSeq::Math::RandomGaussian
|
266
282
|
|
267
283
|
### Version 1.2.8-07292021
|
268
284
|
|
269
|
-
|
285
|
+
1. Fixed an issue when reading .fastq files containing blank_lines.
|
270
286
|
|
271
287
|
### Version 1.2.7-07152021
|
272
288
|
|
273
|
-
|
274
|
-
|
275
|
-
|
289
|
+
1. Optimzed the workflow of the `tcs` pipeline on raw data with uneven lengths.
|
290
|
+
`tcs` version to v2.3.6.
|
276
291
|
|
277
292
|
### Version 1.2.6-07122021
|
278
293
|
|
279
|
-
|
280
|
-
|
281
|
-
|
294
|
+
1. Optimized the workflow of the `tcs` pipeline in the "end-join/QC/Trimming" section.
|
295
|
+
`tcs` version to v2.3.5.
|
282
296
|
|
283
297
|
### Version 1.2.5-06232021
|
284
298
|
|
285
|
-
|
286
|
-
|
287
|
-
|
299
|
+
1. Add error rescue and report in the `tcs` pipeline.
|
300
|
+
error messages are stored in the .tcs_error file. `tcs` pipeline updated to v2.3.4.
|
301
|
+
2. Use simple majority for the consensus cut-off in the default setting of the `tcs -dr` pipeline.
|
288
302
|
|
289
303
|
### Version 1.2.2-05272021
|
290
304
|
|
291
|
-
|
292
|
-
|
305
|
+
1. Fixed a bug in the `tcs` pipeline that sometimes causes `SystemStackError`.
|
306
|
+
`tcs` pipeline upgraded to v2.3.2
|
293
307
|
|
294
308
|
### Version 1.2.1-05172021
|
295
309
|
|
296
|
-
|
310
|
+
1. Added a function in R to check and install missing R packages for `tcs_sdrm` pipeline.
|
297
311
|
|
298
312
|
### Version 1.2.0-05102021
|
299
313
|
|
300
|
-
|
301
|
-
|
314
|
+
1. Added `tcs_sdrm` pipeline as an excutable.
|
315
|
+
`tcs_sdrm` processes `tcs`-processed HIV MPID-NGS data for drug resistance mutations, recency and phylogentic analysis.
|
302
316
|
|
303
|
-
|
317
|
+
2. Added function ViralSeq::SeqHash#sample.
|
304
318
|
|
305
|
-
|
319
|
+
3. Added recency determining function `ViralSeq::Recency::define`
|
306
320
|
|
307
|
-
|
321
|
+
4. Fixed a few bugs related to `tcs_sdrm`.
|
308
322
|
|
309
323
|
### Version 1.1.2-04262021
|
310
324
|
|
311
|
-
|
312
|
-
|
313
|
-
|
325
|
+
1. Added function `ViralSeq::DRMs.sdrm_json` to export SDRM as json object.
|
326
|
+
2. Added a random string to the temp file names for `muscle_bio` to avoid issues when running scripts in parallel.
|
327
|
+
3. Added `--keep-original` flag to the `tcs` pipeline.
|
314
328
|
|
315
329
|
### Version 1.1.1-04012021
|
316
330
|
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
331
|
+
1. Added warning when paired_raw_sequence less than 0.1% of total_raw_sequence.
|
332
|
+
2. Added option `-i WORKING_DIRECTORY` to the `tcs` script.
|
333
|
+
If the `params.json` file does not contain the path to the working directory, it will append path to the run params.
|
334
|
+
3. Added option `-dr` to the `tcs` script.
|
321
335
|
|
322
336
|
### Version 1.1.0-03252021
|
323
337
|
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
338
|
+
1. Optimized the algorithm of end-join.
|
339
|
+
2. Fixed a bug in the `tcs` pipeline that sometimes combined tcs files are not saved.
|
340
|
+
3. Added `tcs_log` command to pool run logs and tcs files from one batch of tcs jobs.
|
341
|
+
4. Added the preset of MPID-HIVDR params file [**_dr.json_**](./docs/dr.json) in /docs.
|
342
|
+
5. Add `platform_format` option in the json generator of the `tcs` Pipeline.
|
343
|
+
Users can choose from 3 MiSeq platforms for processing their sequencing data.
|
344
|
+
MiSeq 300x7x300 is the default option.
|
331
345
|
|
332
346
|
### Version 1.0.14-03052021
|
333
347
|
|
334
|
-
|
348
|
+
1. Add a function `ViralSeq::TcsCore.validate_file_name` to check MiSeq paired-end file names.
|
335
349
|
|
336
350
|
### Version 1.0.13-03032021
|
337
351
|
|
338
|
-
|
352
|
+
1. Fixed the conflict with rails.
|
339
353
|
|
340
354
|
### Version 1.0.12-03032021
|
341
355
|
|
342
|
-
|
356
|
+
1. Fixed an issue that may cause conflicts with ActiveRecord.
|
343
357
|
|
344
358
|
### Version 1.0.11-03022021
|
345
359
|
|
346
|
-
|
347
|
-
|
360
|
+
1. Fixed an issue when calculating Poisson cutoff for minority mutations `ViralSeq::SeqHash.pm`.
|
361
|
+
2. fixed an issue loading class 'OptionParser'in some ruby environments.
|
348
362
|
|
349
363
|
### Version 1.0.10-11112020:
|
350
364
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
365
|
+
1. Modularize TCS pipeline. Move key functions into /viral_seq/tcs_core.rb
|
366
|
+
2. `tcs_json_generator` is removed. This CLI is delivered within the `tcs` pipeline, by running `tcs -j`. The scripts are included in the /viral_seq/tcs_json.rb
|
367
|
+
3. consensus model now includes a true simple majority model, where no nt needs to be over 50% to be called.
|
368
|
+
4. a few optimizations.
|
369
|
+
5. TCS 2.1.0 delivered.
|
370
|
+
6. Tried parallel processing. Cannot achieve the goal because `parallel` gem by default can't pool data from memory of child processors and `in_threads` does not help with the speed.
|
357
371
|
|
358
372
|
### Version 1.0.9-07182020:
|
359
373
|
|
360
|
-
|
374
|
+
1. Change ViralSeq::SeqHash#stop_codon and ViralSeq::SeqHash#a3g_hypermut return value to hash object.
|
361
375
|
|
362
|
-
|
376
|
+
2. TCS pipeline updated to version 2.0.1. Add optional `export_raw: TRUE/FALSE` in json params. If `export_raw` is `TRUE`, raw sequence reads (have to pass quality filters) will be exported, along with TCS reads.
|
363
377
|
|
364
378
|
### Version 1.0.8-02282020:
|
365
379
|
|
366
|
-
|
367
|
-
|
368
|
-
|
380
|
+
1. TCS pipeline (version 2.0.0) added as executable.
|
381
|
+
tcs - main TCS pipeline script.
|
382
|
+
tcs_json_generator - step-by-step script to generate json file for tcs pipeline.
|
369
383
|
|
370
|
-
|
371
|
-
|
384
|
+
2. Methods added:
|
385
|
+
ViralSeq::SeqHash#trim
|
372
386
|
|
373
|
-
|
387
|
+
3. Bug fix for several methods.
|
374
388
|
|
375
389
|
### Version 1.0.7-01282020:
|
376
390
|
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
391
|
+
1. Several methods added, including
|
392
|
+
ViralSeq::SeqHash#error_table
|
393
|
+
ViralSeq::SeqHash#random_select
|
394
|
+
2. Improved performance for several functions.
|
381
395
|
|
382
396
|
### Version 1.0.6-07232019:
|
383
397
|
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
398
|
+
1. Several methods added to ViralSeq::SeqHash, including
|
399
|
+
ViralSeq::SeqHash#size
|
400
|
+
ViralSeq::SeqHash#+
|
401
|
+
ViralSeq::SeqHash#write_nt_fa
|
402
|
+
ViralSeq::SeqHash#mutation
|
403
|
+
2. Update documentations and rspec samples.
|
390
404
|
|
391
405
|
### Version 1.0.5-07112019:
|
392
406
|
|
393
|
-
|
394
|
-
|
395
|
-
|
407
|
+
1. Update ViralSeq::SeqHash#sequence_locator.
|
408
|
+
Program will try to determine the direction (`+` or `-` of the query sequence)
|
409
|
+
2. update executable `locator` to have a column of `direction` in output .csv file
|
396
410
|
|
397
411
|
### Version 1.0.4-07102019:
|
398
412
|
|
399
|
-
|
400
|
-
|
413
|
+
1. Use home directory (Dir.home) instead of the directory of the script file for temp MUSCLE file.
|
414
|
+
2. Fix bugs in bin `locator`
|
401
415
|
|
402
416
|
### Version 1.0.3-07102019:
|
403
417
|
|
404
|
-
|
418
|
+
1. Bug fix.
|
405
419
|
|
406
420
|
### Version 1.0.2-07102019:
|
407
421
|
|
408
|
-
|
422
|
+
1. Fixed a gem loading issue.
|
409
423
|
|
410
424
|
### Version 1.0.1-07102019:
|
411
425
|
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
426
|
+
1. Add keyword argument :model to ViralSeq::SeqHashPair#join2.
|
427
|
+
2. Add method ViralSeq::SeqHash#sequence_locator (also: #loc), a function to locate sequences on HIV/SIV reference genomes, as HIV Sequence Locator from LANL.
|
428
|
+
3. Add executable 'locator'. An HIV/SIV sequence locator tool similar to LANL Sequence Locator.
|
429
|
+
4. update documentations
|
416
430
|
|
417
431
|
### Version 1.0.0-07092019:
|
418
432
|
|
419
|
-
|
433
|
+
1. Rewrote the whole ViralSeq gem, grouping methods into modules and classes under main Module::ViralSeq
|
420
434
|
|
421
435
|
## Development
|
422
436
|
|
data/bin/locator
CHANGED
@@ -38,7 +38,7 @@ def myparser
|
|
38
38
|
options[:outfile] = o
|
39
39
|
end
|
40
40
|
|
41
|
-
opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `
|
41
|
+
opts.on('-r', '--ref_option OPTION', "reference genome option, choose from #{"`HXB2` (default), `SIVmm239`".blue.bold}") do |o|
|
42
42
|
options[:ref_option] = o.to_sym
|
43
43
|
end
|
44
44
|
|
@@ -84,7 +84,7 @@ begin
|
|
84
84
|
seqs = ViralSeq::SeqHash.fa(seq_file)
|
85
85
|
opt = options[:ref_option] ? options[:ref_option] : :HXB2
|
86
86
|
|
87
|
-
unless [:HXB2, :
|
87
|
+
unless [:HXB2, :SIVmm239].include? opt
|
88
88
|
puts "Reference option `#{opt}` not recognized, using `HXB2` as the reference genome.".red.bold
|
89
89
|
opt = :HXB2
|
90
90
|
end
|
data/bin/tcs
CHANGED
@@ -27,9 +27,8 @@
|
|
27
27
|
# run `tcs -j` to generate param json file.
|
28
28
|
|
29
29
|
def gem_installed?(gem_name)
|
30
|
-
found_gem = false
|
31
30
|
begin
|
32
|
-
|
31
|
+
Gem::Specification.find_by_name(gem_name)
|
33
32
|
rescue Gem::LoadError
|
34
33
|
return false
|
35
34
|
else
|
@@ -217,8 +216,8 @@ begin
|
|
217
216
|
ViralSeq::TcsCore.log_and_abort log, "No primer information. Script terminated."
|
218
217
|
end
|
219
218
|
|
220
|
-
|
221
219
|
primers.each do |primer|
|
220
|
+
|
222
221
|
summary_json = {}
|
223
222
|
summary_json[:warnings] = []
|
224
223
|
summary_json[:tcs_version] = ViralSeq::TCS_VERSION
|
@@ -470,6 +469,34 @@ begin
|
|
470
469
|
f.puts JSON.pretty_generate(pid_json)
|
471
470
|
end
|
472
471
|
|
472
|
+
filter_r1 = nil
|
473
|
+
filter_r2 = nil
|
474
|
+
r1_passed_seq = nil
|
475
|
+
r2_passed_seq = nil
|
476
|
+
r1_temp = nil
|
477
|
+
r2_temp = nil
|
478
|
+
r1_temp_sh = nil
|
479
|
+
r2_temp_sh = nil
|
480
|
+
r1_consensus_filtered = nil
|
481
|
+
r2_consensus_filtered = nil
|
482
|
+
consensus_filtered = nil
|
483
|
+
pid_json = nil
|
484
|
+
consensus = nil
|
485
|
+
r1_seq = nil
|
486
|
+
r2_seq = nil
|
487
|
+
bio_r1 = nil
|
488
|
+
bio_r2 = nil
|
489
|
+
id = nil
|
490
|
+
primer_id_count = nil
|
491
|
+
primer_id_dis = nil
|
492
|
+
primer_id_list = nil
|
493
|
+
primer_id_count_over_n = nil
|
494
|
+
r1_sub_seq = nil
|
495
|
+
r2_sub_seq = nil
|
496
|
+
common_keys = nil
|
497
|
+
|
498
|
+
GC.start
|
499
|
+
|
473
500
|
# start end-join
|
474
501
|
def end_join(dir, option, overlap)
|
475
502
|
shp = ViralSeq::SeqHashPair.fa(dir)
|
@@ -492,7 +519,6 @@ begin
|
|
492
519
|
|
493
520
|
if primer[:end_join]
|
494
521
|
log.puts Time.now.to_s + "\t" + "Start end-pairing for TCS..."
|
495
|
-
shp = ViralSeq::SeqHashPair.fa(out_dir_consensus)
|
496
522
|
joined_sh = end_join(out_dir_consensus, primer[:end_join_option], primer[:overlap])
|
497
523
|
log.puts Time.now.to_s + "\t" + "Paired TCS number: " + joined_sh.size.to_s
|
498
524
|
|
@@ -502,6 +528,11 @@ begin
|
|
502
528
|
joined_sh_raw = end_join(out_dir_raw, primer[:end_join_option], primer[:overlap])
|
503
529
|
end
|
504
530
|
|
531
|
+
joined_sh.write_nt_fa(File.join(out_dir_consensus, "combined.fasta"))
|
532
|
+
if export_raw
|
533
|
+
joined_sh_raw.write_nt_fa(File.join(out_dir_raw, "combined.raw.fasta"))
|
534
|
+
end
|
535
|
+
|
505
536
|
if primer[:TCS_QC]
|
506
537
|
ref_start = primer[:ref_start]
|
507
538
|
ref_end = primer[:ref_end]
|
@@ -513,42 +544,11 @@ begin
|
|
513
544
|
if ref_end == 0
|
514
545
|
ref_end = 0..(ViralSeq::RefSeq.get(ref_genome).size - 1)
|
515
546
|
end
|
516
|
-
if primer[:end_join_option] == 1
|
517
|
-
r1_sh = ViralSeq::SeqHash.fa(outfile_r1)
|
518
|
-
r2_sh = ViralSeq::SeqHash.fa(outfile_r2)
|
519
|
-
r1_sh = r1_sh.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
520
|
-
r2_sh = r2_sh.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
521
|
-
new_r1_seq = r1_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
522
|
-
new_r2_seq = r2_sh.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
523
|
-
joined_seq = {}
|
524
|
-
new_r1_seq.each do |seq_name, seq|
|
525
|
-
next unless seq
|
526
|
-
next unless new_r2_seq[seq_name]
|
527
|
-
joined_seq[seq_name] = seq + new_r2_seq[seq_name]
|
528
|
-
end
|
529
|
-
joined_sh = ViralSeq::SeqHash.new(joined_seq)
|
530
547
|
|
531
|
-
|
532
|
-
r1_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r1)
|
533
|
-
r2_sh_raw = ViralSeq::SeqHash.fa(outfile_raw_r2)
|
534
|
-
r1_sh_raw = r1_sh_raw.hiv_seq_qc(ref_start, (0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), indel, ref_genome)
|
535
|
-
r2_sh_raw = r2_sh_raw.hiv_seq_qc((0..(ViralSeq::RefSeq.get(ref_genome).size - 1)), ref_end, indel, ref_genome)
|
536
|
-
new_r1_seq_raw = r1_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
537
|
-
new_r2_seq_raw = r2_sh_raw.dna_hash.each_with_object({}) {|(k, v), h| h[k[0..-4]] = v}
|
538
|
-
joined_seq_raw = {}
|
539
|
-
new_r1_seq_raw.each do |seq_name, seq|
|
540
|
-
next unless seq
|
541
|
-
next unless new_r2_seq_raw[seq_name]
|
542
|
-
joined_seq_raw[seq_name] = seq + new_r2_seq_raw[seq_name]
|
543
|
-
end
|
544
|
-
joined_sh_raw = ViralSeq::SeqHash.new(joined_seq_raw)
|
545
|
-
end
|
546
|
-
else
|
547
|
-
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
548
|
+
joined_sh = joined_sh.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
548
549
|
|
549
|
-
|
550
|
-
|
551
|
-
end
|
550
|
+
if export_raw
|
551
|
+
joined_sh_raw = joined_sh_raw.hiv_seq_qc(ref_start, ref_end, indel, ref_genome)
|
552
552
|
end
|
553
553
|
|
554
554
|
log.puts Time.now.to_s + "\t" + "Paired TCS number after QC based on reference genome: " + joined_sh.size.to_s
|
data/bin/tcs_sdrm
CHANGED
@@ -215,8 +215,8 @@ libs.each do |lib|
|
|
215
215
|
tag = data[0].split("_")[-1].gsub(/\W/,"")
|
216
216
|
summary_hash[tag] += "," + data[1].to_f.round(4).to_s + "," + data[2].to_f.round(4).to_s
|
217
217
|
end
|
218
|
-
|
219
|
-
|
218
|
+
regions_for_summary = regions.dup.push("V1V3")
|
219
|
+
regions_for_summary.each do |region|
|
220
220
|
next unless summary_hash[region]
|
221
221
|
seq_summary_out.puts region.to_s + "," + summary_hash[region]
|
222
222
|
end
|
data/lib/viral_seq/R.rb
CHANGED
@@ -14,7 +14,9 @@ module ViralSeq
|
|
14
14
|
|
15
15
|
# check if required R packages is installed.
|
16
16
|
def self.check_R_packages
|
17
|
-
|
17
|
+
file = File.join(ViralSeq.root, "viral_seq", "util", "check_env.r")
|
18
|
+
safe_file = Shellwords.escape(file)
|
19
|
+
if system "Rscript #{safe_file}"
|
18
20
|
return 0
|
19
21
|
else
|
20
22
|
raise "Non-zero exit code. Error happens when checking required R packages."
|