seqtrimnext 2.0.51 → 2.0.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Manifest.txt +3 -3
- data/README.rdoc +18 -3
- data/Rakefile +2 -1
- data/bin/parse_params.rb +5 -1
- data/bin/seqtrimnext +53 -21
- data/lib/seqtrimnext/actions/{action_classify.rb → action_user_contaminant.rb} +2 -2
- data/lib/seqtrimnext/classes/em_classes/seqtrim_work_manager.rb +64 -20
- data/lib/seqtrimnext/classes/em_classes/seqtrim_worker.rb +375 -240
- data/lib/seqtrimnext/classes/extract_stats.rb +26 -23
- data/lib/seqtrimnext/classes/params.rb +109 -123
- data/lib/seqtrimnext/classes/plugin_manager.rb +2 -4
- data/lib/seqtrimnext/classes/seqtrim.rb +24 -29
- data/lib/seqtrimnext/classes/sequence.rb +2 -2
- data/lib/seqtrimnext/classes/sequence_group.rb +21 -1
- data/lib/seqtrimnext/classes/sequence_with_action.rb +25 -13
- data/lib/seqtrimnext/plugins/plugin.rb +42 -12
- data/lib/seqtrimnext/plugins/plugin_ab_adapters.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_adapters.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_amplicons.rb +0 -12
- data/lib/seqtrimnext/plugins/plugin_contaminants.rb +5 -8
- data/lib/seqtrimnext/plugins/plugin_extract_inserts.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_find_poly_at.rb +1 -11
- data/lib/seqtrimnext/plugins/plugin_ignore_repeated.rb +1 -7
- data/lib/seqtrimnext/plugins/plugin_indeterminations.rb +1 -8
- data/lib/seqtrimnext/plugins/plugin_key.rb +1 -9
- data/lib/seqtrimnext/plugins/plugin_linker.rb +0 -9
- data/lib/seqtrimnext/plugins/plugin_low_complexity.rb +6 -21
- data/lib/seqtrimnext/plugins/plugin_low_high_size.rb +3 -13
- data/lib/seqtrimnext/plugins/plugin_low_quality.rb +126 -330
- data/lib/seqtrimnext/plugins/plugin_mids.rb +0 -11
- data/lib/seqtrimnext/plugins/plugin_short_insert.rb +1 -10
- data/lib/seqtrimnext/plugins/plugin_user_contaminants.rb +40 -32
- data/lib/seqtrimnext/plugins/plugin_vectors.rb +0 -9
- data/lib/seqtrimnext/templates/amplicons.txt +1 -8
- data/lib/seqtrimnext/templates/genomics_454.txt +12 -8
- data/lib/seqtrimnext/templates/genomics_454_with_paired.txt +19 -1
- data/lib/seqtrimnext/templates/genomics_short_reads.txt +26 -1
- data/lib/seqtrimnext/templates/genomics_short_reads_2.txt +24 -1
- data/lib/seqtrimnext/templates/only_quality.txt +24 -0
- data/lib/seqtrimnext/templates/sanger.txt +25 -0
- data/lib/seqtrimnext/templates/transcriptomics_454.txt +18 -1
- data/lib/seqtrimnext/templates/transcriptomics_plants.txt +22 -1
- data/lib/seqtrimnext/templates/transcriptomics_short_reads.txt +23 -1
- data/lib/seqtrimnext.rb +1 -1
- metadata +20 -7
- data/lib/seqtrimnext/plugins/plugin_adapters_old.rb +0 -165
- data/lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb +0 -245
@@ -9,18 +9,11 @@
|
|
9
9
|
# Help: <ul>
|
10
10
|
# Help: <li>PluginLowHighSize: rejecting too short or too long sequences</li>
|
11
11
|
# Help: <li>PluginKey: trimming Roche 454 sequencing keys (typically 4 first nucleotides)</li>
|
12
|
-
# Help: <li>PluginMids: trimming Roche 454 MIDs</li>
|
12
|
+
# Help: <li>PluginMids: trimming Roche 454 MIDs and keys</li>
|
13
13
|
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
14
14
|
# Help: <li>PluginAbAdapters: trimming the Roche 454 AB adapters</li>
|
15
|
-
# Help: <li>PluginAdapters: trimming the adapters found in SeqTrimNEXT database</li>
|
16
|
-
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
17
|
-
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
18
|
-
# Help: <li>PluginFindPolyAt: trimming PolyA and PolyT. After a PolyT, the sequence is checked for low complexity. </li>
|
19
15
|
# Help: <li>PluginAmplicons: getting sequences contained between two primers. Sequences with less than two primers are rejected. </li>
|
20
|
-
# Help: <li>PluginLinker: splits Roche 454 paired-end sequences by any linker found in linkers database. Linker is removed.</li>
|
21
|
-
# Help: <li>PluginLowComplexity: sequences with low complexity are stored on a separate file. </li>
|
22
16
|
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
23
|
-
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
24
17
|
# Help: </ul>
|
25
18
|
|
26
19
|
plugin_list = PluginLowHighSize,PluginKey,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginAmplicons,PluginLowQuality
|
@@ -5,14 +5,18 @@
|
|
5
5
|
# Help: <br/>This template is used to preprocess Roche 454 genomic data <br/>
|
6
6
|
|
7
7
|
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
8
|
+
|
8
9
|
# Help: <ul>
|
9
|
-
# Help: <li>PluginLowHighSize</li>
|
10
|
-
# Help: <li>PluginMids</li>
|
11
|
-
# Help: <li>PluginIndeterminations</li>
|
12
|
-
# Help: <li>PluginAbAdapters</li>
|
13
|
-
|
14
|
-
# Help: <li>
|
15
|
-
|
10
|
+
# Help: <li>PluginLowHighSize: rejecting too short or too long sequences</li>
|
11
|
+
# Help: <li>PluginMids: trimming Roche 454 MIDs and keys</li>
|
12
|
+
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
13
|
+
# Help: <li>PluginAbAdapters: trimming the Roche 454 AB adapters</li>
|
14
|
+
|
15
|
+
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
16
|
+
|
17
|
+
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
18
|
+
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
19
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
16
20
|
# Help: </ul>
|
17
21
|
|
18
|
-
plugin_list = PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginContaminants,PluginVectors,PluginLowQuality
|
22
|
+
plugin_list = PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginUserContaminants,PluginContaminants,PluginVectors,PluginLowQuality
|
@@ -4,4 +4,22 @@
|
|
4
4
|
|
5
5
|
# Help: <br/>This template is used to preprocess Roche 454 genomic data including paired-end <br/>
|
6
6
|
|
7
|
-
|
7
|
+
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
8
|
+
|
9
|
+
# Help: <ul>
|
10
|
+
# Help: <li>PluginLowHighSize: rejecting too short or too long sequences</li>
|
11
|
+
# Help: <li>PluginMids: trimming Roche 454 MIDs and keys</li>
|
12
|
+
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
13
|
+
# Help: <li>PluginAbAdapters: trimming the Roche 454 AB adapters</li>
|
14
|
+
|
15
|
+
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
16
|
+
|
17
|
+
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
18
|
+
# Help: <li>PluginLinker: splits Roche 454 paired-end sequences by any linker found in linkers database. Linker is removed.</li>
|
19
|
+
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
20
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
21
|
+
# Help: </ul>
|
22
|
+
|
23
|
+
|
24
|
+
plugin_list = PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginUserContaminants,PluginContaminants,PluginLinker,PluginVectors,PluginLowQuality
|
25
|
+
|
@@ -3,5 +3,30 @@
|
|
3
3
|
# ==============================================================
|
4
4
|
|
5
5
|
# Help: <br/>This template is used to preprocess short reads for genomics <br/>
|
6
|
+
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
7
|
+
|
8
|
+
# Help: <ul>
|
9
|
+
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
10
|
+
|
11
|
+
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
12
|
+
|
13
|
+
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
14
|
+
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
15
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
16
|
+
# Help: </ul>
|
17
|
+
|
18
|
+
plugin_list = PluginIndeterminations,PluginUserContaminants,PluginContaminants,PluginVectors,PluginLowQuality
|
19
|
+
|
20
|
+
generate_initial_stats = false
|
21
|
+
|
22
|
+
# Minimum insert size for every trimmed sequence
|
23
|
+
|
24
|
+
min_insert_size_trimmed = 30
|
25
|
+
min_quality=20
|
26
|
+
min_insert_size_trimmed=40
|
27
|
+
min_insert_size_paired=40
|
28
|
+
|
29
|
+
|
30
|
+
# do not remove cloned sequences
|
31
|
+
remove_clonality=false
|
6
32
|
|
7
|
-
plugin_list = PluginIndeterminations,PluginContaminants,PluginVectors,PluginLowQuality
|
@@ -3,5 +3,28 @@
|
|
3
3
|
# ==============================================================
|
4
4
|
|
5
5
|
# Help: <br/>This template is used to preprocess short reads for genomics including low complexity removal<br/>
|
6
|
+
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
7
|
+
|
8
|
+
# Help: <ul>
|
9
|
+
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
10
|
+
|
11
|
+
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
12
|
+
|
13
|
+
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
14
|
+
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
15
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
16
|
+
# Help: <li>PluginLowComplexity: sequences with low complexity are stored on a separate file. </li>
|
17
|
+
# Help: </ul>
|
18
|
+
|
19
|
+
plugin_list = PluginIndeterminations,PluginUserContaminants,PluginContaminants,PluginVectors,PluginLowQuality,PluginLowComplexity
|
20
|
+
|
21
|
+
generate_initial_stats = false
|
22
|
+
|
23
|
+
# Minimum insert size for every trimmed sequence
|
24
|
+
|
25
|
+
min_insert_size_trimmed = 30
|
26
|
+
|
27
|
+
|
28
|
+
# do not remove cloned sequences
|
29
|
+
remove_clonality=false
|
6
30
|
|
7
|
-
plugin_list = PluginIndeterminations,PluginContaminants,PluginVectors,PluginLowQuality,PluginLowComplexity
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# ======================================
|
2
|
+
# General parameters to extract Amplicons
|
3
|
+
# ======================================
|
4
|
+
|
5
|
+
# Help: <br/>This template is used to remove only bad quality regions<br/>
|
6
|
+
|
7
|
+
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
8
|
+
|
9
|
+
# Help: <ul>
|
10
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
11
|
+
# Help: </ul>
|
12
|
+
|
13
|
+
plugin_list = PluginLowQuality
|
14
|
+
|
15
|
+
# do not remove cloned sequences
|
16
|
+
remove_clonality=false
|
17
|
+
|
18
|
+
# remove amplicons containing less or equal number of sequences indicated
|
19
|
+
|
20
|
+
generate_initial_stats = false
|
21
|
+
|
22
|
+
# Minimum insert size for every trimmed sequence
|
23
|
+
|
24
|
+
min_insert_size_trimmed = 30
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# ======================================
|
2
|
+
# General parameters SANGER
|
3
|
+
# ======================================
|
4
|
+
|
5
|
+
# Help: <br/>This template is used to preprocess Sanger genomic data <br/>
|
6
|
+
|
7
|
+
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
8
|
+
|
9
|
+
# Help: <ul>
|
10
|
+
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
11
|
+
# Help: <li>PluginAdapters: trimming adapters</li>
|
12
|
+
|
13
|
+
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
14
|
+
|
15
|
+
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
16
|
+
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
17
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
18
|
+
# Help: </ul>
|
19
|
+
|
20
|
+
plugin_list = PluginIndeterminations,PluginAdapters,PluginFindPolyAt,PluginContaminants,PluginVectors,PluginLowQuality
|
21
|
+
|
22
|
+
|
23
|
+
# do not remove cloned sequences
|
24
|
+
remove_clonality=false
|
25
|
+
|
@@ -4,7 +4,24 @@
|
|
4
4
|
|
5
5
|
# Help: <br/>This template is used to preprocess Roche 454 transcriptomic data <br/>
|
6
6
|
|
7
|
+
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
7
8
|
|
8
|
-
|
9
|
+
# Help: <ul>
|
10
|
+
# Help: <li>PluginLowHighSize: rejecting too short or too long sequences</li>
|
11
|
+
# Help: <li>PluginMids: trimming Roche 454 MIDs and keys</li>
|
12
|
+
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
13
|
+
# Help: <li>PluginAbAdapters: trimming the Roche 454 AB adapters</li>
|
14
|
+
# Help: <li>PluginAdapters: trimming the adapters found in SeqTrimNEXT database</li>
|
15
|
+
# Help: <li>PluginFindPolyAt: trimming PolyA and PolyT. After a PolyT, the sequence is checked for low complexity. </li>
|
16
|
+
|
17
|
+
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
18
|
+
|
19
|
+
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
20
|
+
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
21
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
22
|
+
# Help: <li>PluginLowComplexity: sequences with low complexity are stored on a separate file. </li>
|
23
|
+
# Help: </ul>
|
24
|
+
|
25
|
+
plugin_list = PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginAdapters,PluginFindPolyAt,PluginUserContaminants,PluginContaminants,PluginVectors,PluginLowQuality,PluginLowComplexity
|
9
26
|
|
10
27
|
contaminants_db="contaminants.fasta cont_ribosome.fasta"
|
@@ -3,7 +3,28 @@
|
|
3
3
|
# ======================================
|
4
4
|
|
5
5
|
# Help: <br/>This template is used to preprocess Roche 454 transcriptomic data. Customized for plants.<br/>
|
6
|
+
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
6
7
|
|
7
|
-
|
8
|
+
# Help: <ul>
|
9
|
+
# Help: <li>PluginLowHighSize: rejecting too short or too long sequences</li>
|
10
|
+
# Help: <li>PluginMids: trimming Roche 454 MIDs and keys</li>
|
11
|
+
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
12
|
+
# Help: <li>PluginAbAdapters: trimming the Roche 454 AB adapters</li>
|
13
|
+
# Help: <li>PluginAdapters: trimming the adapters found in SeqTrimNEXT database</li>
|
14
|
+
# Help: <li>PluginFindPolyAt: trimming PolyA and PolyT. After a PolyT, the sequence is checked for low complexity. </li>
|
15
|
+
|
16
|
+
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
17
|
+
|
18
|
+
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
19
|
+
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
20
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
21
|
+
# Help: <li>PluginLowComplexity: sequences with low complexity are stored on a separate file. </li>
|
22
|
+
# Help: </ul>
|
23
|
+
|
24
|
+
plugin_list = PluginLowHighSize,PluginMids,PluginIndeterminations,PluginAbAdapters,PluginAdapters,PluginFindPolyAt,PluginUserContaminants,PluginContaminants,PluginVectors,PluginLowQuality,PluginLowComplexity
|
8
25
|
|
9
26
|
contaminants_db="contaminants.fasta cont_ribosome.fasta cont_mitochondrias.fasta cont_plastids.fasta"
|
27
|
+
|
28
|
+
|
29
|
+
# do not remove cloned sequences
|
30
|
+
remove_clonality=false
|
@@ -3,7 +3,29 @@
|
|
3
3
|
# ======================================
|
4
4
|
|
5
5
|
# Help: <br/>This template is used to preprocess short reads for transcriptomics<br/>
|
6
|
+
# Help: <br/><b>Plugin list and aplication order:</b><br/>
|
6
7
|
|
7
|
-
|
8
|
+
# Help: <ul>
|
9
|
+
# Help: <li>PluginIndeterminations: retaining the longest sequence fragment without indeterminations (N)</li>
|
10
|
+
# Help: <li>PluginFindPolyAt: trimming PolyA and PolyT. After a PolyT, the sequence is checked for low complexity. </li>
|
11
|
+
|
12
|
+
# Help: <li>PluginUserContaminants: discarding sequences matching any entry in the user contaminant database saving them in a separate file</li>
|
13
|
+
|
14
|
+
# Help: <li>PluginContaminants: trimming the contaminant fragments found in the contaminant database. When contamination is prevalent, sequences are rejected. </li>
|
15
|
+
# Help: <li>PluginVectors: trimming any cloning vector found in SeqTrimNEXT database. </li>
|
16
|
+
# Help: <li>PluginLowQuality: trimming low quality regions from sequences. </li>
|
17
|
+
# Help: <li>PluginLowComplexity: sequences with low complexity are stored on a separate file. </li>
|
18
|
+
# Help: </ul>
|
19
|
+
|
20
|
+
plugin_list = PluginIndeterminations,PluginFindPolyAt,PluginUserContaminants,PluginContaminants,PluginVectors,PluginLowQuality,PluginLowComplexity
|
8
21
|
|
9
22
|
contaminants_db="contaminants.fasta cont_ribosome.fasta"
|
23
|
+
|
24
|
+
generate_initial_stats = false
|
25
|
+
|
26
|
+
# Minimum insert size for every trimmed sequence
|
27
|
+
|
28
|
+
min_insert_size_trimmed = 30
|
29
|
+
|
30
|
+
# do not remove cloned sequences
|
31
|
+
remove_clonality=false
|
data/lib/seqtrimnext.rb
CHANGED
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: seqtrimnext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 2.0.
|
5
|
+
version: 2.0.52
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dario Guerrero & Almudena Bocinos
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-07-16 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: narray
|
@@ -123,16 +123,27 @@ dependencies:
|
|
123
123
|
type: :runtime
|
124
124
|
version_requirements: *id010
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
126
|
+
name: scbi_headers
|
127
127
|
prerelease: false
|
128
128
|
requirement: &id011 !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: 0.0.2
|
134
|
+
type: :runtime
|
135
|
+
version_requirements: *id011
|
136
|
+
- !ruby/object:Gem::Dependency
|
137
|
+
name: hoe
|
138
|
+
prerelease: false
|
139
|
+
requirement: &id012 !ruby/object:Gem::Requirement
|
129
140
|
none: false
|
130
141
|
requirements:
|
131
142
|
- - ">="
|
132
143
|
- !ruby/object:Gem::Version
|
133
144
|
version: 2.8.0
|
134
145
|
type: :development
|
135
|
-
version_requirements: *
|
146
|
+
version_requirements: *id012
|
136
147
|
description: SeqtrimNEXT is a customizable and distributed pre-processing software for NGS (Next Generation Sequencing) biological data. It makes use of scbi_mapreduce gem to be able to run in parallel and distributed environments. It is specially suited for Roche 454 (normal and paired-end) & Ilumina datasets, although it could be easyly adapted to any other situation.
|
137
148
|
email:
|
138
149
|
- dariogf@gmail.com & alkoke@gmail.com
|
@@ -164,6 +175,8 @@ extensions: []
|
|
164
175
|
extra_rdoc_files:
|
165
176
|
- History.txt
|
166
177
|
- lib/seqtrimnext/templates/amplicons.txt
|
178
|
+
- lib/seqtrimnext/templates/sanger.txt
|
179
|
+
- lib/seqtrimnext/templates/only_quality.txt
|
167
180
|
- lib/seqtrimnext/templates/genomics_454.txt
|
168
181
|
- lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
169
182
|
- lib/seqtrimnext/templates/genomics_short_reads.txt
|
@@ -200,7 +213,7 @@ files:
|
|
200
213
|
- lib/seqtrimnext/actions/action_ab_adapter.rb
|
201
214
|
- lib/seqtrimnext/actions/action_ab_far_adapter.rb
|
202
215
|
- lib/seqtrimnext/actions/action_ab_left_adapter.rb
|
203
|
-
- lib/seqtrimnext/actions/
|
216
|
+
- lib/seqtrimnext/actions/action_user_contaminant.rb
|
204
217
|
- lib/seqtrimnext/actions/action_empty_insert.rb
|
205
218
|
- lib/seqtrimnext/actions/action_ignore_repeated.rb
|
206
219
|
- lib/seqtrimnext/actions/action_indetermination.rb
|
@@ -251,7 +264,6 @@ files:
|
|
251
264
|
- lib/seqtrimnext/plugins/plugin.rb
|
252
265
|
- lib/seqtrimnext/plugins/plugin_ab_adapters.rb
|
253
266
|
- lib/seqtrimnext/plugins/plugin_adapters.rb
|
254
|
-
- lib/seqtrimnext/plugins/plugin_adapters_old.rb
|
255
267
|
- lib/seqtrimnext/plugins/plugin_amplicons.rb
|
256
268
|
- lib/seqtrimnext/plugins/plugin_contaminants.rb
|
257
269
|
- lib/seqtrimnext/plugins/plugin_user_contaminants.rb
|
@@ -265,10 +277,11 @@ files:
|
|
265
277
|
- lib/seqtrimnext/plugins/plugin_low_high_size.rb
|
266
278
|
- lib/seqtrimnext/plugins/plugin_low_quality.rb
|
267
279
|
- lib/seqtrimnext/plugins/plugin_mids.rb
|
268
|
-
- lib/seqtrimnext/plugins/plugin_rem_adit_artifacts.rb
|
269
280
|
- lib/seqtrimnext/plugins/plugin_short_insert.rb
|
270
281
|
- lib/seqtrimnext/plugins/plugin_vectors.rb
|
271
282
|
- lib/seqtrimnext/templates/amplicons.txt
|
283
|
+
- lib/seqtrimnext/templates/sanger.txt
|
284
|
+
- lib/seqtrimnext/templates/only_quality.txt
|
272
285
|
- lib/seqtrimnext/templates/genomics_454.txt
|
273
286
|
- lib/seqtrimnext/templates/genomics_454_with_paired.txt
|
274
287
|
- lib/seqtrimnext/templates/genomics_short_reads.txt
|
@@ -1,165 +0,0 @@
|
|
1
|
-
require "plugin"
|
2
|
-
|
3
|
-
########################################################
|
4
|
-
# Author: Almudena Bocinos Rioboo
|
5
|
-
#
|
6
|
-
# Defines the main methods that are necessary to execute PluginAdapters
|
7
|
-
# Inherit: Plugin
|
8
|
-
########################################################
|
9
|
-
|
10
|
-
class PluginAdaptersOld < Plugin
|
11
|
-
|
12
|
-
def get_type_adapter(p_start,p_end,seq)
|
13
|
-
#if q_beg is nearer the left, add adapter action by the left,
|
14
|
-
#if q_end esta is nearer the right , add adapter action by the right
|
15
|
-
#NOTE: If the adapter is very near from left and rigth,
|
16
|
-
#then the sequence isn't valid, because almost sequence is adapter.
|
17
|
-
|
18
|
-
|
19
|
-
v1= p_end.to_i
|
20
|
-
v2= p_start.to_i
|
21
|
-
|
22
|
-
# puts " startadapter #{v2} endadapter #{v1} insert_start #{seq.insert_start} insert_end #{seq.insert_end}"
|
23
|
-
|
24
|
-
# puts " #{v2+seq.insert_start} <? #{seq.seq_fasta.length - v1 - 1 + seq.seq_fasta_orig.length - seq.insert_end-1}"
|
25
|
-
if (v2+seq.insert_start < (seq.seq_fasta.length - v1 - 1+ seq.seq_fasta_orig.length - seq.insert_end-1)) #IF THE NEAREST ONE IS THE LEFT
|
26
|
-
type = "ActionLeftAdapter"
|
27
|
-
|
28
|
-
else
|
29
|
-
type = "ActionRightAdapter"
|
30
|
-
|
31
|
-
end
|
32
|
-
return type
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
def cut_by_right(adapter,seq)
|
37
|
-
|
38
|
-
left_size = adapter.q_beg-seq.insert_start+1
|
39
|
-
right_size = seq.insert_end-adapter.q_end+1
|
40
|
-
left_size=0 if (left_size<0)
|
41
|
-
right_size=0 if (right_size<0)
|
42
|
-
|
43
|
-
return (left_size>(right_size/2).to_i)
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
#Begins the plugin1's execution to warn that there are contaminants in the sequence "seq"
|
48
|
-
def execute(seqs)
|
49
|
-
blasts= do_blasts(seqs)
|
50
|
-
|
51
|
-
seqs.each_with_index do |s,i|
|
52
|
-
exec_seq(s,blasts.querys[i])
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def do_blasts(seqs)
|
57
|
-
# find MIDS with less results than max_target_seqs value
|
58
|
-
blast=BatchBlast.new("-db #{@params.get_param('adapters_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')}")
|
59
|
-
$LOG.debug('BLAST:'+blast.get_blast_cmd)
|
60
|
-
|
61
|
-
fastas=[]
|
62
|
-
|
63
|
-
seqs.each do |seq|
|
64
|
-
fastas.push ">"+seq.seq_name
|
65
|
-
fastas.push seq.seq_fasta
|
66
|
-
end
|
67
|
-
|
68
|
-
# fastas=fastas.join("\n")
|
69
|
-
|
70
|
-
blast_table_results = blast.do_blast(fastas)
|
71
|
-
|
72
|
-
# puts blast_table_results.inspect
|
73
|
-
|
74
|
-
return blast_table_results
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
def exec_seq(seq,blast_query)
|
79
|
-
if blast_query.query_id != seq.seq_name
|
80
|
-
raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}"
|
81
|
-
end
|
82
|
-
|
83
|
-
$LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for adapters into the sequence"
|
84
|
-
|
85
|
-
|
86
|
-
# blast=BatchBlast.new("-db #{File.join($FORMATTED_DB_PATH,'adapters.fasta')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_adapters')} -perc_identity #{@params.get_param('blast_percent_adapters')}")
|
87
|
-
|
88
|
-
# blast with only one sequence, no with many sequences from a database
|
89
|
-
#---------------------------------------------------------------------
|
90
|
-
|
91
|
-
# blast_table_results = blast.do_blast(seq.seq_fasta) #rise seq to adapterss executing over blast
|
92
|
-
|
93
|
-
#blast_table_results = BlastTableResult.new(res)
|
94
|
-
|
95
|
-
# blast_table_results.inspect
|
96
|
-
|
97
|
-
adapters=[]
|
98
|
-
# blast_table_results.querys.each do |query| # first round to save adapters without overlap
|
99
|
-
merge_hits(blast_query,adapters)
|
100
|
-
# end
|
101
|
-
|
102
|
-
begin
|
103
|
-
adapters2=adapters # second round to save adapters without overlap
|
104
|
-
adapters = []
|
105
|
-
merge_hits(adapters2,adapters)
|
106
|
-
end until (adapters2.count == adapters.count)
|
107
|
-
|
108
|
-
actions=[]
|
109
|
-
adapter_size=0
|
110
|
-
# @stats['adapter_size']={}
|
111
|
-
adapters.each do |ad| # adds the correspondent action to the sequence
|
112
|
-
|
113
|
-
type = get_type_adapter(ad.q_beg,ad.q_end,seq)
|
114
|
-
a = seq.new_action(ad.q_beg,ad.q_end,type)
|
115
|
-
# puts " state left_action #{a.left_action} right_action #{a.right_action}"
|
116
|
-
|
117
|
-
|
118
|
-
adapter_size=ad.q_end-ad.q_beg+1
|
119
|
-
|
120
|
-
if cut_by_right(ad,seq)
|
121
|
-
|
122
|
-
# puts "action right end1 #{seq.insert_end}"
|
123
|
-
|
124
|
-
a.right_action=true #mark rigth action to get the left insert
|
125
|
-
else
|
126
|
-
|
127
|
-
# puts " cut1 by left #{seq.insert_start} ad #{ad.q_beg+seq.insert_start} #{ad.q_end+seq.insert_start}"
|
128
|
-
|
129
|
-
a.left_action = true #mark left action to get the right insert
|
130
|
-
|
131
|
-
end
|
132
|
-
|
133
|
-
a.message = ad.subject_id
|
134
|
-
a.reversed = ad.reversed
|
135
|
-
actions.push a
|
136
|
-
|
137
|
-
# @stats[:adapter_size]={adapter_size => 1}
|
138
|
-
add_stats('adapter_size',adapter_size)
|
139
|
-
|
140
|
-
end
|
141
|
-
seq.add_actions(actions)
|
142
|
-
#
|
143
|
-
end
|
144
|
-
|
145
|
-
#Returns an array with the errors due to parameters are missing
|
146
|
-
def self.check_params(params)
|
147
|
-
errors=[]
|
148
|
-
|
149
|
-
comment='Blast E-value used as cut-off when searching for adapters or primers'
|
150
|
-
default_value = 1e-6
|
151
|
-
params.check_param(errors,'blast_evalue_adapters','Float',default_value,comment)
|
152
|
-
|
153
|
-
comment='Minimum required identity (%) for a reliable adapter'
|
154
|
-
default_value = 95
|
155
|
-
params.check_param(errors,'blast_percent_adapters','Integer',default_value,comment)
|
156
|
-
|
157
|
-
comment='Path for adapter database'
|
158
|
-
default_value = File.join($FORMATTED_DB_PATH,'adapters.fasta')
|
159
|
-
params.check_param(errors,'adapters_db','DB',default_value,comment)
|
160
|
-
|
161
|
-
return errors
|
162
|
-
end
|
163
|
-
|
164
|
-
|
165
|
-
end
|