genfrag 0.0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.bnsignore +16 -0
- data/History.txt +4 -0
- data/LICENSE.txt +58 -0
- data/README.rdoc +40 -0
- data/Rakefile +53 -0
- data/bin/genfrag +8 -0
- data/lib/genfrag.rb +129 -0
- data/lib/genfrag/app.rb +105 -0
- data/lib/genfrag/app/command.rb +145 -0
- data/lib/genfrag/app/index_command.rb +227 -0
- data/lib/genfrag/app/index_command/db.rb +105 -0
- data/lib/genfrag/app/search_command.rb +298 -0
- data/lib/genfrag/app/search_command/match.rb +165 -0
- data/lib/genfrag/app/search_command/process_file.rb +125 -0
- data/lib/genfrag/app/search_command/trim.rb +121 -0
- data/lib/genfrag/debug.rb +0 -0
- data/spec/data/index_command/in/a.fasta +109 -0
- data/spec/data/index_command/out/1-a_lookup.tdf +4 -0
- data/spec/data/index_command/out/2-a_lookup.db +0 -0
- data/spec/data/index_command/out/3-a_lookup.tdf +2 -0
- data/spec/data/index_command/out/4-a_lookup.db +0 -0
- data/spec/data/index_command/out/5-a_lookup.tdf +4 -0
- data/spec/data/index_command/out/6-a_lookup.db +0 -0
- data/spec/data/index_command/out/a.fasta.db +0 -0
- data/spec/data/index_command/out/a.fasta.tdf +6 -0
- data/spec/genfrag/app/command_spec.rb +55 -0
- data/spec/genfrag/app/index_command_spec.rb +258 -0
- data/spec/genfrag/app/search_command/match_spec.rb +77 -0
- data/spec/genfrag/app/search_command/process_file_spec.rb +185 -0
- data/spec/genfrag/app/search_command/trim_spec.rb +75 -0
- data/spec/genfrag/app/search_command_spec.rb +260 -0
- data/spec/genfrag/app_spec.rb +77 -0
- data/spec/genfrag_spec.rb +87 -0
- data/spec/spec_helper.rb +56 -0
- data/tasks/ann.rake +80 -0
- data/tasks/bones.rake +20 -0
- data/tasks/gem.rake +201 -0
- data/tasks/git.rake +40 -0
- data/tasks/notes.rake +27 -0
- data/tasks/post_load.rake +34 -0
- data/tasks/rdoc.rake +50 -0
- data/tasks/rubyforge.rake +55 -0
- data/tasks/setup.rb +300 -0
- data/tasks/spec.rake +54 -0
- data/tasks/svn.rake +47 -0
- data/tasks/test.rake +40 -0
- metadata +136 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
|
2
|
+
module Genfrag
|
3
|
+
class App
|
4
|
+
|
5
|
+
class IndexCommand < Command
|
6
|
+
|
7
|
+
attr_reader :sizes
|
8
|
+
|
9
|
+
# Run from command-line
|
10
|
+
#
|
11
|
+
def cli_run( args )
|
12
|
+
parse args
|
13
|
+
|
14
|
+
@input_filenames = ARGV
|
15
|
+
|
16
|
+
validate_options(options)
|
17
|
+
|
18
|
+
if options[:tracktime]
|
19
|
+
Genfrag.tracktime {
|
20
|
+
run(options, @input_filenames, true)
|
21
|
+
}
|
22
|
+
else
|
23
|
+
run(options, @input_filenames, true)
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
# Main class for creating the index - accepts multiple input files. Either an SQLite database or
|
29
|
+
# a flat file index is created (extension .tdf) which is unique for the input file combination.
|
30
|
+
# This file is used by the Search routine later.
|
31
|
+
#
|
32
|
+
def run(ops=@ops, input_filenames=[], cli=false)
|
33
|
+
if ops.kind_of? OpenStruct
|
34
|
+
@ops = ops.dup
|
35
|
+
elsif ops.kind_of? Hash
|
36
|
+
@ops = OpenStruct.new(ops)
|
37
|
+
else
|
38
|
+
raise ArgumentError
|
39
|
+
end
|
40
|
+
|
41
|
+
# Set defaults
|
42
|
+
@ops.verbose ||= false
|
43
|
+
@ops.quiet ||= false
|
44
|
+
@ops.sqlite ||= false
|
45
|
+
@ops.filelookup ||= nil
|
46
|
+
@ops.filefasta ||= nil
|
47
|
+
@ops.re5 ||= nil
|
48
|
+
@ops.re3 ||= nil
|
49
|
+
@ops.indir ||= '.'
|
50
|
+
@ops.outdir ||= '.'
|
51
|
+
|
52
|
+
@input_filenames = input_filenames.empty? ? [@ops.filefasta] : input_filenames
|
53
|
+
@sizes = {}
|
54
|
+
db = IndexCommand::DB.new(@ops, @input_filenames)
|
55
|
+
@re5_ds, @re3_ds = [@ops.re5, @ops.re3].map {|x| Bio::RestrictionEnzyme::DoubleStranded.new(x)}
|
56
|
+
db.write_headers
|
57
|
+
|
58
|
+
if @ops.verbose
|
59
|
+
cli_p(cli, <<-END
|
60
|
+
RE5: #{@ops.re5}
|
61
|
+
#{@re5_ds.aligned_strands_with_cuts.primary}
|
62
|
+
#{@re5_ds.aligned_strands_with_cuts.complement}
|
63
|
+
|
64
|
+
RE3: #{@ops.re3}
|
65
|
+
#{@re3_ds.aligned_strands_with_cuts.primary}
|
66
|
+
#{@re3_ds.aligned_strands_with_cuts.complement}
|
67
|
+
END
|
68
|
+
)
|
69
|
+
end
|
70
|
+
|
71
|
+
# unit test with aasi, aari, and ppii
|
72
|
+
re5_regexp, re3_regexp = [@ops.re5, @ops.re3].map {|x| Bio::Sequence::NA.new( Bio::RestrictionEnzyme::DoubleStranded.new(x).aligned_strands.primary ).to_re }
|
73
|
+
|
74
|
+
entries = {}
|
75
|
+
# Account for exact duplicate sequences
|
76
|
+
@input_filenames.each do |input_filename|
|
77
|
+
Bio::FlatFile.auto(File.join(@ops.indir, input_filename)).each_entry do |e|
|
78
|
+
e.definition.tr!("\t",'')
|
79
|
+
s = e.seq.to_s.downcase
|
80
|
+
if entries[s]
|
81
|
+
entries[s] << e.definition
|
82
|
+
else
|
83
|
+
entries[s] = [e.definition]
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
a_re = /(.*)(#{re5_regexp})/
|
89
|
+
b_re = /(.*?)(#{re3_regexp})/
|
90
|
+
|
91
|
+
normalized_fasta_id=0
|
92
|
+
entries.each do |seq, definitions|
|
93
|
+
normalized_fasta_id+=1
|
94
|
+
db.write_entry_to_fasta(normalized_fasta_id, seq, definitions)
|
95
|
+
|
96
|
+
# NOTE the index command is slow because of the match functions, compare with ruby 1.9
|
97
|
+
m1 = a_re.match(seq)
|
98
|
+
if m1
|
99
|
+
# Find the fragment 'frag1' cut most right in seq with re5_regexp
|
100
|
+
frag1 = $2 + m1.post_match
|
101
|
+
|
102
|
+
position = $1.size
|
103
|
+
|
104
|
+
m2 = b_re.match( frag1 )
|
105
|
+
|
106
|
+
# Now cut frag1 with re3_regexp resulting in frag2
|
107
|
+
if m2
|
108
|
+
@frag2 = $1 + $2
|
109
|
+
if @ops.verbose
|
110
|
+
cli_p(cli, <<-END
|
111
|
+
---
|
112
|
+
#{definitions.join("\n")}
|
113
|
+
#{@frag2}
|
114
|
+
END
|
115
|
+
)
|
116
|
+
end
|
117
|
+
@sizes[@frag2.size] ||= []
|
118
|
+
@sizes[@frag2.size] << [position, normalized_fasta_id]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
i=0
|
125
|
+
@sizes.each do |size,info|
|
126
|
+
i+=1
|
127
|
+
db.write_entry_to_freq(i, size, info.map {|x| x.join(' ')}.join(', ') )
|
128
|
+
end
|
129
|
+
|
130
|
+
if @ops.verbose
|
131
|
+
@sizes.each { |@entry| cli_p(cli, @entry.inspect) }
|
132
|
+
else
|
133
|
+
cli_p(cli, "Cut sites found: #{@sizes.values.flatten.size / 2}")
|
134
|
+
end
|
135
|
+
|
136
|
+
db.close
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
############
|
141
|
+
# Command-line
|
142
|
+
############
|
143
|
+
|
144
|
+
|
145
|
+
# Option parser for command-line
|
146
|
+
#
|
147
|
+
def opt_parser
|
148
|
+
std_opts = standard_options
|
149
|
+
|
150
|
+
opts = OptionParser.new
|
151
|
+
opts.banner = 'Usage: genfrag index [options]'
|
152
|
+
|
153
|
+
opts.separator ''
|
154
|
+
opts.separator " Create a database of sequence fragments that match the last 5' fragment"
|
155
|
+
opts.separator " cut by two restricting enzymes RE3 and RE5."
|
156
|
+
opts.separator " The Fasta file defined by the --fasta option is taken as input."
|
157
|
+
opts.separator " Two files are created for the search function - a lookup file, and"
|
158
|
+
opts.separator " the contents of the Fasta file rewritten in a special format. You can"
|
159
|
+
opts.separator " specify the name of the lookup file with the --lookup option."
|
160
|
+
|
161
|
+
opts.separator ''
|
162
|
+
|
163
|
+
ary = [:verbose, :quiet, :tracktime, :indir, :outdir, :sqlite, :re5, :re3,
|
164
|
+
:filelookup, :filefasta
|
165
|
+
]
|
166
|
+
ary.each { |a| opts.on(*std_opts[a]) }
|
167
|
+
|
168
|
+
opts.separator ''
|
169
|
+
opts.separator ' Common Options:'
|
170
|
+
opts.on( '-h', '--help', 'show this message' ) { @out.puts opts; exit 1 }
|
171
|
+
opts.separator ' Examples:'
|
172
|
+
opts.separator ' genfrag index -f example.fasta --re5 BstYI --re3 MseI'
|
173
|
+
opts.separator ' genfrag index --out /tmp --in . -f example.fasta --re5 BstYI --re3 MseI'
|
174
|
+
opts
|
175
|
+
end
|
176
|
+
|
177
|
+
# Parse options passed from command-line
|
178
|
+
#
|
179
|
+
def parse( args )
|
180
|
+
opts = opt_parser
|
181
|
+
|
182
|
+
if args.empty?
|
183
|
+
@out.puts opts
|
184
|
+
exit 1
|
185
|
+
end
|
186
|
+
|
187
|
+
# parse the command line arguments
|
188
|
+
opts.parse! args
|
189
|
+
end
|
190
|
+
|
191
|
+
# Validate options passed from the command-line
|
192
|
+
def validate_options(o)
|
193
|
+
if o[:filefasta] == nil
|
194
|
+
clierr_p "missing option: must supply fasta filename"
|
195
|
+
exit 1
|
196
|
+
end
|
197
|
+
|
198
|
+
if o[:re5] == nil
|
199
|
+
clierr_p "missing option: re5"
|
200
|
+
exit 1
|
201
|
+
end
|
202
|
+
|
203
|
+
if o[:re3] == nil
|
204
|
+
clierr_p "missing option: re3"
|
205
|
+
exit 1
|
206
|
+
end
|
207
|
+
|
208
|
+
begin
|
209
|
+
Bio::RestrictionEnzyme::DoubleStranded.new(o[:re3])
|
210
|
+
rescue
|
211
|
+
clierr_p "re3 is not an enzyme name"
|
212
|
+
exit 1
|
213
|
+
end
|
214
|
+
|
215
|
+
begin
|
216
|
+
Bio::RestrictionEnzyme::DoubleStranded.new(o[:re5])
|
217
|
+
rescue
|
218
|
+
clierr_p "re5 is not an enzyme name"
|
219
|
+
exit 1
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
end # class IndexCommand
|
224
|
+
end # class App
|
225
|
+
end # module Genfrag
|
226
|
+
|
227
|
+
# EOF
|
@@ -0,0 +1,105 @@
|
|
1
|
+
module Genfrag
|
2
|
+
class App
|
3
|
+
|
4
|
+
class IndexCommand < Command
|
5
|
+
|
6
|
+
class DB
|
7
|
+
attr_accessor :ops # an OpenStruct of the options
|
8
|
+
attr_accessor :input_filenames
|
9
|
+
attr_accessor :normalized_fasta
|
10
|
+
attr_accessor :freq_lookup
|
11
|
+
|
12
|
+
def initialize( ops, input_filenames )
|
13
|
+
@normalized_fasta = nil
|
14
|
+
@freq_lookup = nil
|
15
|
+
@ops = ops
|
16
|
+
@input_filenames = input_filenames
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
def sc
|
21
|
+
@ops.sqlite ? 'sqlite' : 'csv'
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def write_headers
|
26
|
+
self.send("write_headers_#{sc}")
|
27
|
+
end
|
28
|
+
|
29
|
+
def write_headers_sqlite
|
30
|
+
@normalized_fasta = SQLite3::Database.new( File.join(@ops.outdir, Genfrag.name_normalized_fasta(@input_filenames,@ops.filefasta) + '.db') )
|
31
|
+
sql = <<-SQL
|
32
|
+
drop table if exists db_normalized_fasta;
|
33
|
+
create table db_normalized_fasta (
|
34
|
+
id integer,
|
35
|
+
definitions text,
|
36
|
+
sequence text
|
37
|
+
);
|
38
|
+
create unique index db_normalized_fasta_idx on db_normalized_fasta(id);
|
39
|
+
SQL
|
40
|
+
@normalized_fasta.execute_batch( sql )
|
41
|
+
@freq_lookup = SQLite3::Database.new( File.join(@ops.outdir, Genfrag.name_freq_lookup(@input_filenames,@ops.filefasta,@ops.filelookup,@ops.re5,@ops.re3) + '.db') )
|
42
|
+
sql = <<-SQL
|
43
|
+
drop table if exists db_freq_lookup;
|
44
|
+
create table db_freq_lookup (
|
45
|
+
id integer,
|
46
|
+
size integer,
|
47
|
+
positions text
|
48
|
+
);
|
49
|
+
create unique index db_freq_lookup_idx on db_freq_lookup(id);
|
50
|
+
SQL
|
51
|
+
@freq_lookup.execute_batch( sql )
|
52
|
+
end
|
53
|
+
|
54
|
+
def write_headers_csv
|
55
|
+
@normalized_fasta = File.new(File.join(@ops.outdir,Genfrag.name_normalized_fasta(@input_filenames,@ops.filefasta) + '.tdf'), 'w')
|
56
|
+
@normalized_fasta.puts %w(id Definitions Sequence).join("\t")
|
57
|
+
@freq_lookup = File.new( File.join(@ops.outdir,Genfrag.name_freq_lookup(@input_filenames,@ops.filefasta,@ops.filelookup,@ops.re5,@ops.re3) + '.tdf'), 'w')
|
58
|
+
@freq_lookup.puts %w(id Size Positions).join("\t")
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
def write_entry_to_fasta(normalized_fasta_id, seq, definitions)
|
63
|
+
self.send("write_entry_to_fasta_#{sc}", normalized_fasta_id, seq, definitions)
|
64
|
+
end
|
65
|
+
|
66
|
+
def write_entry_to_fasta_sqlite(normalized_fasta_id, seq, definitions)
|
67
|
+
@normalized_fasta.execute( "insert into db_normalized_fasta values ( ?, ?, ? )", normalized_fasta_id, CSV.generate_line(definitions), seq )
|
68
|
+
end
|
69
|
+
|
70
|
+
def write_entry_to_fasta_csv(normalized_fasta_id, seq, definitions)
|
71
|
+
@normalized_fasta.puts [normalized_fasta_id,CSV.generate_line(definitions),seq].join("\t")
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
def write_entry_to_freq(i, size, str)
|
76
|
+
self.send("write_entry_to_freq_#{sc}", i, size, str)
|
77
|
+
end
|
78
|
+
|
79
|
+
def write_entry_to_freq_sqlite(i, size, str)
|
80
|
+
@freq_lookup.execute( "insert into db_freq_lookup values ( ?, ?, ? )", i, size, str )
|
81
|
+
end
|
82
|
+
|
83
|
+
def write_entry_to_freq_csv(i, size, str)
|
84
|
+
@freq_lookup.puts [i,size,str].join("\t")
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def close
|
89
|
+
self.send("close_#{sc}")
|
90
|
+
end
|
91
|
+
|
92
|
+
def close_sqlite
|
93
|
+
end
|
94
|
+
|
95
|
+
def close_csv
|
96
|
+
@normalized_fasta.close
|
97
|
+
@freq_lookup.close
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end # class IndexCommand
|
102
|
+
end # class App
|
103
|
+
end # module Genfrag
|
104
|
+
|
105
|
+
# EOF
|
@@ -0,0 +1,298 @@
|
|
1
|
+
|
2
|
+
module Genfrag
|
3
|
+
class App
|
4
|
+
|
5
|
+
class SearchCommand < Command
|
6
|
+
|
7
|
+
def cli_run( args )
|
8
|
+
parse args
|
9
|
+
|
10
|
+
@input_filenames = ARGV
|
11
|
+
input_filenames = [@input_filenames].flatten
|
12
|
+
processed_adapters=nil
|
13
|
+
|
14
|
+
validate_options(options)
|
15
|
+
|
16
|
+
|
17
|
+
if options[:sqlite]
|
18
|
+
processed_fasta_file = SearchCommand::ProcessFile.process_db_fasta_file( SQLite3::Database.new( Genfrag.name_normalized_fasta(input_filenames,options[:filefasta]) + '.db' ) )
|
19
|
+
processed_freq_lookup = SearchCommand::ProcessFile.process_db_freq_lookup( SQLite3::Database.new( Genfrag.name_freq_lookup(input_filenames,options[:filefasta],options[:filelookup],options[:re5],options[:re3]) + '.db' ) )
|
20
|
+
else
|
21
|
+
processed_fasta_file = SearchCommand::ProcessFile.process_tdf_fasta_file( IO.readlines( Genfrag.name_normalized_fasta(input_filenames,options[:filefasta]) + '.tdf' ) )
|
22
|
+
processed_freq_lookup = SearchCommand::ProcessFile.process_tdf_freq_lookup( IO.readlines( Genfrag.name_freq_lookup(input_filenames,options[:filefasta],options[:filelookup],options[:re5],options[:re3]) + '.tdf' ) )
|
23
|
+
end
|
24
|
+
|
25
|
+
if options[:fileadapters]
|
26
|
+
processed_adapters = SearchCommand::ProcessFile.process_tdf_adapters( IO.readlines( Genfrag.name_adapters(options[:fileadapters]) + '.tdf' ), options[:named_adapter5], options[:named_adapter3] )
|
27
|
+
end
|
28
|
+
|
29
|
+
run(options, processed_fasta_file, processed_freq_lookup, processed_adapters, true)
|
30
|
+
end
|
31
|
+
|
32
|
+
def opt_parser
|
33
|
+
std_opts = standard_options
|
34
|
+
|
35
|
+
opts = OptionParser.new
|
36
|
+
opts.banner = 'Usage: genfrag search [options]'
|
37
|
+
|
38
|
+
opts.separator ''
|
39
|
+
opts.separator " Search a database of sequence fragments that match the last 5'"
|
40
|
+
opts.separator " fragment cut by two restricting enzymes RE3 and RE5, as created by the"
|
41
|
+
opts.separator " index function. Next, adapters are applied to search a subset of"
|
42
|
+
opts.separator " fragments, as is used in some protocols."
|
43
|
+
|
44
|
+
opts.separator ''
|
45
|
+
ary = [:verbose, :quiet, :tracktime, :indir, :outdir, :sqlite, :re5, :re3,
|
46
|
+
:filelookup, :filefasta, :fileadapters, :adapter5_sequence, :adapter3_sequence,
|
47
|
+
:adapter5_size, :adapter3_size, :named_adapter5, :named_adapter3,
|
48
|
+
:adapter5, :adapter3
|
49
|
+
]
|
50
|
+
ary.each { |a| opts.on(*std_opts[a]) }
|
51
|
+
|
52
|
+
opts.separator ''
|
53
|
+
opts.separator ' Common Options:'
|
54
|
+
opts.on( '-h', '--help', 'show this message' ) { @out.puts opts; exit }
|
55
|
+
|
56
|
+
opts.separator ' Examples:'
|
57
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --adapter5 tt'
|
58
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --add 26 --adapter5 ct --adapter3 aa --size 190,215'
|
59
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --adapter5-size 11 --adapter5 tt --adapter3-size 15 --size 168'
|
60
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --adapter5-sequence GACTGCGTAGTGATC --adapter5 tt --adapter3-size 15 --size 168'
|
61
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --adapter5-size 11 --adapter5 ct --adapter3-size 15 --adapter3 aa --size 190,215'
|
62
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --add 26 --named-adapter5 BstYI-T4 --named-adapter3 MseI-21 --size 190,215'
|
63
|
+
opts
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse( args )
|
67
|
+
opts = opt_parser
|
68
|
+
|
69
|
+
if args.empty?
|
70
|
+
@out.puts opts
|
71
|
+
exit 1
|
72
|
+
end
|
73
|
+
|
74
|
+
# parse the command line arguments
|
75
|
+
opts.parse! args
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
def validate_options(o)
|
80
|
+
if o[:filefasta] == nil
|
81
|
+
clierr_p "missing option: must supply fasta filename"
|
82
|
+
exit 1
|
83
|
+
end
|
84
|
+
|
85
|
+
if o[:re5] == nil
|
86
|
+
clierr_p "missing option: re5"
|
87
|
+
exit 1
|
88
|
+
end
|
89
|
+
|
90
|
+
if o[:re3] == nil
|
91
|
+
clierr_p "missing option: re3"
|
92
|
+
exit 1
|
93
|
+
end
|
94
|
+
|
95
|
+
begin
|
96
|
+
Bio::RestrictionEnzyme::DoubleStranded.new(o[:re3])
|
97
|
+
rescue
|
98
|
+
clierr_p "re3 is not an enzyme name"
|
99
|
+
exit 1
|
100
|
+
end
|
101
|
+
|
102
|
+
begin
|
103
|
+
Bio::RestrictionEnzyme::DoubleStranded.new(o[:re5])
|
104
|
+
rescue
|
105
|
+
clierr_p "re5 is not an enzyme name"
|
106
|
+
exit 1
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def run(ops=OpenStruct.new, processed_fasta_file=nil, processed_freq_lookup=nil, processed_adapters=nil, cli=false)
|
111
|
+
if ops.kind_of? OpenStruct
|
112
|
+
@ops = ops.dup
|
113
|
+
elsif ops.kind_of? Hash
|
114
|
+
@ops = OpenStruct.new(ops)
|
115
|
+
else
|
116
|
+
raise ArgumentError
|
117
|
+
end
|
118
|
+
|
119
|
+
# Set defaults
|
120
|
+
@ops.verbose ||= false
|
121
|
+
@ops.quiet ||= false
|
122
|
+
@ops.sqlite ||= false
|
123
|
+
@ops.re5 ||= nil
|
124
|
+
@ops.re3 ||= nil
|
125
|
+
@ops.size ||= [0]
|
126
|
+
@ops.adapter5_size ||= nil
|
127
|
+
@ops.adapter3_size ||= nil
|
128
|
+
@ops.adapter5 ||= nil
|
129
|
+
@ops.adapter3 ||= nil
|
130
|
+
|
131
|
+
@sizes = processed_freq_lookup
|
132
|
+
@sequences = processed_fasta_file
|
133
|
+
@adapters = {}
|
134
|
+
@re5_ds, @re3_ds = [@ops.re5, @ops.re3].map {|x| Bio::RestrictionEnzyme::DoubleStranded.new(x)}
|
135
|
+
if @ops.verbose
|
136
|
+
cli_p(cli, <<-END
|
137
|
+
RE5: #{@ops.re5}
|
138
|
+
#{@re5_ds.aligned_strands_with_cuts.primary}
|
139
|
+
#{@re5_ds.aligned_strands_with_cuts.complement}
|
140
|
+
|
141
|
+
RE3: #{@ops.re3}
|
142
|
+
#{@re3_ds.aligned_strands_with_cuts.primary}
|
143
|
+
#{@re3_ds.aligned_strands_with_cuts.complement}
|
144
|
+
|
145
|
+
adapter5: #{@ops.adapter5}
|
146
|
+
adapter3: #{@ops.adapter3}
|
147
|
+
END
|
148
|
+
)
|
149
|
+
end
|
150
|
+
|
151
|
+
if @ops.named_adapter5 and @ops.adapter5
|
152
|
+
raise ArgumentError, "Cannot have both 'adapter5' and 'named_adapter5'"
|
153
|
+
elsif @ops.named_adapter3 and @ops.adapter3
|
154
|
+
raise ArgumentError, "Cannot have both 'adapter3' and 'named_adapter3'"
|
155
|
+
end
|
156
|
+
|
157
|
+
if !processed_adapters and (@ops.named_adapter5 or @ops.named_adapter3)
|
158
|
+
raise ArgumentError, "Must specify --fileadapters when using a named_adapter"
|
159
|
+
end
|
160
|
+
|
161
|
+
if processed_adapters
|
162
|
+
adapter_setup_1(processed_adapters)
|
163
|
+
else
|
164
|
+
adapter_setup_2
|
165
|
+
end
|
166
|
+
|
167
|
+
# translated adapter 3' if given in reverse orientation - e.g. _tt is
|
168
|
+
# translated to aa (reversed) and _tct returns the primary strand
|
169
|
+
# ending in specific 'tct'
|
170
|
+
if @adapters[:adapter3_specificity] =~ /^_/
|
171
|
+
seq3 = Bio::Sequence::NA.new(@adapters[:adapter3_specificity][1..-1]).downcase
|
172
|
+
@adapters[:adapter3_specificity] = seq3.complement.to_s
|
173
|
+
end
|
174
|
+
|
175
|
+
if @ops.adapter5_size and @ops.adapter5_sequence and (@ops.adapter5_size != @adapters[:adapter5_size])
|
176
|
+
raise ArgumentError, "--adapter5-sequence and --adapter5-size both supplied"
|
177
|
+
end
|
178
|
+
if @ops.adapter3_size and @ops.adapter3_sequence and (@ops.adapter3_size != @adapters[:adapter3_size])
|
179
|
+
raise ArgumentError, "--adapter3-sequence and --adapter3-size both supplied"
|
180
|
+
end
|
181
|
+
|
182
|
+
@trim = calculate_trim_for_nucleotides(@re5_ds, @re3_ds)
|
183
|
+
|
184
|
+
# ------
|
185
|
+
# Start calculations
|
186
|
+
#
|
187
|
+
left_trim, right_trim = calculate_left_and_right_trims(@trim)
|
188
|
+
|
189
|
+
matching_fragments = find_matching_fragments(@sizes, left_trim, right_trim)
|
190
|
+
results = []
|
191
|
+
|
192
|
+
matching_fragments.each do |hit|
|
193
|
+
hit.each do |entry|
|
194
|
+
seq = @sequences[entry[:fasta_id]][:sequence]
|
195
|
+
raw_frag = seq[entry[:offset]..(entry[:offset]+entry[:raw_size]-1)]
|
196
|
+
|
197
|
+
primary_frag, complement_frag = trim_sequences(raw_frag, Bio::Sequence::NA.new(raw_frag).forward_complement, left_trim, right_trim, @trim)
|
198
|
+
|
199
|
+
p = primary_frag.dup
|
200
|
+
c = complement_frag.dup
|
201
|
+
|
202
|
+
# note the next two if-statements at this lever chain together with 'p' and 'c'
|
203
|
+
if @adapters[:adapter5_specificity]
|
204
|
+
p, c = matches_adapter(5, p, c, raw_frag, @trim)
|
205
|
+
next if !p # next if returned false -- no match
|
206
|
+
end
|
207
|
+
|
208
|
+
if @adapters[:adapter3_specificity]
|
209
|
+
p, c = matches_adapter(3, p, c, raw_frag, @trim)
|
210
|
+
next if !p # next if returned false -- no match
|
211
|
+
end
|
212
|
+
|
213
|
+
primary_frag_with_adapters = p
|
214
|
+
complement_frag_with_adapters = c
|
215
|
+
|
216
|
+
results << {:raw_frag => raw_frag, :primary_frag => primary_frag, :primary_frag_with_adapters => primary_frag_with_adapters, :complement_frag => complement_frag, :complement_frag_with_adapters => complement_frag_with_adapters, :entry => entry, :seq => seq} # FIXME
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
if results.size == 0
|
221
|
+
cli_p(cli,"Nothing found") if @ops.verbose
|
222
|
+
end
|
223
|
+
|
224
|
+
sorted_results = {}
|
225
|
+
results.sort {|a,b| a[:seq] <=> b[:seq]}.each do |r|
|
226
|
+
raise "shouldn't happen" if sorted_results[r[:seq]] != nil
|
227
|
+
sorted_results[r[:seq]] = {}
|
228
|
+
x = sorted_results[r[:seq]]
|
229
|
+
x['sequence size'] = r[:seq].size
|
230
|
+
x['fragment - primary strand'] = r[:primary_frag]
|
231
|
+
x['fragment - complement strand'] = r[:complement_frag]
|
232
|
+
x['fragment with adapters - primary strand'] = r[:primary_frag_with_adapters]
|
233
|
+
x['fragment with adapters - complement strand'] = r[:complement_frag_with_adapters]
|
234
|
+
end
|
235
|
+
|
236
|
+
if @ops.verbose
|
237
|
+
ary = ['sequence size', 'fragment - primary strand', 'fragment - complement strand',
|
238
|
+
'fragment with adapters - primary strand', 'fragment with adapters - complement strand']
|
239
|
+
else
|
240
|
+
ary = ['fragment with adapters - primary strand', 'fragment with adapters - complement strand']
|
241
|
+
end
|
242
|
+
sorted_results.each do |k,v|
|
243
|
+
cli_p(cli, '---')
|
244
|
+
if @ops.verbose
|
245
|
+
cli_p(cli, '- sequence')
|
246
|
+
cli_p(cli, " #{k}")
|
247
|
+
end
|
248
|
+
|
249
|
+
ary.each do |a|
|
250
|
+
cli_p(cli, "- #{a}")
|
251
|
+
cli_p(cli, " #{v[a]}")
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
return results
|
256
|
+
end
|
257
|
+
|
258
|
+
def adapter_setup_1(hsh)
|
259
|
+
l = lambda do |i|
|
260
|
+
if @ops.send("adapter#{i}")
|
261
|
+
@adapters["adapter#{i}_specificity".to_sym] = @ops.send("adapter#{i}")
|
262
|
+
if @ops.send("adapter#{i}_sequence")
|
263
|
+
@adapters["adapter#{i}_sequence".to_sym] = @ops.send("adapter#{i}_sequence").gsub(/\|N*$/i,'')
|
264
|
+
@adapters["adapter#{i}_size".to_sym] = @adapters["adapter#{i}_sequence".to_sym].size + @adapters["adapter#{i}_specificity".to_sym].size
|
265
|
+
else
|
266
|
+
@adapters["adapter#{i}_size".to_sym] = @ops.send("adapter#{i}_size")
|
267
|
+
end
|
268
|
+
elsif hsh["adapter#{i}_specificity".to_sym]
|
269
|
+
@adapters["adapter#{i}_specificity".to_sym] = hsh["adapter#{i}_specificity".to_sym]
|
270
|
+
@adapters["adapter#{i}_sequence".to_sym] = hsh["adapter#{i}_sequence".to_sym]
|
271
|
+
@adapters["adapter#{i}_size".to_sym] = hsh["adapter#{i}_sequence".to_sym].size + hsh["adapter#{i}_specificity".to_sym].size
|
272
|
+
end
|
273
|
+
end
|
274
|
+
# set adapter 5' and 3' respectively using above procs
|
275
|
+
l.call(5)
|
276
|
+
l.call(3)
|
277
|
+
end
|
278
|
+
|
279
|
+
def adapter_setup_2
|
280
|
+
l = lambda do |i|
|
281
|
+
@adapters["adapter#{i}_specificity".to_sym] = @ops.send("adapter#{i}")
|
282
|
+
if @ops.send("adapter#{i}_sequence")
|
283
|
+
@adapters["adapter#{i}_sequence".to_sym] = @ops.send("adapter#{i}_sequence").gsub(/\|N*$/i,'')
|
284
|
+
@adapters["adapter#{i}_size".to_sym] = @adapters["adapter#{i}_sequence".to_sym].size + @adapters["adapter#{i}_specificity".to_sym].size
|
285
|
+
else
|
286
|
+
@adapters["adapter#{i}_size".to_sym] = @ops.send("adapter#{i}_size")
|
287
|
+
end
|
288
|
+
end
|
289
|
+
l.call(5)
|
290
|
+
l.call(3)
|
291
|
+
end
|
292
|
+
|
293
|
+
|
294
|
+
end # class SearchCommand
|
295
|
+
end # class App
|
296
|
+
end # module Genfrag
|
297
|
+
|
298
|
+
# EOF
|