genfrag 0.0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.bnsignore +16 -0
- data/History.txt +4 -0
- data/LICENSE.txt +58 -0
- data/README.rdoc +40 -0
- data/Rakefile +53 -0
- data/bin/genfrag +8 -0
- data/lib/genfrag.rb +129 -0
- data/lib/genfrag/app.rb +105 -0
- data/lib/genfrag/app/command.rb +145 -0
- data/lib/genfrag/app/index_command.rb +227 -0
- data/lib/genfrag/app/index_command/db.rb +105 -0
- data/lib/genfrag/app/search_command.rb +298 -0
- data/lib/genfrag/app/search_command/match.rb +165 -0
- data/lib/genfrag/app/search_command/process_file.rb +125 -0
- data/lib/genfrag/app/search_command/trim.rb +121 -0
- data/lib/genfrag/debug.rb +0 -0
- data/spec/data/index_command/in/a.fasta +109 -0
- data/spec/data/index_command/out/1-a_lookup.tdf +4 -0
- data/spec/data/index_command/out/2-a_lookup.db +0 -0
- data/spec/data/index_command/out/3-a_lookup.tdf +2 -0
- data/spec/data/index_command/out/4-a_lookup.db +0 -0
- data/spec/data/index_command/out/5-a_lookup.tdf +4 -0
- data/spec/data/index_command/out/6-a_lookup.db +0 -0
- data/spec/data/index_command/out/a.fasta.db +0 -0
- data/spec/data/index_command/out/a.fasta.tdf +6 -0
- data/spec/genfrag/app/command_spec.rb +55 -0
- data/spec/genfrag/app/index_command_spec.rb +258 -0
- data/spec/genfrag/app/search_command/match_spec.rb +77 -0
- data/spec/genfrag/app/search_command/process_file_spec.rb +185 -0
- data/spec/genfrag/app/search_command/trim_spec.rb +75 -0
- data/spec/genfrag/app/search_command_spec.rb +260 -0
- data/spec/genfrag/app_spec.rb +77 -0
- data/spec/genfrag_spec.rb +87 -0
- data/spec/spec_helper.rb +56 -0
- data/tasks/ann.rake +80 -0
- data/tasks/bones.rake +20 -0
- data/tasks/gem.rake +201 -0
- data/tasks/git.rake +40 -0
- data/tasks/notes.rake +27 -0
- data/tasks/post_load.rake +34 -0
- data/tasks/rdoc.rake +50 -0
- data/tasks/rubyforge.rake +55 -0
- data/tasks/setup.rb +300 -0
- data/tasks/spec.rake +54 -0
- data/tasks/svn.rake +47 -0
- data/tasks/test.rake +40 -0
- metadata +136 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
|
2
|
+
module Genfrag
|
3
|
+
class App
|
4
|
+
|
5
|
+
class IndexCommand < Command
|
6
|
+
|
7
|
+
attr_reader :sizes
|
8
|
+
|
9
|
+
# Run from command-line
|
10
|
+
#
|
11
|
+
def cli_run( args )
|
12
|
+
parse args
|
13
|
+
|
14
|
+
@input_filenames = ARGV
|
15
|
+
|
16
|
+
validate_options(options)
|
17
|
+
|
18
|
+
if options[:tracktime]
|
19
|
+
Genfrag.tracktime {
|
20
|
+
run(options, @input_filenames, true)
|
21
|
+
}
|
22
|
+
else
|
23
|
+
run(options, @input_filenames, true)
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
# Main class for creating the index - accepts multiple input files. Either an SQLite database or
|
29
|
+
# a flat file index is created (extension .tdf) which is unique for the input file combination.
|
30
|
+
# This file is used by the Search routine later.
|
31
|
+
#
|
32
|
+
def run(ops=@ops, input_filenames=[], cli=false)
|
33
|
+
if ops.kind_of? OpenStruct
|
34
|
+
@ops = ops.dup
|
35
|
+
elsif ops.kind_of? Hash
|
36
|
+
@ops = OpenStruct.new(ops)
|
37
|
+
else
|
38
|
+
raise ArgumentError
|
39
|
+
end
|
40
|
+
|
41
|
+
# Set defaults
|
42
|
+
@ops.verbose ||= false
|
43
|
+
@ops.quiet ||= false
|
44
|
+
@ops.sqlite ||= false
|
45
|
+
@ops.filelookup ||= nil
|
46
|
+
@ops.filefasta ||= nil
|
47
|
+
@ops.re5 ||= nil
|
48
|
+
@ops.re3 ||= nil
|
49
|
+
@ops.indir ||= '.'
|
50
|
+
@ops.outdir ||= '.'
|
51
|
+
|
52
|
+
@input_filenames = input_filenames.empty? ? [@ops.filefasta] : input_filenames
|
53
|
+
@sizes = {}
|
54
|
+
db = IndexCommand::DB.new(@ops, @input_filenames)
|
55
|
+
@re5_ds, @re3_ds = [@ops.re5, @ops.re3].map {|x| Bio::RestrictionEnzyme::DoubleStranded.new(x)}
|
56
|
+
db.write_headers
|
57
|
+
|
58
|
+
if @ops.verbose
|
59
|
+
cli_p(cli, <<-END
|
60
|
+
RE5: #{@ops.re5}
|
61
|
+
#{@re5_ds.aligned_strands_with_cuts.primary}
|
62
|
+
#{@re5_ds.aligned_strands_with_cuts.complement}
|
63
|
+
|
64
|
+
RE3: #{@ops.re3}
|
65
|
+
#{@re3_ds.aligned_strands_with_cuts.primary}
|
66
|
+
#{@re3_ds.aligned_strands_with_cuts.complement}
|
67
|
+
END
|
68
|
+
)
|
69
|
+
end
|
70
|
+
|
71
|
+
# unit test with aasi, aari, and ppii
|
72
|
+
re5_regexp, re3_regexp = [@ops.re5, @ops.re3].map {|x| Bio::Sequence::NA.new( Bio::RestrictionEnzyme::DoubleStranded.new(x).aligned_strands.primary ).to_re }
|
73
|
+
|
74
|
+
entries = {}
|
75
|
+
# Account for exact duplicate sequences
|
76
|
+
@input_filenames.each do |input_filename|
|
77
|
+
Bio::FlatFile.auto(File.join(@ops.indir, input_filename)).each_entry do |e|
|
78
|
+
e.definition.tr!("\t",'')
|
79
|
+
s = e.seq.to_s.downcase
|
80
|
+
if entries[s]
|
81
|
+
entries[s] << e.definition
|
82
|
+
else
|
83
|
+
entries[s] = [e.definition]
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
a_re = /(.*)(#{re5_regexp})/
|
89
|
+
b_re = /(.*?)(#{re3_regexp})/
|
90
|
+
|
91
|
+
normalized_fasta_id=0
|
92
|
+
entries.each do |seq, definitions|
|
93
|
+
normalized_fasta_id+=1
|
94
|
+
db.write_entry_to_fasta(normalized_fasta_id, seq, definitions)
|
95
|
+
|
96
|
+
# NOTE the index command is slow because of the match functions, compare with ruby 1.9
|
97
|
+
m1 = a_re.match(seq)
|
98
|
+
if m1
|
99
|
+
# Find the fragment 'frag1' cut most right in seq with re5_regexp
|
100
|
+
frag1 = $2 + m1.post_match
|
101
|
+
|
102
|
+
position = $1.size
|
103
|
+
|
104
|
+
m2 = b_re.match( frag1 )
|
105
|
+
|
106
|
+
# Now cut frag1 with re3_regexp resulting in frag2
|
107
|
+
if m2
|
108
|
+
@frag2 = $1 + $2
|
109
|
+
if @ops.verbose
|
110
|
+
cli_p(cli, <<-END
|
111
|
+
---
|
112
|
+
#{definitions.join("\n")}
|
113
|
+
#{@frag2}
|
114
|
+
END
|
115
|
+
)
|
116
|
+
end
|
117
|
+
@sizes[@frag2.size] ||= []
|
118
|
+
@sizes[@frag2.size] << [position, normalized_fasta_id]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
i=0
|
125
|
+
@sizes.each do |size,info|
|
126
|
+
i+=1
|
127
|
+
db.write_entry_to_freq(i, size, info.map {|x| x.join(' ')}.join(', ') )
|
128
|
+
end
|
129
|
+
|
130
|
+
if @ops.verbose
|
131
|
+
@sizes.each { |@entry| cli_p(cli, @entry.inspect) }
|
132
|
+
else
|
133
|
+
cli_p(cli, "Cut sites found: #{@sizes.values.flatten.size / 2}")
|
134
|
+
end
|
135
|
+
|
136
|
+
db.close
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
############
|
141
|
+
# Command-line
|
142
|
+
############
|
143
|
+
|
144
|
+
|
145
|
+
# Option parser for command-line
|
146
|
+
#
|
147
|
+
def opt_parser
|
148
|
+
std_opts = standard_options
|
149
|
+
|
150
|
+
opts = OptionParser.new
|
151
|
+
opts.banner = 'Usage: genfrag index [options]'
|
152
|
+
|
153
|
+
opts.separator ''
|
154
|
+
opts.separator " Create a database of sequence fragments that match the last 5' fragment"
|
155
|
+
opts.separator " cut by two restricting enzymes RE3 and RE5."
|
156
|
+
opts.separator " The Fasta file defined by the --fasta option is taken as input."
|
157
|
+
opts.separator " Two files are created for the search function - a lookup file, and"
|
158
|
+
opts.separator " the contents of the Fasta file rewritten in a special format. You can"
|
159
|
+
opts.separator " specify the name of the lookup file with the --lookup option."
|
160
|
+
|
161
|
+
opts.separator ''
|
162
|
+
|
163
|
+
ary = [:verbose, :quiet, :tracktime, :indir, :outdir, :sqlite, :re5, :re3,
|
164
|
+
:filelookup, :filefasta
|
165
|
+
]
|
166
|
+
ary.each { |a| opts.on(*std_opts[a]) }
|
167
|
+
|
168
|
+
opts.separator ''
|
169
|
+
opts.separator ' Common Options:'
|
170
|
+
opts.on( '-h', '--help', 'show this message' ) { @out.puts opts; exit 1 }
|
171
|
+
opts.separator ' Examples:'
|
172
|
+
opts.separator ' genfrag index -f example.fasta --re5 BstYI --re3 MseI'
|
173
|
+
opts.separator ' genfrag index --out /tmp --in . -f example.fasta --re5 BstYI --re3 MseI'
|
174
|
+
opts
|
175
|
+
end
|
176
|
+
|
177
|
+
# Parse options passed from command-line
|
178
|
+
#
|
179
|
+
def parse( args )
|
180
|
+
opts = opt_parser
|
181
|
+
|
182
|
+
if args.empty?
|
183
|
+
@out.puts opts
|
184
|
+
exit 1
|
185
|
+
end
|
186
|
+
|
187
|
+
# parse the command line arguments
|
188
|
+
opts.parse! args
|
189
|
+
end
|
190
|
+
|
191
|
+
# Validate options passed from the command-line
|
192
|
+
def validate_options(o)
|
193
|
+
if o[:filefasta] == nil
|
194
|
+
clierr_p "missing option: must supply fasta filename"
|
195
|
+
exit 1
|
196
|
+
end
|
197
|
+
|
198
|
+
if o[:re5] == nil
|
199
|
+
clierr_p "missing option: re5"
|
200
|
+
exit 1
|
201
|
+
end
|
202
|
+
|
203
|
+
if o[:re3] == nil
|
204
|
+
clierr_p "missing option: re3"
|
205
|
+
exit 1
|
206
|
+
end
|
207
|
+
|
208
|
+
begin
|
209
|
+
Bio::RestrictionEnzyme::DoubleStranded.new(o[:re3])
|
210
|
+
rescue
|
211
|
+
clierr_p "re3 is not an enzyme name"
|
212
|
+
exit 1
|
213
|
+
end
|
214
|
+
|
215
|
+
begin
|
216
|
+
Bio::RestrictionEnzyme::DoubleStranded.new(o[:re5])
|
217
|
+
rescue
|
218
|
+
clierr_p "re5 is not an enzyme name"
|
219
|
+
exit 1
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
end # class IndexCommand
|
224
|
+
end # class App
|
225
|
+
end # module Genfrag
|
226
|
+
|
227
|
+
# EOF
|
@@ -0,0 +1,105 @@
|
|
1
|
+
module Genfrag
|
2
|
+
class App
|
3
|
+
|
4
|
+
class IndexCommand < Command
|
5
|
+
|
6
|
+
class DB
|
7
|
+
attr_accessor :ops # an OpenStruct of the options
|
8
|
+
attr_accessor :input_filenames
|
9
|
+
attr_accessor :normalized_fasta
|
10
|
+
attr_accessor :freq_lookup
|
11
|
+
|
12
|
+
def initialize( ops, input_filenames )
|
13
|
+
@normalized_fasta = nil
|
14
|
+
@freq_lookup = nil
|
15
|
+
@ops = ops
|
16
|
+
@input_filenames = input_filenames
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
def sc
|
21
|
+
@ops.sqlite ? 'sqlite' : 'csv'
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def write_headers
|
26
|
+
self.send("write_headers_#{sc}")
|
27
|
+
end
|
28
|
+
|
29
|
+
def write_headers_sqlite
|
30
|
+
@normalized_fasta = SQLite3::Database.new( File.join(@ops.outdir, Genfrag.name_normalized_fasta(@input_filenames,@ops.filefasta) + '.db') )
|
31
|
+
sql = <<-SQL
|
32
|
+
drop table if exists db_normalized_fasta;
|
33
|
+
create table db_normalized_fasta (
|
34
|
+
id integer,
|
35
|
+
definitions text,
|
36
|
+
sequence text
|
37
|
+
);
|
38
|
+
create unique index db_normalized_fasta_idx on db_normalized_fasta(id);
|
39
|
+
SQL
|
40
|
+
@normalized_fasta.execute_batch( sql )
|
41
|
+
@freq_lookup = SQLite3::Database.new( File.join(@ops.outdir, Genfrag.name_freq_lookup(@input_filenames,@ops.filefasta,@ops.filelookup,@ops.re5,@ops.re3) + '.db') )
|
42
|
+
sql = <<-SQL
|
43
|
+
drop table if exists db_freq_lookup;
|
44
|
+
create table db_freq_lookup (
|
45
|
+
id integer,
|
46
|
+
size integer,
|
47
|
+
positions text
|
48
|
+
);
|
49
|
+
create unique index db_freq_lookup_idx on db_freq_lookup(id);
|
50
|
+
SQL
|
51
|
+
@freq_lookup.execute_batch( sql )
|
52
|
+
end
|
53
|
+
|
54
|
+
def write_headers_csv
|
55
|
+
@normalized_fasta = File.new(File.join(@ops.outdir,Genfrag.name_normalized_fasta(@input_filenames,@ops.filefasta) + '.tdf'), 'w')
|
56
|
+
@normalized_fasta.puts %w(id Definitions Sequence).join("\t")
|
57
|
+
@freq_lookup = File.new( File.join(@ops.outdir,Genfrag.name_freq_lookup(@input_filenames,@ops.filefasta,@ops.filelookup,@ops.re5,@ops.re3) + '.tdf'), 'w')
|
58
|
+
@freq_lookup.puts %w(id Size Positions).join("\t")
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
def write_entry_to_fasta(normalized_fasta_id, seq, definitions)
|
63
|
+
self.send("write_entry_to_fasta_#{sc}", normalized_fasta_id, seq, definitions)
|
64
|
+
end
|
65
|
+
|
66
|
+
def write_entry_to_fasta_sqlite(normalized_fasta_id, seq, definitions)
|
67
|
+
@normalized_fasta.execute( "insert into db_normalized_fasta values ( ?, ?, ? )", normalized_fasta_id, CSV.generate_line(definitions), seq )
|
68
|
+
end
|
69
|
+
|
70
|
+
def write_entry_to_fasta_csv(normalized_fasta_id, seq, definitions)
|
71
|
+
@normalized_fasta.puts [normalized_fasta_id,CSV.generate_line(definitions),seq].join("\t")
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
def write_entry_to_freq(i, size, str)
|
76
|
+
self.send("write_entry_to_freq_#{sc}", i, size, str)
|
77
|
+
end
|
78
|
+
|
79
|
+
def write_entry_to_freq_sqlite(i, size, str)
|
80
|
+
@freq_lookup.execute( "insert into db_freq_lookup values ( ?, ?, ? )", i, size, str )
|
81
|
+
end
|
82
|
+
|
83
|
+
def write_entry_to_freq_csv(i, size, str)
|
84
|
+
@freq_lookup.puts [i,size,str].join("\t")
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def close
|
89
|
+
self.send("close_#{sc}")
|
90
|
+
end
|
91
|
+
|
92
|
+
def close_sqlite
|
93
|
+
end
|
94
|
+
|
95
|
+
def close_csv
|
96
|
+
@normalized_fasta.close
|
97
|
+
@freq_lookup.close
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end # class IndexCommand
|
102
|
+
end # class App
|
103
|
+
end # module Genfrag
|
104
|
+
|
105
|
+
# EOF
|
@@ -0,0 +1,298 @@
|
|
1
|
+
|
2
|
+
module Genfrag
|
3
|
+
class App
|
4
|
+
|
5
|
+
class SearchCommand < Command
|
6
|
+
|
7
|
+
def cli_run( args )
|
8
|
+
parse args
|
9
|
+
|
10
|
+
@input_filenames = ARGV
|
11
|
+
input_filenames = [@input_filenames].flatten
|
12
|
+
processed_adapters=nil
|
13
|
+
|
14
|
+
validate_options(options)
|
15
|
+
|
16
|
+
|
17
|
+
if options[:sqlite]
|
18
|
+
processed_fasta_file = SearchCommand::ProcessFile.process_db_fasta_file( SQLite3::Database.new( Genfrag.name_normalized_fasta(input_filenames,options[:filefasta]) + '.db' ) )
|
19
|
+
processed_freq_lookup = SearchCommand::ProcessFile.process_db_freq_lookup( SQLite3::Database.new( Genfrag.name_freq_lookup(input_filenames,options[:filefasta],options[:filelookup],options[:re5],options[:re3]) + '.db' ) )
|
20
|
+
else
|
21
|
+
processed_fasta_file = SearchCommand::ProcessFile.process_tdf_fasta_file( IO.readlines( Genfrag.name_normalized_fasta(input_filenames,options[:filefasta]) + '.tdf' ) )
|
22
|
+
processed_freq_lookup = SearchCommand::ProcessFile.process_tdf_freq_lookup( IO.readlines( Genfrag.name_freq_lookup(input_filenames,options[:filefasta],options[:filelookup],options[:re5],options[:re3]) + '.tdf' ) )
|
23
|
+
end
|
24
|
+
|
25
|
+
if options[:fileadapters]
|
26
|
+
processed_adapters = SearchCommand::ProcessFile.process_tdf_adapters( IO.readlines( Genfrag.name_adapters(options[:fileadapters]) + '.tdf' ), options[:named_adapter5], options[:named_adapter3] )
|
27
|
+
end
|
28
|
+
|
29
|
+
run(options, processed_fasta_file, processed_freq_lookup, processed_adapters, true)
|
30
|
+
end
|
31
|
+
|
32
|
+
def opt_parser
|
33
|
+
std_opts = standard_options
|
34
|
+
|
35
|
+
opts = OptionParser.new
|
36
|
+
opts.banner = 'Usage: genfrag search [options]'
|
37
|
+
|
38
|
+
opts.separator ''
|
39
|
+
opts.separator " Search a database of sequence fragments that match the last 5'"
|
40
|
+
opts.separator " fragment cut by two restricting enzymes RE3 and RE5, as created by the"
|
41
|
+
opts.separator " index function. Next, adapters are applied to search a subset of"
|
42
|
+
opts.separator " fragments, as is used in some protocols."
|
43
|
+
|
44
|
+
opts.separator ''
|
45
|
+
ary = [:verbose, :quiet, :tracktime, :indir, :outdir, :sqlite, :re5, :re3,
|
46
|
+
:filelookup, :filefasta, :fileadapters, :adapter5_sequence, :adapter3_sequence,
|
47
|
+
:adapter5_size, :adapter3_size, :named_adapter5, :named_adapter3,
|
48
|
+
:adapter5, :adapter3
|
49
|
+
]
|
50
|
+
ary.each { |a| opts.on(*std_opts[a]) }
|
51
|
+
|
52
|
+
opts.separator ''
|
53
|
+
opts.separator ' Common Options:'
|
54
|
+
opts.on( '-h', '--help', 'show this message' ) { @out.puts opts; exit }
|
55
|
+
|
56
|
+
opts.separator ' Examples:'
|
57
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --adapter5 tt'
|
58
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --add 26 --adapter5 ct --adapter3 aa --size 190,215'
|
59
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --adapter5-size 11 --adapter5 tt --adapter3-size 15 --size 168'
|
60
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --adapter5-sequence GACTGCGTAGTGATC --adapter5 tt --adapter3-size 15 --size 168'
|
61
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --adapter5-size 11 --adapter5 ct --adapter3-size 15 --adapter3 aa --size 190,215'
|
62
|
+
opts.separator ' genfrag search -f example.fasta --re5 BstYI --re3 MseI --add 26 --named-adapter5 BstYI-T4 --named-adapter3 MseI-21 --size 190,215'
|
63
|
+
opts
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse( args )
|
67
|
+
opts = opt_parser
|
68
|
+
|
69
|
+
if args.empty?
|
70
|
+
@out.puts opts
|
71
|
+
exit 1
|
72
|
+
end
|
73
|
+
|
74
|
+
# parse the command line arguments
|
75
|
+
opts.parse! args
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
def validate_options(o)
|
80
|
+
if o[:filefasta] == nil
|
81
|
+
clierr_p "missing option: must supply fasta filename"
|
82
|
+
exit 1
|
83
|
+
end
|
84
|
+
|
85
|
+
if o[:re5] == nil
|
86
|
+
clierr_p "missing option: re5"
|
87
|
+
exit 1
|
88
|
+
end
|
89
|
+
|
90
|
+
if o[:re3] == nil
|
91
|
+
clierr_p "missing option: re3"
|
92
|
+
exit 1
|
93
|
+
end
|
94
|
+
|
95
|
+
begin
|
96
|
+
Bio::RestrictionEnzyme::DoubleStranded.new(o[:re3])
|
97
|
+
rescue
|
98
|
+
clierr_p "re3 is not an enzyme name"
|
99
|
+
exit 1
|
100
|
+
end
|
101
|
+
|
102
|
+
begin
|
103
|
+
Bio::RestrictionEnzyme::DoubleStranded.new(o[:re5])
|
104
|
+
rescue
|
105
|
+
clierr_p "re5 is not an enzyme name"
|
106
|
+
exit 1
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def run(ops=OpenStruct.new, processed_fasta_file=nil, processed_freq_lookup=nil, processed_adapters=nil, cli=false)
|
111
|
+
if ops.kind_of? OpenStruct
|
112
|
+
@ops = ops.dup
|
113
|
+
elsif ops.kind_of? Hash
|
114
|
+
@ops = OpenStruct.new(ops)
|
115
|
+
else
|
116
|
+
raise ArgumentError
|
117
|
+
end
|
118
|
+
|
119
|
+
# Set defaults
|
120
|
+
@ops.verbose ||= false
|
121
|
+
@ops.quiet ||= false
|
122
|
+
@ops.sqlite ||= false
|
123
|
+
@ops.re5 ||= nil
|
124
|
+
@ops.re3 ||= nil
|
125
|
+
@ops.size ||= [0]
|
126
|
+
@ops.adapter5_size ||= nil
|
127
|
+
@ops.adapter3_size ||= nil
|
128
|
+
@ops.adapter5 ||= nil
|
129
|
+
@ops.adapter3 ||= nil
|
130
|
+
|
131
|
+
@sizes = processed_freq_lookup
|
132
|
+
@sequences = processed_fasta_file
|
133
|
+
@adapters = {}
|
134
|
+
@re5_ds, @re3_ds = [@ops.re5, @ops.re3].map {|x| Bio::RestrictionEnzyme::DoubleStranded.new(x)}
|
135
|
+
if @ops.verbose
|
136
|
+
cli_p(cli, <<-END
|
137
|
+
RE5: #{@ops.re5}
|
138
|
+
#{@re5_ds.aligned_strands_with_cuts.primary}
|
139
|
+
#{@re5_ds.aligned_strands_with_cuts.complement}
|
140
|
+
|
141
|
+
RE3: #{@ops.re3}
|
142
|
+
#{@re3_ds.aligned_strands_with_cuts.primary}
|
143
|
+
#{@re3_ds.aligned_strands_with_cuts.complement}
|
144
|
+
|
145
|
+
adapter5: #{@ops.adapter5}
|
146
|
+
adapter3: #{@ops.adapter3}
|
147
|
+
END
|
148
|
+
)
|
149
|
+
end
|
150
|
+
|
151
|
+
if @ops.named_adapter5 and @ops.adapter5
|
152
|
+
raise ArgumentError, "Cannot have both 'adapter5' and 'named_adapter5'"
|
153
|
+
elsif @ops.named_adapter3 and @ops.adapter3
|
154
|
+
raise ArgumentError, "Cannot have both 'adapter3' and 'named_adapter3'"
|
155
|
+
end
|
156
|
+
|
157
|
+
if !processed_adapters and (@ops.named_adapter5 or @ops.named_adapter3)
|
158
|
+
raise ArgumentError, "Must specify --fileadapters when using a named_adapter"
|
159
|
+
end
|
160
|
+
|
161
|
+
if processed_adapters
|
162
|
+
adapter_setup_1(processed_adapters)
|
163
|
+
else
|
164
|
+
adapter_setup_2
|
165
|
+
end
|
166
|
+
|
167
|
+
# translated adapter 3' if given in reverse orientation - e.g. _tt is
|
168
|
+
# translated to aa (reversed) and _tct returns the primary strand
|
169
|
+
# ending in specific 'tct'
|
170
|
+
if @adapters[:adapter3_specificity] =~ /^_/
|
171
|
+
seq3 = Bio::Sequence::NA.new(@adapters[:adapter3_specificity][1..-1]).downcase
|
172
|
+
@adapters[:adapter3_specificity] = seq3.complement.to_s
|
173
|
+
end
|
174
|
+
|
175
|
+
if @ops.adapter5_size and @ops.adapter5_sequence and (@ops.adapter5_size != @adapters[:adapter5_size])
|
176
|
+
raise ArgumentError, "--adapter5-sequence and --adapter5-size both supplied"
|
177
|
+
end
|
178
|
+
if @ops.adapter3_size and @ops.adapter3_sequence and (@ops.adapter3_size != @adapters[:adapter3_size])
|
179
|
+
raise ArgumentError, "--adapter3-sequence and --adapter3-size both supplied"
|
180
|
+
end
|
181
|
+
|
182
|
+
@trim = calculate_trim_for_nucleotides(@re5_ds, @re3_ds)
|
183
|
+
|
184
|
+
# ------
|
185
|
+
# Start calculations
|
186
|
+
#
|
187
|
+
left_trim, right_trim = calculate_left_and_right_trims(@trim)
|
188
|
+
|
189
|
+
matching_fragments = find_matching_fragments(@sizes, left_trim, right_trim)
|
190
|
+
results = []
|
191
|
+
|
192
|
+
matching_fragments.each do |hit|
|
193
|
+
hit.each do |entry|
|
194
|
+
seq = @sequences[entry[:fasta_id]][:sequence]
|
195
|
+
raw_frag = seq[entry[:offset]..(entry[:offset]+entry[:raw_size]-1)]
|
196
|
+
|
197
|
+
primary_frag, complement_frag = trim_sequences(raw_frag, Bio::Sequence::NA.new(raw_frag).forward_complement, left_trim, right_trim, @trim)
|
198
|
+
|
199
|
+
p = primary_frag.dup
|
200
|
+
c = complement_frag.dup
|
201
|
+
|
202
|
+
# note the next two if-statements at this lever chain together with 'p' and 'c'
|
203
|
+
if @adapters[:adapter5_specificity]
|
204
|
+
p, c = matches_adapter(5, p, c, raw_frag, @trim)
|
205
|
+
next if !p # next if returned false -- no match
|
206
|
+
end
|
207
|
+
|
208
|
+
if @adapters[:adapter3_specificity]
|
209
|
+
p, c = matches_adapter(3, p, c, raw_frag, @trim)
|
210
|
+
next if !p # next if returned false -- no match
|
211
|
+
end
|
212
|
+
|
213
|
+
primary_frag_with_adapters = p
|
214
|
+
complement_frag_with_adapters = c
|
215
|
+
|
216
|
+
results << {:raw_frag => raw_frag, :primary_frag => primary_frag, :primary_frag_with_adapters => primary_frag_with_adapters, :complement_frag => complement_frag, :complement_frag_with_adapters => complement_frag_with_adapters, :entry => entry, :seq => seq} # FIXME
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
if results.size == 0
|
221
|
+
cli_p(cli,"Nothing found") if @ops.verbose
|
222
|
+
end
|
223
|
+
|
224
|
+
sorted_results = {}
|
225
|
+
results.sort {|a,b| a[:seq] <=> b[:seq]}.each do |r|
|
226
|
+
raise "shouldn't happen" if sorted_results[r[:seq]] != nil
|
227
|
+
sorted_results[r[:seq]] = {}
|
228
|
+
x = sorted_results[r[:seq]]
|
229
|
+
x['sequence size'] = r[:seq].size
|
230
|
+
x['fragment - primary strand'] = r[:primary_frag]
|
231
|
+
x['fragment - complement strand'] = r[:complement_frag]
|
232
|
+
x['fragment with adapters - primary strand'] = r[:primary_frag_with_adapters]
|
233
|
+
x['fragment with adapters - complement strand'] = r[:complement_frag_with_adapters]
|
234
|
+
end
|
235
|
+
|
236
|
+
if @ops.verbose
|
237
|
+
ary = ['sequence size', 'fragment - primary strand', 'fragment - complement strand',
|
238
|
+
'fragment with adapters - primary strand', 'fragment with adapters - complement strand']
|
239
|
+
else
|
240
|
+
ary = ['fragment with adapters - primary strand', 'fragment with adapters - complement strand']
|
241
|
+
end
|
242
|
+
sorted_results.each do |k,v|
|
243
|
+
cli_p(cli, '---')
|
244
|
+
if @ops.verbose
|
245
|
+
cli_p(cli, '- sequence')
|
246
|
+
cli_p(cli, " #{k}")
|
247
|
+
end
|
248
|
+
|
249
|
+
ary.each do |a|
|
250
|
+
cli_p(cli, "- #{a}")
|
251
|
+
cli_p(cli, " #{v[a]}")
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
return results
|
256
|
+
end
|
257
|
+
|
258
|
+
def adapter_setup_1(hsh)
|
259
|
+
l = lambda do |i|
|
260
|
+
if @ops.send("adapter#{i}")
|
261
|
+
@adapters["adapter#{i}_specificity".to_sym] = @ops.send("adapter#{i}")
|
262
|
+
if @ops.send("adapter#{i}_sequence")
|
263
|
+
@adapters["adapter#{i}_sequence".to_sym] = @ops.send("adapter#{i}_sequence").gsub(/\|N*$/i,'')
|
264
|
+
@adapters["adapter#{i}_size".to_sym] = @adapters["adapter#{i}_sequence".to_sym].size + @adapters["adapter#{i}_specificity".to_sym].size
|
265
|
+
else
|
266
|
+
@adapters["adapter#{i}_size".to_sym] = @ops.send("adapter#{i}_size")
|
267
|
+
end
|
268
|
+
elsif hsh["adapter#{i}_specificity".to_sym]
|
269
|
+
@adapters["adapter#{i}_specificity".to_sym] = hsh["adapter#{i}_specificity".to_sym]
|
270
|
+
@adapters["adapter#{i}_sequence".to_sym] = hsh["adapter#{i}_sequence".to_sym]
|
271
|
+
@adapters["adapter#{i}_size".to_sym] = hsh["adapter#{i}_sequence".to_sym].size + hsh["adapter#{i}_specificity".to_sym].size
|
272
|
+
end
|
273
|
+
end
|
274
|
+
# set adapter 5' and 3' respectively using above procs
|
275
|
+
l.call(5)
|
276
|
+
l.call(3)
|
277
|
+
end
|
278
|
+
|
279
|
+
def adapter_setup_2
|
280
|
+
l = lambda do |i|
|
281
|
+
@adapters["adapter#{i}_specificity".to_sym] = @ops.send("adapter#{i}")
|
282
|
+
if @ops.send("adapter#{i}_sequence")
|
283
|
+
@adapters["adapter#{i}_sequence".to_sym] = @ops.send("adapter#{i}_sequence").gsub(/\|N*$/i,'')
|
284
|
+
@adapters["adapter#{i}_size".to_sym] = @adapters["adapter#{i}_sequence".to_sym].size + @adapters["adapter#{i}_specificity".to_sym].size
|
285
|
+
else
|
286
|
+
@adapters["adapter#{i}_size".to_sym] = @ops.send("adapter#{i}_size")
|
287
|
+
end
|
288
|
+
end
|
289
|
+
l.call(5)
|
290
|
+
l.call(3)
|
291
|
+
end
|
292
|
+
|
293
|
+
|
294
|
+
end # class SearchCommand
|
295
|
+
end # class App
|
296
|
+
end # module Genfrag
|
297
|
+
|
298
|
+
# EOF
|