protk 1.3.0 → 1.3.1.pre2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/make_decoy.rb +1 -2
- data/bin/mascot_search.rb +2 -0
- data/bin/msgfplus_search.rb +1 -1
- data/bin/protxml_to_gff.rb +94 -115
- data/bin/protxml_to_psql.rb +3 -2
- data/bin/sixframe.rb +15 -8
- data/bin/swissprot_to_table.rb +120 -0
- data/lib/protk.rb +0 -1
- data/lib/protk/bio_gff3_extensions.rb +22 -0
- data/lib/protk/bio_sptr_extensions.rb +19 -4
- data/lib/protk/constants.rb +19 -11
- data/lib/protk/gffdb.rb +60 -0
- data/lib/protk/peptide.rb +158 -0
- data/lib/protk/protein.rb +72 -0
- data/lib/protk/protein_to_genome_mapper.rb +8 -0
- data/lib/protk/protxml_to_gff_tool.rb +3 -1
- data/lib/protk/search_tool.rb +3 -24
- data/lib/protk/swissprot_database.rb +8 -20
- data/lib/protk/tool.rb +36 -1
- metadata +68 -41
- data/lib/protk/protxml.rb +0 -141
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 888f8ebff75c2c33497c9bf4f7aeec182311b7e3
|
4
|
+
data.tar.gz: 4102a91afbee688babe093df8a53b84b097ba0c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e67189a07c6ac237a4def19ad90043ab8919d5492fd43b67cfa5fc3285819b2fd62671375283c5d9dd05618c746603ba829c70225b140b3a52ccba9fafb24f8
|
7
|
+
data.tar.gz: 354a9eb2499d3f8b194ccdef82f06692672435a0e47b5b49197b9f1fba2c27181275d38c98fe6f644750b554b4b1601b873f6a51a0318853b86b17d7783f2e57
|
data/bin/make_decoy.rb
CHANGED
@@ -49,10 +49,9 @@ if (tool.reverse_only)
|
|
49
49
|
Bio::FastaFormat.open(input_file).each do |seq|
|
50
50
|
id=nil
|
51
51
|
begin
|
52
|
-
# require 'debugger';debugger
|
53
52
|
id=seq.definition.chomp.scan(/#{tool.id_regex}/)[0][0]
|
54
53
|
revdef=seq.definition.sub(id,"#{tool.prefix_string}#{id}")
|
55
|
-
decoys_out.write ">#{revdef}\n#{seq.aaseq}\n"
|
54
|
+
decoys_out.write ">#{revdef}\n#{seq.aaseq.reverse}\n"
|
56
55
|
rescue
|
57
56
|
puts "Unable to parse id for #{seq.definition}. Skipping" if (id==nil)
|
58
57
|
end
|
data/bin/mascot_search.rb
CHANGED
@@ -103,6 +103,7 @@ def search_params_dictionary(search_tool,input_file)
|
|
103
103
|
postdict[:FILE]=File.new(input_file)
|
104
104
|
postdict[:FORMVER]='1.01'
|
105
105
|
postdict[:INTERMEDIATE]=''
|
106
|
+
postdict[:QUANTITATION]=search_tool.quantitation
|
106
107
|
|
107
108
|
postdict
|
108
109
|
end
|
@@ -134,6 +135,7 @@ search_tool.options.output_suffix="_mascot"
|
|
134
135
|
|
135
136
|
search_tool.add_value_option(:mascot_server,"#{$genv.default_mascot_server}/mascot/cgi",['-S', '--server url', 'The url to the cgi directory of the mascot server'])
|
136
137
|
search_tool.add_value_option(:allowed_charges,"1+,2+,3+",['--allowed-charges ac', 'Allowed precursor ion charges.'])
|
138
|
+
search_tool.add_value_option(:quantitation,"",['--quantitation method','Mascot quant method'])
|
137
139
|
search_tool.add_value_option(:email,"",['--email em', 'User email.'])
|
138
140
|
search_tool.add_value_option(:username,"",['--username un', 'Username.'])
|
139
141
|
search_tool.add_value_option(:httpproxy,nil,['--proxy url', 'The url to a proxy server'])
|
data/bin/msgfplus_search.rb
CHANGED
data/bin/protxml_to_gff.rb
CHANGED
@@ -1,44 +1,51 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
3
|
# This file is part of protk
|
4
|
-
#
|
5
|
-
# Translated to ruby by Ira Cooke 29/1/2013
|
4
|
+
# Created by Ira Cooke 3/8/2014
|
6
5
|
#
|
7
6
|
#
|
8
7
|
|
9
8
|
require 'protk/constants'
|
10
|
-
require 'protk/protxml_to_gff_tool'
|
11
9
|
require 'protk/fastadb'
|
10
|
+
require 'protk/gffdb'
|
11
|
+
require 'protk/protein'
|
12
|
+
require 'protk/peptide'
|
13
|
+
require 'protk/tool'
|
12
14
|
require 'libxml'
|
13
15
|
require 'bio'
|
14
16
|
|
15
17
|
include LibXML
|
16
18
|
|
17
|
-
tool=ProtXMLToGFFTool.new()
|
18
19
|
|
19
|
-
|
20
|
-
@output_suffix=""
|
21
|
-
|
22
|
-
exit unless tool.check_options(true,[:database])
|
23
|
-
|
24
|
-
input_proxml=ARGV[0]
|
25
|
-
|
26
|
-
if ( tool.explicit_output!=nil)
|
27
|
-
gff_out_file=tool.explicit_output
|
28
|
-
else
|
29
|
-
gff_out_file=Tool.default_output_path(input_proxml,@output_extension,tool.output_prefix,@output_suffix)
|
20
|
+
class NoGFFEntryFoundError < StandardError
|
30
21
|
end
|
31
22
|
|
32
|
-
|
33
|
-
|
23
|
+
class ProteinNotInDBError < StandardError
|
24
|
+
end
|
34
25
|
|
26
|
+
class MultipleGFFEntriesForProteinError < StandardError
|
27
|
+
end
|
35
28
|
|
36
29
|
def parse_proteins(protxml_file)
|
37
|
-
puts "Parsing proteins from protxml"
|
38
30
|
protxml_parser=XML::Parser.file(protxml_file)
|
39
31
|
protxml_doc=protxml_parser.parse
|
40
32
|
proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
41
|
-
proteins
|
33
|
+
proteins.collect { |node| Protein.from_protxml(node) }
|
34
|
+
end
|
35
|
+
|
36
|
+
def protein_id_to_gffid(protein_id,gff_idregex)
|
37
|
+
return protein_id if gff_idregex.nil?
|
38
|
+
return protein_id.match(/#{gff_idregex}/)[1]
|
39
|
+
end
|
40
|
+
|
41
|
+
def protein_id_to_genomeid(protein_id,genome_idregex)
|
42
|
+
return protein_id if genome_idregex.nil?
|
43
|
+
return protein_id.match(/#{genome_idregex}/)[1]
|
44
|
+
end
|
45
|
+
|
46
|
+
def protein_id_to_protdbid(protein_id)
|
47
|
+
# return protein_id.sub(/^lcl\|/,"")
|
48
|
+
return protein_id
|
42
49
|
end
|
43
50
|
|
44
51
|
def prepare_fasta(database_path,type)
|
@@ -50,134 +57,106 @@ def prepare_fasta(database_path,type)
|
|
50
57
|
db_filename=Constants.new.current_database_for_name(database_path)
|
51
58
|
end
|
52
59
|
|
53
|
-
|
60
|
+
|
61
|
+
db_indexfilename = type=='prot' ? "#{db_filename}.pin" : "#{db_filename}.nhr"
|
54
62
|
|
55
63
|
if File.exist?(db_indexfilename)
|
56
|
-
puts "Using existing indexed database"
|
57
64
|
orf_lookup = FastaDB.new(db_filename)
|
58
65
|
else
|
59
|
-
puts "Indexing database"
|
60
66
|
orf_lookup = FastaDB.create(db_filename,db_filename,type)
|
61
67
|
end
|
62
68
|
orf_lookup
|
63
69
|
end
|
64
70
|
|
65
|
-
proteins = parse_proteins(input_proxml)
|
66
|
-
fastadb = prepare_fasta(tool.database,'prot')
|
67
|
-
genomedb = nil
|
68
|
-
if tool.genome
|
69
|
-
genomedb = prepare_fasta(tool.genome,'nucl')
|
70
|
-
end
|
71
|
-
|
72
|
-
puts "Aligning peptides and writing GFF data..."
|
73
|
-
|
74
|
-
low_prob = 0
|
75
|
-
skipped = 0
|
76
|
-
peptide_count = 0
|
77
|
-
protein_count = 0
|
78
|
-
total_peptides = 0
|
79
|
-
|
80
|
-
peptides_covered_genome={}
|
81
|
-
|
82
|
-
for prot in proteins
|
83
|
-
prot_prob = prot['probability']
|
84
|
-
if ( prot_prob.to_f < tool.protein_probability_threshold )
|
85
|
-
next
|
86
|
-
end
|
87
|
-
|
88
|
-
# Gets identifiers of all proteins (includeing indistinguishable ones)
|
89
|
-
prot_names=tool.protein_names(prot)
|
90
71
|
|
91
72
|
|
92
|
-
|
93
|
-
|
94
|
-
end
|
73
|
+
tool=Tool.new([:explicit_output,:debug])
|
74
|
+
tool.option_parser.banner = "Map proteins and peptides to genomic coordinates.\n\nUsage: protxml_to_gff.rb [options] proteins.<protXML>"
|
95
75
|
|
76
|
+
tool.add_value_option(:database,nil,['-d filename','--database filename','Database used for ms/ms searches (Fasta Format)'])
|
77
|
+
# tool.add_value_option(:genome,nil,['-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)'])
|
78
|
+
tool.add_value_option(:coords_file,nil,['-c filename','--coords-file filename.gff3', 'A file containing genomic coordinates for predicted proteins and/or 6-frame translations'])
|
79
|
+
tool.add_boolean_option(:stack_charge_states,false,['--stack-charge-states','Different peptide charge states get separate gff entries'])
|
80
|
+
tool.add_value_option(:peptide_probability_threshold,0.95,['--threshold prob','Peptide Probability Threshold (Default 0.95)'])
|
81
|
+
tool.add_value_option(:protein_probability_threshold,0.99,['--prot-threshold prob','Protein Probability Threshold (Default 0.99)'])
|
82
|
+
tool.add_value_option(:gff_idregex,nil,['--gff-idregex pre','Regex with capture group for parsing gff ids from protein ids'])
|
83
|
+
tool.add_value_option(:genome_idregex,nil,['--genome-idregex pre','Regex with capture group for parsing genomic ids from protein ids'])
|
96
84
|
|
97
|
-
|
98
|
-
entries_covered=[]
|
99
|
-
for protein_name in prot_names
|
100
|
-
protein_count += 1
|
101
|
-
prot_id = "pr#{protein_count.to_s}"
|
102
|
-
begin
|
85
|
+
exit unless tool.check_options(true,[:database,:coords_file])
|
103
86
|
|
104
|
-
|
105
|
-
|
87
|
+
$protk = Constants.new
|
88
|
+
log_level = tool.debug ? "info" : "warn"
|
89
|
+
$protk.info_level= log_level
|
106
90
|
|
107
|
-
unless (tool.collapse_redundant_proteins && !tool.is_new_genome_location(protein_info,entries_covered) )
|
108
91
|
|
109
|
-
|
92
|
+
input_file=ARGV[0]
|
110
93
|
|
111
|
-
|
94
|
+
if tool.explicit_output
|
95
|
+
output_fh=File.new("#{tool.explicit_output}",'w')
|
96
|
+
else
|
97
|
+
output_fh=$stdout
|
98
|
+
end
|
112
99
|
|
113
|
-
|
114
|
-
throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
|
100
|
+
should_ = tool.debug || (output_fh!=$stdout)
|
115
101
|
|
116
|
-
|
117
|
-
peptide_count=1
|
118
|
-
for peptide in peptides
|
102
|
+
input_protxml=ARGV[0]
|
119
103
|
|
120
|
-
|
121
|
-
# puts peptide
|
122
|
-
# puts pprob
|
123
|
-
pep_seq = peptide['peptide_sequence']
|
104
|
+
gffdb = GFFDB.create(tool.coords_file) if tool.coords_file
|
124
105
|
|
125
|
-
|
106
|
+
# genome_db = prepare_fasta(tool.genome,'nucl')
|
107
|
+
prot_db = prepare_fasta(tool.database,'prot')
|
126
108
|
|
127
|
-
|
128
|
-
if !protein_info.is_sixframe
|
129
|
-
throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
|
130
|
-
dna_sequence = tool.get_dna_sequence(protein_info,genomedb)
|
131
|
-
end
|
109
|
+
proteins = parse_proteins(input_protxml)
|
132
110
|
|
111
|
+
num_missing_gff_entries = 0
|
133
112
|
|
134
|
-
|
113
|
+
proteins.each do |protein|
|
135
114
|
|
136
|
-
|
115
|
+
begin
|
116
|
+
# Get the full protein sequence
|
117
|
+
#
|
118
|
+
parsed_name_for_protdb = protein_id_to_protdbid(protein.protein_name)
|
119
|
+
protein_entry = prot_db.get_by_id parsed_name_for_protdb
|
120
|
+
raise ProteinNotInDBError if ( protein_entry == nil)
|
137
121
|
|
138
|
-
|
122
|
+
protein.sequence = protein_entry.aaseq
|
139
123
|
|
140
|
-
|
124
|
+
# Get the CDS and parent entries from the gff file
|
125
|
+
#
|
126
|
+
parsed_name_for_gffid = protein_id_to_gffid(protein.protein_name,tool.gff_idregex)
|
127
|
+
gff_parent_entries = gffdb.get_by_id(parsed_name_for_gffid)
|
128
|
+
raise NoGFFEntryFoundError if gff_parent_entries.nil? || gff_parent_entries.length==0
|
129
|
+
raise MultipleGFFEntriesForProteinError if gff_parent_entries.length > 1
|
141
130
|
|
142
|
-
|
143
|
-
|
131
|
+
gff_parent_entry = gff_parent_entries.first
|
132
|
+
gff_cds_entries = gffdb.get_cds_by_parent_id(parsed_name_for_gffid)
|
144
133
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
puts "Duplicate peptide #{peptide_gff[0]}"
|
149
|
-
end
|
150
|
-
# puts gff_db.records.last
|
151
|
-
end
|
152
|
-
end
|
153
|
-
else
|
154
|
-
puts "Skipping redundant entry #{protein_name}"
|
155
|
-
protein_count-=1 # To counter +1 prior to begin rescue end block
|
156
|
-
end
|
134
|
+
# Account for sixframe case. Parent is CDS and there are no children
|
135
|
+
#
|
136
|
+
gff_cds_entries=[gff_parent_entry] if gff_cds_entries.nil? && gff_parent_entry.feature=="CDS"
|
157
137
|
|
158
|
-
|
138
|
+
peptides = tool.stack_charge_states ? protein.peptides : protein.representative_peptides
|
159
139
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
140
|
+
peptides.each do |peptide|
|
141
|
+
peptide_entries = peptide.to_gff3_records(protein_entry.aaseq,gff_parent_entry,gff_cds_entries)
|
142
|
+
peptide_entries.each do |peptide_entry|
|
143
|
+
output_fh.write peptide_entry.to_s
|
144
|
+
end
|
145
|
+
end
|
165
146
|
|
166
|
-
|
167
|
-
|
147
|
+
rescue NoGFFEntryFoundError
|
148
|
+
$protk.log "No gff entry for #{parsed_name_for_gffid}", :info
|
149
|
+
num_missing_gff_entries+=1
|
150
|
+
rescue ProteinNotInDBError
|
151
|
+
$protk.log "No entry for #{parsed_name_for_protdb}", :info
|
152
|
+
rescue MultipleGFFEntriesForProteinError
|
153
|
+
$protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :info
|
154
|
+
rescue PeptideNotInProteinError
|
155
|
+
$protk.log "A peptide was not found in its parent protein #{protein.protein_name}" , :warn
|
156
|
+
end
|
157
|
+
end
|
168
158
|
|
159
|
+
if num_missing_gff_entries>0
|
160
|
+
$protk.log "Failed to lookup gff entries. Try setting --gff-idregex" if tool.gff_idregex.nil?
|
169
161
|
end
|
170
162
|
|
171
|
-
f = open(gff_out_file,'w+')
|
172
|
-
gff_db.records.each { |rec|
|
173
|
-
f.write(rec.to_s)
|
174
|
-
}
|
175
|
-
f.close
|
176
|
-
|
177
|
-
p "Finished."
|
178
|
-
p "Proteins: #{protein_count}"
|
179
|
-
p "Skipped Decoys: #{skipped}"
|
180
|
-
p "Total Peptides: #{total_peptides}"
|
181
|
-
p "Peptides Written: #{total_peptides - low_prob}"
|
182
|
-
p "Peptides Culled: #{low_prob}"
|
183
|
-
exit(0)
|
data/bin/protxml_to_psql.rb
CHANGED
@@ -242,7 +242,7 @@ def insert_psms_from_file(filepath)
|
|
242
242
|
|
243
243
|
spectrum_queries.each do |query|
|
244
244
|
|
245
|
-
spectrum_name = query.attributes['spectrum'].chomp.gsub(
|
245
|
+
spectrum_name = query.attributes['spectrum'].chomp.gsub(/\.0+/,"\.").sub(/\.\d+$/,"")
|
246
246
|
|
247
247
|
start_scan=query.attributes['start_scan'].to_i
|
248
248
|
end_scan=query.attributes['end_scan'].to_i
|
@@ -318,7 +318,8 @@ def lookup_spectra_from_files(file_list,matched_spectra)
|
|
318
318
|
SQL
|
319
319
|
|
320
320
|
else
|
321
|
-
|
321
|
+
# require 'debugger';debugger
|
322
|
+
# puts "Unmatched spectrum #{spec[:title]}"
|
322
323
|
end
|
323
324
|
spec = mzml_parser.next_spectrum
|
324
325
|
end
|
data/bin/sixframe.rb
CHANGED
@@ -29,6 +29,7 @@ tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage
|
|
29
29
|
tool.add_boolean_option(:print_coords,false,['--coords', 'Write genomic coordinates in the fasta header'])
|
30
30
|
tool.add_boolean_option(:keep_header,true,['--strip-header', 'Dont write sequence definition'])
|
31
31
|
tool.add_value_option(:min_len,20,['--min-len','Minimum ORF length to keep'])
|
32
|
+
tool.add_boolean_option(:write_gff,false,['--gff3','Output gff3 instead of fasta'])
|
32
33
|
|
33
34
|
exit unless tool.check_options(true)
|
34
35
|
|
@@ -38,6 +39,9 @@ output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
|
|
38
39
|
|
39
40
|
output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
|
40
41
|
|
42
|
+
if tool.write_gff
|
43
|
+
output_fh.write "##gff-version 3\n"
|
44
|
+
end
|
41
45
|
|
42
46
|
file = Bio::FastaFormat.open(input_file)
|
43
47
|
|
@@ -66,13 +70,11 @@ file.each do |entry|
|
|
66
70
|
position_end=forward_position_end
|
67
71
|
end
|
68
72
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
73
|
# Create accession compliant with NCBI naming standard
|
73
74
|
# See http://www.ncbi.nlm.nih.gov/books/NBK7183/?rendertype=table&id=ch_demo.T5
|
74
75
|
ncbi_scaffold_id = entry.entry_id.gsub('|','_').gsub(' ','_')
|
75
76
|
ncbi_accession = "lcl|#{ncbi_scaffold_id}_frame_#{frame}_orf_#{oi}"
|
77
|
+
gff_id = "#{ncbi_scaffold_id}_frame_#{frame}_orf_#{oi}"
|
76
78
|
|
77
79
|
defline=">#{ncbi_accession}"
|
78
80
|
|
@@ -84,11 +86,16 @@ file.each do |entry|
|
|
84
86
|
defline << " #{entry.definition}"
|
85
87
|
end
|
86
88
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
89
|
+
if tool.write_gff
|
90
|
+
strand = frame>3 ? "-" : "+"
|
91
|
+
# score = self.nsp_adjusted_probability.nil? ? "." : self.nsp_adjusted_probability.to_s
|
92
|
+
# gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
|
93
|
+
output_fh.write("#{ncbi_scaffold_id}\tsixframe\tCDS\t#{position_start}\t#{position_end}\t.\t#{strand}\t0\tID=#{gff_id}\n")
|
94
|
+
else
|
95
|
+
# Output in fasta format
|
96
|
+
# start and end positions are always relative to the forward strand
|
97
|
+
output_fh.write("#{defline}\n#{orf}\n")
|
98
|
+
end
|
92
99
|
end
|
93
100
|
position += orf.length*3+3
|
94
101
|
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# Convert a pepXML file to a tab delimited table
|
7
|
+
#
|
8
|
+
#
|
9
|
+
require 'protk/tool'
|
10
|
+
require 'protk/swissprot_database'
|
11
|
+
require 'protk/bio_sptr_extensions'
|
12
|
+
require 'protk/fastadb'
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
columns={'recname'=>"Primary Name",'cd'=>"CD Antigen Name",'altnames'=>"Alternate Names",
|
17
|
+
'location' => "Subcellular Location",
|
18
|
+
'function' => "Known Function",
|
19
|
+
'similarity' => "Similarity",
|
20
|
+
'tissues' => "Tissue Specificity",
|
21
|
+
'disease' => "Disease Association",
|
22
|
+
'domain' => "Domain",
|
23
|
+
'subunit' => "Sub Unit",
|
24
|
+
'nextbio' => "NextBio",
|
25
|
+
'ipi' => "IPI",
|
26
|
+
'intact' => "Interactions",
|
27
|
+
'pride' => 'Pride',
|
28
|
+
'ensembl'=> 'Ensembl',
|
29
|
+
'num_transmem'=>"Transmembrane Regions",
|
30
|
+
'signalp'=>'Signal Peptide',
|
31
|
+
'go_terms'=>"GO Terms",
|
32
|
+
'go_entries'=>"GO Entries",
|
33
|
+
'accessions'=>"Uniprot Accessions",
|
34
|
+
'ncbi_taxon_id'=>"NCBI Taxon ID"
|
35
|
+
}
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
42
|
+
#
|
43
|
+
tool=Tool.new([:explicit_output,:debug])
|
44
|
+
tool.option_parser.banner = "Query a swissprot flat file and output to tab delimited table.\n\nUsage: swissprot_to_table.rb [options] -d flatfile.dat queries.txt"
|
45
|
+
|
46
|
+
tool.add_value_option(:database,nil,['-d','--database file','Uniprot flatfile database containing full records for proteins'])
|
47
|
+
tool.add_value_option(:output_keys,nil,['-K','--keys keys','Filter output to only the specified keys (comma separated)'])
|
48
|
+
tool.add_boolean_option(:show_keys,false,['--show-keys','Print a list of possible values for the keys field and exit'])
|
49
|
+
tool.add_value_option(:separator,"\t",['-S','--separator sep','Separator character for output, default (tab)'])
|
50
|
+
tool.add_value_option(:array_separator,",",['-A','--array-separator sep','Array Separator character, default ,'])
|
51
|
+
tool.add_value_option(:query_separator,"\t",['--query-separator sep','Separator character for queries.txt, default is tab'])
|
52
|
+
tool.add_value_option(:id_column,1,['--id-column num','Column in queries.txt in which Uniprot Accessions are found'])
|
53
|
+
|
54
|
+
|
55
|
+
if ARGV.include? "--show-keys"
|
56
|
+
columns.each_pair { |name, val| $stdout.write "#{name} (#{val})\n" }
|
57
|
+
exit
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
exit unless tool.check_options(true,[:database])
|
62
|
+
|
63
|
+
|
64
|
+
$protk = Constants.new
|
65
|
+
log_level = tool.debug ? :debug : :fatal
|
66
|
+
$protk.info_level= log_level
|
67
|
+
|
68
|
+
|
69
|
+
if tool.explicit_output
|
70
|
+
output_fh=File.new("#{tool.explicit_output}",'w')
|
71
|
+
else
|
72
|
+
output_fh=$stdout
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
if tool.output_keys
|
77
|
+
output_keys=tool.output_keys.split(",").collect { |k| k.strip }
|
78
|
+
columns.delete_if { |key, value| !output_keys.include? key }
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
db_info=tool.database_info
|
83
|
+
database_path=db_info.path
|
84
|
+
|
85
|
+
database_index_path = "#{Pathname.new(database_path).dirname}/config.dat"
|
86
|
+
|
87
|
+
skip_index = File.exists?(database_index_path) ? true : false
|
88
|
+
|
89
|
+
|
90
|
+
swissprotdb=SwissprotDatabase.new(database_path,skip_index)
|
91
|
+
|
92
|
+
|
93
|
+
def write_entry(item_name,item,columns,tool,output_fh)
|
94
|
+
row=[item_name]
|
95
|
+
row << columns.keys.collect do |name|
|
96
|
+
colvalue = item.send(name)
|
97
|
+
colvalue = "" unless colvalue
|
98
|
+
colvalue = colvalue.join(tool.array_separator) if colvalue.class==Array
|
99
|
+
colvalue
|
100
|
+
end
|
101
|
+
output_fh.write "#{row.join(tool.separator)}\n"
|
102
|
+
end
|
103
|
+
|
104
|
+
File.open(ARGV[0]).each_line do |line|
|
105
|
+
|
106
|
+
begin
|
107
|
+
query_id = line.chomp.split(tool.query_separator)[tool.id_column.to_i-1]
|
108
|
+
rescue
|
109
|
+
query_id = line.chomp
|
110
|
+
end
|
111
|
+
|
112
|
+
begin
|
113
|
+
item = swissprotdb.get_entry_for_name(query_id)
|
114
|
+
write_entry(query_id,item,columns,tool,output_fh)
|
115
|
+
rescue
|
116
|
+
$protk.log "Unable to retrieve entry for #{query_id}" , :debug
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|