protk 1.3.0 → 1.3.1.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/make_decoy.rb +1 -2
- data/bin/mascot_search.rb +2 -0
- data/bin/msgfplus_search.rb +1 -1
- data/bin/protxml_to_gff.rb +94 -115
- data/bin/protxml_to_psql.rb +3 -2
- data/bin/sixframe.rb +15 -8
- data/bin/swissprot_to_table.rb +120 -0
- data/lib/protk.rb +0 -1
- data/lib/protk/bio_gff3_extensions.rb +22 -0
- data/lib/protk/bio_sptr_extensions.rb +19 -4
- data/lib/protk/constants.rb +19 -11
- data/lib/protk/gffdb.rb +60 -0
- data/lib/protk/peptide.rb +158 -0
- data/lib/protk/protein.rb +72 -0
- data/lib/protk/protein_to_genome_mapper.rb +8 -0
- data/lib/protk/protxml_to_gff_tool.rb +3 -1
- data/lib/protk/search_tool.rb +3 -24
- data/lib/protk/swissprot_database.rb +8 -20
- data/lib/protk/tool.rb +36 -1
- metadata +68 -41
- data/lib/protk/protxml.rb +0 -141
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 888f8ebff75c2c33497c9bf4f7aeec182311b7e3
|
4
|
+
data.tar.gz: 4102a91afbee688babe093df8a53b84b097ba0c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e67189a07c6ac237a4def19ad90043ab8919d5492fd43b67cfa5fc3285819b2fd62671375283c5d9dd05618c746603ba829c70225b140b3a52ccba9fafb24f8
|
7
|
+
data.tar.gz: 354a9eb2499d3f8b194ccdef82f06692672435a0e47b5b49197b9f1fba2c27181275d38c98fe6f644750b554b4b1601b873f6a51a0318853b86b17d7783f2e57
|
data/bin/make_decoy.rb
CHANGED
@@ -49,10 +49,9 @@ if (tool.reverse_only)
|
|
49
49
|
Bio::FastaFormat.open(input_file).each do |seq|
|
50
50
|
id=nil
|
51
51
|
begin
|
52
|
-
# require 'debugger';debugger
|
53
52
|
id=seq.definition.chomp.scan(/#{tool.id_regex}/)[0][0]
|
54
53
|
revdef=seq.definition.sub(id,"#{tool.prefix_string}#{id}")
|
55
|
-
decoys_out.write ">#{revdef}\n#{seq.aaseq}\n"
|
54
|
+
decoys_out.write ">#{revdef}\n#{seq.aaseq.reverse}\n"
|
56
55
|
rescue
|
57
56
|
puts "Unable to parse id for #{seq.definition}. Skipping" if (id==nil)
|
58
57
|
end
|
data/bin/mascot_search.rb
CHANGED
@@ -103,6 +103,7 @@ def search_params_dictionary(search_tool,input_file)
|
|
103
103
|
postdict[:FILE]=File.new(input_file)
|
104
104
|
postdict[:FORMVER]='1.01'
|
105
105
|
postdict[:INTERMEDIATE]=''
|
106
|
+
postdict[:QUANTITATION]=search_tool.quantitation
|
106
107
|
|
107
108
|
postdict
|
108
109
|
end
|
@@ -134,6 +135,7 @@ search_tool.options.output_suffix="_mascot"
|
|
134
135
|
|
135
136
|
search_tool.add_value_option(:mascot_server,"#{$genv.default_mascot_server}/mascot/cgi",['-S', '--server url', 'The url to the cgi directory of the mascot server'])
|
136
137
|
search_tool.add_value_option(:allowed_charges,"1+,2+,3+",['--allowed-charges ac', 'Allowed precursor ion charges.'])
|
138
|
+
search_tool.add_value_option(:quantitation,"",['--quantitation method','Mascot quant method'])
|
137
139
|
search_tool.add_value_option(:email,"",['--email em', 'User email.'])
|
138
140
|
search_tool.add_value_option(:username,"",['--username un', 'Username.'])
|
139
141
|
search_tool.add_value_option(:httpproxy,nil,['--proxy url', 'The url to a proxy server'])
|
data/bin/msgfplus_search.rb
CHANGED
data/bin/protxml_to_gff.rb
CHANGED
@@ -1,44 +1,51 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
3
|
# This file is part of protk
|
4
|
-
#
|
5
|
-
# Translated to ruby by Ira Cooke 29/1/2013
|
4
|
+
# Created by Ira Cooke 3/8/2014
|
6
5
|
#
|
7
6
|
#
|
8
7
|
|
9
8
|
require 'protk/constants'
|
10
|
-
require 'protk/protxml_to_gff_tool'
|
11
9
|
require 'protk/fastadb'
|
10
|
+
require 'protk/gffdb'
|
11
|
+
require 'protk/protein'
|
12
|
+
require 'protk/peptide'
|
13
|
+
require 'protk/tool'
|
12
14
|
require 'libxml'
|
13
15
|
require 'bio'
|
14
16
|
|
15
17
|
include LibXML
|
16
18
|
|
17
|
-
tool=ProtXMLToGFFTool.new()
|
18
19
|
|
19
|
-
|
20
|
-
@output_suffix=""
|
21
|
-
|
22
|
-
exit unless tool.check_options(true,[:database])
|
23
|
-
|
24
|
-
input_proxml=ARGV[0]
|
25
|
-
|
26
|
-
if ( tool.explicit_output!=nil)
|
27
|
-
gff_out_file=tool.explicit_output
|
28
|
-
else
|
29
|
-
gff_out_file=Tool.default_output_path(input_proxml,@output_extension,tool.output_prefix,@output_suffix)
|
20
|
+
class NoGFFEntryFoundError < StandardError
|
30
21
|
end
|
31
22
|
|
32
|
-
|
33
|
-
|
23
|
+
class ProteinNotInDBError < StandardError
|
24
|
+
end
|
34
25
|
|
26
|
+
class MultipleGFFEntriesForProteinError < StandardError
|
27
|
+
end
|
35
28
|
|
36
29
|
def parse_proteins(protxml_file)
|
37
|
-
puts "Parsing proteins from protxml"
|
38
30
|
protxml_parser=XML::Parser.file(protxml_file)
|
39
31
|
protxml_doc=protxml_parser.parse
|
40
32
|
proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
41
|
-
proteins
|
33
|
+
proteins.collect { |node| Protein.from_protxml(node) }
|
34
|
+
end
|
35
|
+
|
36
|
+
def protein_id_to_gffid(protein_id,gff_idregex)
|
37
|
+
return protein_id if gff_idregex.nil?
|
38
|
+
return protein_id.match(/#{gff_idregex}/)[1]
|
39
|
+
end
|
40
|
+
|
41
|
+
def protein_id_to_genomeid(protein_id,genome_idregex)
|
42
|
+
return protein_id if genome_idregex.nil?
|
43
|
+
return protein_id.match(/#{genome_idregex}/)[1]
|
44
|
+
end
|
45
|
+
|
46
|
+
def protein_id_to_protdbid(protein_id)
|
47
|
+
# return protein_id.sub(/^lcl\|/,"")
|
48
|
+
return protein_id
|
42
49
|
end
|
43
50
|
|
44
51
|
def prepare_fasta(database_path,type)
|
@@ -50,134 +57,106 @@ def prepare_fasta(database_path,type)
|
|
50
57
|
db_filename=Constants.new.current_database_for_name(database_path)
|
51
58
|
end
|
52
59
|
|
53
|
-
|
60
|
+
|
61
|
+
db_indexfilename = type=='prot' ? "#{db_filename}.pin" : "#{db_filename}.nhr"
|
54
62
|
|
55
63
|
if File.exist?(db_indexfilename)
|
56
|
-
puts "Using existing indexed database"
|
57
64
|
orf_lookup = FastaDB.new(db_filename)
|
58
65
|
else
|
59
|
-
puts "Indexing database"
|
60
66
|
orf_lookup = FastaDB.create(db_filename,db_filename,type)
|
61
67
|
end
|
62
68
|
orf_lookup
|
63
69
|
end
|
64
70
|
|
65
|
-
proteins = parse_proteins(input_proxml)
|
66
|
-
fastadb = prepare_fasta(tool.database,'prot')
|
67
|
-
genomedb = nil
|
68
|
-
if tool.genome
|
69
|
-
genomedb = prepare_fasta(tool.genome,'nucl')
|
70
|
-
end
|
71
|
-
|
72
|
-
puts "Aligning peptides and writing GFF data..."
|
73
|
-
|
74
|
-
low_prob = 0
|
75
|
-
skipped = 0
|
76
|
-
peptide_count = 0
|
77
|
-
protein_count = 0
|
78
|
-
total_peptides = 0
|
79
|
-
|
80
|
-
peptides_covered_genome={}
|
81
|
-
|
82
|
-
for prot in proteins
|
83
|
-
prot_prob = prot['probability']
|
84
|
-
if ( prot_prob.to_f < tool.protein_probability_threshold )
|
85
|
-
next
|
86
|
-
end
|
87
|
-
|
88
|
-
# Gets identifiers of all proteins (includeing indistinguishable ones)
|
89
|
-
prot_names=tool.protein_names(prot)
|
90
71
|
|
91
72
|
|
92
|
-
|
93
|
-
|
94
|
-
end
|
73
|
+
tool=Tool.new([:explicit_output,:debug])
|
74
|
+
tool.option_parser.banner = "Map proteins and peptides to genomic coordinates.\n\nUsage: protxml_to_gff.rb [options] proteins.<protXML>"
|
95
75
|
|
76
|
+
tool.add_value_option(:database,nil,['-d filename','--database filename','Database used for ms/ms searches (Fasta Format)'])
|
77
|
+
# tool.add_value_option(:genome,nil,['-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)'])
|
78
|
+
tool.add_value_option(:coords_file,nil,['-c filename','--coords-file filename.gff3', 'A file containing genomic coordinates for predicted proteins and/or 6-frame translations'])
|
79
|
+
tool.add_boolean_option(:stack_charge_states,false,['--stack-charge-states','Different peptide charge states get separate gff entries'])
|
80
|
+
tool.add_value_option(:peptide_probability_threshold,0.95,['--threshold prob','Peptide Probability Threshold (Default 0.95)'])
|
81
|
+
tool.add_value_option(:protein_probability_threshold,0.99,['--prot-threshold prob','Protein Probability Threshold (Default 0.99)'])
|
82
|
+
tool.add_value_option(:gff_idregex,nil,['--gff-idregex pre','Regex with capture group for parsing gff ids from protein ids'])
|
83
|
+
tool.add_value_option(:genome_idregex,nil,['--genome-idregex pre','Regex with capture group for parsing genomic ids from protein ids'])
|
96
84
|
|
97
|
-
|
98
|
-
entries_covered=[]
|
99
|
-
for protein_name in prot_names
|
100
|
-
protein_count += 1
|
101
|
-
prot_id = "pr#{protein_count.to_s}"
|
102
|
-
begin
|
85
|
+
exit unless tool.check_options(true,[:database,:coords_file])
|
103
86
|
|
104
|
-
|
105
|
-
|
87
|
+
$protk = Constants.new
|
88
|
+
log_level = tool.debug ? "info" : "warn"
|
89
|
+
$protk.info_level= log_level
|
106
90
|
|
107
|
-
unless (tool.collapse_redundant_proteins && !tool.is_new_genome_location(protein_info,entries_covered) )
|
108
91
|
|
109
|
-
|
92
|
+
input_file=ARGV[0]
|
110
93
|
|
111
|
-
|
94
|
+
if tool.explicit_output
|
95
|
+
output_fh=File.new("#{tool.explicit_output}",'w')
|
96
|
+
else
|
97
|
+
output_fh=$stdout
|
98
|
+
end
|
112
99
|
|
113
|
-
|
114
|
-
throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
|
100
|
+
should_ = tool.debug || (output_fh!=$stdout)
|
115
101
|
|
116
|
-
|
117
|
-
peptide_count=1
|
118
|
-
for peptide in peptides
|
102
|
+
input_protxml=ARGV[0]
|
119
103
|
|
120
|
-
|
121
|
-
# puts peptide
|
122
|
-
# puts pprob
|
123
|
-
pep_seq = peptide['peptide_sequence']
|
104
|
+
gffdb = GFFDB.create(tool.coords_file) if tool.coords_file
|
124
105
|
|
125
|
-
|
106
|
+
# genome_db = prepare_fasta(tool.genome,'nucl')
|
107
|
+
prot_db = prepare_fasta(tool.database,'prot')
|
126
108
|
|
127
|
-
|
128
|
-
if !protein_info.is_sixframe
|
129
|
-
throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
|
130
|
-
dna_sequence = tool.get_dna_sequence(protein_info,genomedb)
|
131
|
-
end
|
109
|
+
proteins = parse_proteins(input_protxml)
|
132
110
|
|
111
|
+
num_missing_gff_entries = 0
|
133
112
|
|
134
|
-
|
113
|
+
proteins.each do |protein|
|
135
114
|
|
136
|
-
|
115
|
+
begin
|
116
|
+
# Get the full protein sequence
|
117
|
+
#
|
118
|
+
parsed_name_for_protdb = protein_id_to_protdbid(protein.protein_name)
|
119
|
+
protein_entry = prot_db.get_by_id parsed_name_for_protdb
|
120
|
+
raise ProteinNotInDBError if ( protein_entry == nil)
|
137
121
|
|
138
|
-
|
122
|
+
protein.sequence = protein_entry.aaseq
|
139
123
|
|
140
|
-
|
124
|
+
# Get the CDS and parent entries from the gff file
|
125
|
+
#
|
126
|
+
parsed_name_for_gffid = protein_id_to_gffid(protein.protein_name,tool.gff_idregex)
|
127
|
+
gff_parent_entries = gffdb.get_by_id(parsed_name_for_gffid)
|
128
|
+
raise NoGFFEntryFoundError if gff_parent_entries.nil? || gff_parent_entries.length==0
|
129
|
+
raise MultipleGFFEntriesForProteinError if gff_parent_entries.length > 1
|
141
130
|
|
142
|
-
|
143
|
-
|
131
|
+
gff_parent_entry = gff_parent_entries.first
|
132
|
+
gff_cds_entries = gffdb.get_cds_by_parent_id(parsed_name_for_gffid)
|
144
133
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
puts "Duplicate peptide #{peptide_gff[0]}"
|
149
|
-
end
|
150
|
-
# puts gff_db.records.last
|
151
|
-
end
|
152
|
-
end
|
153
|
-
else
|
154
|
-
puts "Skipping redundant entry #{protein_name}"
|
155
|
-
protein_count-=1 # To counter +1 prior to begin rescue end block
|
156
|
-
end
|
134
|
+
# Account for sixframe case. Parent is CDS and there are no children
|
135
|
+
#
|
136
|
+
gff_cds_entries=[gff_parent_entry] if gff_cds_entries.nil? && gff_parent_entry.feature=="CDS"
|
157
137
|
|
158
|
-
|
138
|
+
peptides = tool.stack_charge_states ? protein.peptides : protein.representative_peptides
|
159
139
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
140
|
+
peptides.each do |peptide|
|
141
|
+
peptide_entries = peptide.to_gff3_records(protein_entry.aaseq,gff_parent_entry,gff_cds_entries)
|
142
|
+
peptide_entries.each do |peptide_entry|
|
143
|
+
output_fh.write peptide_entry.to_s
|
144
|
+
end
|
145
|
+
end
|
165
146
|
|
166
|
-
|
167
|
-
|
147
|
+
rescue NoGFFEntryFoundError
|
148
|
+
$protk.log "No gff entry for #{parsed_name_for_gffid}", :info
|
149
|
+
num_missing_gff_entries+=1
|
150
|
+
rescue ProteinNotInDBError
|
151
|
+
$protk.log "No entry for #{parsed_name_for_protdb}", :info
|
152
|
+
rescue MultipleGFFEntriesForProteinError
|
153
|
+
$protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :info
|
154
|
+
rescue PeptideNotInProteinError
|
155
|
+
$protk.log "A peptide was not found in its parent protein #{protein.protein_name}" , :warn
|
156
|
+
end
|
157
|
+
end
|
168
158
|
|
159
|
+
if num_missing_gff_entries>0
|
160
|
+
$protk.log "Failed to lookup gff entries. Try setting --gff-idregex" if tool.gff_idregex.nil?
|
169
161
|
end
|
170
162
|
|
171
|
-
f = open(gff_out_file,'w+')
|
172
|
-
gff_db.records.each { |rec|
|
173
|
-
f.write(rec.to_s)
|
174
|
-
}
|
175
|
-
f.close
|
176
|
-
|
177
|
-
p "Finished."
|
178
|
-
p "Proteins: #{protein_count}"
|
179
|
-
p "Skipped Decoys: #{skipped}"
|
180
|
-
p "Total Peptides: #{total_peptides}"
|
181
|
-
p "Peptides Written: #{total_peptides - low_prob}"
|
182
|
-
p "Peptides Culled: #{low_prob}"
|
183
|
-
exit(0)
|
data/bin/protxml_to_psql.rb
CHANGED
@@ -242,7 +242,7 @@ def insert_psms_from_file(filepath)
|
|
242
242
|
|
243
243
|
spectrum_queries.each do |query|
|
244
244
|
|
245
|
-
spectrum_name = query.attributes['spectrum'].chomp.gsub(
|
245
|
+
spectrum_name = query.attributes['spectrum'].chomp.gsub(/\.0+/,"\.").sub(/\.\d+$/,"")
|
246
246
|
|
247
247
|
start_scan=query.attributes['start_scan'].to_i
|
248
248
|
end_scan=query.attributes['end_scan'].to_i
|
@@ -318,7 +318,8 @@ def lookup_spectra_from_files(file_list,matched_spectra)
|
|
318
318
|
SQL
|
319
319
|
|
320
320
|
else
|
321
|
-
|
321
|
+
# require 'debugger';debugger
|
322
|
+
# puts "Unmatched spectrum #{spec[:title]}"
|
322
323
|
end
|
323
324
|
spec = mzml_parser.next_spectrum
|
324
325
|
end
|
data/bin/sixframe.rb
CHANGED
@@ -29,6 +29,7 @@ tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage
|
|
29
29
|
tool.add_boolean_option(:print_coords,false,['--coords', 'Write genomic coordinates in the fasta header'])
|
30
30
|
tool.add_boolean_option(:keep_header,true,['--strip-header', 'Dont write sequence definition'])
|
31
31
|
tool.add_value_option(:min_len,20,['--min-len','Minimum ORF length to keep'])
|
32
|
+
tool.add_boolean_option(:write_gff,false,['--gff3','Output gff3 instead of fasta'])
|
32
33
|
|
33
34
|
exit unless tool.check_options(true)
|
34
35
|
|
@@ -38,6 +39,9 @@ output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
|
|
38
39
|
|
39
40
|
output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
|
40
41
|
|
42
|
+
if tool.write_gff
|
43
|
+
output_fh.write "##gff-version 3\n"
|
44
|
+
end
|
41
45
|
|
42
46
|
file = Bio::FastaFormat.open(input_file)
|
43
47
|
|
@@ -66,13 +70,11 @@ file.each do |entry|
|
|
66
70
|
position_end=forward_position_end
|
67
71
|
end
|
68
72
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
73
|
# Create accession compliant with NCBI naming standard
|
73
74
|
# See http://www.ncbi.nlm.nih.gov/books/NBK7183/?rendertype=table&id=ch_demo.T5
|
74
75
|
ncbi_scaffold_id = entry.entry_id.gsub('|','_').gsub(' ','_')
|
75
76
|
ncbi_accession = "lcl|#{ncbi_scaffold_id}_frame_#{frame}_orf_#{oi}"
|
77
|
+
gff_id = "#{ncbi_scaffold_id}_frame_#{frame}_orf_#{oi}"
|
76
78
|
|
77
79
|
defline=">#{ncbi_accession}"
|
78
80
|
|
@@ -84,11 +86,16 @@ file.each do |entry|
|
|
84
86
|
defline << " #{entry.definition}"
|
85
87
|
end
|
86
88
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
89
|
+
if tool.write_gff
|
90
|
+
strand = frame>3 ? "-" : "+"
|
91
|
+
# score = self.nsp_adjusted_probability.nil? ? "." : self.nsp_adjusted_probability.to_s
|
92
|
+
# gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
|
93
|
+
output_fh.write("#{ncbi_scaffold_id}\tsixframe\tCDS\t#{position_start}\t#{position_end}\t.\t#{strand}\t0\tID=#{gff_id}\n")
|
94
|
+
else
|
95
|
+
# Output in fasta format
|
96
|
+
# start and end positions are always relative to the forward strand
|
97
|
+
output_fh.write("#{defline}\n#{orf}\n")
|
98
|
+
end
|
92
99
|
end
|
93
100
|
position += orf.length*3+3
|
94
101
|
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# Convert a pepXML file to a tab delimited table
|
7
|
+
#
|
8
|
+
#
|
9
|
+
require 'protk/tool'
|
10
|
+
require 'protk/swissprot_database'
|
11
|
+
require 'protk/bio_sptr_extensions'
|
12
|
+
require 'protk/fastadb'
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
columns={'recname'=>"Primary Name",'cd'=>"CD Antigen Name",'altnames'=>"Alternate Names",
|
17
|
+
'location' => "Subcellular Location",
|
18
|
+
'function' => "Known Function",
|
19
|
+
'similarity' => "Similarity",
|
20
|
+
'tissues' => "Tissue Specificity",
|
21
|
+
'disease' => "Disease Association",
|
22
|
+
'domain' => "Domain",
|
23
|
+
'subunit' => "Sub Unit",
|
24
|
+
'nextbio' => "NextBio",
|
25
|
+
'ipi' => "IPI",
|
26
|
+
'intact' => "Interactions",
|
27
|
+
'pride' => 'Pride',
|
28
|
+
'ensembl'=> 'Ensembl',
|
29
|
+
'num_transmem'=>"Transmembrane Regions",
|
30
|
+
'signalp'=>'Signal Peptide',
|
31
|
+
'go_terms'=>"GO Terms",
|
32
|
+
'go_entries'=>"GO Entries",
|
33
|
+
'accessions'=>"Uniprot Accessions",
|
34
|
+
'ncbi_taxon_id'=>"NCBI Taxon ID"
|
35
|
+
}
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
42
|
+
#
|
43
|
+
tool=Tool.new([:explicit_output,:debug])
|
44
|
+
tool.option_parser.banner = "Query a swissprot flat file and output to tab delimited table.\n\nUsage: swissprot_to_table.rb [options] -d flatfile.dat queries.txt"
|
45
|
+
|
46
|
+
tool.add_value_option(:database,nil,['-d','--database file','Uniprot flatfile database containing full records for proteins'])
|
47
|
+
tool.add_value_option(:output_keys,nil,['-K','--keys keys','Filter output to only the specified keys (comma separated)'])
|
48
|
+
tool.add_boolean_option(:show_keys,false,['--show-keys','Print a list of possible values for the keys field and exit'])
|
49
|
+
tool.add_value_option(:separator,"\t",['-S','--separator sep','Separator character for output, default (tab)'])
|
50
|
+
tool.add_value_option(:array_separator,",",['-A','--array-separator sep','Array Separator character, default ,'])
|
51
|
+
tool.add_value_option(:query_separator,"\t",['--query-separator sep','Separator character for queries.txt, default is tab'])
|
52
|
+
tool.add_value_option(:id_column,1,['--id-column num','Column in queries.txt in which Uniprot Accessions are found'])
|
53
|
+
|
54
|
+
|
55
|
+
if ARGV.include? "--show-keys"
|
56
|
+
columns.each_pair { |name, val| $stdout.write "#{name} (#{val})\n" }
|
57
|
+
exit
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
exit unless tool.check_options(true,[:database])
|
62
|
+
|
63
|
+
|
64
|
+
$protk = Constants.new
|
65
|
+
log_level = tool.debug ? :debug : :fatal
|
66
|
+
$protk.info_level= log_level
|
67
|
+
|
68
|
+
|
69
|
+
if tool.explicit_output
|
70
|
+
output_fh=File.new("#{tool.explicit_output}",'w')
|
71
|
+
else
|
72
|
+
output_fh=$stdout
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
if tool.output_keys
|
77
|
+
output_keys=tool.output_keys.split(",").collect { |k| k.strip }
|
78
|
+
columns.delete_if { |key, value| !output_keys.include? key }
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
db_info=tool.database_info
|
83
|
+
database_path=db_info.path
|
84
|
+
|
85
|
+
database_index_path = "#{Pathname.new(database_path).dirname}/config.dat"
|
86
|
+
|
87
|
+
skip_index = File.exists?(database_index_path) ? true : false
|
88
|
+
|
89
|
+
|
90
|
+
swissprotdb=SwissprotDatabase.new(database_path,skip_index)
|
91
|
+
|
92
|
+
|
93
|
+
def write_entry(item_name,item,columns,tool,output_fh)
|
94
|
+
row=[item_name]
|
95
|
+
row << columns.keys.collect do |name|
|
96
|
+
colvalue = item.send(name)
|
97
|
+
colvalue = "" unless colvalue
|
98
|
+
colvalue = colvalue.join(tool.array_separator) if colvalue.class==Array
|
99
|
+
colvalue
|
100
|
+
end
|
101
|
+
output_fh.write "#{row.join(tool.separator)}\n"
|
102
|
+
end
|
103
|
+
|
104
|
+
File.open(ARGV[0]).each_line do |line|
|
105
|
+
|
106
|
+
begin
|
107
|
+
query_id = line.chomp.split(tool.query_separator)[tool.id_column.to_i-1]
|
108
|
+
rescue
|
109
|
+
query_id = line.chomp
|
110
|
+
end
|
111
|
+
|
112
|
+
begin
|
113
|
+
item = swissprotdb.get_entry_for_name(query_id)
|
114
|
+
write_entry(query_id,item,columns,tool,output_fh)
|
115
|
+
rescue
|
116
|
+
$protk.log "Unable to retrieve entry for #{query_id}" , :debug
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|