protk 1.2.6.pre5 → 1.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
@@ -24,7 +24,9 @@
|
|
24
24
|
#define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
|
25
25
|
|
26
26
|
|
27
|
-
static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
27
|
+
static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
28
|
+
VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
|
29
|
+
|
28
30
|
char *input_file = RSTRING_PTR(input_file_in);
|
29
31
|
long sequences_to_generate = NUM2INT(db_length_in);
|
30
32
|
char * output_file = RSTRING_PTR(output_file_in);
|
@@ -50,10 +52,26 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
50
52
|
/* scanning sequence database */
|
51
53
|
|
52
54
|
strcpy(infile,input_file);
|
53
|
-
|
54
|
-
|
55
|
+
|
56
|
+
if ((inp = fopen(infile, "r"))==NULL) {
|
57
|
+
printf("error opening sequence database %s\n",infile);return -1;
|
58
|
+
}
|
59
|
+
|
60
|
+
printf("scanning sequence database \n%s\n",infile);
|
61
|
+
fflush(stdout);
|
62
|
+
|
55
63
|
i=0;n=0;k=0;
|
56
|
-
|
64
|
+
|
65
|
+
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
|
66
|
+
i++;
|
67
|
+
if(line[0]=='>') {
|
68
|
+
if (!(n%1000)) {
|
69
|
+
printf(".");
|
70
|
+
fflush(stdout);
|
71
|
+
n++;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
}
|
57
75
|
|
58
76
|
n_sequences=n;
|
59
77
|
|
@@ -65,11 +83,17 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
65
83
|
index=(char**)malloc(sizeof(char*)*n_sequences);
|
66
84
|
index[0]=sequence; /* set first index pointer to beginning of first database sequence */
|
67
85
|
|
68
|
-
if ((inp = fopen(infile, "r"))==NULL) {
|
86
|
+
if ((inp = fopen(infile, "r"))==NULL) {
|
87
|
+
printf("error opening sequence database %s\n",infile);
|
88
|
+
return -1;
|
89
|
+
}
|
69
90
|
|
70
|
-
printf("done\nreading sequence database %s",infile);
|
91
|
+
printf("done\nreading sequence database \n%s\n",infile);
|
92
|
+
fflush(stdout);
|
93
|
+
|
71
94
|
n=-1;
|
72
95
|
strcpy(temp_sequence,"\0");
|
96
|
+
|
73
97
|
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
|
74
98
|
{
|
75
99
|
if (strcmp(line,"\n")==0) {
|
@@ -98,18 +122,21 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
98
122
|
}
|
99
123
|
|
100
124
|
strcpy(index[n],temp_sequence);
|
125
|
+
|
101
126
|
fclose(inp);
|
102
127
|
|
103
128
|
n_sequences=n+1;
|
104
129
|
|
105
130
|
printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
|
131
|
+
|
106
132
|
measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
|
107
133
|
|
108
134
|
|
109
135
|
|
110
136
|
/* generating Markov probabilities */
|
111
137
|
|
112
|
-
printf("generating Markov probability matrix...");
|
138
|
+
printf("generating Markov probability matrix...");
|
139
|
+
fflush(stdout);
|
113
140
|
|
114
141
|
srand(time(0)); /* replace with constant to re-generate identical random databases */
|
115
142
|
|
@@ -124,7 +151,11 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
124
151
|
|
125
152
|
for(protein=0;protein<n_sequences;protein++)
|
126
153
|
{
|
127
|
-
if (!(protein%1000)) {
|
154
|
+
if (!(protein%1000)) {
|
155
|
+
printf(".");
|
156
|
+
fflush(stdout);
|
157
|
+
}
|
158
|
+
|
128
159
|
if (protein<(n_sequences-1))
|
129
160
|
{
|
130
161
|
strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
|
@@ -142,35 +173,56 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
142
173
|
{
|
143
174
|
printf("Unknown amino acid %c",one_sequence[i]);
|
144
175
|
} else {
|
145
|
-
|
146
|
-
|
147
|
-
|
176
|
+
a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
|
177
|
+
MP[a][i]++;
|
178
|
+
measured_aa_freq[a]++;
|
148
179
|
}
|
149
180
|
}
|
150
|
-
else {
|
181
|
+
else {
|
182
|
+
a=floor(20*(float)rand()/RAND_MAX);
|
183
|
+
MP[a][i]++;
|
184
|
+
measured_aa_freq[a]++;
|
185
|
+
} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
151
186
|
}
|
152
187
|
MP[20][pl]++;
|
153
188
|
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
154
189
|
}
|
155
|
-
|
190
|
+
|
191
|
+
printf("done\n");
|
192
|
+
fflush(stdout);
|
156
193
|
|
157
194
|
|
158
195
|
|
159
|
-
for(i=0;i<MAX_SEQUENCE_LENGTH;i++)
|
160
|
-
|
196
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
|
197
|
+
row_sum[i]=0;
|
198
|
+
}
|
199
|
+
|
200
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
|
201
|
+
for(j=0;j<=20;j++){
|
202
|
+
row_sum[i]+=MP[j][i];
|
203
|
+
}
|
204
|
+
}
|
161
205
|
|
162
206
|
|
163
207
|
/* generate random protein sequences through Markov chain */
|
164
208
|
|
165
|
-
|
166
|
-
|
209
|
+
strcpy(outfile,output_file);
|
210
|
+
|
211
|
+
if ((outp = fopen(outfile, "w"))==NULL) {
|
212
|
+
printf("error opening output file %s\n",outfile);
|
213
|
+
return -1;
|
214
|
+
}
|
215
|
+
|
167
216
|
printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
|
168
217
|
|
169
218
|
strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
|
170
219
|
|
171
220
|
for(protein=0;protein<sequences_to_generate;protein++)
|
172
221
|
{
|
173
|
-
if (!(protein%1000)) {
|
222
|
+
if (!(protein%1000)) {
|
223
|
+
printf(".");fflush(stdout);
|
224
|
+
}
|
225
|
+
|
174
226
|
i=0; j=0;
|
175
227
|
while (1)
|
176
228
|
{
|
@@ -213,9 +265,10 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
213
265
|
|
214
266
|
k=0;l=0;
|
215
267
|
for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
|
216
|
-
printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
|
217
|
-
for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
|
218
|
-
|
268
|
+
// printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
|
269
|
+
// for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
|
270
|
+
|
271
|
+
printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
|
219
272
|
|
220
273
|
return 0;
|
221
274
|
|
data/lib/protk/constants.rb
CHANGED
@@ -7,7 +7,6 @@
|
|
7
7
|
require 'yaml'
|
8
8
|
require 'logger'
|
9
9
|
require 'pathname'
|
10
|
-
require 'ftools'
|
11
10
|
|
12
11
|
class Constants
|
13
12
|
|
@@ -77,6 +76,21 @@ class Constants
|
|
77
76
|
"#{@protk_dir}/tools/tandem"
|
78
77
|
end
|
79
78
|
|
79
|
+
def makeblastdb
|
80
|
+
makeblastdbpath=%x[which makeblastdb]
|
81
|
+
makeblastdbpath.chomp
|
82
|
+
end
|
83
|
+
|
84
|
+
def blastdbcmd
|
85
|
+
path=%x[which blastdbcmd]
|
86
|
+
path.chomp
|
87
|
+
end
|
88
|
+
|
89
|
+
def mascot2xml
|
90
|
+
path=%x[which Mascot2XML]
|
91
|
+
path.chomp
|
92
|
+
end
|
93
|
+
|
80
94
|
def protein_database_root
|
81
95
|
path=@env['protein_database_root']
|
82
96
|
if ( path =~ /^\// )
|
@@ -154,7 +168,7 @@ class Constants
|
|
154
168
|
|
155
169
|
ENV['PATH']=protk_paths.join(":")
|
156
170
|
|
157
|
-
|
171
|
+
# puts "Path #{ENV['PATH']}"
|
158
172
|
throw "No data found in config file" unless @env!=nil
|
159
173
|
@info_level=default_config_yml['message_level']
|
160
174
|
|
@@ -0,0 +1,175 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<?xml-stylesheet type="text/xsl" href="tandem-input-style.xsl"?>
|
3
|
+
<bioml>
|
4
|
+
|
5
|
+
<note>spectrum parameters</note>
|
6
|
+
<note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
|
7
|
+
<note type="input" label="spectrum, parent monoisotopic mass error plus">100</note>
|
8
|
+
<note type="input" label="spectrum, parent monoisotopic mass error minus">100</note>
|
9
|
+
<note type="input" label="spectrum, parent monoisotopic mass isotope error">yes</note>
|
10
|
+
<note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
|
11
|
+
<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
|
12
|
+
<note type="input" label="spectrum, parent monoisotopic mass error units">ppm</note>
|
13
|
+
<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
|
14
|
+
<note type="input" label="spectrum, fragment mass type">monoisotopic</note>
|
15
|
+
<note>values are monoisotopic|average </note>
|
16
|
+
|
17
|
+
<note>spectrum conditioning parameters</note>
|
18
|
+
<note type="input" label="spectrum, dynamic range">100.0</note>
|
19
|
+
<note>The peaks read in are normalized so that the most intense peak
|
20
|
+
is set to the dynamic range value. All peaks with values of less that
|
21
|
+
1, using this normalization, are not used. This normalization has the
|
22
|
+
overall effect of setting a threshold value for peak intensities.</note>
|
23
|
+
<note type="input" label="spectrum, total peaks">50</note>
|
24
|
+
<note>If this value is 0, it is ignored. If it is greater than zero (lets say 50),
|
25
|
+
then the number of peaks in the spectrum with be limited to the 50 most intense
|
26
|
+
peaks in the spectrum. X! tandem does not do any peak finding: it only
|
27
|
+
limits the peaks used by this parameter, and the dynamic range parameter.</note>
|
28
|
+
<note type="input" label="spectrum, maximum parent charge">4</note>
|
29
|
+
<note type="input" label="spectrum, use noise suppression">yes</note>
|
30
|
+
<note type="input" label="spectrum, minimum parent m+h">500.0</note>
|
31
|
+
<note type="input" label="spectrum, minimum fragment mz">150.0</note>
|
32
|
+
<note type="input" label="spectrum, minimum peaks">15</note>
|
33
|
+
<note type="input" label="spectrum, threads">1</note>
|
34
|
+
<note type="input" label="spectrum, sequence batch size">1000</note>
|
35
|
+
|
36
|
+
<note>residue modification parameters</note>
|
37
|
+
<note type="input" label="residue, modification mass">57.022@C</note>
|
38
|
+
<note>The format of this parameter is m@X, where m is the modfication
|
39
|
+
mass in Daltons and X is the appropriate residue to modify. Lists of
|
40
|
+
modifications are separated by commas. For example, to modify M and C
|
41
|
+
with the addition of 16.0 Daltons, the parameter line would be
|
42
|
+
+16.0@M,+16.0@C
|
43
|
+
Positive and negative values are allowed.
|
44
|
+
</note>
|
45
|
+
<note type="input" label="residue, potential modification mass"></note>
|
46
|
+
<note>The format of this parameter is the same as the format
|
47
|
+
for residue, modification mass (see above).</note>
|
48
|
+
<note type="input" label="residue, potential modification motif"></note>
|
49
|
+
<note>The format of this parameter is similar to residue, modification mass,
|
50
|
+
with the addition of a modified PROSITE notation sequence motif specification.
|
51
|
+
For example, a value of 80@[ST!]PX[KR] indicates a modification
|
52
|
+
of either S or T when followed by P, and residue and the a K or an R.
|
53
|
+
A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
|
54
|
+
is NOT followed by a P, then either an S or a T, NOT followed by a P.
|
55
|
+
Positive and negative values are allowed.
|
56
|
+
</note>
|
57
|
+
|
58
|
+
<note>protein parameters</note>
|
59
|
+
<note type="input" label="protein, taxon">other mammals</note>
|
60
|
+
<note>This value is interpreted using the information in taxonomy.xml.</note>
|
61
|
+
<note type="input" label="protein, cleavage site">[RK]|{P}</note>
|
62
|
+
<note>this setting corresponds to the enzyme trypsin. The first characters
|
63
|
+
in brackets represent residues N-terminal to the bond - the '|' pipe -
|
64
|
+
and the second set of characters represent residues C-terminal to the
|
65
|
+
bond. The characters must be in square brackets (denoting that only
|
66
|
+
these residues are allowed for a cleavage) or french brackets (denoting
|
67
|
+
that these residues cannot be in that position). Use UPPERCASE characters.
|
68
|
+
To denote cleavage at any residue, use [X]|[X] and reset the
|
69
|
+
scoring, maximum missed cleavage site parameter (see below) to something like 50.
|
70
|
+
</note>
|
71
|
+
<note type="input" label="protein, modified residue mass file"></note>
|
72
|
+
<note type="input" label="protein, cleavage C-terminal mass change">+17.002735</note>
|
73
|
+
<note type="input" label="protein, cleavage N-terminal mass change">+1.007825</note>
|
74
|
+
<note type="input" label="protein, N-terminal residue modification mass">0.0</note>
|
75
|
+
<note type="input" label="protein, C-terminal residue modification mass">0.0</note>
|
76
|
+
<note type="input" label="protein, homolog management">no</note>
|
77
|
+
<note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
|
78
|
+
|
79
|
+
<note>model refinement parameters</note>
|
80
|
+
<note type="input" label="refine">yes</note>
|
81
|
+
<note type="input" label="refine, modification mass"></note>
|
82
|
+
<note type="input" label="refine, sequence path"></note>
|
83
|
+
<note type="input" label="refine, tic percent">20</note>
|
84
|
+
<note type="input" label="refine, spectrum synthesis">yes</note>
|
85
|
+
<note type="input" label="refine, maximum valid expectation value">0.1</note>
|
86
|
+
<note type="input" label="refine, potential N-terminus modifications">+42.010565@[</note>
|
87
|
+
<note type="input" label="refine, potential C-terminus modifications"></note>
|
88
|
+
<note type="input" label="refine, unanticipated cleavage">yes</note>
|
89
|
+
<note type="input" label="refine, potential modification mass"></note>
|
90
|
+
<note type="input" label="refine, point mutations">no</note>
|
91
|
+
<note type="input" label="refine, use potential modifications for full refinement">no</note>
|
92
|
+
<note type="input" label="refine, point mutations">no</note>
|
93
|
+
<note type="input" label="refine, potential modification motif"></note>
|
94
|
+
<note>The format of this parameter is similar to residue, modification mass,
|
95
|
+
with the addition of a modified PROSITE notation sequence motif specification.
|
96
|
+
For example, a value of 80@[ST!]PX[KR] indicates a modification
|
97
|
+
of either S or T when followed by P, and residue and the a K or an R.
|
98
|
+
A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
|
99
|
+
is NOT followed by a P, then either an S or a T, NOT followed by a P.
|
100
|
+
Positive and negative values are allowed.
|
101
|
+
</note>
|
102
|
+
|
103
|
+
<note>scoring parameters</note>
|
104
|
+
<note type="input" label="scoring, minimum ion count">4</note>
|
105
|
+
<note type="input" label="scoring, maximum missed cleavage sites">1</note>
|
106
|
+
<note type="input" label="scoring, x ions">no</note>
|
107
|
+
<note type="input" label="scoring, y ions">yes</note>
|
108
|
+
<note type="input" label="scoring, z ions">no</note>
|
109
|
+
<note type="input" label="scoring, a ions">no</note>
|
110
|
+
<note type="input" label="scoring, b ions">yes</note>
|
111
|
+
<note type="input" label="scoring, c ions">no</note>
|
112
|
+
<note type="input" label="scoring, cyclic permutation">no</note>
|
113
|
+
<note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
|
114
|
+
<note type="input" label="scoring, include reverse">no</note>
|
115
|
+
<note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
|
116
|
+
<note type="input" label="scoring, cyclic permutation">no</note>
|
117
|
+
<note type="input" label="scoring, include reverse">no</note>
|
118
|
+
|
119
|
+
<note>output parameters</note>
|
120
|
+
<note type="input" label="output, log path"></note>
|
121
|
+
<note type="input" label="output, message">testing 1 2 3</note>
|
122
|
+
<note type="input" label="output, one sequence copy">no</note>
|
123
|
+
<note type="input" label="output, sequence path"></note>
|
124
|
+
<note type="input" label="output, path">output.xml</note>
|
125
|
+
<note type="input" label="output, sort results by">protein</note>
|
126
|
+
<note>values = protein|spectrum (spectrum is the default)</note>
|
127
|
+
<note type="input" label="output, path hashing">yes</note>
|
128
|
+
<note>values = yes|no</note>
|
129
|
+
<note type="input" label="output, xsl path">tandem-style.xsl</note>
|
130
|
+
<note type="input" label="output, parameters">yes</note>
|
131
|
+
<note>values = yes|no</note>
|
132
|
+
<note type="input" label="output, performance">yes</note>
|
133
|
+
<note>values = yes|no</note>
|
134
|
+
<note type="input" label="output, spectra">yes</note>
|
135
|
+
<note>values = yes|no</note>
|
136
|
+
<note type="input" label="output, histograms">yes</note>
|
137
|
+
<note>values = yes|no</note>
|
138
|
+
<note type="input" label="output, proteins">yes</note>
|
139
|
+
<note>values = yes|no</note>
|
140
|
+
<note type="input" label="output, sequences">yes</note>
|
141
|
+
<note>values = yes|no</note>
|
142
|
+
<note type="input" label="output, one sequence copy">no</note>
|
143
|
+
<note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
|
144
|
+
<note type="input" label="output, results">valid</note>
|
145
|
+
<note>values = all|valid|stochastic</note>
|
146
|
+
<note type="input" label="output, maximum valid expectation value">0.1</note>
|
147
|
+
<note>value is used in the valid|stochastic setting of output, results</note>
|
148
|
+
<note type="input" label="output, histogram column width">30</note>
|
149
|
+
<note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms
|
150
|
+
into spread sheet programs easier.</note>
|
151
|
+
<note type="description">ADDITIONAL EXPLANATIONS</note>
|
152
|
+
<note type="description">Each one of the parameters for X! tandem is entered as a labeled note
|
153
|
+
node. In the current version of X!, keep those note nodes
|
154
|
+
on a single line.
|
155
|
+
</note>
|
156
|
+
<note type="description">The presence of the type 'input' is necessary if a note is to be considered
|
157
|
+
an input parameter.
|
158
|
+
</note>
|
159
|
+
<note type="description">Any of the parameters that are paths to files may require alteration for a
|
160
|
+
particular installation. Full path names usually cause the least trouble,
|
161
|
+
but there is no reason not to use relative path names, if that is the
|
162
|
+
most convenient.
|
163
|
+
</note>
|
164
|
+
<note type="description">Any parameter values set in the 'list path, default parameters' file are
|
165
|
+
reset by entries in the normal input file, if they are present. Otherwise,
|
166
|
+
the default set is used.
|
167
|
+
</note>
|
168
|
+
<note type="description">The 'list path, taxonomy information' file must exist.
|
169
|
+
</note>
|
170
|
+
<note type="description">The directory containing the 'output, path' file must exist: it will not be created.
|
171
|
+
</note>
|
172
|
+
<note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.
|
173
|
+
</note>
|
174
|
+
|
175
|
+
</bioml>
|
@@ -0,0 +1,123 @@
|
|
1
|
+
<bioml>
|
2
|
+
|
3
|
+
<note>spectrum parameters</note>
|
4
|
+
<note type="input" label="spectrum, parent monoisotopic mass error minus">2.0</note>
|
5
|
+
<note type="input" label="spectrum, parent monoisotopic mass error plus">4.0</note>
|
6
|
+
<note>PRECURSOR MASS TOLERANCE. This is monoisotopic mass, so for non-accurate-mass instruments, for which the precursor is often taken nearer to the isotopically averaged mass, an asymmetric tolerance (-2.0 Da to 4.0 Da) is preferable. This somewhat imitates a (-3.0 Da to 3.0 Da) window for averaged mass (but not exactly).</note>
|
7
|
+
<note type="input" label="spectrum, parent monoisotopic mass isotope error">no</note>
|
8
|
+
<note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
|
9
|
+
<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
|
10
|
+
<note type="input" label="spectrum, parent monoisotopic mass error units">Daltons</note>
|
11
|
+
<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
|
12
|
+
<note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
|
13
|
+
<note>This parameter has no effect in k-score scoring.</note>
|
14
|
+
<note type="input" label="spectrum, fragment mass type">monoisotopic</note>
|
15
|
+
<note>values are monoisotopic|average </note>
|
16
|
+
|
17
|
+
<note>spectrum conditioning parameters</note>
|
18
|
+
<note type="input" label="spectrum, use conditioning">no</note>
|
19
|
+
<note>For k-score scoring, it is recommended spectrum conditioning be turned OFF for best performance. All of the spectrum filtering and preprocessing options below in this section will be inactive.</note>
|
20
|
+
<note type="input" label="spectrum, dynamic range">10000.0</note>
|
21
|
+
<note type="input" label="spectrum, total peaks">400</note>
|
22
|
+
<note type="input" label="spectrum, maximum parent charge">5</note>
|
23
|
+
<note type="input" label="spectrum, use noise suppression">yes</note>
|
24
|
+
<note type="input" label="spectrum, minimum parent m+h">600.0</note>
|
25
|
+
<note type="input" label="spectrum, minimum fragment mz">125.0</note>
|
26
|
+
<note type="input" label="spectrum, minimum peaks">10</note>
|
27
|
+
<note type="input" label="spectrum, threads">1</note>
|
28
|
+
<note type="input" label="spectrum, sequence batch size">1000</note>
|
29
|
+
|
30
|
+
<note>residue modification parameters</note>
|
31
|
+
<note type="input" label="residue, modification mass"></note>
|
32
|
+
<note>STATIC MODIFICATION. The format of this parameter is m@X, where m is the modfication mass in Daltons and X is the appropriate residue to modify. Lists of modifications are separated by commas. For example, to modify M and C with the addition of 16.0 Daltons, the parameter line would be +16.0@M,+16.0@C. Positive and negative values are allowed.</note>
|
33
|
+
<note type="input" label="residue, potential modification mass"></note>
|
34
|
+
<note>VARIABLE MODIFICATION. The format of this parameter is the same as the format for residue, modification mass (see above).</note>
|
35
|
+
<note type="input" label="residue, potential modification motif"></note>
|
36
|
+
<note>VARIABLE MODIFICATION IN A MOTIF. The format of this parameter is similar to residue, modification mass, with the addition of a modified PROSITE notation sequence motif specification. For example, a value of 80@[ST!]PX[KR] indicates a modification of either S or T when followed by P, and residue and the a K or an R. A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it is NOT followed by a P, then either an S or a T, NOT followed by a P. Positive and negative values are allowed. </note>
|
37
|
+
|
38
|
+
<note>protein parameters</note>
|
39
|
+
<note type="input" label="protein, taxon">no default</note>
|
40
|
+
<note>SEQUENCE DATABASE TO SEARCH. This refers to identifiers in taxonomy.xml.</note>
|
41
|
+
<note type="input" label="protein, cleavage site">[RK]|{P}</note>
|
42
|
+
<note>ENZYME SPECIFICITY. This setting corresponds to the enzyme trypsin. The first characters in brackets represent residues N-terminal to the bond - the '|' pipe - and the second set of characters represent residues C-terminal to the bond. The characters must be in square brackets (denoting that only these residues are allowed for a cleavage) or french brackets (denoting that these residues cannot be in that position). Use UPPERCASE characters. To denote cleavage at any residue, use [X]|[X] and reset the scoring, maximum missed cleavage site parameter (see below) to something like 50. </note>
|
43
|
+
<note type="input" label="protein, modified residue mass file"></note>
|
44
|
+
<note type="input" label="protein, N-terminal residue modification mass"></note>
|
45
|
+
<note type="input" label="protein, C-terminal residue modification mass"></note>
|
46
|
+
<note type="input" label="protein, homolog management">no</note>
|
47
|
+
<note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
|
48
|
+
|
49
|
+
<note>model refinement parameters</note>
|
50
|
+
<note type="input" label="refine">no</note>
|
51
|
+
<note type="input" label="refine, modification mass"></note>
|
52
|
+
<note type="input" label="refine, sequence path"></note>
|
53
|
+
<note type="input" label="refine, tic percent">10</note>
|
54
|
+
<note type="input" label="refine, spectrum synthesis">yes</note>
|
55
|
+
<note type="input" label="refine, maximum valid expectation value">0.1</note>
|
56
|
+
<note type="input" label="refine, potential N-terminus modifications"></note>
|
57
|
+
<note type="input" label="refine, potential C-terminus modifications"></note>
|
58
|
+
<note type="input" label="refine, unanticipated cleavage">no</note>
|
59
|
+
<note type="input" label="refine, potential modification mass"></note>
|
60
|
+
<note type="input" label="refine, point mutations">no</note>
|
61
|
+
<note type="input" label="refine, use potential modifications for full refinement">no</note>
|
62
|
+
<note type="input" label="refine, point mutations">no</note>
|
63
|
+
<note type="input" label="refine, potential modification motif"></note>
|
64
|
+
|
65
|
+
<note>scoring parameters</note>
|
66
|
+
|
67
|
+
<note type="input" label="scoring, algorithm">k-score</note>
|
68
|
+
<note type="input" label="scoring, minimum ion count">1</note>
|
69
|
+
<note type="input" label="scoring, maximum missed cleavage sites">2</note>
|
70
|
+
<note type="input" label="scoring, x ions">no</note>
|
71
|
+
<note type="input" label="scoring, y ions">yes</note>
|
72
|
+
<note type="input" label="scoring, z ions">no</note>
|
73
|
+
<note type="input" label="scoring, a ions">no</note>
|
74
|
+
<note type="input" label="scoring, b ions">yes</note>
|
75
|
+
<note type="input" label="scoring, c ions">no</note>
|
76
|
+
<note type="input" label="scoring, cyclic permutation">no</note>
|
77
|
+
<note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
|
78
|
+
<note type="input" label="scoring, include reverse">no</note>
|
79
|
+
<note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
|
80
|
+
<note type="input" label="scoring, cyclic permutation">no</note>
|
81
|
+
<note type="input" label="scoring, include reverse">no</note>
|
82
|
+
|
83
|
+
<note>output parameters</note>
|
84
|
+
<note type="input" label="output, log path"></note>
|
85
|
+
<note type="input" label="output, message">1234567890</note>
|
86
|
+
<note type="input" label="output, sequence path"></note>
|
87
|
+
<note type="input" label="output, path">output.xml</note>
|
88
|
+
<note type="input" label="output, sort results by">spectrum</note>
|
89
|
+
<note>values = protein|spectrum (spectrum is the default)</note>
|
90
|
+
<note type="input" label="output, path hashing">no</note>
|
91
|
+
<note>values = yes|no</note>
|
92
|
+
<note type="input" label="output, xsl path">tandem-style.xsl</note>
|
93
|
+
<note type="input" label="output, parameters">yes</note>
|
94
|
+
<note>values = yes|no</note>
|
95
|
+
<note type="input" label="output, performance">yes</note>
|
96
|
+
<note>values = yes|no</note>
|
97
|
+
<note type="input" label="output, spectra">no</note>
|
98
|
+
<note>values = yes|no</note>
|
99
|
+
<note type="input" label="output, histograms">no</note>
|
100
|
+
<note>values = yes|no</note>
|
101
|
+
<note type="input" label="output, proteins">yes</note>
|
102
|
+
<note>values = yes|no</note>
|
103
|
+
<note type="input" label="output, sequences">no</note>
|
104
|
+
<note>values = yes|no</note>
|
105
|
+
<note type="input" label="output, one sequence copy">no</note>
|
106
|
+
<note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
|
107
|
+
<note type="input" label="output, results">all</note>
|
108
|
+
<note>values = all|valid|stochastic</note>
|
109
|
+
<note type="input" label="output, maximum valid expectation value">0.1</note>
|
110
|
+
<note>value is used in the valid|stochastic setting of output, results</note>
|
111
|
+
<note type="input" label="output, histogram column width">30</note>
|
112
|
+
<note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms into spread sheet programs easier.</note>
|
113
|
+
|
114
|
+
<note type="description">ADDITIONAL EXPLANATIONS</note>
|
115
|
+
<note type="description">Each one of the parameters for X! tandem is entered as a labeled note node. In the current version of X!, keep those note nodes on a single line.</note>
|
116
|
+
<note type="description">The presence of the type 'input' is necessary if a note is to be considered an input parameter. </note>
|
117
|
+
<note type="description">Any of the parameters that are paths to files may require alteration for a particular installation. Full path names usually cause the least trouble, but there is no reason not to use relative path names, if that is the most convenient.</note>
|
118
|
+
<note type="description">Any parameter values set in the 'list path, default parameters' file are reset by entries in the normal input file, if they are present. Otherwise, the default set is used. </note>
|
119
|
+
<note type="description">The 'list path, taxonomy information' file must exist.</note>
|
120
|
+
<note type="description">The directory containing the 'output, path' file must exist: it will not be created.</note>
|
121
|
+
<note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.</note>
|
122
|
+
|
123
|
+
</bioml>
|