protk 1.2.6.pre5 → 1.3.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
@@ -24,7 +24,9 @@
|
|
24
24
|
#define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
|
25
25
|
|
26
26
|
|
27
|
-
static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
27
|
+
static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
|
28
|
+
VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
|
29
|
+
|
28
30
|
char *input_file = RSTRING_PTR(input_file_in);
|
29
31
|
long sequences_to_generate = NUM2INT(db_length_in);
|
30
32
|
char * output_file = RSTRING_PTR(output_file_in);
|
@@ -50,10 +52,26 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
50
52
|
/* scanning sequence database */
|
51
53
|
|
52
54
|
strcpy(infile,input_file);
|
53
|
-
|
54
|
-
|
55
|
+
|
56
|
+
if ((inp = fopen(infile, "r"))==NULL) {
|
57
|
+
printf("error opening sequence database %s\n",infile);return -1;
|
58
|
+
}
|
59
|
+
|
60
|
+
printf("scanning sequence database \n%s\n",infile);
|
61
|
+
fflush(stdout);
|
62
|
+
|
55
63
|
i=0;n=0;k=0;
|
56
|
-
|
64
|
+
|
65
|
+
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
|
66
|
+
i++;
|
67
|
+
if(line[0]=='>') {
|
68
|
+
if (!(n%1000)) {
|
69
|
+
printf(".");
|
70
|
+
fflush(stdout);
|
71
|
+
n++;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
}
|
57
75
|
|
58
76
|
n_sequences=n;
|
59
77
|
|
@@ -65,11 +83,17 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
65
83
|
index=(char**)malloc(sizeof(char*)*n_sequences);
|
66
84
|
index[0]=sequence; /* set first index pointer to beginning of first database sequence */
|
67
85
|
|
68
|
-
if ((inp = fopen(infile, "r"))==NULL) {
|
86
|
+
if ((inp = fopen(infile, "r"))==NULL) {
|
87
|
+
printf("error opening sequence database %s\n",infile);
|
88
|
+
return -1;
|
89
|
+
}
|
69
90
|
|
70
|
-
printf("done\nreading sequence database %s",infile);
|
91
|
+
printf("done\nreading sequence database \n%s\n",infile);
|
92
|
+
fflush(stdout);
|
93
|
+
|
71
94
|
n=-1;
|
72
95
|
strcpy(temp_sequence,"\0");
|
96
|
+
|
73
97
|
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
|
74
98
|
{
|
75
99
|
if (strcmp(line,"\n")==0) {
|
@@ -98,18 +122,21 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
98
122
|
}
|
99
123
|
|
100
124
|
strcpy(index[n],temp_sequence);
|
125
|
+
|
101
126
|
fclose(inp);
|
102
127
|
|
103
128
|
n_sequences=n+1;
|
104
129
|
|
105
130
|
printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
|
131
|
+
|
106
132
|
measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
|
107
133
|
|
108
134
|
|
109
135
|
|
110
136
|
/* generating Markov probabilities */
|
111
137
|
|
112
|
-
printf("generating Markov probability matrix...");
|
138
|
+
printf("generating Markov probability matrix...");
|
139
|
+
fflush(stdout);
|
113
140
|
|
114
141
|
srand(time(0)); /* replace with constant to re-generate identical random databases */
|
115
142
|
|
@@ -124,7 +151,11 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
124
151
|
|
125
152
|
for(protein=0;protein<n_sequences;protein++)
|
126
153
|
{
|
127
|
-
if (!(protein%1000)) {
|
154
|
+
if (!(protein%1000)) {
|
155
|
+
printf(".");
|
156
|
+
fflush(stdout);
|
157
|
+
}
|
158
|
+
|
128
159
|
if (protein<(n_sequences-1))
|
129
160
|
{
|
130
161
|
strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
|
@@ -142,35 +173,56 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
142
173
|
{
|
143
174
|
printf("Unknown amino acid %c",one_sequence[i]);
|
144
175
|
} else {
|
145
|
-
|
146
|
-
|
147
|
-
|
176
|
+
a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
|
177
|
+
MP[a][i]++;
|
178
|
+
measured_aa_freq[a]++;
|
148
179
|
}
|
149
180
|
}
|
150
|
-
else {
|
181
|
+
else {
|
182
|
+
a=floor(20*(float)rand()/RAND_MAX);
|
183
|
+
MP[a][i]++;
|
184
|
+
measured_aa_freq[a]++;
|
185
|
+
} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
151
186
|
}
|
152
187
|
MP[20][pl]++;
|
153
188
|
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
154
189
|
}
|
155
|
-
|
190
|
+
|
191
|
+
printf("done\n");
|
192
|
+
fflush(stdout);
|
156
193
|
|
157
194
|
|
158
195
|
|
159
|
-
for(i=0;i<MAX_SEQUENCE_LENGTH;i++)
|
160
|
-
|
196
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
|
197
|
+
row_sum[i]=0;
|
198
|
+
}
|
199
|
+
|
200
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
|
201
|
+
for(j=0;j<=20;j++){
|
202
|
+
row_sum[i]+=MP[j][i];
|
203
|
+
}
|
204
|
+
}
|
161
205
|
|
162
206
|
|
163
207
|
/* generate random protein sequences through Markov chain */
|
164
208
|
|
165
|
-
|
166
|
-
|
209
|
+
strcpy(outfile,output_file);
|
210
|
+
|
211
|
+
if ((outp = fopen(outfile, "w"))==NULL) {
|
212
|
+
printf("error opening output file %s\n",outfile);
|
213
|
+
return -1;
|
214
|
+
}
|
215
|
+
|
167
216
|
printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
|
168
217
|
|
169
218
|
strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
|
170
219
|
|
171
220
|
for(protein=0;protein<sequences_to_generate;protein++)
|
172
221
|
{
|
173
|
-
if (!(protein%1000)) {
|
222
|
+
if (!(protein%1000)) {
|
223
|
+
printf(".");fflush(stdout);
|
224
|
+
}
|
225
|
+
|
174
226
|
i=0; j=0;
|
175
227
|
while (1)
|
176
228
|
{
|
@@ -213,9 +265,10 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
|
|
213
265
|
|
214
266
|
k=0;l=0;
|
215
267
|
for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
|
216
|
-
printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
|
217
|
-
for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
|
218
|
-
|
268
|
+
// printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
|
269
|
+
// for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
|
270
|
+
|
271
|
+
printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
|
219
272
|
|
220
273
|
return 0;
|
221
274
|
|
data/lib/protk/constants.rb
CHANGED
@@ -7,7 +7,6 @@
|
|
7
7
|
require 'yaml'
|
8
8
|
require 'logger'
|
9
9
|
require 'pathname'
|
10
|
-
require 'ftools'
|
11
10
|
|
12
11
|
class Constants
|
13
12
|
|
@@ -77,6 +76,21 @@ class Constants
|
|
77
76
|
"#{@protk_dir}/tools/tandem"
|
78
77
|
end
|
79
78
|
|
79
|
+
def makeblastdb
|
80
|
+
makeblastdbpath=%x[which makeblastdb]
|
81
|
+
makeblastdbpath.chomp
|
82
|
+
end
|
83
|
+
|
84
|
+
def blastdbcmd
|
85
|
+
path=%x[which blastdbcmd]
|
86
|
+
path.chomp
|
87
|
+
end
|
88
|
+
|
89
|
+
def mascot2xml
|
90
|
+
path=%x[which Mascot2XML]
|
91
|
+
path.chomp
|
92
|
+
end
|
93
|
+
|
80
94
|
def protein_database_root
|
81
95
|
path=@env['protein_database_root']
|
82
96
|
if ( path =~ /^\// )
|
@@ -154,7 +168,7 @@ class Constants
|
|
154
168
|
|
155
169
|
ENV['PATH']=protk_paths.join(":")
|
156
170
|
|
157
|
-
|
171
|
+
# puts "Path #{ENV['PATH']}"
|
158
172
|
throw "No data found in config file" unless @env!=nil
|
159
173
|
@info_level=default_config_yml['message_level']
|
160
174
|
|
@@ -0,0 +1,175 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<?xml-stylesheet type="text/xsl" href="tandem-input-style.xsl"?>
|
3
|
+
<bioml>
|
4
|
+
|
5
|
+
<note>spectrum parameters</note>
|
6
|
+
<note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
|
7
|
+
<note type="input" label="spectrum, parent monoisotopic mass error plus">100</note>
|
8
|
+
<note type="input" label="spectrum, parent monoisotopic mass error minus">100</note>
|
9
|
+
<note type="input" label="spectrum, parent monoisotopic mass isotope error">yes</note>
|
10
|
+
<note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
|
11
|
+
<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
|
12
|
+
<note type="input" label="spectrum, parent monoisotopic mass error units">ppm</note>
|
13
|
+
<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
|
14
|
+
<note type="input" label="spectrum, fragment mass type">monoisotopic</note>
|
15
|
+
<note>values are monoisotopic|average </note>
|
16
|
+
|
17
|
+
<note>spectrum conditioning parameters</note>
|
18
|
+
<note type="input" label="spectrum, dynamic range">100.0</note>
|
19
|
+
<note>The peaks read in are normalized so that the most intense peak
|
20
|
+
is set to the dynamic range value. All peaks with values of less that
|
21
|
+
1, using this normalization, are not used. This normalization has the
|
22
|
+
overall effect of setting a threshold value for peak intensities.</note>
|
23
|
+
<note type="input" label="spectrum, total peaks">50</note>
|
24
|
+
<note>If this value is 0, it is ignored. If it is greater than zero (lets say 50),
|
25
|
+
then the number of peaks in the spectrum with be limited to the 50 most intense
|
26
|
+
peaks in the spectrum. X! tandem does not do any peak finding: it only
|
27
|
+
limits the peaks used by this parameter, and the dynamic range parameter.</note>
|
28
|
+
<note type="input" label="spectrum, maximum parent charge">4</note>
|
29
|
+
<note type="input" label="spectrum, use noise suppression">yes</note>
|
30
|
+
<note type="input" label="spectrum, minimum parent m+h">500.0</note>
|
31
|
+
<note type="input" label="spectrum, minimum fragment mz">150.0</note>
|
32
|
+
<note type="input" label="spectrum, minimum peaks">15</note>
|
33
|
+
<note type="input" label="spectrum, threads">1</note>
|
34
|
+
<note type="input" label="spectrum, sequence batch size">1000</note>
|
35
|
+
|
36
|
+
<note>residue modification parameters</note>
|
37
|
+
<note type="input" label="residue, modification mass">57.022@C</note>
|
38
|
+
<note>The format of this parameter is m@X, where m is the modfication
|
39
|
+
mass in Daltons and X is the appropriate residue to modify. Lists of
|
40
|
+
modifications are separated by commas. For example, to modify M and C
|
41
|
+
with the addition of 16.0 Daltons, the parameter line would be
|
42
|
+
+16.0@M,+16.0@C
|
43
|
+
Positive and negative values are allowed.
|
44
|
+
</note>
|
45
|
+
<note type="input" label="residue, potential modification mass"></note>
|
46
|
+
<note>The format of this parameter is the same as the format
|
47
|
+
for residue, modification mass (see above).</note>
|
48
|
+
<note type="input" label="residue, potential modification motif"></note>
|
49
|
+
<note>The format of this parameter is similar to residue, modification mass,
|
50
|
+
with the addition of a modified PROSITE notation sequence motif specification.
|
51
|
+
For example, a value of 80@[ST!]PX[KR] indicates a modification
|
52
|
+
of either S or T when followed by P, and residue and the a K or an R.
|
53
|
+
A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
|
54
|
+
is NOT followed by a P, then either an S or a T, NOT followed by a P.
|
55
|
+
Positive and negative values are allowed.
|
56
|
+
</note>
|
57
|
+
|
58
|
+
<note>protein parameters</note>
|
59
|
+
<note type="input" label="protein, taxon">other mammals</note>
|
60
|
+
<note>This value is interpreted using the information in taxonomy.xml.</note>
|
61
|
+
<note type="input" label="protein, cleavage site">[RK]|{P}</note>
|
62
|
+
<note>this setting corresponds to the enzyme trypsin. The first characters
|
63
|
+
in brackets represent residues N-terminal to the bond - the '|' pipe -
|
64
|
+
and the second set of characters represent residues C-terminal to the
|
65
|
+
bond. The characters must be in square brackets (denoting that only
|
66
|
+
these residues are allowed for a cleavage) or french brackets (denoting
|
67
|
+
that these residues cannot be in that position). Use UPPERCASE characters.
|
68
|
+
To denote cleavage at any residue, use [X]|[X] and reset the
|
69
|
+
scoring, maximum missed cleavage site parameter (see below) to something like 50.
|
70
|
+
</note>
|
71
|
+
<note type="input" label="protein, modified residue mass file"></note>
|
72
|
+
<note type="input" label="protein, cleavage C-terminal mass change">+17.002735</note>
|
73
|
+
<note type="input" label="protein, cleavage N-terminal mass change">+1.007825</note>
|
74
|
+
<note type="input" label="protein, N-terminal residue modification mass">0.0</note>
|
75
|
+
<note type="input" label="protein, C-terminal residue modification mass">0.0</note>
|
76
|
+
<note type="input" label="protein, homolog management">no</note>
|
77
|
+
<note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
|
78
|
+
|
79
|
+
<note>model refinement parameters</note>
|
80
|
+
<note type="input" label="refine">yes</note>
|
81
|
+
<note type="input" label="refine, modification mass"></note>
|
82
|
+
<note type="input" label="refine, sequence path"></note>
|
83
|
+
<note type="input" label="refine, tic percent">20</note>
|
84
|
+
<note type="input" label="refine, spectrum synthesis">yes</note>
|
85
|
+
<note type="input" label="refine, maximum valid expectation value">0.1</note>
|
86
|
+
<note type="input" label="refine, potential N-terminus modifications">+42.010565@[</note>
|
87
|
+
<note type="input" label="refine, potential C-terminus modifications"></note>
|
88
|
+
<note type="input" label="refine, unanticipated cleavage">yes</note>
|
89
|
+
<note type="input" label="refine, potential modification mass"></note>
|
90
|
+
<note type="input" label="refine, point mutations">no</note>
|
91
|
+
<note type="input" label="refine, use potential modifications for full refinement">no</note>
|
92
|
+
<note type="input" label="refine, point mutations">no</note>
|
93
|
+
<note type="input" label="refine, potential modification motif"></note>
|
94
|
+
<note>The format of this parameter is similar to residue, modification mass,
|
95
|
+
with the addition of a modified PROSITE notation sequence motif specification.
|
96
|
+
For example, a value of 80@[ST!]PX[KR] indicates a modification
|
97
|
+
of either S or T when followed by P, and residue and the a K or an R.
|
98
|
+
A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
|
99
|
+
is NOT followed by a P, then either an S or a T, NOT followed by a P.
|
100
|
+
Positive and negative values are allowed.
|
101
|
+
</note>
|
102
|
+
|
103
|
+
<note>scoring parameters</note>
|
104
|
+
<note type="input" label="scoring, minimum ion count">4</note>
|
105
|
+
<note type="input" label="scoring, maximum missed cleavage sites">1</note>
|
106
|
+
<note type="input" label="scoring, x ions">no</note>
|
107
|
+
<note type="input" label="scoring, y ions">yes</note>
|
108
|
+
<note type="input" label="scoring, z ions">no</note>
|
109
|
+
<note type="input" label="scoring, a ions">no</note>
|
110
|
+
<note type="input" label="scoring, b ions">yes</note>
|
111
|
+
<note type="input" label="scoring, c ions">no</note>
|
112
|
+
<note type="input" label="scoring, cyclic permutation">no</note>
|
113
|
+
<note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
|
114
|
+
<note type="input" label="scoring, include reverse">no</note>
|
115
|
+
<note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
|
116
|
+
<note type="input" label="scoring, cyclic permutation">no</note>
|
117
|
+
<note type="input" label="scoring, include reverse">no</note>
|
118
|
+
|
119
|
+
<note>output parameters</note>
|
120
|
+
<note type="input" label="output, log path"></note>
|
121
|
+
<note type="input" label="output, message">testing 1 2 3</note>
|
122
|
+
<note type="input" label="output, one sequence copy">no</note>
|
123
|
+
<note type="input" label="output, sequence path"></note>
|
124
|
+
<note type="input" label="output, path">output.xml</note>
|
125
|
+
<note type="input" label="output, sort results by">protein</note>
|
126
|
+
<note>values = protein|spectrum (spectrum is the default)</note>
|
127
|
+
<note type="input" label="output, path hashing">yes</note>
|
128
|
+
<note>values = yes|no</note>
|
129
|
+
<note type="input" label="output, xsl path">tandem-style.xsl</note>
|
130
|
+
<note type="input" label="output, parameters">yes</note>
|
131
|
+
<note>values = yes|no</note>
|
132
|
+
<note type="input" label="output, performance">yes</note>
|
133
|
+
<note>values = yes|no</note>
|
134
|
+
<note type="input" label="output, spectra">yes</note>
|
135
|
+
<note>values = yes|no</note>
|
136
|
+
<note type="input" label="output, histograms">yes</note>
|
137
|
+
<note>values = yes|no</note>
|
138
|
+
<note type="input" label="output, proteins">yes</note>
|
139
|
+
<note>values = yes|no</note>
|
140
|
+
<note type="input" label="output, sequences">yes</note>
|
141
|
+
<note>values = yes|no</note>
|
142
|
+
<note type="input" label="output, one sequence copy">no</note>
|
143
|
+
<note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
|
144
|
+
<note type="input" label="output, results">valid</note>
|
145
|
+
<note>values = all|valid|stochastic</note>
|
146
|
+
<note type="input" label="output, maximum valid expectation value">0.1</note>
|
147
|
+
<note>value is used in the valid|stochastic setting of output, results</note>
|
148
|
+
<note type="input" label="output, histogram column width">30</note>
|
149
|
+
<note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms
|
150
|
+
into spread sheet programs easier.</note>
|
151
|
+
<note type="description">ADDITIONAL EXPLANATIONS</note>
|
152
|
+
<note type="description">Each one of the parameters for X! tandem is entered as a labeled note
|
153
|
+
node. In the current version of X!, keep those note nodes
|
154
|
+
on a single line.
|
155
|
+
</note>
|
156
|
+
<note type="description">The presence of the type 'input' is necessary if a note is to be considered
|
157
|
+
an input parameter.
|
158
|
+
</note>
|
159
|
+
<note type="description">Any of the parameters that are paths to files may require alteration for a
|
160
|
+
particular installation. Full path names usually cause the least trouble,
|
161
|
+
but there is no reason not to use relative path names, if that is the
|
162
|
+
most convenient.
|
163
|
+
</note>
|
164
|
+
<note type="description">Any parameter values set in the 'list path, default parameters' file are
|
165
|
+
reset by entries in the normal input file, if they are present. Otherwise,
|
166
|
+
the default set is used.
|
167
|
+
</note>
|
168
|
+
<note type="description">The 'list path, taxonomy information' file must exist.
|
169
|
+
</note>
|
170
|
+
<note type="description">The directory containing the 'output, path' file must exist: it will not be created.
|
171
|
+
</note>
|
172
|
+
<note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.
|
173
|
+
</note>
|
174
|
+
|
175
|
+
</bioml>
|
@@ -0,0 +1,123 @@
|
|
1
|
+
<bioml>
|
2
|
+
|
3
|
+
<note>spectrum parameters</note>
|
4
|
+
<note type="input" label="spectrum, parent monoisotopic mass error minus">2.0</note>
|
5
|
+
<note type="input" label="spectrum, parent monoisotopic mass error plus">4.0</note>
|
6
|
+
<note>PRECURSOR MASS TOLERANCE. This is monoisotopic mass, so for non-accurate-mass instruments, for which the precursor is often taken nearer to the isotopically averaged mass, an asymmetric tolerance (-2.0 Da to 4.0 Da) is preferable. This somewhat imitates a (-3.0 Da to 3.0 Da) window for averaged mass (but not exactly).</note>
|
7
|
+
<note type="input" label="spectrum, parent monoisotopic mass isotope error">no</note>
|
8
|
+
<note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
|
9
|
+
<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
|
10
|
+
<note type="input" label="spectrum, parent monoisotopic mass error units">Daltons</note>
|
11
|
+
<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
|
12
|
+
<note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
|
13
|
+
<note>This parameter has no effect in k-score scoring.</note>
|
14
|
+
<note type="input" label="spectrum, fragment mass type">monoisotopic</note>
|
15
|
+
<note>values are monoisotopic|average </note>
|
16
|
+
|
17
|
+
<note>spectrum conditioning parameters</note>
|
18
|
+
<note type="input" label="spectrum, use conditioning">no</note>
|
19
|
+
<note>For k-score scoring, it is recommended spectrum conditioning be turned OFF for best performance. All of the spectrum filtering and preprocessing options below in this section will be inactive.</note>
|
20
|
+
<note type="input" label="spectrum, dynamic range">10000.0</note>
|
21
|
+
<note type="input" label="spectrum, total peaks">400</note>
|
22
|
+
<note type="input" label="spectrum, maximum parent charge">5</note>
|
23
|
+
<note type="input" label="spectrum, use noise suppression">yes</note>
|
24
|
+
<note type="input" label="spectrum, minimum parent m+h">600.0</note>
|
25
|
+
<note type="input" label="spectrum, minimum fragment mz">125.0</note>
|
26
|
+
<note type="input" label="spectrum, minimum peaks">10</note>
|
27
|
+
<note type="input" label="spectrum, threads">1</note>
|
28
|
+
<note type="input" label="spectrum, sequence batch size">1000</note>
|
29
|
+
|
30
|
+
<note>residue modification parameters</note>
|
31
|
+
<note type="input" label="residue, modification mass"></note>
|
32
|
+
<note>STATIC MODIFICATION. The format of this parameter is m@X, where m is the modfication mass in Daltons and X is the appropriate residue to modify. Lists of modifications are separated by commas. For example, to modify M and C with the addition of 16.0 Daltons, the parameter line would be +16.0@M,+16.0@C. Positive and negative values are allowed.</note>
|
33
|
+
<note type="input" label="residue, potential modification mass"></note>
|
34
|
+
<note>VARIABLE MODIFICATION. The format of this parameter is the same as the format for residue, modification mass (see above).</note>
|
35
|
+
<note type="input" label="residue, potential modification motif"></note>
|
36
|
+
<note>VARIABLE MODIFICATION IN A MOTIF. The format of this parameter is similar to residue, modification mass, with the addition of a modified PROSITE notation sequence motif specification. For example, a value of 80@[ST!]PX[KR] indicates a modification of either S or T when followed by P, and residue and the a K or an R. A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it is NOT followed by a P, then either an S or a T, NOT followed by a P. Positive and negative values are allowed. </note>
|
37
|
+
|
38
|
+
<note>protein parameters</note>
|
39
|
+
<note type="input" label="protein, taxon">no default</note>
|
40
|
+
<note>SEQUENCE DATABASE TO SEARCH. This refers to identifiers in taxonomy.xml.</note>
|
41
|
+
<note type="input" label="protein, cleavage site">[RK]|{P}</note>
|
42
|
+
<note>ENZYME SPECIFICITY. This setting corresponds to the enzyme trypsin. The first characters in brackets represent residues N-terminal to the bond - the '|' pipe - and the second set of characters represent residues C-terminal to the bond. The characters must be in square brackets (denoting that only these residues are allowed for a cleavage) or french brackets (denoting that these residues cannot be in that position). Use UPPERCASE characters. To denote cleavage at any residue, use [X]|[X] and reset the scoring, maximum missed cleavage site parameter (see below) to something like 50. </note>
|
43
|
+
<note type="input" label="protein, modified residue mass file"></note>
|
44
|
+
<note type="input" label="protein, N-terminal residue modification mass"></note>
|
45
|
+
<note type="input" label="protein, C-terminal residue modification mass"></note>
|
46
|
+
<note type="input" label="protein, homolog management">no</note>
|
47
|
+
<note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
|
48
|
+
|
49
|
+
<note>model refinement parameters</note>
|
50
|
+
<note type="input" label="refine">no</note>
|
51
|
+
<note type="input" label="refine, modification mass"></note>
|
52
|
+
<note type="input" label="refine, sequence path"></note>
|
53
|
+
<note type="input" label="refine, tic percent">10</note>
|
54
|
+
<note type="input" label="refine, spectrum synthesis">yes</note>
|
55
|
+
<note type="input" label="refine, maximum valid expectation value">0.1</note>
|
56
|
+
<note type="input" label="refine, potential N-terminus modifications"></note>
|
57
|
+
<note type="input" label="refine, potential C-terminus modifications"></note>
|
58
|
+
<note type="input" label="refine, unanticipated cleavage">no</note>
|
59
|
+
<note type="input" label="refine, potential modification mass"></note>
|
60
|
+
<note type="input" label="refine, point mutations">no</note>
|
61
|
+
<note type="input" label="refine, use potential modifications for full refinement">no</note>
|
62
|
+
<note type="input" label="refine, point mutations">no</note>
|
63
|
+
<note type="input" label="refine, potential modification motif"></note>
|
64
|
+
|
65
|
+
<note>scoring parameters</note>
|
66
|
+
|
67
|
+
<note type="input" label="scoring, algorithm">k-score</note>
|
68
|
+
<note type="input" label="scoring, minimum ion count">1</note>
|
69
|
+
<note type="input" label="scoring, maximum missed cleavage sites">2</note>
|
70
|
+
<note type="input" label="scoring, x ions">no</note>
|
71
|
+
<note type="input" label="scoring, y ions">yes</note>
|
72
|
+
<note type="input" label="scoring, z ions">no</note>
|
73
|
+
<note type="input" label="scoring, a ions">no</note>
|
74
|
+
<note type="input" label="scoring, b ions">yes</note>
|
75
|
+
<note type="input" label="scoring, c ions">no</note>
|
76
|
+
<note type="input" label="scoring, cyclic permutation">no</note>
|
77
|
+
<note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
|
78
|
+
<note type="input" label="scoring, include reverse">no</note>
|
79
|
+
<note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
|
80
|
+
<note type="input" label="scoring, cyclic permutation">no</note>
|
81
|
+
<note type="input" label="scoring, include reverse">no</note>
|
82
|
+
|
83
|
+
<note>output parameters</note>
|
84
|
+
<note type="input" label="output, log path"></note>
|
85
|
+
<note type="input" label="output, message">1234567890</note>
|
86
|
+
<note type="input" label="output, sequence path"></note>
|
87
|
+
<note type="input" label="output, path">output.xml</note>
|
88
|
+
<note type="input" label="output, sort results by">spectrum</note>
|
89
|
+
<note>values = protein|spectrum (spectrum is the default)</note>
|
90
|
+
<note type="input" label="output, path hashing">no</note>
|
91
|
+
<note>values = yes|no</note>
|
92
|
+
<note type="input" label="output, xsl path">tandem-style.xsl</note>
|
93
|
+
<note type="input" label="output, parameters">yes</note>
|
94
|
+
<note>values = yes|no</note>
|
95
|
+
<note type="input" label="output, performance">yes</note>
|
96
|
+
<note>values = yes|no</note>
|
97
|
+
<note type="input" label="output, spectra">no</note>
|
98
|
+
<note>values = yes|no</note>
|
99
|
+
<note type="input" label="output, histograms">no</note>
|
100
|
+
<note>values = yes|no</note>
|
101
|
+
<note type="input" label="output, proteins">yes</note>
|
102
|
+
<note>values = yes|no</note>
|
103
|
+
<note type="input" label="output, sequences">no</note>
|
104
|
+
<note>values = yes|no</note>
|
105
|
+
<note type="input" label="output, one sequence copy">no</note>
|
106
|
+
<note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
|
107
|
+
<note type="input" label="output, results">all</note>
|
108
|
+
<note>values = all|valid|stochastic</note>
|
109
|
+
<note type="input" label="output, maximum valid expectation value">0.1</note>
|
110
|
+
<note>value is used in the valid|stochastic setting of output, results</note>
|
111
|
+
<note type="input" label="output, histogram column width">30</note>
|
112
|
+
<note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms into spread sheet programs easier.</note>
|
113
|
+
|
114
|
+
<note type="description">ADDITIONAL EXPLANATIONS</note>
|
115
|
+
<note type="description">Each one of the parameters for X! tandem is entered as a labeled note node. In the current version of X!, keep those note nodes on a single line.</note>
|
116
|
+
<note type="description">The presence of the type 'input' is necessary if a note is to be considered an input parameter. </note>
|
117
|
+
<note type="description">Any of the parameters that are paths to files may require alteration for a particular installation. Full path names usually cause the least trouble, but there is no reason not to use relative path names, if that is the most convenient.</note>
|
118
|
+
<note type="description">Any parameter values set in the 'list path, default parameters' file are reset by entries in the normal input file, if they are present. Otherwise, the default set is used. </note>
|
119
|
+
<note type="description">The 'list path, taxonomy information' file must exist.</note>
|
120
|
+
<note type="description">The directory containing the 'output, path' file must exist: it will not be created.</note>
|
121
|
+
<note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.</note>
|
122
|
+
|
123
|
+
</bioml>
|