protk 1.2.6.pre5 → 1.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -24,7 +24,9 @@
24
24
  #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
25
25
 
26
26
 
27
- static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
27
+ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
28
+ VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
29
+
28
30
  char *input_file = RSTRING_PTR(input_file_in);
29
31
  long sequences_to_generate = NUM2INT(db_length_in);
30
32
  char * output_file = RSTRING_PTR(output_file_in);
@@ -50,10 +52,26 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
50
52
  /* scanning sequence database */
51
53
 
52
54
  strcpy(infile,input_file);
53
- if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
54
- printf("scanning sequence database %s",infile);fflush(stdout);
55
+
56
+ if ((inp = fopen(infile, "r"))==NULL) {
57
+ printf("error opening sequence database %s\n",infile);return -1;
58
+ }
59
+
60
+ printf("scanning sequence database \n%s\n",infile);
61
+ fflush(stdout);
62
+
55
63
  i=0;n=0;k=0;
56
- while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {i++; if(line[0]=='>') {if (!(n%1000)) printf(".");fflush(stdout); n++;} }
64
+
65
+ while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
66
+ i++;
67
+ if(line[0]=='>') {
68
+ if (!(n%1000)) {
69
+ printf(".");
70
+ fflush(stdout);
71
+ n++;
72
+ }
73
+ }
74
+ }
57
75
 
58
76
  n_sequences=n;
59
77
 
@@ -65,11 +83,17 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
65
83
  index=(char**)malloc(sizeof(char*)*n_sequences);
66
84
  index[0]=sequence; /* set first index pointer to beginning of first database sequence */
67
85
 
68
- if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
86
+ if ((inp = fopen(infile, "r"))==NULL) {
87
+ printf("error opening sequence database %s\n",infile);
88
+ return -1;
89
+ }
69
90
 
70
- printf("done\nreading sequence database %s",infile);fflush(stdout);
91
+ printf("done\nreading sequence database \n%s\n",infile);
92
+ fflush(stdout);
93
+
71
94
  n=-1;
72
95
  strcpy(temp_sequence,"\0");
96
+
73
97
  while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
74
98
  {
75
99
  if (strcmp(line,"\n")==0) {
@@ -98,18 +122,21 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
98
122
  }
99
123
 
100
124
  strcpy(index[n],temp_sequence);
125
+
101
126
  fclose(inp);
102
127
 
103
128
  n_sequences=n+1;
104
129
 
105
130
  printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
131
+
106
132
  measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
107
133
 
108
134
 
109
135
 
110
136
  /* generating Markov probabilities */
111
137
 
112
- printf("generating Markov probability matrix...");fflush(stdout);
138
+ printf("generating Markov probability matrix...");
139
+ fflush(stdout);
113
140
 
114
141
  srand(time(0)); /* replace with constant to re-generate identical random databases */
115
142
 
@@ -124,7 +151,11 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
124
151
 
125
152
  for(protein=0;protein<n_sequences;protein++)
126
153
  {
127
- if (!(protein%1000)) {printf(".");fflush(stdout);}
154
+ if (!(protein%1000)) {
155
+ printf(".");
156
+ fflush(stdout);
157
+ }
158
+
128
159
  if (protein<(n_sequences-1))
129
160
  {
130
161
  strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
@@ -142,35 +173,56 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
142
173
  {
143
174
  printf("Unknown amino acid %c",one_sequence[i]);
144
175
  } else {
145
- a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
146
- MP[a][i]++;
147
- measured_aa_freq[a]++;
176
+ a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
177
+ MP[a][i]++;
178
+ measured_aa_freq[a]++;
148
179
  }
149
180
  }
150
- else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
181
+ else {
182
+ a=floor(20*(float)rand()/RAND_MAX);
183
+ MP[a][i]++;
184
+ measured_aa_freq[a]++;
185
+ } // replace B, X, Z etc. with random amino acid to preserve size distribution
151
186
  }
152
187
  MP[20][pl]++;
153
188
  measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
154
189
  }
155
- printf("done\n"); fflush(stdout);
190
+
191
+ printf("done\n");
192
+ fflush(stdout);
156
193
 
157
194
 
158
195
 
159
- for(i=0;i<MAX_SEQUENCE_LENGTH;i++) row_sum[i]=0;
160
- for(i=0;i<MAX_SEQUENCE_LENGTH;i++) for(j=0;j<=20;j++) row_sum[i]+=MP[j][i];
196
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
197
+ row_sum[i]=0;
198
+ }
199
+
200
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
201
+ for(j=0;j<=20;j++){
202
+ row_sum[i]+=MP[j][i];
203
+ }
204
+ }
161
205
 
162
206
 
163
207
  /* generate random protein sequences through Markov chain */
164
208
 
165
- strcpy(outfile,output_file);
166
- if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file %s\n",outfile); return -1;}
209
+ strcpy(outfile,output_file);
210
+
211
+ if ((outp = fopen(outfile, "w"))==NULL) {
212
+ printf("error opening output file %s\n",outfile);
213
+ return -1;
214
+ }
215
+
167
216
  printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
168
217
 
169
218
  strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
170
219
 
171
220
  for(protein=0;protein<sequences_to_generate;protein++)
172
221
  {
173
- if (!(protein%1000)) {printf(".");fflush(stdout);}
222
+ if (!(protein%1000)) {
223
+ printf(".");fflush(stdout);
224
+ }
225
+
174
226
  i=0; j=0;
175
227
  while (1)
176
228
  {
@@ -213,9 +265,10 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
213
265
 
214
266
  k=0;l=0;
215
267
  for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
216
- printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
217
- for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
218
- printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
268
+ // printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
269
+ // for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
270
+
271
+ printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
219
272
 
220
273
  return 0;
221
274
 
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('decoymaker')
@@ -7,7 +7,6 @@
7
7
  require 'yaml'
8
8
  require 'logger'
9
9
  require 'pathname'
10
- require 'ftools'
11
10
 
12
11
  class Constants
13
12
 
@@ -77,6 +76,21 @@ class Constants
77
76
  "#{@protk_dir}/tools/tandem"
78
77
  end
79
78
 
79
+ def makeblastdb
80
+ makeblastdbpath=%x[which makeblastdb]
81
+ makeblastdbpath.chomp
82
+ end
83
+
84
+ def blastdbcmd
85
+ path=%x[which blastdbcmd]
86
+ path.chomp
87
+ end
88
+
89
+ def mascot2xml
90
+ path=%x[which Mascot2XML]
91
+ path.chomp
92
+ end
93
+
80
94
  def protein_database_root
81
95
  path=@env['protein_database_root']
82
96
  if ( path =~ /^\// )
@@ -154,7 +168,7 @@ class Constants
154
168
 
155
169
  ENV['PATH']=protk_paths.join(":")
156
170
 
157
-
171
+ # puts "Path #{ENV['PATH']}"
158
172
  throw "No data found in config file" unless @env!=nil
159
173
  @info_level=default_config_yml['message_level']
160
174
 
@@ -12,4 +12,5 @@ uniprot_sprot_annotation_database: swissprot_annotation
12
12
  uniprot_trembl_annotation_database: trembl_annotation
13
13
  galaxy_root: galaxy
14
14
  default_mascot_server: www.matrixscience.com
15
- log_file: Logs/protk.log
15
+ log_file: Logs/protk.log
16
+
@@ -0,0 +1,175 @@
1
+ <?xml version="1.0"?>
2
+ <?xml-stylesheet type="text/xsl" href="tandem-input-style.xsl"?>
3
+ <bioml>
4
+
5
+ <note>spectrum parameters</note>
6
+ <note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
7
+ <note type="input" label="spectrum, parent monoisotopic mass error plus">100</note>
8
+ <note type="input" label="spectrum, parent monoisotopic mass error minus">100</note>
9
+ <note type="input" label="spectrum, parent monoisotopic mass isotope error">yes</note>
10
+ <note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
11
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
12
+ <note type="input" label="spectrum, parent monoisotopic mass error units">ppm</note>
13
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
14
+ <note type="input" label="spectrum, fragment mass type">monoisotopic</note>
15
+ <note>values are monoisotopic|average </note>
16
+
17
+ <note>spectrum conditioning parameters</note>
18
+ <note type="input" label="spectrum, dynamic range">100.0</note>
19
+ <note>The peaks read in are normalized so that the most intense peak
20
+ is set to the dynamic range value. All peaks with values of less that
21
+ 1, using this normalization, are not used. This normalization has the
22
+ overall effect of setting a threshold value for peak intensities.</note>
23
+ <note type="input" label="spectrum, total peaks">50</note>
24
+ <note>If this value is 0, it is ignored. If it is greater than zero (lets say 50),
25
+ then the number of peaks in the spectrum with be limited to the 50 most intense
26
+ peaks in the spectrum. X! tandem does not do any peak finding: it only
27
+ limits the peaks used by this parameter, and the dynamic range parameter.</note>
28
+ <note type="input" label="spectrum, maximum parent charge">4</note>
29
+ <note type="input" label="spectrum, use noise suppression">yes</note>
30
+ <note type="input" label="spectrum, minimum parent m+h">500.0</note>
31
+ <note type="input" label="spectrum, minimum fragment mz">150.0</note>
32
+ <note type="input" label="spectrum, minimum peaks">15</note>
33
+ <note type="input" label="spectrum, threads">1</note>
34
+ <note type="input" label="spectrum, sequence batch size">1000</note>
35
+
36
+ <note>residue modification parameters</note>
37
+ <note type="input" label="residue, modification mass">57.022@C</note>
38
+ <note>The format of this parameter is m@X, where m is the modfication
39
+ mass in Daltons and X is the appropriate residue to modify. Lists of
40
+ modifications are separated by commas. For example, to modify M and C
41
+ with the addition of 16.0 Daltons, the parameter line would be
42
+ +16.0@M,+16.0@C
43
+ Positive and negative values are allowed.
44
+ </note>
45
+ <note type="input" label="residue, potential modification mass"></note>
46
+ <note>The format of this parameter is the same as the format
47
+ for residue, modification mass (see above).</note>
48
+ <note type="input" label="residue, potential modification motif"></note>
49
+ <note>The format of this parameter is similar to residue, modification mass,
50
+ with the addition of a modified PROSITE notation sequence motif specification.
51
+ For example, a value of 80@[ST!]PX[KR] indicates a modification
52
+ of either S or T when followed by P, and residue and the a K or an R.
53
+ A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
54
+ is NOT followed by a P, then either an S or a T, NOT followed by a P.
55
+ Positive and negative values are allowed.
56
+ </note>
57
+
58
+ <note>protein parameters</note>
59
+ <note type="input" label="protein, taxon">other mammals</note>
60
+ <note>This value is interpreted using the information in taxonomy.xml.</note>
61
+ <note type="input" label="protein, cleavage site">[RK]|{P}</note>
62
+ <note>this setting corresponds to the enzyme trypsin. The first characters
63
+ in brackets represent residues N-terminal to the bond - the '|' pipe -
64
+ and the second set of characters represent residues C-terminal to the
65
+ bond. The characters must be in square brackets (denoting that only
66
+ these residues are allowed for a cleavage) or french brackets (denoting
67
+ that these residues cannot be in that position). Use UPPERCASE characters.
68
+ To denote cleavage at any residue, use [X]|[X] and reset the
69
+ scoring, maximum missed cleavage site parameter (see below) to something like 50.
70
+ </note>
71
+ <note type="input" label="protein, modified residue mass file"></note>
72
+ <note type="input" label="protein, cleavage C-terminal mass change">+17.002735</note>
73
+ <note type="input" label="protein, cleavage N-terminal mass change">+1.007825</note>
74
+ <note type="input" label="protein, N-terminal residue modification mass">0.0</note>
75
+ <note type="input" label="protein, C-terminal residue modification mass">0.0</note>
76
+ <note type="input" label="protein, homolog management">no</note>
77
+ <note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
78
+
79
+ <note>model refinement parameters</note>
80
+ <note type="input" label="refine">yes</note>
81
+ <note type="input" label="refine, modification mass"></note>
82
+ <note type="input" label="refine, sequence path"></note>
83
+ <note type="input" label="refine, tic percent">20</note>
84
+ <note type="input" label="refine, spectrum synthesis">yes</note>
85
+ <note type="input" label="refine, maximum valid expectation value">0.1</note>
86
+ <note type="input" label="refine, potential N-terminus modifications">+42.010565@[</note>
87
+ <note type="input" label="refine, potential C-terminus modifications"></note>
88
+ <note type="input" label="refine, unanticipated cleavage">yes</note>
89
+ <note type="input" label="refine, potential modification mass"></note>
90
+ <note type="input" label="refine, point mutations">no</note>
91
+ <note type="input" label="refine, use potential modifications for full refinement">no</note>
92
+ <note type="input" label="refine, point mutations">no</note>
93
+ <note type="input" label="refine, potential modification motif"></note>
94
+ <note>The format of this parameter is similar to residue, modification mass,
95
+ with the addition of a modified PROSITE notation sequence motif specification.
96
+ For example, a value of 80@[ST!]PX[KR] indicates a modification
97
+ of either S or T when followed by P, and residue and the a K or an R.
98
+ A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
99
+ is NOT followed by a P, then either an S or a T, NOT followed by a P.
100
+ Positive and negative values are allowed.
101
+ </note>
102
+
103
+ <note>scoring parameters</note>
104
+ <note type="input" label="scoring, minimum ion count">4</note>
105
+ <note type="input" label="scoring, maximum missed cleavage sites">1</note>
106
+ <note type="input" label="scoring, x ions">no</note>
107
+ <note type="input" label="scoring, y ions">yes</note>
108
+ <note type="input" label="scoring, z ions">no</note>
109
+ <note type="input" label="scoring, a ions">no</note>
110
+ <note type="input" label="scoring, b ions">yes</note>
111
+ <note type="input" label="scoring, c ions">no</note>
112
+ <note type="input" label="scoring, cyclic permutation">no</note>
113
+ <note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
114
+ <note type="input" label="scoring, include reverse">no</note>
115
+ <note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
116
+ <note type="input" label="scoring, cyclic permutation">no</note>
117
+ <note type="input" label="scoring, include reverse">no</note>
118
+
119
+ <note>output parameters</note>
120
+ <note type="input" label="output, log path"></note>
121
+ <note type="input" label="output, message">testing 1 2 3</note>
122
+ <note type="input" label="output, one sequence copy">no</note>
123
+ <note type="input" label="output, sequence path"></note>
124
+ <note type="input" label="output, path">output.xml</note>
125
+ <note type="input" label="output, sort results by">protein</note>
126
+ <note>values = protein|spectrum (spectrum is the default)</note>
127
+ <note type="input" label="output, path hashing">yes</note>
128
+ <note>values = yes|no</note>
129
+ <note type="input" label="output, xsl path">tandem-style.xsl</note>
130
+ <note type="input" label="output, parameters">yes</note>
131
+ <note>values = yes|no</note>
132
+ <note type="input" label="output, performance">yes</note>
133
+ <note>values = yes|no</note>
134
+ <note type="input" label="output, spectra">yes</note>
135
+ <note>values = yes|no</note>
136
+ <note type="input" label="output, histograms">yes</note>
137
+ <note>values = yes|no</note>
138
+ <note type="input" label="output, proteins">yes</note>
139
+ <note>values = yes|no</note>
140
+ <note type="input" label="output, sequences">yes</note>
141
+ <note>values = yes|no</note>
142
+ <note type="input" label="output, one sequence copy">no</note>
143
+ <note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
144
+ <note type="input" label="output, results">valid</note>
145
+ <note>values = all|valid|stochastic</note>
146
+ <note type="input" label="output, maximum valid expectation value">0.1</note>
147
+ <note>value is used in the valid|stochastic setting of output, results</note>
148
+ <note type="input" label="output, histogram column width">30</note>
149
+ <note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms
150
+ into spread sheet programs easier.</note>
151
+ <note type="description">ADDITIONAL EXPLANATIONS</note>
152
+ <note type="description">Each one of the parameters for X! tandem is entered as a labeled note
153
+ node. In the current version of X!, keep those note nodes
154
+ on a single line.
155
+ </note>
156
+ <note type="description">The presence of the type 'input' is necessary if a note is to be considered
157
+ an input parameter.
158
+ </note>
159
+ <note type="description">Any of the parameters that are paths to files may require alteration for a
160
+ particular installation. Full path names usually cause the least trouble,
161
+ but there is no reason not to use relative path names, if that is the
162
+ most convenient.
163
+ </note>
164
+ <note type="description">Any parameter values set in the 'list path, default parameters' file are
165
+ reset by entries in the normal input file, if they are present. Otherwise,
166
+ the default set is used.
167
+ </note>
168
+ <note type="description">The 'list path, taxonomy information' file must exist.
169
+ </note>
170
+ <note type="description">The directory containing the 'output, path' file must exist: it will not be created.
171
+ </note>
172
+ <note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.
173
+ </note>
174
+
175
+ </bioml>
@@ -0,0 +1,123 @@
1
+ <bioml>
2
+
3
+ <note>spectrum parameters</note>
4
+ <note type="input" label="spectrum, parent monoisotopic mass error minus">2.0</note>
5
+ <note type="input" label="spectrum, parent monoisotopic mass error plus">4.0</note>
6
+ <note>PRECURSOR MASS TOLERANCE. This is monoisotopic mass, so for non-accurate-mass instruments, for which the precursor is often taken nearer to the isotopically averaged mass, an asymmetric tolerance (-2.0 Da to 4.0 Da) is preferable. This somewhat imitates a (-3.0 Da to 3.0 Da) window for averaged mass (but not exactly).</note>
7
+ <note type="input" label="spectrum, parent monoisotopic mass isotope error">no</note>
8
+ <note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
9
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
10
+ <note type="input" label="spectrum, parent monoisotopic mass error units">Daltons</note>
11
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
12
+ <note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
13
+ <note>This parameter has no effect in k-score scoring.</note>
14
+ <note type="input" label="spectrum, fragment mass type">monoisotopic</note>
15
+ <note>values are monoisotopic|average </note>
16
+
17
+ <note>spectrum conditioning parameters</note>
18
+ <note type="input" label="spectrum, use conditioning">no</note>
19
+ <note>For k-score scoring, it is recommended spectrum conditioning be turned OFF for best performance. All of the spectrum filtering and preprocessing options below in this section will be inactive.</note>
20
+ <note type="input" label="spectrum, dynamic range">10000.0</note>
21
+ <note type="input" label="spectrum, total peaks">400</note>
22
+ <note type="input" label="spectrum, maximum parent charge">5</note>
23
+ <note type="input" label="spectrum, use noise suppression">yes</note>
24
+ <note type="input" label="spectrum, minimum parent m+h">600.0</note>
25
+ <note type="input" label="spectrum, minimum fragment mz">125.0</note>
26
+ <note type="input" label="spectrum, minimum peaks">10</note>
27
+ <note type="input" label="spectrum, threads">1</note>
28
+ <note type="input" label="spectrum, sequence batch size">1000</note>
29
+
30
+ <note>residue modification parameters</note>
31
+ <note type="input" label="residue, modification mass"></note>
32
+ <note>STATIC MODIFICATION. The format of this parameter is m@X, where m is the modfication mass in Daltons and X is the appropriate residue to modify. Lists of modifications are separated by commas. For example, to modify M and C with the addition of 16.0 Daltons, the parameter line would be +16.0@M,+16.0@C. Positive and negative values are allowed.</note>
33
+ <note type="input" label="residue, potential modification mass"></note>
34
+ <note>VARIABLE MODIFICATION. The format of this parameter is the same as the format for residue, modification mass (see above).</note>
35
+ <note type="input" label="residue, potential modification motif"></note>
36
+ <note>VARIABLE MODIFICATION IN A MOTIF. The format of this parameter is similar to residue, modification mass, with the addition of a modified PROSITE notation sequence motif specification. For example, a value of 80@[ST!]PX[KR] indicates a modification of either S or T when followed by P, and residue and the a K or an R. A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it is NOT followed by a P, then either an S or a T, NOT followed by a P. Positive and negative values are allowed. </note>
37
+
38
+ <note>protein parameters</note>
39
+ <note type="input" label="protein, taxon">no default</note>
40
+ <note>SEQUENCE DATABASE TO SEARCH. This refers to identifiers in taxonomy.xml.</note>
41
+ <note type="input" label="protein, cleavage site">[RK]|{P}</note>
42
+ <note>ENZYME SPECIFICITY. This setting corresponds to the enzyme trypsin. The first characters in brackets represent residues N-terminal to the bond - the '|' pipe - and the second set of characters represent residues C-terminal to the bond. The characters must be in square brackets (denoting that only these residues are allowed for a cleavage) or french brackets (denoting that these residues cannot be in that position). Use UPPERCASE characters. To denote cleavage at any residue, use [X]|[X] and reset the scoring, maximum missed cleavage site parameter (see below) to something like 50. </note>
43
+ <note type="input" label="protein, modified residue mass file"></note>
44
+ <note type="input" label="protein, N-terminal residue modification mass"></note>
45
+ <note type="input" label="protein, C-terminal residue modification mass"></note>
46
+ <note type="input" label="protein, homolog management">no</note>
47
+ <note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
48
+
49
+ <note>model refinement parameters</note>
50
+ <note type="input" label="refine">no</note>
51
+ <note type="input" label="refine, modification mass"></note>
52
+ <note type="input" label="refine, sequence path"></note>
53
+ <note type="input" label="refine, tic percent">10</note>
54
+ <note type="input" label="refine, spectrum synthesis">yes</note>
55
+ <note type="input" label="refine, maximum valid expectation value">0.1</note>
56
+ <note type="input" label="refine, potential N-terminus modifications"></note>
57
+ <note type="input" label="refine, potential C-terminus modifications"></note>
58
+ <note type="input" label="refine, unanticipated cleavage">no</note>
59
+ <note type="input" label="refine, potential modification mass"></note>
60
+ <note type="input" label="refine, point mutations">no</note>
61
+ <note type="input" label="refine, use potential modifications for full refinement">no</note>
62
+ <note type="input" label="refine, point mutations">no</note>
63
+ <note type="input" label="refine, potential modification motif"></note>
64
+
65
+ <note>scoring parameters</note>
66
+
67
+ <note type="input" label="scoring, algorithm">k-score</note>
68
+ <note type="input" label="scoring, minimum ion count">1</note>
69
+ <note type="input" label="scoring, maximum missed cleavage sites">2</note>
70
+ <note type="input" label="scoring, x ions">no</note>
71
+ <note type="input" label="scoring, y ions">yes</note>
72
+ <note type="input" label="scoring, z ions">no</note>
73
+ <note type="input" label="scoring, a ions">no</note>
74
+ <note type="input" label="scoring, b ions">yes</note>
75
+ <note type="input" label="scoring, c ions">no</note>
76
+ <note type="input" label="scoring, cyclic permutation">no</note>
77
+ <note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
78
+ <note type="input" label="scoring, include reverse">no</note>
79
+ <note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
80
+ <note type="input" label="scoring, cyclic permutation">no</note>
81
+ <note type="input" label="scoring, include reverse">no</note>
82
+
83
+ <note>output parameters</note>
84
+ <note type="input" label="output, log path"></note>
85
+ <note type="input" label="output, message">1234567890</note>
86
+ <note type="input" label="output, sequence path"></note>
87
+ <note type="input" label="output, path">output.xml</note>
88
+ <note type="input" label="output, sort results by">spectrum</note>
89
+ <note>values = protein|spectrum (spectrum is the default)</note>
90
+ <note type="input" label="output, path hashing">no</note>
91
+ <note>values = yes|no</note>
92
+ <note type="input" label="output, xsl path">tandem-style.xsl</note>
93
+ <note type="input" label="output, parameters">yes</note>
94
+ <note>values = yes|no</note>
95
+ <note type="input" label="output, performance">yes</note>
96
+ <note>values = yes|no</note>
97
+ <note type="input" label="output, spectra">no</note>
98
+ <note>values = yes|no</note>
99
+ <note type="input" label="output, histograms">no</note>
100
+ <note>values = yes|no</note>
101
+ <note type="input" label="output, proteins">yes</note>
102
+ <note>values = yes|no</note>
103
+ <note type="input" label="output, sequences">no</note>
104
+ <note>values = yes|no</note>
105
+ <note type="input" label="output, one sequence copy">no</note>
106
+ <note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
107
+ <note type="input" label="output, results">all</note>
108
+ <note>values = all|valid|stochastic</note>
109
+ <note type="input" label="output, maximum valid expectation value">0.1</note>
110
+ <note>value is used in the valid|stochastic setting of output, results</note>
111
+ <note type="input" label="output, histogram column width">30</note>
112
+ <note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms into spread sheet programs easier.</note>
113
+
114
+ <note type="description">ADDITIONAL EXPLANATIONS</note>
115
+ <note type="description">Each one of the parameters for X! tandem is entered as a labeled note node. In the current version of X!, keep those note nodes on a single line.</note>
116
+ <note type="description">The presence of the type 'input' is necessary if a note is to be considered an input parameter. </note>
117
+ <note type="description">Any of the parameters that are paths to files may require alteration for a particular installation. Full path names usually cause the least trouble, but there is no reason not to use relative path names, if that is the most convenient.</note>
118
+ <note type="description">Any parameter values set in the 'list path, default parameters' file are reset by entries in the normal input file, if they are present. Otherwise, the default set is used. </note>
119
+ <note type="description">The 'list path, taxonomy information' file must exist.</note>
120
+ <note type="description">The directory containing the 'output, path' file must exist: it will not be created.</note>
121
+ <note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.</note>
122
+
123
+ </bioml>