protk 1.2.6.pre5 → 1.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -24,7 +24,9 @@
24
24
  #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
25
25
 
26
26
 
27
- static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
27
+ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
28
+ VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
29
+
28
30
  char *input_file = RSTRING_PTR(input_file_in);
29
31
  long sequences_to_generate = NUM2INT(db_length_in);
30
32
  char * output_file = RSTRING_PTR(output_file_in);
@@ -50,10 +52,26 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
50
52
  /* scanning sequence database */
51
53
 
52
54
  strcpy(infile,input_file);
53
- if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
54
- printf("scanning sequence database %s",infile);fflush(stdout);
55
+
56
+ if ((inp = fopen(infile, "r"))==NULL) {
57
+ printf("error opening sequence database %s\n",infile);return -1;
58
+ }
59
+
60
+ printf("scanning sequence database \n%s\n",infile);
61
+ fflush(stdout);
62
+
55
63
  i=0;n=0;k=0;
56
- while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {i++; if(line[0]=='>') {if (!(n%1000)) printf(".");fflush(stdout); n++;} }
64
+
65
+ while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
66
+ i++;
67
+ if(line[0]=='>') {
68
+ if (!(n%1000)) {
69
+ printf(".");
70
+ fflush(stdout);
71
+ n++;
72
+ }
73
+ }
74
+ }
57
75
 
58
76
  n_sequences=n;
59
77
 
@@ -65,11 +83,17 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
65
83
  index=(char**)malloc(sizeof(char*)*n_sequences);
66
84
  index[0]=sequence; /* set first index pointer to beginning of first database sequence */
67
85
 
68
- if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
86
+ if ((inp = fopen(infile, "r"))==NULL) {
87
+ printf("error opening sequence database %s\n",infile);
88
+ return -1;
89
+ }
69
90
 
70
- printf("done\nreading sequence database %s",infile);fflush(stdout);
91
+ printf("done\nreading sequence database \n%s\n",infile);
92
+ fflush(stdout);
93
+
71
94
  n=-1;
72
95
  strcpy(temp_sequence,"\0");
96
+
73
97
  while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
74
98
  {
75
99
  if (strcmp(line,"\n")==0) {
@@ -98,18 +122,21 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
98
122
  }
99
123
 
100
124
  strcpy(index[n],temp_sequence);
125
+
101
126
  fclose(inp);
102
127
 
103
128
  n_sequences=n+1;
104
129
 
105
130
  printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
131
+
106
132
  measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
107
133
 
108
134
 
109
135
 
110
136
  /* generating Markov probabilities */
111
137
 
112
- printf("generating Markov probability matrix...");fflush(stdout);
138
+ printf("generating Markov probability matrix...");
139
+ fflush(stdout);
113
140
 
114
141
  srand(time(0)); /* replace with constant to re-generate identical random databases */
115
142
 
@@ -124,7 +151,11 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
124
151
 
125
152
  for(protein=0;protein<n_sequences;protein++)
126
153
  {
127
- if (!(protein%1000)) {printf(".");fflush(stdout);}
154
+ if (!(protein%1000)) {
155
+ printf(".");
156
+ fflush(stdout);
157
+ }
158
+
128
159
  if (protein<(n_sequences-1))
129
160
  {
130
161
  strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
@@ -142,35 +173,56 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
142
173
  {
143
174
  printf("Unknown amino acid %c",one_sequence[i]);
144
175
  } else {
145
- a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
146
- MP[a][i]++;
147
- measured_aa_freq[a]++;
176
+ a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
177
+ MP[a][i]++;
178
+ measured_aa_freq[a]++;
148
179
  }
149
180
  }
150
- else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
181
+ else {
182
+ a=floor(20*(float)rand()/RAND_MAX);
183
+ MP[a][i]++;
184
+ measured_aa_freq[a]++;
185
+ } // replace B, X, Z etc. with random amino acid to preserve size distribution
151
186
  }
152
187
  MP[20][pl]++;
153
188
  measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
154
189
  }
155
- printf("done\n"); fflush(stdout);
190
+
191
+ printf("done\n");
192
+ fflush(stdout);
156
193
 
157
194
 
158
195
 
159
- for(i=0;i<MAX_SEQUENCE_LENGTH;i++) row_sum[i]=0;
160
- for(i=0;i<MAX_SEQUENCE_LENGTH;i++) for(j=0;j<=20;j++) row_sum[i]+=MP[j][i];
196
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
197
+ row_sum[i]=0;
198
+ }
199
+
200
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
201
+ for(j=0;j<=20;j++){
202
+ row_sum[i]+=MP[j][i];
203
+ }
204
+ }
161
205
 
162
206
 
163
207
  /* generate random protein sequences through Markov chain */
164
208
 
165
- strcpy(outfile,output_file);
166
- if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file %s\n",outfile); return -1;}
209
+ strcpy(outfile,output_file);
210
+
211
+ if ((outp = fopen(outfile, "w"))==NULL) {
212
+ printf("error opening output file %s\n",outfile);
213
+ return -1;
214
+ }
215
+
167
216
  printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
168
217
 
169
218
  strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
170
219
 
171
220
  for(protein=0;protein<sequences_to_generate;protein++)
172
221
  {
173
- if (!(protein%1000)) {printf(".");fflush(stdout);}
222
+ if (!(protein%1000)) {
223
+ printf(".");fflush(stdout);
224
+ }
225
+
174
226
  i=0; j=0;
175
227
  while (1)
176
228
  {
@@ -213,9 +265,10 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
213
265
 
214
266
  k=0;l=0;
215
267
  for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
216
- printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
217
- for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
218
- printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
268
+ // printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
269
+ // for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
270
+
271
+ printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
219
272
 
220
273
  return 0;
221
274
 
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('decoymaker')
@@ -7,7 +7,6 @@
7
7
  require 'yaml'
8
8
  require 'logger'
9
9
  require 'pathname'
10
- require 'ftools'
11
10
 
12
11
  class Constants
13
12
 
@@ -77,6 +76,21 @@ class Constants
77
76
  "#{@protk_dir}/tools/tandem"
78
77
  end
79
78
 
79
+ def makeblastdb
80
+ makeblastdbpath=%x[which makeblastdb]
81
+ makeblastdbpath.chomp
82
+ end
83
+
84
+ def blastdbcmd
85
+ path=%x[which blastdbcmd]
86
+ path.chomp
87
+ end
88
+
89
+ def mascot2xml
90
+ path=%x[which Mascot2XML]
91
+ path.chomp
92
+ end
93
+
80
94
  def protein_database_root
81
95
  path=@env['protein_database_root']
82
96
  if ( path =~ /^\// )
@@ -154,7 +168,7 @@ class Constants
154
168
 
155
169
  ENV['PATH']=protk_paths.join(":")
156
170
 
157
-
171
+ # puts "Path #{ENV['PATH']}"
158
172
  throw "No data found in config file" unless @env!=nil
159
173
  @info_level=default_config_yml['message_level']
160
174
 
@@ -12,4 +12,5 @@ uniprot_sprot_annotation_database: swissprot_annotation
12
12
  uniprot_trembl_annotation_database: trembl_annotation
13
13
  galaxy_root: galaxy
14
14
  default_mascot_server: www.matrixscience.com
15
- log_file: Logs/protk.log
15
+ log_file: Logs/protk.log
16
+
@@ -0,0 +1,175 @@
1
+ <?xml version="1.0"?>
2
+ <?xml-stylesheet type="text/xsl" href="tandem-input-style.xsl"?>
3
+ <bioml>
4
+
5
+ <note>spectrum parameters</note>
6
+ <note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
7
+ <note type="input" label="spectrum, parent monoisotopic mass error plus">100</note>
8
+ <note type="input" label="spectrum, parent monoisotopic mass error minus">100</note>
9
+ <note type="input" label="spectrum, parent monoisotopic mass isotope error">yes</note>
10
+ <note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
11
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
12
+ <note type="input" label="spectrum, parent monoisotopic mass error units">ppm</note>
13
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
14
+ <note type="input" label="spectrum, fragment mass type">monoisotopic</note>
15
+ <note>values are monoisotopic|average </note>
16
+
17
+ <note>spectrum conditioning parameters</note>
18
+ <note type="input" label="spectrum, dynamic range">100.0</note>
19
+ <note>The peaks read in are normalized so that the most intense peak
20
+ is set to the dynamic range value. All peaks with values of less that
21
+ 1, using this normalization, are not used. This normalization has the
22
+ overall effect of setting a threshold value for peak intensities.</note>
23
+ <note type="input" label="spectrum, total peaks">50</note>
24
+ <note>If this value is 0, it is ignored. If it is greater than zero (lets say 50),
25
+ then the number of peaks in the spectrum with be limited to the 50 most intense
26
+ peaks in the spectrum. X! tandem does not do any peak finding: it only
27
+ limits the peaks used by this parameter, and the dynamic range parameter.</note>
28
+ <note type="input" label="spectrum, maximum parent charge">4</note>
29
+ <note type="input" label="spectrum, use noise suppression">yes</note>
30
+ <note type="input" label="spectrum, minimum parent m+h">500.0</note>
31
+ <note type="input" label="spectrum, minimum fragment mz">150.0</note>
32
+ <note type="input" label="spectrum, minimum peaks">15</note>
33
+ <note type="input" label="spectrum, threads">1</note>
34
+ <note type="input" label="spectrum, sequence batch size">1000</note>
35
+
36
+ <note>residue modification parameters</note>
37
+ <note type="input" label="residue, modification mass">57.022@C</note>
38
+ <note>The format of this parameter is m@X, where m is the modfication
39
+ mass in Daltons and X is the appropriate residue to modify. Lists of
40
+ modifications are separated by commas. For example, to modify M and C
41
+ with the addition of 16.0 Daltons, the parameter line would be
42
+ +16.0@M,+16.0@C
43
+ Positive and negative values are allowed.
44
+ </note>
45
+ <note type="input" label="residue, potential modification mass"></note>
46
+ <note>The format of this parameter is the same as the format
47
+ for residue, modification mass (see above).</note>
48
+ <note type="input" label="residue, potential modification motif"></note>
49
+ <note>The format of this parameter is similar to residue, modification mass,
50
+ with the addition of a modified PROSITE notation sequence motif specification.
51
+ For example, a value of 80@[ST!]PX[KR] indicates a modification
52
+ of either S or T when followed by P, and residue and the a K or an R.
53
+ A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
54
+ is NOT followed by a P, then either an S or a T, NOT followed by a P.
55
+ Positive and negative values are allowed.
56
+ </note>
57
+
58
+ <note>protein parameters</note>
59
+ <note type="input" label="protein, taxon">other mammals</note>
60
+ <note>This value is interpreted using the information in taxonomy.xml.</note>
61
+ <note type="input" label="protein, cleavage site">[RK]|{P}</note>
62
+ <note>this setting corresponds to the enzyme trypsin. The first characters
63
+ in brackets represent residues N-terminal to the bond - the '|' pipe -
64
+ and the second set of characters represent residues C-terminal to the
65
+ bond. The characters must be in square brackets (denoting that only
66
+ these residues are allowed for a cleavage) or french brackets (denoting
67
+ that these residues cannot be in that position). Use UPPERCASE characters.
68
+ To denote cleavage at any residue, use [X]|[X] and reset the
69
+ scoring, maximum missed cleavage site parameter (see below) to something like 50.
70
+ </note>
71
+ <note type="input" label="protein, modified residue mass file"></note>
72
+ <note type="input" label="protein, cleavage C-terminal mass change">+17.002735</note>
73
+ <note type="input" label="protein, cleavage N-terminal mass change">+1.007825</note>
74
+ <note type="input" label="protein, N-terminal residue modification mass">0.0</note>
75
+ <note type="input" label="protein, C-terminal residue modification mass">0.0</note>
76
+ <note type="input" label="protein, homolog management">no</note>
77
+ <note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
78
+
79
+ <note>model refinement parameters</note>
80
+ <note type="input" label="refine">yes</note>
81
+ <note type="input" label="refine, modification mass"></note>
82
+ <note type="input" label="refine, sequence path"></note>
83
+ <note type="input" label="refine, tic percent">20</note>
84
+ <note type="input" label="refine, spectrum synthesis">yes</note>
85
+ <note type="input" label="refine, maximum valid expectation value">0.1</note>
86
+ <note type="input" label="refine, potential N-terminus modifications">+42.010565@[</note>
87
+ <note type="input" label="refine, potential C-terminus modifications"></note>
88
+ <note type="input" label="refine, unanticipated cleavage">yes</note>
89
+ <note type="input" label="refine, potential modification mass"></note>
90
+ <note type="input" label="refine, point mutations">no</note>
91
+ <note type="input" label="refine, use potential modifications for full refinement">no</note>
92
+ <note type="input" label="refine, point mutations">no</note>
93
+ <note type="input" label="refine, potential modification motif"></note>
94
+ <note>The format of this parameter is similar to residue, modification mass,
95
+ with the addition of a modified PROSITE notation sequence motif specification.
96
+ For example, a value of 80@[ST!]PX[KR] indicates a modification
97
+ of either S or T when followed by P, and residue and the a K or an R.
98
+ A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
99
+ is NOT followed by a P, then either an S or a T, NOT followed by a P.
100
+ Positive and negative values are allowed.
101
+ </note>
102
+
103
+ <note>scoring parameters</note>
104
+ <note type="input" label="scoring, minimum ion count">4</note>
105
+ <note type="input" label="scoring, maximum missed cleavage sites">1</note>
106
+ <note type="input" label="scoring, x ions">no</note>
107
+ <note type="input" label="scoring, y ions">yes</note>
108
+ <note type="input" label="scoring, z ions">no</note>
109
+ <note type="input" label="scoring, a ions">no</note>
110
+ <note type="input" label="scoring, b ions">yes</note>
111
+ <note type="input" label="scoring, c ions">no</note>
112
+ <note type="input" label="scoring, cyclic permutation">no</note>
113
+ <note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
114
+ <note type="input" label="scoring, include reverse">no</note>
115
+ <note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
116
+ <note type="input" label="scoring, cyclic permutation">no</note>
117
+ <note type="input" label="scoring, include reverse">no</note>
118
+
119
+ <note>output parameters</note>
120
+ <note type="input" label="output, log path"></note>
121
+ <note type="input" label="output, message">testing 1 2 3</note>
122
+ <note type="input" label="output, one sequence copy">no</note>
123
+ <note type="input" label="output, sequence path"></note>
124
+ <note type="input" label="output, path">output.xml</note>
125
+ <note type="input" label="output, sort results by">protein</note>
126
+ <note>values = protein|spectrum (spectrum is the default)</note>
127
+ <note type="input" label="output, path hashing">yes</note>
128
+ <note>values = yes|no</note>
129
+ <note type="input" label="output, xsl path">tandem-style.xsl</note>
130
+ <note type="input" label="output, parameters">yes</note>
131
+ <note>values = yes|no</note>
132
+ <note type="input" label="output, performance">yes</note>
133
+ <note>values = yes|no</note>
134
+ <note type="input" label="output, spectra">yes</note>
135
+ <note>values = yes|no</note>
136
+ <note type="input" label="output, histograms">yes</note>
137
+ <note>values = yes|no</note>
138
+ <note type="input" label="output, proteins">yes</note>
139
+ <note>values = yes|no</note>
140
+ <note type="input" label="output, sequences">yes</note>
141
+ <note>values = yes|no</note>
142
+ <note type="input" label="output, one sequence copy">no</note>
143
+ <note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
144
+ <note type="input" label="output, results">valid</note>
145
+ <note>values = all|valid|stochastic</note>
146
+ <note type="input" label="output, maximum valid expectation value">0.1</note>
147
+ <note>value is used in the valid|stochastic setting of output, results</note>
148
+ <note type="input" label="output, histogram column width">30</note>
149
+ <note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms
150
+ into spread sheet programs easier.</note>
151
+ <note type="description">ADDITIONAL EXPLANATIONS</note>
152
+ <note type="description">Each one of the parameters for X! tandem is entered as a labeled note
153
+ node. In the current version of X!, keep those note nodes
154
+ on a single line.
155
+ </note>
156
+ <note type="description">The presence of the type 'input' is necessary if a note is to be considered
157
+ an input parameter.
158
+ </note>
159
+ <note type="description">Any of the parameters that are paths to files may require alteration for a
160
+ particular installation. Full path names usually cause the least trouble,
161
+ but there is no reason not to use relative path names, if that is the
162
+ most convenient.
163
+ </note>
164
+ <note type="description">Any parameter values set in the 'list path, default parameters' file are
165
+ reset by entries in the normal input file, if they are present. Otherwise,
166
+ the default set is used.
167
+ </note>
168
+ <note type="description">The 'list path, taxonomy information' file must exist.
169
+ </note>
170
+ <note type="description">The directory containing the 'output, path' file must exist: it will not be created.
171
+ </note>
172
+ <note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.
173
+ </note>
174
+
175
+ </bioml>
@@ -0,0 +1,123 @@
1
+ <bioml>
2
+
3
+ <note>spectrum parameters</note>
4
+ <note type="input" label="spectrum, parent monoisotopic mass error minus">2.0</note>
5
+ <note type="input" label="spectrum, parent monoisotopic mass error plus">4.0</note>
6
+ <note>PRECURSOR MASS TOLERANCE. This is monoisotopic mass, so for non-accurate-mass instruments, for which the precursor is often taken nearer to the isotopically averaged mass, an asymmetric tolerance (-2.0 Da to 4.0 Da) is preferable. This somewhat imitates a (-3.0 Da to 3.0 Da) window for averaged mass (but not exactly).</note>
7
+ <note type="input" label="spectrum, parent monoisotopic mass isotope error">no</note>
8
+ <note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
9
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
10
+ <note type="input" label="spectrum, parent monoisotopic mass error units">Daltons</note>
11
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
12
+ <note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
13
+ <note>This parameter has no effect in k-score scoring.</note>
14
+ <note type="input" label="spectrum, fragment mass type">monoisotopic</note>
15
+ <note>values are monoisotopic|average </note>
16
+
17
+ <note>spectrum conditioning parameters</note>
18
+ <note type="input" label="spectrum, use conditioning">no</note>
19
+ <note>For k-score scoring, it is recommended spectrum conditioning be turned OFF for best performance. All of the spectrum filtering and preprocessing options below in this section will be inactive.</note>
20
+ <note type="input" label="spectrum, dynamic range">10000.0</note>
21
+ <note type="input" label="spectrum, total peaks">400</note>
22
+ <note type="input" label="spectrum, maximum parent charge">5</note>
23
+ <note type="input" label="spectrum, use noise suppression">yes</note>
24
+ <note type="input" label="spectrum, minimum parent m+h">600.0</note>
25
+ <note type="input" label="spectrum, minimum fragment mz">125.0</note>
26
+ <note type="input" label="spectrum, minimum peaks">10</note>
27
+ <note type="input" label="spectrum, threads">1</note>
28
+ <note type="input" label="spectrum, sequence batch size">1000</note>
29
+
30
+ <note>residue modification parameters</note>
31
+ <note type="input" label="residue, modification mass"></note>
32
+ <note>STATIC MODIFICATION. The format of this parameter is m@X, where m is the modfication mass in Daltons and X is the appropriate residue to modify. Lists of modifications are separated by commas. For example, to modify M and C with the addition of 16.0 Daltons, the parameter line would be +16.0@M,+16.0@C. Positive and negative values are allowed.</note>
33
+ <note type="input" label="residue, potential modification mass"></note>
34
+ <note>VARIABLE MODIFICATION. The format of this parameter is the same as the format for residue, modification mass (see above).</note>
35
+ <note type="input" label="residue, potential modification motif"></note>
36
+ <note>VARIABLE MODIFICATION IN A MOTIF. The format of this parameter is similar to residue, modification mass, with the addition of a modified PROSITE notation sequence motif specification. For example, a value of 80@[ST!]PX[KR] indicates a modification of either S or T when followed by P, and residue and the a K or an R. A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it is NOT followed by a P, then either an S or a T, NOT followed by a P. Positive and negative values are allowed. </note>
37
+
38
+ <note>protein parameters</note>
39
+ <note type="input" label="protein, taxon">no default</note>
40
+ <note>SEQUENCE DATABASE TO SEARCH. This refers to identifiers in taxonomy.xml.</note>
41
+ <note type="input" label="protein, cleavage site">[RK]|{P}</note>
42
+ <note>ENZYME SPECIFICITY. This setting corresponds to the enzyme trypsin. The first characters in brackets represent residues N-terminal to the bond - the '|' pipe - and the second set of characters represent residues C-terminal to the bond. The characters must be in square brackets (denoting that only these residues are allowed for a cleavage) or french brackets (denoting that these residues cannot be in that position). Use UPPERCASE characters. To denote cleavage at any residue, use [X]|[X] and reset the scoring, maximum missed cleavage site parameter (see below) to something like 50. </note>
43
+ <note type="input" label="protein, modified residue mass file"></note>
44
+ <note type="input" label="protein, N-terminal residue modification mass"></note>
45
+ <note type="input" label="protein, C-terminal residue modification mass"></note>
46
+ <note type="input" label="protein, homolog management">no</note>
47
+ <note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
48
+
49
+ <note>model refinement parameters</note>
50
+ <note type="input" label="refine">no</note>
51
+ <note type="input" label="refine, modification mass"></note>
52
+ <note type="input" label="refine, sequence path"></note>
53
+ <note type="input" label="refine, tic percent">10</note>
54
+ <note type="input" label="refine, spectrum synthesis">yes</note>
55
+ <note type="input" label="refine, maximum valid expectation value">0.1</note>
56
+ <note type="input" label="refine, potential N-terminus modifications"></note>
57
+ <note type="input" label="refine, potential C-terminus modifications"></note>
58
+ <note type="input" label="refine, unanticipated cleavage">no</note>
59
+ <note type="input" label="refine, potential modification mass"></note>
60
+ <note type="input" label="refine, point mutations">no</note>
61
+ <note type="input" label="refine, use potential modifications for full refinement">no</note>
62
+ <note type="input" label="refine, point mutations">no</note>
63
+ <note type="input" label="refine, potential modification motif"></note>
64
+
65
+ <note>scoring parameters</note>
66
+
67
+ <note type="input" label="scoring, algorithm">k-score</note>
68
+ <note type="input" label="scoring, minimum ion count">1</note>
69
+ <note type="input" label="scoring, maximum missed cleavage sites">2</note>
70
+ <note type="input" label="scoring, x ions">no</note>
71
+ <note type="input" label="scoring, y ions">yes</note>
72
+ <note type="input" label="scoring, z ions">no</note>
73
+ <note type="input" label="scoring, a ions">no</note>
74
+ <note type="input" label="scoring, b ions">yes</note>
75
+ <note type="input" label="scoring, c ions">no</note>
76
+ <note type="input" label="scoring, cyclic permutation">no</note>
77
+ <note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
78
+ <note type="input" label="scoring, include reverse">no</note>
79
+ <note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
80
+ <note type="input" label="scoring, cyclic permutation">no</note>
81
+ <note type="input" label="scoring, include reverse">no</note>
82
+
83
+ <note>output parameters</note>
84
+ <note type="input" label="output, log path"></note>
85
+ <note type="input" label="output, message">1234567890</note>
86
+ <note type="input" label="output, sequence path"></note>
87
+ <note type="input" label="output, path">output.xml</note>
88
+ <note type="input" label="output, sort results by">spectrum</note>
89
+ <note>values = protein|spectrum (spectrum is the default)</note>
90
+ <note type="input" label="output, path hashing">no</note>
91
+ <note>values = yes|no</note>
92
+ <note type="input" label="output, xsl path">tandem-style.xsl</note>
93
+ <note type="input" label="output, parameters">yes</note>
94
+ <note>values = yes|no</note>
95
+ <note type="input" label="output, performance">yes</note>
96
+ <note>values = yes|no</note>
97
+ <note type="input" label="output, spectra">no</note>
98
+ <note>values = yes|no</note>
99
+ <note type="input" label="output, histograms">no</note>
100
+ <note>values = yes|no</note>
101
+ <note type="input" label="output, proteins">yes</note>
102
+ <note>values = yes|no</note>
103
+ <note type="input" label="output, sequences">no</note>
104
+ <note>values = yes|no</note>
105
+ <note type="input" label="output, one sequence copy">no</note>
106
+ <note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
107
+ <note type="input" label="output, results">all</note>
108
+ <note>values = all|valid|stochastic</note>
109
+ <note type="input" label="output, maximum valid expectation value">0.1</note>
110
+ <note>value is used in the valid|stochastic setting of output, results</note>
111
+ <note type="input" label="output, histogram column width">30</note>
112
+ <note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms into spread sheet programs easier.</note>
113
+
114
+ <note type="description">ADDITIONAL EXPLANATIONS</note>
115
+ <note type="description">Each one of the parameters for X! tandem is entered as a labeled note node. In the current version of X!, keep those note nodes on a single line.</note>
116
+ <note type="description">The presence of the type 'input' is necessary if a note is to be considered an input parameter. </note>
117
+ <note type="description">Any of the parameters that are paths to files may require alteration for a particular installation. Full path names usually cause the least trouble, but there is no reason not to use relative path names, if that is the most convenient.</note>
118
+ <note type="description">Any parameter values set in the 'list path, default parameters' file are reset by entries in the normal input file, if they are present. Otherwise, the default set is used. </note>
119
+ <note type="description">The 'list path, taxonomy information' file must exist.</note>
120
+ <note type="description">The directory containing the 'output, path' file must exist: it will not be created.</note>
121
+ <note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.</note>
122
+
123
+ </bioml>