RubyGems - protk - Versions diffs - 1.2.6.pre5 → 1.3.0.pre1 - Mend

protk 1.2.6.pre5 → 1.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

checksums.yaml +4 -4
data/README.md +84 -45
data/bin/add_retention_times.rb +9 -5
data/bin/augustus_to_proteindb.rb +7 -11
data/bin/interprophet.rb +28 -46
data/bin/make_decoy.rb +16 -48
data/bin/mascot_search.rb +57 -71
data/bin/mascot_to_pepxml.rb +13 -26
data/bin/msgfplus_search.rb +70 -107
data/bin/omssa_search.rb +52 -109
data/bin/peptide_prophet.rb +44 -119
data/bin/pepxml_to_table.rb +24 -27
data/bin/protein_prophet.rb +22 -82
data/bin/protxml_to_gff.rb +22 -519
data/bin/protxml_to_table.rb +2 -16
data/bin/sixframe.rb +10 -32
data/bin/tandem_search.rb +30 -403
data/bin/tandem_to_pepxml.rb +43 -0
data/bin/unimod_to_loc.rb +1 -1
data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
data/ext/decoymaker/extconf.rb +3 -0
data/lib/protk/constants.rb +16 -2
data/lib/protk/data/default_config.yml +2 -1
data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
data/lib/protk/data/tandem_params.xml +17 -54
data/lib/protk/fastadb.rb +2 -2
data/lib/protk/prophet_tool.rb +1 -1
data/lib/protk/protxml_to_gff_tool.rb +474 -0
data/lib/protk/search_tool.rb +58 -103
data/lib/protk/setup_rakefile.rake +9 -5
data/lib/protk/tandem_search_tool.rb +256 -0
data/lib/protk/tool.rb +85 -104
data/lib/protk.rb +1 -6
metadata +24 -103
data/bin/annotate_ids.rb +0 -59
data/bin/asapratio.rb +0 -27
data/bin/blastxml_to_table.rb +0 -119
data/bin/correct_omssa_retention_times.rb +0 -27
data/bin/feature_finder.rb +0 -95
data/bin/file_convert.rb +0 -164
data/bin/generate_omssa_loc.rb +0 -42
data/bin/gffmerge.rb +0 -208
data/bin/libra.rb +0 -70
data/bin/toppas_pipeline.rb +0 -84
data/bin/uniprot_annotation.rb +0 -141
data/bin/xls_to_table.rb +0 -52
data/bin/xpress.rb +0 -27
data/ext/protk/decoymaker/extconf.rb +0 -3
data/ext/protk/simplealign/extconf.rb +0 -3
data/lib/protk/biotools_excel_converter.rb +0 -60
data/lib/protk/eupathdb_gene_information_table.rb +0 -158
data/lib/protk/gapped_aligner.rb +0 -264
data/lib/protk/protein_annotator.rb +0 -646
data/lib/protk/spreadsheet_extensions.rb +0 -79
data/lib/protk/xtandem_defaults.rb +0 -11

data/ext/{protk/decoymaker → decoymaker}/decoymaker.c RENAMED Viewed

@@ -24,7 +24,9 @@
 #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
-static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
+static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
+  VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
   char *input_file = RSTRING_PTR(input_file_in);
   long sequences_to_generate = NUM2INT(db_length_in);
   char * output_file = RSTRING_PTR(output_file_in);
@@ -50,10 +52,26 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
   /* scanning sequence database */
   strcpy(infile,input_file);
-  if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
-  printf("scanning sequence database %s",infile);fflush(stdout);
+  if ((inp = fopen(infile, "r"))==NULL) {
+    printf("error opening sequence database %s\n",infile);return -1;
+  }
+  printf("scanning sequence database \n%s\n",infile);
+  fflush(stdout);
   i=0;n=0;k=0;
-  while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {i++; if(line[0]=='>') {if (!(n%1000)) printf(".");fflush(stdout); n++;} }
+  while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
+    i++;
+    if(line[0]=='>') {
+      if (!(n%1000)) {
+        printf(".");
+        fflush(stdout);
+        n++;
+      }
+    }
+  }
   n_sequences=n;
@@ -65,11 +83,17 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
   index=(char**)malloc(sizeof(char*)*n_sequences);
   index[0]=sequence; /* set first index pointer to beginning of first database sequence */
-  if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
+  if ((inp = fopen(infile, "r"))==NULL) {
+    printf("error opening sequence database %s\n",infile);
+    return -1;
+  }
-  printf("done\nreading sequence database %s",infile);fflush(stdout);
+  printf("done\nreading sequence database \n%s\n",infile);
+  fflush(stdout);
   n=-1;
   strcpy(temp_sequence,"\0");
   while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
   {
     if (strcmp(line,"\n")==0) {
@@ -98,18 +122,21 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
   }
   strcpy(index[n],temp_sequence);
   fclose(inp);
   n_sequences=n+1;
   printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
   measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
   /* generating Markov probabilities */
-  printf("generating Markov probability matrix...");fflush(stdout);
+  printf("generating Markov probability matrix...");
+  fflush(stdout);
   srand(time(0)); /* replace with constant to re-generate identical random databases */
@@ -124,7 +151,11 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
   for(protein=0;protein<n_sequences;protein++)
   {
-    if (!(protein%1000)) {printf(".");fflush(stdout);}
+    if (!(protein%1000)) {
+      printf(".");
+      fflush(stdout);
+    }
     if (protein<(n_sequences-1))
     {
      strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
@@ -142,35 +173,56 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
       {
         printf("Unknown amino acid %c",one_sequence[i]);
       } else {
-                a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
-                MP[a][i]++;
-                measured_aa_freq[a]++;
+        a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
+        MP[a][i]++;
+        measured_aa_freq[a]++;
       }
   }
-    else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
+    else {
+    a=floor(20*(float)rand()/RAND_MAX);
+    MP[a][i]++;
+    measured_aa_freq[a]++;
+    } // replace B, X, Z etc. with random amino acid to preserve size distribution
   }
   MP[20][pl]++;
       measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
     }
-    printf("done\n"); fflush(stdout);
+    printf("done\n");
+    fflush(stdout);
-  for(i=0;i<MAX_SEQUENCE_LENGTH;i++) row_sum[i]=0;
-    for(i=0;i<MAX_SEQUENCE_LENGTH;i++) for(j=0;j<=20;j++) row_sum[i]+=MP[j][i];
+  for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
+     row_sum[i]=0;
+  }
+  for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
+    for(j=0;j<=20;j++){
+      row_sum[i]+=MP[j][i];
+    }
+  }
   /* generate random protein sequences through Markov chain */
-      strcpy(outfile,output_file);
-    if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file %s\n",outfile); return -1;}
+  strcpy(outfile,output_file);
+    if ((outp = fopen(outfile, "w"))==NULL) {
+      printf("error opening output file %s\n",outfile);
+      return -1;
+    }
     printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
     strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
     for(protein=0;protein<sequences_to_generate;protein++)
     {
-      if (!(protein%1000)) {printf(".");fflush(stdout);}
+      if (!(protein%1000)) {
+        printf(".");fflush(stdout);
+      }
       i=0; j=0;
       while (1)
       {
@@ -213,9 +265,10 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_leng
   k=0;l=0;
   for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
-    printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
-  for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
-    printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
+    // printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
+  // for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
+  printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
   return 0;

data/ext/decoymaker/extconf.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require 'mkmf'
+create_makefile('decoymaker')

data/lib/protk/constants.rb CHANGED Viewed

@@ -7,7 +7,6 @@
 require 'yaml'
 require 'logger'
 require 'pathname'
-require 'ftools'
 class Constants
@@ -77,6 +76,21 @@ class Constants
       "#{@protk_dir}/tools/tandem"
   end
+  def makeblastdb
+    makeblastdbpath=%x[which makeblastdb]
+    makeblastdbpath.chomp
+  end
+  def blastdbcmd
+    path=%x[which blastdbcmd]
+    path.chomp
+  end
+  def mascot2xml
+    path=%x[which Mascot2XML]
+    path.chomp
+  end
   def protein_database_root
     path=@env['protein_database_root']
     if ( path =~ /^\// )
@@ -154,7 +168,7 @@ class Constants
     ENV['PATH']=protk_paths.join(":")
+    # puts "Path #{ENV['PATH']}"
     throw "No data found in config file" unless @env!=nil
     @info_level=default_config_yml['message_level']

data/lib/protk/data/default_config.yml CHANGED Viewed

@@ -12,4 +12,5 @@ uniprot_sprot_annotation_database: swissprot_annotation
 uniprot_trembl_annotation_database: trembl_annotation
 galaxy_root: galaxy
 default_mascot_server: www.matrixscience.com
-log_file: Logs/protk.log
+log_file: Logs/protk.log

data/lib/protk/data/tandem_gpm_defaults.xml ADDED Viewed

@@ -0,0 +1,175 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="tandem-input-style.xsl"?>
+<bioml>
+<note>spectrum parameters</note>
+	<note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
+	<note type="input" label="spectrum, parent monoisotopic mass error plus">100</note>
+	<note type="input" label="spectrum, parent monoisotopic mass error minus">100</note>
+	<note type="input" label="spectrum, parent monoisotopic mass isotope error">yes</note>
+	<note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
+	<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
+	<note type="input" label="spectrum, parent monoisotopic mass error units">ppm</note>
+		<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
+	<note type="input" label="spectrum, fragment mass type">monoisotopic</note>
+		<note>values are monoisotopic|average </note>
+<note>spectrum conditioning parameters</note>
+	<note type="input" label="spectrum, dynamic range">100.0</note>
+		<note>The peaks read in are normalized so that the most intense peak
+		is set to the dynamic range value. All peaks with values of less that
+		1, using this normalization, are not used. This normalization has the
+		overall effect of setting a threshold value for peak intensities.</note>
+	<note type="input" label="spectrum, total peaks">50</note>
+		<note>If this value is 0, it is ignored. If it is greater than zero (lets say 50),
+		then the number of peaks in the spectrum with be limited to the 50 most intense
+		peaks in the spectrum. X! tandem does not do any peak finding: it only
+		limits the peaks used by this parameter, and the dynamic range parameter.</note>
+	<note type="input" label="spectrum, maximum parent charge">4</note>
+	<note type="input" label="spectrum, use noise suppression">yes</note>
+	<note type="input" label="spectrum, minimum parent m+h">500.0</note>
+	<note type="input" label="spectrum, minimum fragment mz">150.0</note>
+	<note type="input" label="spectrum, minimum peaks">15</note>
+	<note type="input" label="spectrum, threads">1</note>
+	<note type="input" label="spectrum, sequence batch size">1000</note>
+<note>residue modification parameters</note>
+	<note type="input" label="residue, modification mass">57.022@C</note>
+		<note>The format of this parameter is m@X, where m is the modfication
+		mass in Daltons and X is the appropriate residue to modify. Lists of
+		modifications are separated by commas. For example, to modify M and C
+		with the addition of 16.0 Daltons, the parameter line would be
+		+16.0@M,+16.0@C
+		Positive and negative values are allowed.
+		</note>
+	<note type="input" label="residue, potential modification mass"></note>
+		<note>The format of this parameter is the same as the format
+		for residue, modification mass (see above).</note>
+	<note type="input" label="residue, potential modification motif"></note>
+		<note>The format of this parameter is similar to residue, modification mass,
+		with the addition of a modified PROSITE notation sequence motif specification.
+		For example, a value of 80@[ST!]PX[KR] indicates a modification
+		of either S or T when followed by P, and residue and the a K or an R.
+		A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
+		is NOT followed by a P, then either an S or a T, NOT followed by a P.
+		Positive and negative values are allowed.
+		</note>
+<note>protein parameters</note>
+	<note type="input" label="protein, taxon">other mammals</note>
+		<note>This value is interpreted using the information in taxonomy.xml.</note>
+	<note type="input" label="protein, cleavage site">[RK]|{P}</note>
+		<note>this setting corresponds to the enzyme trypsin. The first characters
+		in brackets represent residues N-terminal to the bond - the '|' pipe -
+		and the second set of characters represent residues C-terminal to the
+		bond. The characters must be in square brackets (denoting that only
+		these residues are allowed for a cleavage) or french brackets (denoting
+		that these residues cannot be in that position). Use UPPERCASE characters.
+		To denote cleavage at any residue, use [X]|[X] and reset the
+		scoring, maximum missed cleavage site parameter (see below) to something like 50.
+		</note>
+	<note type="input" label="protein, modified residue mass file"></note>
+	<note type="input" label="protein, cleavage C-terminal mass change">+17.002735</note>
+	<note type="input" label="protein, cleavage N-terminal mass change">+1.007825</note>
+	<note type="input" label="protein, N-terminal residue modification mass">0.0</note>
+	<note type="input" label="protein, C-terminal residue modification mass">0.0</note>
+	<note type="input" label="protein, homolog management">no</note>
+		<note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
+<note>model refinement parameters</note>
+	<note type="input" label="refine">yes</note>
+	<note type="input" label="refine, modification mass"></note>
+	<note type="input" label="refine, sequence path"></note>
+	<note type="input" label="refine, tic percent">20</note>
+	<note type="input" label="refine, spectrum synthesis">yes</note>
+	<note type="input" label="refine, maximum valid expectation value">0.1</note>
+	<note type="input" label="refine, potential N-terminus modifications">+42.010565@[</note>
+	<note type="input" label="refine, potential C-terminus modifications"></note>
+	<note type="input" label="refine, unanticipated cleavage">yes</note>
+	<note type="input" label="refine, potential modification mass"></note>
+	<note type="input" label="refine, point mutations">no</note>
+	<note type="input" label="refine, use potential modifications for full refinement">no</note>
+	<note type="input" label="refine, point mutations">no</note>
+	<note type="input" label="refine, potential modification motif"></note>
+	<note>The format of this parameter is similar to residue, modification mass,
+		with the addition of a modified PROSITE notation sequence motif specification.
+		For example, a value of 80@[ST!]PX[KR] indicates a modification
+		of either S or T when followed by P, and residue and the a K or an R.
+		A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it
+		is NOT followed by a P, then either an S or a T, NOT followed by a P.
+		Positive and negative values are allowed.
+		</note>
+<note>scoring parameters</note>
+	<note type="input" label="scoring, minimum ion count">4</note>
+	<note type="input" label="scoring, maximum missed cleavage sites">1</note>
+	<note type="input" label="scoring, x ions">no</note>
+	<note type="input" label="scoring, y ions">yes</note>
+	<note type="input" label="scoring, z ions">no</note>
+	<note type="input" label="scoring, a ions">no</note>
+	<note type="input" label="scoring, b ions">yes</note>
+	<note type="input" label="scoring, c ions">no</note>
+	<note type="input" label="scoring, cyclic permutation">no</note>
+		<note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
+	<note type="input" label="scoring, include reverse">no</note>
+		<note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
+	<note type="input" label="scoring, cyclic permutation">no</note>
+	<note type="input" label="scoring, include reverse">no</note>
+<note>output parameters</note>
+	<note type="input" label="output, log path"></note>
+	<note type="input" label="output, message">testing 1 2 3</note>
+	<note type="input" label="output, one sequence copy">no</note>
+	<note type="input" label="output, sequence path"></note>
+	<note type="input" label="output, path">output.xml</note>
+	<note type="input" label="output, sort results by">protein</note>
+		<note>values = protein|spectrum (spectrum is the default)</note>
+	<note type="input" label="output, path hashing">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, xsl path">tandem-style.xsl</note>
+	<note type="input" label="output, parameters">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, performance">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, spectra">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, histograms">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, proteins">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, sequences">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, one sequence copy">no</note>
+		<note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
+	<note type="input" label="output, results">valid</note>
+		<note>values = all|valid|stochastic</note>
+	<note type="input" label="output, maximum valid expectation value">0.1</note>
+		<note>value is used in the valid|stochastic setting of output, results</note>
+	<note type="input" label="output, histogram column width">30</note>
+		<note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms
+		into spread sheet programs easier.</note>
+<note type="description">ADDITIONAL EXPLANATIONS</note>
+	<note type="description">Each one of the parameters for X! tandem is entered as a labeled note
+			node. In the current version of X!, keep those note nodes
+			on a single line.
+	</note>
+	<note type="description">The presence of the type 'input' is necessary if a note is to be considered
+			an input parameter.
+	</note>
+	<note type="description">Any of the parameters that are paths to files may require alteration for a
+			particular installation. Full path names usually cause the least trouble,
+			but there is no reason not to use relative path names, if that is the
+			most convenient.
+	</note>
+	<note type="description">Any parameter values set in the 'list path, default parameters' file are
+			reset by entries in the normal input file, if they are present. Otherwise,
+			the default set is used.
+	</note>
+	<note type="description">The 'list path, taxonomy information' file must exist.
+		</note>
+	<note type="description">The directory containing the 'output, path' file must exist: it will not be created.
+		</note>
+	<note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.
+		</note>
+</bioml>

data/lib/protk/data/tandem_isb_kscore_defaults.xml ADDED Viewed

@@ -0,0 +1,123 @@
+<bioml>
+<note>spectrum parameters</note>
+	<note type="input" label="spectrum, parent monoisotopic mass error minus">2.0</note>
+	<note type="input" label="spectrum, parent monoisotopic mass error plus">4.0</note>
+	        <note>PRECURSOR MASS TOLERANCE. This is monoisotopic mass, so for non-accurate-mass instruments, for which the precursor is often taken nearer to the isotopically averaged mass, an asymmetric tolerance (-2.0 Da to 4.0 Da) is preferable. This somewhat imitates a (-3.0 Da to 3.0 Da) window for averaged mass (but not exactly).</note>
+	<note type="input" label="spectrum, parent monoisotopic mass isotope error">no</note>
+	<note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
+		<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
+	<note type="input" label="spectrum, parent monoisotopic mass error units">Daltons</note>
+		<note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
+	<note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
+	        <note>This parameter has no effect in k-score scoring.</note>
+	<note type="input" label="spectrum, fragment mass type">monoisotopic</note>
+		<note>values are monoisotopic|average </note>
+<note>spectrum conditioning parameters</note>
+        <note type="input" label="spectrum, use conditioning">no</note>
+		<note>For k-score scoring, it is recommended spectrum conditioning be turned OFF for best performance. All of the spectrum filtering and preprocessing options below in this section will be inactive.</note>
+	<note type="input" label="spectrum, dynamic range">10000.0</note>
+	<note type="input" label="spectrum, total peaks">400</note>
+	<note type="input" label="spectrum, maximum parent charge">5</note>
+	<note type="input" label="spectrum, use noise suppression">yes</note>
+	<note type="input" label="spectrum, minimum parent m+h">600.0</note>
+	<note type="input" label="spectrum, minimum fragment mz">125.0</note>
+	<note type="input" label="spectrum, minimum peaks">10</note>
+	<note type="input" label="spectrum, threads">1</note>
+	<note type="input" label="spectrum, sequence batch size">1000</note>
+<note>residue modification parameters</note>
+	<note type="input" label="residue, modification mass"></note>
+		<note>STATIC MODIFICATION. The format of this parameter is m@X, where m is the modfication mass in Daltons and X is the appropriate residue to modify. Lists of modifications are separated by commas. For example, to modify M and C with the addition of 16.0 Daltons, the parameter line would be +16.0@M,+16.0@C. Positive and negative values are allowed.</note>
+	<note type="input" label="residue, potential modification mass"></note>
+		<note>VARIABLE MODIFICATION. The format of this parameter is the same as the format for residue, modification mass (see above).</note>
+	<note type="input" label="residue, potential modification motif"></note>
+		<note>VARIABLE MODIFICATION IN A MOTIF. The format of this parameter is similar to residue, modification mass, with the addition of a modified PROSITE notation sequence motif specification. For example, a value of 80@[ST!]PX[KR] indicates a modification of either S or T when followed by P, and residue and the a K or an R. A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it is NOT followed by a P, then either an S or a T, NOT followed by a P. Positive and negative values are allowed. </note>
+<note>protein parameters</note>
+	<note type="input" label="protein, taxon">no default</note>
+		<note>SEQUENCE DATABASE TO SEARCH. This refers to identifiers in taxonomy.xml.</note>
+	<note type="input" label="protein, cleavage site">[RK]|{P}</note>
+		<note>ENZYME SPECIFICITY. This setting corresponds to the enzyme trypsin. The first characters in brackets represent residues N-terminal to the bond - the '|' pipe - and the second set of characters represent residues C-terminal to the bond. The characters must be in square brackets (denoting that only these residues are allowed for a cleavage) or french brackets (denoting that these residues cannot be in that position). Use UPPERCASE characters. To denote cleavage at any residue, use [X]|[X] and reset the scoring, maximum missed cleavage site parameter (see below) to something like 50. </note>
+	<note type="input" label="protein, modified residue mass file"></note>
+	<note type="input" label="protein, N-terminal residue modification mass"></note>
+	<note type="input" label="protein, C-terminal residue modification mass"></note>
+	<note type="input" label="protein, homolog management">no</note>
+		<note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
+<note>model refinement parameters</note>
+	<note type="input" label="refine">no</note>
+	<note type="input" label="refine, modification mass"></note>
+	<note type="input" label="refine, sequence path"></note>
+	<note type="input" label="refine, tic percent">10</note>
+	<note type="input" label="refine, spectrum synthesis">yes</note>
+	<note type="input" label="refine, maximum valid expectation value">0.1</note>
+	<note type="input" label="refine, potential N-terminus modifications"></note>
+	<note type="input" label="refine, potential C-terminus modifications"></note>
+	<note type="input" label="refine, unanticipated cleavage">no</note>
+	<note type="input" label="refine, potential modification mass"></note>
+	<note type="input" label="refine, point mutations">no</note>
+	<note type="input" label="refine, use potential modifications for full refinement">no</note>
+	<note type="input" label="refine, point mutations">no</note>
+	<note type="input" label="refine, potential modification motif"></note>
+<note>scoring parameters</note>
+        <note type="input" label="scoring, algorithm">k-score</note>
+ 	<note type="input" label="scoring, minimum ion count">1</note>
+	<note type="input" label="scoring, maximum missed cleavage sites">2</note>
+	<note type="input" label="scoring, x ions">no</note>
+	<note type="input" label="scoring, y ions">yes</note>
+	<note type="input" label="scoring, z ions">no</note>
+	<note type="input" label="scoring, a ions">no</note>
+	<note type="input" label="scoring, b ions">yes</note>
+	<note type="input" label="scoring, c ions">no</note>
+	<note type="input" label="scoring, cyclic permutation">no</note>
+		<note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
+	<note type="input" label="scoring, include reverse">no</note>
+		<note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
+	<note type="input" label="scoring, cyclic permutation">no</note>
+	<note type="input" label="scoring, include reverse">no</note>
+<note>output parameters</note>
+	<note type="input" label="output, log path"></note>
+	<note type="input" label="output, message">1234567890</note>
+	<note type="input" label="output, sequence path"></note>
+	<note type="input" label="output, path">output.xml</note>
+	<note type="input" label="output, sort results by">spectrum</note>
+		<note>values = protein|spectrum (spectrum is the default)</note>
+	<note type="input" label="output, path hashing">no</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, xsl path">tandem-style.xsl</note>
+	<note type="input" label="output, parameters">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, performance">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, spectra">no</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, histograms">no</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, proteins">yes</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, sequences">no</note>
+		<note>values = yes|no</note>
+	<note type="input" label="output, one sequence copy">no</note>
+		<note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
+	<note type="input" label="output, results">all</note>
+		<note>values = all|valid|stochastic</note>
+	<note type="input" label="output, maximum valid expectation value">0.1</note>
+		<note>value is used in the valid|stochastic setting of output, results</note>
+	<note type="input" label="output, histogram column width">30</note>
+		<note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms into spread sheet programs easier.</note>
+<note type="description">ADDITIONAL EXPLANATIONS</note>
+	<note type="description">Each one of the parameters for X! tandem is entered as a labeled note node. In the current version of X!, keep those note nodes on a single line.</note>
+	<note type="description">The presence of the type 'input' is necessary if a note is to be considered an input parameter. </note>
+	<note type="description">Any of the parameters that are paths to files may require alteration for a particular installation. Full path names usually cause the least trouble, but there is no reason not to use relative path names, if that is the most convenient.</note>
+	<note type="description">Any parameter values set in the 'list path, default parameters' file are reset by entries in the normal input file, if they are present. Otherwise, the default set is used. </note>
+	<note type="description">The 'list path, taxonomy information' file must exist.</note>
+	<note type="description">The directory containing the 'output, path' file must exist: it will not be created.</note>
+	<note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.</note>
+</bioml>