protk 1.2.6.pre5 → 1.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -0,0 +1,123 @@
1
+ <?xml version="1.0"?>
2
+ <?xml-stylesheet type="text/xsl" href="tandem-input-style.xsl"?>
3
+ <bioml>
4
+
5
+ <note>spectrum parameters</note>
6
+ <note type="input" label="spectrum, parent monoisotopic mass isotope error">no</note>
7
+ <note type="input" label="spectrum, parent monoisotopic mass error units">Daltons</note>
8
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
9
+
10
+ <note type="input" label="spectrum, fragment monoisotopic mass error">0.4</note>
11
+ <note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
12
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
13
+ <note type="input" label="spectrum, fragment mass type">monoisotopic</note>
14
+ <note>values are monoisotopic|average </note>
15
+
16
+ <note>spectrum conditioning parameters</note>
17
+ <note type="input" label="spectrum, use conditioning">yes</note>
18
+ <note>For native scoring, conditioning should be turned on.</note>
19
+ <note type="input" label="spectrum, dynamic range">100.0</note>
20
+ <note>The peaks read in are normalized so that the most intense peak is set to the dynamic range value. All peaks with values of less that 1, using this normalization, are not used. This normalization has the overall effect of setting a threshold value for peak intensities.</note>
21
+ <note type="input" label="spectrum, total peaks">50</note>
22
+ <note>If this value is 0, it is ignored. If it is greater than zero (lets say 50), then the number of peaks in the spectrum with be limited to the 50 most intense peaks in the spectrum. X! tandem does not do any peak finding: it only limits the peaks used by this parameter, and the dynamic range parameter.</note>
23
+ <note type="input" label="spectrum, maximum parent charge">5</note>
24
+ <note type="input" label="spectrum, use noise suppression">no</note>
25
+ <note type="input" label="spectrum, minimum parent m+h">600.0</note>
26
+ <note type="input" label="spectrum, maximum parent m+h">4000.0</note>
27
+ <note type="input" label="spectrum, minimum fragment mz">50.0</note>
28
+ <note type="input" label="spectrum, minimum peaks">6</note>
29
+ <note type="input" label="spectrum, threads">1</note>
30
+ <note type="input" label="spectrum, sequence batch size">1000</note>
31
+
32
+ <note>residue modification parameters</note>
33
+ <note type="input" label="residue, modification mass"></note>
34
+ <note>STATIC MODIFICATION. The format of this parameter is m@X, where m is the modfication mass in Daltons and X is the appropriate residue to modify. Lists of modifications are separated by commas. For example, to modify M and C with the addition of 16.0 Daltons, the parameter line would be +16.0@M,+16.0@C. Positive and negative values are allowed.</note>
35
+ <note type="input" label="residue, potential modification mass"></note>
36
+ <note>VARIABLE MODIFICATION. The format of this parameter is the same as the format for residue, modification mass (see above).</note>
37
+ <note type="input" label="residue, potential modification motif"></note>
38
+ <note>VARIABLE MODIFICATION IN A MOTIF. The format of this parameter is similar to residue, modification mass, with the addition of a modified PROSITE notation sequence motif specification. For example, a value of 80@[ST!]PX[KR] indicates a modification of either S or T when followed by P, and residue and the a K or an R. A value of 204@N!{P}[ST]{P} indicates a modification of N by 204, if it is NOT followed by a P, then either an S or a T, NOT followed by a P. Positive and negative values are allowed. </note>
39
+
40
+ <note>protein parameters</note>
41
+ <note type="input" label="protein, taxon">no default</note>
42
+ <note>SEQUENCE DATABASE TO SEARCH. This refers to identifiers in taxonomy.xml.</note>
43
+ <note type="input" label="protein, cleavage site">[RK]|{P}</note>
44
+ <note>ENZYME SPECIFICITY. This setting corresponds to the enzyme trypsin. The first characters in brackets represent residues N-terminal to the bond - the '|' pipe - and the second set of characters represent residues C-terminal to the bond. The characters must be in square brackets (denoting that only these residues are allowed for a cleavage) or french brackets (denoting that these residues cannot be in that position). Use UPPERCASE characters. To denote cleavage at any residue, use [X]|[X] and reset the scoring, maximum missed cleavage site parameter (see below) to something like 50. </note>
45
+ <note type="input" label="protein, modified residue mass file"></note>
46
+ <note type="input" label="protein, N-terminal residue modification mass"></note>
47
+ <note type="input" label="protein, C-terminal residue modification mass"></note>
48
+ <note type="input" label="protein, homolog management">no</note>
49
+ <note>if yes, an upper limit is set on the number of homologues kept for a particular spectrum</note>
50
+
51
+ <note>model refinement parameters</note>
52
+ <note type="input" label="refine">no</note>
53
+ <note type="input" label="refine, modification mass"></note>
54
+ <note type="input" label="refine, sequence path"></note>
55
+ <note type="input" label="refine, tic percent">10</note>
56
+ <note type="input" label="refine, spectrum synthesis">yes</note>
57
+ <note type="input" label="refine, maximum valid expectation value">0.1</note>
58
+ <note type="input" label="refine, potential N-terminus modifications"></note>
59
+ <note type="input" label="refine, potential C-terminus modifications"></note>
60
+ <note type="input" label="refine, unanticipated cleavage">no</note>
61
+ <note type="input" label="refine, potential modification mass"></note>
62
+ <note type="input" label="refine, point mutations">no</note>
63
+ <note type="input" label="refine, use potential modifications for full refinement">no</note>
64
+ <note type="input" label="refine, point mutations">no</note>
65
+ <note type="input" label="refine, potential modification motif"></note>
66
+
67
+ <note>scoring parameters</note>
68
+ <note type="input" label="scoring, minimum ion count">4</note>
69
+ <note type="input" label="scoring, maximum missed cleavage sites">2</note>
70
+ <note type="input" label="scoring, x ions">no</note>
71
+ <note type="input" label="scoring, y ions">yes</note>
72
+ <note type="input" label="scoring, z ions">no</note>
73
+ <note type="input" label="scoring, a ions">no</note>
74
+ <note type="input" label="scoring, b ions">yes</note>
75
+ <note type="input" label="scoring, c ions">no</note>
76
+ <note type="input" label="scoring, cyclic permutation">no</note>
77
+ <note>if yes, cyclic peptide sequence permutation is used to pad the scoring histograms</note>
78
+ <note type="input" label="scoring, include reverse">no</note>
79
+ <note>if yes, then reversed sequences are searched at the same time as forward sequences</note>
80
+ <note type="input" label="scoring, cyclic permutation">no</note>
81
+ <note type="input" label="scoring, include reverse">no</note>
82
+
83
+ <note>output parameters</note>
84
+ <note type="input" label="output, log path"></note>
85
+ <note type="input" label="output, message">1234567890</note>
86
+ <note type="input" label="output, sequence path"></note>
87
+ <note type="input" label="output, path">output.xml</note>
88
+ <note type="input" label="output, sort results by">spectrum</note>
89
+ <note>values = protein|spectrum (spectrum is the default)</note>
90
+ <note type="input" label="output, path hashing">no</note>
91
+ <note>values = yes|no</note>
92
+ <note type="input" label="output, xsl path">tandem-style.xsl</note>
93
+ <note type="input" label="output, parameters">yes</note>
94
+ <note>values = yes|no</note>
95
+ <note type="input" label="output, performance">yes</note>
96
+ <note>values = yes|no</note>
97
+ <note type="input" label="output, spectra">no</note>
98
+ <note>values = yes|no</note>
99
+ <note type="input" label="output, histograms">no</note>
100
+ <note>values = yes|no</note>
101
+ <note type="input" label="output, proteins">yes</note>
102
+ <note>values = yes|no</note>
103
+ <note type="input" label="output, sequences">no</note>
104
+ <note>values = yes|no</note>
105
+ <note type="input" label="output, one sequence copy">no</note>
106
+ <note>values = yes|no, set to yes to produce only one copy of each protein sequence in the output xml</note>
107
+ <note type="input" label="output, results">all</note>
108
+ <note>values = all|valid|stochastic</note>
109
+ <note type="input" label="output, maximum valid expectation value">0.1</note>
110
+ <note>value is used in the valid|stochastic setting of output, results</note>
111
+ <note type="input" label="output, histogram column width">30</note>
112
+ <note>values any integer greater than 0. Setting this to '1' makes cutting and pasting histograms into spread sheet programs easier.</note>
113
+
114
+ <note type="description">ADDITIONAL EXPLANATIONS</note>
115
+ <note type="description">Each one of the parameters for X! tandem is entered as a labeled note node. In the current version of X!, keep those note nodes on a single line.</note>
116
+ <note type="description">The presence of the type 'input' is necessary if a note is to be considered an input parameter. </note>
117
+ <note type="description">Any of the parameters that are paths to files may require alteration for a particular installation. Full path names usually cause the least trouble, but there is no reason not to use relative path names, if that is the most convenient.</note>
118
+ <note type="description">Any parameter values set in the 'list path, default parameters' file are reset by entries in the normal input file, if they are present. Otherwise, the default set is used. </note>
119
+ <note type="description">The 'list path, taxonomy information' file must exist.</note>
120
+ <note type="description">The directory containing the 'output, path' file must exist: it will not be created.</note>
121
+ <note type="description">The 'output, xsl path' is optional: it is only of use if a good XSLT style sheet exists.</note>
122
+
123
+ </bioml>
@@ -1,70 +1,33 @@
1
1
  <?xml version="1.0" encoding="UTF-8"?>
2
2
  <bioml>
3
+ <note>This file is for all tandem parameters that we expose via command-line flags to the tandem_search.rb tool. The command-line flags and their defaults always override the values in this file.</note>
3
4
 
4
- <note> TAXONOMY FILE. This is a file containing references to the sequence databases. Point it to your own taxonomy.xml if needed.</note>
5
- <note type="input" label="list path, taxonomy information">Temporary tandem taxonomy file generated for each run</note>
6
-
7
- <note> PROTEIN SEQUENCE DATABASE. This refers to identifiers in the taxonomy.xml, not the .fasta files themselves! Make sure the database you want is present as an entry in the taxonomy.xml referenced above. This is REQUIRED. </note>
8
- <note type="input" label="protein, taxon">sphuman</note>
9
-
10
- <note> FILE LOCATIONS. Replace them with your input (.mzXML) file and output file -- these are REQUIRED. Optionally a log file and a sequence output file of all protein sequences identified in the first-pass can be specified. Use of FULL path (not relative) paths is recommended. </note>
5
+ <note>File Paths</note>
6
+ <note type="input" label="list path, taxonomy information">no default</note>
7
+ <note type="input" label="protein, taxon">no default</note>
11
8
  <note type="input" label="spectrum, path">no default</note>
12
9
  <note type="input" label="output, path">no default</note>
13
10
 
14
- <note> DEFAULT PARAMETERS. The value of "isb_default_input_kscore.xml" is recommended. Change to "isb_default_input_native.xml" for native X!Tandem scoring.</note>
11
+ <note> DEFAULT PARAMETERS. One of the following;
12
+ User supplied
13
+ isb_default_input_native.xml
14
+ gpm_default_input.xml
15
+ </note>
15
16
  <note type="input" label="list path, default parameters">no default</note>
16
17
 
17
- <note> FRAGMENT MASS TOLERANCES </note>
18
- <note type="input" label="spectrum, fragment monoisotopic mass error">0.65</note>
19
-
20
- <note> PRECURSOR MASS TOLERANCES. In the example below, a -2.0 Da to 4.0 Da (monoisotopic mass) window is searched for peptide candidates. Since this is monoisotopic mass, so for non-accurate-mass instruments, for which the precursor is often taken nearer to the isotopically averaged mass, an asymmetric tolerance (-2.0 Da to 4.0 Da) is preferable. This somewhat imitates a (-3.0 Da to 3.0 Da) window for averaged mass (but not exactly)</note>
21
- <note type="input" label="spectrum, parent monoisotopic mass error minus">100</note>
22
- <note type="input" label="spectrum, parent monoisotopic mass error plus">100</note>
23
- <note type="input" label="spectrum, parent monoisotopic mass error units">ppm</note>
24
- <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
25
- <note type="input" label="spectrum, parent monoisotopic mass isotope error">yes</note>
26
- <note>This allows peptide candidates in windows around -1 Da and -2 Da from the acquired mass to be considered. Only applicable when the minus/plus window above is set to less than 0.5 Da. Good for accurate-mass instruments for which the reported precursor mass is not corrected to the monoisotopic mass. </note>
27
-
28
-
29
- <note> MODIFICATIONS. In the example below, there is a static (carbamidomethyl) modification on C, and variable modifications on M (oxidation). Multiple modifications can be separated by commas, as in "80.0@S,80.0@T". Peptide terminal modifications can be specified with the symbol '[' for N-terminus and ']' for C-terminus, such as 42.0@[ . </note>
30
- <note id="carbamidomethyl-fixed" type="input" label="residue, modification mass">57.021464@C</note>
31
- <note id="methionine-oxidation-variable" type="input" label="residue, potential modification mass">15.994915@M</note>
32
- <note id="glyco-variable" type="input" label="residue, potential modification motif">0.998@N!{P}[ST]</note>
33
- <note> You can specify a variable modification when present in a motif. For instance, 0.998@N!{P}[ST] is a deamidation modification on N only if it is present in an N[any but P][S or T] motif (N-glycosite). </note>
18
+ <note> MODIFICATIONS. No Default</note>
34
19
 
20
+ <note type="input" label="protein, cleavage C-terminal mass change"></note>
21
+ <note type="input" label="protein, cleavage N-terminal mass change"></note>
35
22
  <note type="input" label="protein, N-terminal residue modification mass"></note>
36
- <note type="input" label="protein, C-terminal residue modification mass"></note>
23
+ <note type="input" label="protein, C-terminal residue modification mass"></note>
37
24
  <note> These are *static* modifications on the PROTEINS' N or C-termini. </note>
38
25
 
39
- <note> SEMI-TRYPTICS AND MISSED CLEAVAGES. In the example below, semitryptic peptides are allowed, and up to 2 missed cleavages are allowed. </note>
40
- <note type="input" label="protein, cleavage semi">yes</note>
41
- <note type="input" label="scoring, maximum missed cleavage sites">2</note>
42
26
 
43
- <note> REFINEMENT. Do not use unless you know what you are doing. Set "refine" to "yes" and specify what you want to search in the refinement. For non-confusing results, repeat the same modifications you set above for the first-pass here.</note>
27
+ <note>
28
+ REFINEMENT. Not supported via tandem_search.rb.
29
+ Put refinement params directly in a user defined defaults.xml
30
+ </note>
44
31
  <note type="input" label="refine">no</note>
45
- <note type="input" label="refine, maximum valid expectation value">0.1</note>
46
- <note type="input" label="refine, modification mass">57.012@C</note>
47
- <note type="input" label="refine, potential modification mass">15.994915@M</note>
48
- <note type="input" label="refine, potential modification motif"></note>
49
- <note type="input" label="refine, cleavage semi">yes</note>
50
- <note type="input" label="refine, unanticipated cleavage">no</note>
51
- <note type="input" label="refine, potential N-terminus modifications"></note>
52
- <note type="input" label="refine, potential C-terminus modifications"></note>
53
- <note type="input" label="refine, point mutations">no</note>
54
- <note type="input" label="refine, use potential modifications for full refinement">no</note>
55
-
56
-
57
-
58
- <note type="input" label="output, spectra">no</note>
59
-
60
- <!-- Controlled by thresholds_type parameter. -->
61
- <note type="input" label="spectrum, dynamic range">10000.0</note>
62
- <note type="input" label="spectrum, use noise suppression">yes</note>
63
- <note type="input" label="spectrum, minimum parent m+h">600.0</note>
64
- <note type="input" label="spectrum, minimum fragment mz">125.0</note>
65
- <note type="input" label="spectrum, minimum peaks">10</note>
66
- <note type="input" label="scoring, minimum ion count">1</note>
67
- <note type="input" label="output, maximum valid expectation value">0.1</note>
68
-
69
32
 
70
33
  </bioml>
data/lib/protk/fastadb.rb CHANGED
@@ -10,8 +10,8 @@ class FastaDB
10
10
  def initialize(blast_database_file_path)
11
11
  env = Constants.new
12
12
  @database = blast_database_file_path
13
- @makedbcmd = "makeblastdb"
14
- @searchdbcmd = "blastdbcmd"
13
+ @makedbcmd = env.makeblastdb
14
+ @searchdbcmd = env.blastdbcmd
15
15
  end
16
16
 
17
17
  def self.create(blast_database_file_path,input_fasta_filepath,type='nucl')
@@ -17,7 +17,7 @@ class ProphetTool < SearchTool
17
17
 
18
18
 
19
19
  # Initializes the commandline options
20
- def initialize(option_support=[:prefix_suffix,:over_write])
20
+ def initialize(option_support=[:prefix,:over_write])
21
21
 
22
22
  super(option_support)
23
23