RubyGems - protk - Versions diffs - 1.3.0.pre3 → 1.3.0 - Mend

protk 1.3.0.pre3 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/bin/make_decoy.rb +1 -1
data/bin/msgfplus_search.rb +5 -5
data/bin/omssa_search.rb +0 -18
data/bin/peptide_prophet.rb +10 -3
data/bin/protein_prophet.rb +14 -11
data/bin/protxml_to_psql.rb +399 -0
data/ext/decoymaker/decoymaker.c +120 -114
data/lib/protk/galaxy_stager.rb +4 -5
data/lib/protk/galaxy_util.rb +1 -1
data/lib/protk/mzml_parser.rb +67 -0
data/lib/protk/tool.rb +3 -1
metadata +19 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 704282a21d38fd8d3a536fbbba9ab90eabd54355
-  data.tar.gz: d325658a001939d222bfb7c942836df25ae9790b
+  metadata.gz: 5e8f8a571cb43ed61984a34b6e1fb51caf979593
+  data.tar.gz: b53857f75c1ff6ca850859c3985aee36533e437f
 SHA512:
-  metadata.gz: f9902d7d48b5171470b073ab94089481f0a9125d0e65e4a33b600ed86cf622bafa774335a0c33520e955d49dd0f195ed7ae82fcab6ad18bf35a42dafe030aea3
-  data.tar.gz: 253cabdf8bfcf009516cc2675ebe12bb9b43ec15515625eabd3701c0761be9f3fc78dcc58dc731c7f89f642f8c11cf1bd8889c5f68a565cda0d04de06fb273c3
+  metadata.gz: 9450fccc4a5ce59f064927d62fbc6a4342a1710c3b82707e0908dea52af7d0b50f215e64073bb067a506d204701acea11b6d28f302447494b8a30b1e7af2df2d
+  data.tar.gz: 1b8bc78fc09b4c81eee72fad169a6aee7145a16312c01bc95a5dd590f08cb98194b26115a166a759b9c52c7c67204a767747642e5e9331de4562d52f31eb1e11

data/bin/make_decoy.rb CHANGED

@@ -36,7 +36,6 @@ input_file=ARGV[0]
 db_length=tool.db_length
 if ( db_length==0) #If no db length was specified use the number of entries in the input file
   db_length=Bio::FastaFormat.open(input_file).count
-  puts "Found #{db_length} entries in input file"
 end
 output_file = tool.explicit_output if tool.explicit_output!=nil
@@ -65,6 +64,7 @@ end
 if ( tool.append )
 	cmd ="awk 'FNR==1{print \"\"}1' #{input_file} #{decoys_tmp_file} > #{output_file};"
+	cmd << "sed -i.bak '/^$/d' #{output_file};"
 	cmd << "rm #{decoys_tmp_file}"
 else
 	cmd = "mv #{decoys_tmp_file} #{output_file}"

data/bin/msgfplus_search.rb CHANGED

@@ -83,16 +83,17 @@ database_path=db_info.path
 # Database must have fasta extension
 if Pathname.new(database_path).extname.to_s.downcase != ".fasta"
-  make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
+  File.symlink(database_path,"#{database_path}.fasta") unless File.exists?("#{database_path}.fasta")
+  # make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
   database_path="#{database_path}.fasta"
-  db_info.path=database_path
+  database_path
 end
 # Database must be indexed
 unless FileTest.exists?("#{database_path}.canno")
-  dbdir = Pathname.new(database_path).dirname.realpath.to_s
+  dbdir = Pathname.new(database_path).dirname.to_s
   tdavalue=search_tool.decoy_search ? 1 : 0;
-  make_msgfdb_cmd << "cd #{dbdir}; java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
+  make_msgfdb_cmd << "java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
 end
@@ -214,7 +215,6 @@ ARGV.each do |filename|
     else
       cmd << "; mv #{mzid_output_path} #{output_path}"
     end
     # Up to here we've formulated the command. The rest is cleanup
     p "Running:#{cmd}"

data/bin/omssa_search.rb CHANGED

@@ -12,8 +12,6 @@ require 'protk/command_runner'
 require 'protk/search_tool'
 require 'protk/galaxy_util'
-for_galaxy = GalaxyUtil.for_galaxy?
 # Setup specific command-line options for this tool. Other options are inherited from SearchTool
 #
 search_tool=SearchTool.new([
@@ -94,22 +92,6 @@ ARGV.each do |filename|
     #
     cmd << " -v #{search_tool.missed_cleavages}"
-    # If this is for Galaxy and a data directory has been specified
-    # look for a common unimod.xml file.
-    if for_galaxy
-      galaxy_index_dir = search_tool.galaxy_index_dir
-      if galaxy_index_dir
-        galaxy_mods = File.join(galaxy_index_dir, "mods.xml")
-        if( FileTest.exists?(galaxy_mods) )
-          cmd << " -mx #{galaxy_mods}"
-        end
-        galaxy_usermods = File.join(galaxy_index_dir, "usermods.xml")
-        if( FileTest.exists?(galaxy_usermods) )
-          cmd << " -mux #{galaxy_usermods}"
-        end
-      end
-    end
     if ( search_tool.omx_output )
       cmd << " -ox #{search_tool.omx_output} "
     end

data/bin/peptide_prophet.rb CHANGED

@@ -51,10 +51,11 @@ throw "When --output and -F options are set only one file at a time can be run"
 # Obtain a global environment object
 genv=Constants.new
+input_stagers=[]
 inputs=ARGV.collect { |file_name| file_name.chomp}
 if for_galaxy
-  inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
+  input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
+  inputs=input_stagers.collect { |sg| sg.staged_path }
 end
 # Interrogate all the input files to obtain the database and search engine from them
@@ -212,7 +213,13 @@ else
   cmd=generate_command(genv,prophet_tool,inputs,output_file_name,database,engine)
   run_peptide_prophet(genv,prophet_tool,cmd,output_file_name,engine)
 end
+if (for_galaxy)
+  input_stagers.each do |sg|
+    sg.restore_references(output_file_name)
+    sg.restore_references(output_file_name,{:base_only => true})
+  end
+end

data/bin/protein_prophet.rb CHANGED

@@ -40,7 +40,13 @@ exit unless prophet_tool.check_options(true)
 # Obtain a global environment object
 genv=Constants.new
-inputs = ARGV.collect {|file_name| file_name.chomp }
+input_stagers=[]
+inputs=ARGV.collect { |file_name| file_name.chomp}
+if for_galaxy
+  input_stagers = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
+  inputs=input_stagers.collect { |sg| sg.staged_path }
+end
 if ( prophet_tool.explicit_output )
   output_file=prophet_tool.explicit_output
@@ -52,11 +58,6 @@ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
   cmd="ProteinProphet "
-  if for_galaxy
-    inputs = inputs.collect {|ip| GalaxyUtil.stage_pepxml(ip) }
-  end
   cmd << " #{inputs.join(" ")} #{output_file}"
   if ( prophet_tool.glyco )
@@ -71,11 +72,13 @@ else
   genv.log("Protein Prophet output file #{output_file} already exists. Run with -r option to replace",:warn)
 end
-# if for_galaxy
-  # Restore references to peptide prophet xml so downstream tools like
-  # libra can find it.
-  # input_stager.restore_references("protein_prophet_results.prot.xml")
-# end
+if (for_galaxy)
+  input_stagers.each do |sg|
+    sg.restore_references(output_file)
+    sg.restore_references(output_file,{:base_only => true})
+  end
+end

data/bin/protxml_to_psql.rb ADDED

@@ -0,0 +1,399 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 18/1/2011
+#
+# Convert a protein/peptide xml file to sqlite database
+#
+#
+require 'libxml'
+require 'protk/constants'
+require 'protk/command_runner'
+require 'protk/tool'
+require 'protk/fastadb'
+require 'sqlite3'
+require 'protk/mzml_parser'
+include LibXML
+def prepare_fasta(database_path,type)
+  db_filename = nil
+  case
+  when Pathname.new(database_path).exist? # It's an explicitly named db
+    db_filename = Pathname.new(database_path).expand_path.to_s
+  else
+    db_filename=Constants.new.current_database_for_name(database_path)
+  end
+  db_indexfilename = "#{db_filename}.pin"
+  if File.exist?(db_indexfilename)
+    puts "Using existing indexed database"
+    orf_lookup = FastaDB.new(db_filename)
+  else
+    puts "Indexing database"
+    orf_lookup = FastaDB.create(db_filename,db_filename,type)
+  end
+  orf_lookup
+end
+def get_fasta_record(protein_name,fastadb)
+#  puts "Looking up #{protein_name}"
+  entry = fastadb.get_by_id protein_name
+  if ( entry == nil)
+    puts "Failed lookup for #{protein_name}"
+    raise KeyError
+  end
+  entry
+end
+def initialize_db()
+	result = $outputdb.execute <<-SQL
+	  CREATE TABLE ProteinGroups (
+	    ID INT,
+	    Probability REAL
+	  );
+	SQL
+	result = $outputdb.execute <<-SQL
+	  CREATE TABLE Proteins (
+	    ID INT,
+	    ProteinGroupID INT,
+	    Probability REAL,
+	    Name TEXT,
+	    Description TEXT,
+	    Coverage REAL,
+	    NumPeptides INT,
+	    Indistinguishables TEXT,
+	    Sequence TEXT
+	  );
+	SQL
+	result = $outputdb.execute <<-SQL
+	  CREATE TABLE Peptides (
+	    ID INT,
+	    ProteinID INT,
+	    Probability REAL,
+	    Sequence TEXT,
+	    Start INT,
+	    End INT,
+	    ModifiedSequence TEXT
+	  );
+	SQL
+	# This has the role of a join table for the Peptides <-> Spectra many to many relationship
+	result = $outputdb.execute <<-SQL
+		CREATE TABLE PeptideSpectrumMatches (
+	    PeptideSequence TEXT,
+	    PeptideModifiedSequence TEXT,
+	    SpectrumID INT,
+	    ScanNum INT,
+	    RetentionTime REAL,
+	    PrecursorNeutralMass REAL,
+	    MassDeviation REAL,
+	    PrevAA TEXT,
+	    NextAA TEXT
+	  );
+	SQL
+	result = $outputdb.execute <<-SQL
+		CREATE TABLE Spectra (
+			ID INTEGER PRIMARY KEY,
+			MZData TEXT,
+			IntensityData TEXT,
+			PrecursorMass REAL,
+			PrecursorCharge INT,
+			SpectrumType INT,
+			SpectrumTitle TEXT
+		);
+	SQL
+end
+def insert_protein_group(group_node)
+	group_number=group_node.attributes['group_number']
+	group_prob=group_node.attributes['probability']
+	$outputdb.execute <<-SQL
+		INSERT INTO ProteinGroups(ID,Probability) VALUES(
+			#{group_number},#{group_prob}
+		);
+	SQL
+	proteins=group_node.find("./#{$protxml_ns_prefix}protein", $protxml_ns)
+	proteins.each do |protein|
+		insert_protein(protein,group_number)
+	end
+end
+def protein_dbid_from_name(protein_name)
+	protein_name #TODO: Allow user defined regex to parse this
+end
+def insert_protein(protein,group_id)
+	indis_proteins=protein.find("./#{$protxml_ns_prefix}indistinguishable_protein", $protxml_ns)
+	indis_proteins_summary=""
+	indis_proteins.each { |iprot| indis_proteins_summary<<"#{iprot.attributes['protein_name']};" }
+	annot_descr=protein.find("./#{$protxml_ns_prefix}annotation[@protein_description]", $protxml_ns)
+	protein_prob=protein.attributes['probability']
+	protein_name=protein.attributes['protein_name']
+	begin
+		protein_description=annot_descr[0].attributes['protein_description'].chomp.gsub("'","")
+	rescue
+		puts "No protein_description"
+	end
+	protein_coverage=protein.attributes['percent_coverage']
+	protein_npep = protein.attributes['total_number_peptides']
+	protein_indis = indis_proteins_summary
+	protein_coverage="NULL" unless protein_coverage
+	protein_indis="NULL" unless protein_indis
+	protein_description="NULL" unless protein_description
+	if $fasta_lookup
+		begin
+			entry=get_fasta_record(protein_name,$fasta_lookup)
+			protein_seq=entry.aaseq
+		rescue
+			puts "Warning: No entry found for #{protein_name}"
+			protein_seq="NULL"
+		end
+	end
+	begin
+		$outputdb.execute <<-SQL
+		INSERT INTO Proteins(ID,ProteinGroupID,Probability,Name,Description,Coverage,NumPeptides,Indistinguishables,Sequence)
+		VALUES(#{$protein_id},#{group_id},#{protein_prob},\'#{protein_name}\',\'#{protein_description}\',#{protein_coverage},
+		#{protein_npep},\'#{protein_indis}\','#{protein_seq}');
+		SQL
+	rescue
+		throw "Unable to insert #{protein_description}\n"
+	end
+	peptides=protein.find("./#{$protxml_ns_prefix}peptide",$protxml_ns)
+	peptides.each do |peptide|
+		insert_peptide(peptide,$protein_id,protein_seq)
+	end
+	$protein_id+=1
+end
+def insert_peptide(peptide,protein_id,protein_seq)
+	nsp_adjusted_probability=peptide.attributes['nsp_adjusted_probability']
+	sequence=peptide.attributes['peptide_sequence']
+	start_pos="NULL"
+	end_pos="NULL"
+	begin
+		if protein_seq!="NULL"
+			start_pos = protein_seq.index(sequence)
+			end_pos = start_pos+sequence.length
+		end
+	rescue
+		puts "Unable to locate peptide #{sequence} in protein #{protein_seq} for #{$protein_id}\n"
+		start_pos="NULL"
+		end_pos="NULL"
+	end
+	mod_info=peptide.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
+	throw "More than one modification_info object for a peptide" unless mod_info.length<=1
+	mod_seq=format_modified_peptide(mod_info)
+	$outputdb.execute <<-SQL
+		INSERT INTO Peptides(ID,ProteinID,Probability,Sequence,Start,End,ModifiedSequence)
+		VALUES(#{$peptide_id},#{protein_id},#{nsp_adjusted_probability},\'#{sequence}\',
+		#{start_pos},#{end_pos},\'#{mod_seq}\')
+	SQL
+	$peptide_id+=1
+end
+def format_modified_peptide(mod_info)
+	mod_seq="NULL"
+	if mod_info.length==1
+		mod_seq=mod_info[0].attributes['modified_peptide']
+		mod_seq.gsub!(/\[/,"\{")
+		mod_seq.gsub!(/\]/,"\}")
+	end
+	mod_seq
+end
+def insert_psms_from_file(filepath)
+	$pepxml_ns_prefix="xmlns:"
+	$pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
+	pepxml_parser=XML::Parser.file("#{filepath}")
+	puts "Parsing #{filepath}"
+	pepxml_doc=pepxml_parser.parse
+	if not pepxml_doc.root.namespaces.default
+  		$pepxml_ns_prefix=""
+  		$pepxml_ns=nil
+	end
+	matched_spectra=[]
+	spectrum_queries=pepxml_doc.find("//#{$pepxml_ns_prefix}spectrum_query", $pepxml_ns)
+	spectrum_queries.each do |query|
+		spectrum_name = query.attributes['spectrum'].chomp.gsub("0","").sub(/\.\d+$/,"")
+		start_scan=query.attributes['start_scan'].to_i
+		end_scan=query.attributes['end_scan'].to_i
+		throw "Don't know how to deal with multi scan spectra" unless start_scan==end_scan
+		retention_time=query.attributes['retention_time_sec'].chomp.to_f
+		neutral_mass=query.attributes['precursor_neutral_mass'].to_f
+		assumed_charge=query.attributes['assumed_charge'].to_i
+		top_search_hit=query.find("./#{$pepxml_ns_prefix}search_result/#{$pepxml_ns_prefix}search_hit",$pepxml_ns)[0]
+		peptide=top_search_hit.attributes['peptide']
+		mod_info=top_search_hit.find("./#{$protxml_ns_prefix}modification_info",$protxml_ns)
+		throw "More than one modification_info object for a peptide" unless mod_info.length<=1
+		modified_peptide=format_modified_peptide(mod_info)
+		calc_neutral_pep_mass=top_search_hit.attributes['calc_neutral_pep_mass'].to_f
+		massdiff = top_search_hit.attributes['massdiff'].to_f
+		prevaa = top_search_hit.attributes['peptide_prev_aa']
+		nextaa = top_search_hit.attributes['peptide_next_aa']
+		spectrum_name="NULL" unless spectrum_name
+		retention_time="NULL" unless retention_time
+		assumed_charge="NULL" unless assumed_charge
+		calc_neutral_pep_mass="NULL" unless calc_neutral_pep_mass
+		massdiff = "NULL" unless massdiff
+		prevaa = "NULL" unless prevaa
+		nextaa = "NULL" unless nextaa
+		$outputdb.execute <<-SQL
+			INSERT INTO PeptideSpectrumMatches(PeptideSequence,PeptideModifiedSequence,SpectrumID,ScanNum,RetentionTime,PrecursorNeutralMass,MassDeviation,PrevAA,NextAA)
+			VALUES('#{peptide}','#{modified_peptide}','#{spectrum_name}','#{start_scan}','#{retention_time.to_f}'\
+			,'#{calc_neutral_pep_mass}','#{massdiff}','#{prevaa}','#{nextaa}')
+		SQL
+		matched_spectra<<{:name => spectrum_name, :scan_num => start_scan}
+	end
+	matched_spectra
+end
+def lookup_spectra_from_files(file_list,matched_spectra)
+	titles_to_match = matched_spectra.collect { |s| s[:name] }
+	# require 'debugger';debugger
+	queries_with_spectra=Array.new.replace(titles_to_match)
+	num_matched=0
+	total_spectra=0
+	file_list.each do |file|
+		mzml_parser = MzMLParser.new(file)
+		spec = mzml_parser.next_spectrum
+		while (spec) do
+			total_spectra+=1
+			if titles_to_match.include? spec[:title]
+				num_matched+=1
+				queries_with_spectra.delete(spec[:title])
+				$outputdb.execute <<-SQL
+					INSERT INTO Spectra(MZData,IntensityData,SpectrumTitle,PrecursorMass)
+					VALUES('#{spec[:mz]}','#{spec[:intensity]}','#{spec[:title]}','#{spec[:precursormz]}')
+				SQL
+			else
+			end
+			spec = mzml_parser.next_spectrum
+		end
+	end
+	puts "Found #{num_matched} matching spectra"
+	puts "Total in spectrum files #{total_spectra}"
+	puts "Total psms #{titles_to_match.length}"
+	puts "Unmatched psms #{queries_with_spectra.length}"
+end
+# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
+#
+tool=Tool.new([:explicit_output,:over_write])
+tool.option_parser.banner = "Convert a protXML file to a sqlite database.\n\nUsage: protxml_to_psql.rb [options] file1.protXML"
+tool.add_value_option(:database,nil,['-d','--database path','A Fasta file where full protein sequences can be looked up'])
+# require 'debugger';debugger
+exit unless tool.check_options(true,[:explicit_output])
+input_file=ARGV.shift
+if File.exists? tool.explicit_output
+	throw "Cant overwrite existing db #{tool.explicit_output}" unless tool.over_write
+	File.delete(tool.explicit_output)
+end
+$fasta_lookup=nil
+if tool.database
+	$fasta_lookup=prepare_fasta(tool.database,'prot')
+end
+$outputdb = SQLite3::Database.new tool.explicit_output
+initialize_db
+XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
+protxml_parser=XML::Parser.file("#{input_file}")
+$protxml_ns_prefix="xmlns:"
+$protxml_ns="xmlns:http://regis-web.systemsbiology.net/protXML"
+protxml_doc=protxml_parser.parse
+if not protxml_doc.root.namespaces.default
+  $protxml_ns_prefix=""
+  $protxml_ns=nil
+end
+$protein_id=0
+$peptide_id=0
+headers_with_inputs=protxml_doc.find("//#{$protxml_ns_prefix}protein_summary_header[@source_files]",$protxml_ns)
+matched_spectra=[]
+headers_with_inputs.each do |header|
+	pepxml_files = header.attributes['source_files'].split(",")
+	pepxml_files.each do |pepxml_file|
+		matched_spectra.concat insert_psms_from_file(pepxml_file)
+	end
+end
+lookup_spectra_from_files(ARGV.collect { |file| file.chomp },matched_spectra)
+protein_groups=protxml_doc.find("//#{$protxml_ns_prefix}protein_group", $protxml_ns)
+protein_groups.each do |g|
+	insert_protein_group(g)
+end

data/ext/decoymaker/decoymaker.c CHANGED

@@ -20,24 +20,40 @@
 #define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
 #define NOT_AMINO_ACIDS "BJOUXZ*"
-#define MAX_SEQUENCE_LENGTH 20000
-#define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
+#define MAX_SEQUENCE_LENGTH 2000
+#define MAX_LINE_LENGTH 200000 /* large enough to read in long header lines */
+void RemoveSpaces(char* source)
+{
+  char* i = source;
+  char* j = source;
+  while(*j != 0)
+  {
+    *i = *j++;
+    if(*i != ' ')
+      i++;
+  }
+  *i = 0;
+}
 static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
-  VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
-  char *input_file = RSTRING_PTR(input_file_in);
+  VALUE db_length_in,VALUE output_file_in,VALUE prefix_string_in)
+{
+  char *infile = StringValueCStr(input_file_in);
   long sequences_to_generate = NUM2INT(db_length_in);
-  char * output_file = RSTRING_PTR(output_file_in);
+  char * outfile = StringValueCStr(output_file_in);
+  char *prefix_string = StringValueCStr(prefix_string_in);
   char line[MAX_LINE_LENGTH];
-  char settings_line[60][70];
-  char infile[255], outfile[255]; /* for reading input and writing output */
-  char prefix_string[255];
+  // char settings_line[60][70];
   char *p,**index;
-  char *sequence;
-  char one_sequence[MAX_SEQUENCE_LENGTH],random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)],random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
+  char one_sequence[MAX_SEQUENCE_LENGTH];
+  char random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)];
+  char random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
   char *temp_sequence;
   int a;
   FILE *inp, *outp;
@@ -50,63 +66,57 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
   double x;
   /* scanning sequence database */
-  strcpy(infile,input_file);
   if ((inp = fopen(infile, "r"))==NULL) {
     printf("error opening sequence database %s\n",infile);return -1;
   }
-  printf("scanning sequence database \n%s\n",infile);
-  fflush(stdout);
-  i=0;n=0;k=0;
+  long total_sequence_len=0;
+  n=0;
   while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {
-    i++;
-    if(line[0]=='>') {
-      if (!(n%1000)) {
-        printf(".");
-        fflush(stdout);
-        n++;
-      }
-    }
+    total_sequence_len+=strlen(line);
+    // printf("%ld\n",i);fflush(stdout);
+    if (line[0]=='>') { n++; }
   }
   n_sequences=n;
   /* reading sequence database */
   temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
-  sequence=(char*)malloc(sizeof(char)*(i*80)); /* allocate enough memory for 80 characters per line in FASTA database */
+  char *sequence_block=(char*)malloc(sizeof(char)*(total_sequence_len+2));
   index=(char**)malloc(sizeof(char*)*n_sequences);
-  index[0]=sequence; /* set first index pointer to beginning of first database sequence */
+  index[0]=sequence_block; /* set first index pointer to beginning of first database sequence */
   if ((inp = fopen(infile, "r"))==NULL) {
     printf("error opening sequence database %s\n",infile);
     return -1;
   }
-  printf("done\nreading sequence database \n%s\n",infile);
-  fflush(stdout);
   n=-1;
   strcpy(temp_sequence,"\0");
   while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
-  {
-    if (strcmp(line,"\n")==0) {
+  {
+    RemoveSpaces(line);
+    if (strcmp(line,"\n")==0) { // Skips blank lines
       continue;
     }
     if (line[0]=='>') {
       if (n>=0) {
-        if (!(n%1000)&&n>0) {
-          printf(".");fflush(stdout);
-        }
         strcpy(index[n],temp_sequence);
-        n++; index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
+        n++;
+        index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
         strcpy(temp_sequence,"\0");
       }
       else
       {
@@ -116,7 +126,9 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
     }
     else
     {
-      if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) continue;
+      if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) {
+        continue;
+      }
       strncat(temp_sequence,line,strlen(line)-1);
     }
   }
@@ -127,16 +139,18 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
   n_sequences=n+1;
-  printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
+  // printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
+  // measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
-  measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
   /* generating Markov probabilities */
-  printf("generating Markov probability matrix...");
-  fflush(stdout);
+  // printf("generating Markov probability matrix...");
+  // fflush(stdout);
   srand(time(0)); /* replace with constant to re-generate identical random databases */
@@ -146,52 +160,53 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
     }
   }
   for(j=0;j<=20;j++) {
-    measured_aa_freq[j]=0;generated_aa_freq[j]=0;
+    measured_aa_freq[j]=0;
+    generated_aa_freq[j]=0;
   }
   for(protein=0;protein<n_sequences;protein++)
   {
-    if (!(protein%1000)) {
-      printf(".");
-      fflush(stdout);
+    if (protein<(n_sequences-1))
+    {
+      long len_one_seq = (index[protein+1]-index[protein])/sizeof(char);
+      if ( len_one_seq > MAX_SEQUENCE_LENGTH ){
+        printf("Seq is longer than max len \n");fflush(stdout);
+        len_one_seq=MAX_SEQUENCE_LENGTH;
+      }
+      strncpy(one_sequence,index[protein],len_one_seq);
+      one_sequence[len_one_seq]='\0'; // NULL terminate the string
+    } else {
+      strcpy(one_sequence,index[protein]);
     }
-    if (protein<(n_sequences-1))
+    pl=strlen(one_sequence);
+    n=1;
+    one_index=0;
+    for(i=0;i<pl;i++)
     {
-     strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
-     one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
-   }
-   else strcpy(one_sequence,index[protein]);
-   pl=strlen(one_sequence);
-   n=1;one_index=0;
-   for(i=0;i<pl;i++)
-   {
-     if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
-     {
-      if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
+      if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
       {
-        printf("Unknown amino acid %c",one_sequence[i]);
+        if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
+        {
+          printf("Unknown amino acid %c",one_sequence[i]);
+        } else {
+          a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
+          MP[a][i]++;
+          measured_aa_freq[a]++;
+        }
       } else {
-        a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
-        MP[a][i]++;
+        a=floor(20*(float)rand()/RAND_MAX);
+        MP[a][i]++;
         measured_aa_freq[a]++;
-      }
-  }
-    else {
-    a=floor(20*(float)rand()/RAND_MAX);
-    MP[a][i]++;
-    measured_aa_freq[a]++;
-    } // replace B, X, Z etc. with random amino acid to preserve size distribution
-  }
-  MP[20][pl]++;
-      measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
+      } // replace B, X, Z etc. with random amino acid to preserve size distribution
     }
-    printf("done\n");
-    fflush(stdout);
+    MP[20][pl]++;
+    measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
+  }
   for(i=0;i<MAX_SEQUENCE_LENGTH;i++){
      row_sum[i]=0;
@@ -204,41 +219,38 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
   }
-  /* generate random protein sequences through Markov chain */
-  strcpy(outfile,output_file);
-    if ((outp = fopen(outfile, "w"))==NULL) {
-      printf("error opening output file %s\n",outfile);
-      return -1;
-    }
+  /* generate random protein sequences through Markov chain */
-    printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
-    strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
+  if ((outp = fopen(outfile, "w"))==NULL) {
+    printf("error opening output file %s\n",outfile);
+    return -1;
+  }
-    for(protein=0;protein<sequences_to_generate;protein++)
-    {
-      if (!(protein%1000)) {
-        printf(".");fflush(stdout);
-      }
+  for(protein=0;protein<sequences_to_generate;protein++)
+  {
-      i=0; j=0;
-      while (1)
+    i=0; j=0;
+    while (1)
+    {
+      x=(double)row_sum[j]*((double)rand()/RAND_MAX);
+      partial_sum=MP[0][j]; i=1;
+      while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
+      if (j>=MAX_SEQUENCE_LENGTH) { i=21; }/* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
+      if (i<21)
       {
-       x=(double)row_sum[j]*((double)rand()/RAND_MAX);
-       partial_sum=MP[0][j]; i=1;
-       while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
-    if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
-       if (i<21)
-       {
-         random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
-       }
-    else /* i==21, i.e. protein sequence terminated */
-       {
-         k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
-         for(l=0;l<j;l++)
-         {
+        random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
+      } else { /* i==21, i.e. protein sequence terminated */
+        k=0;
+        generated_aa_freq[20]++;
+        generated_pl_sum+=j;
+        for(l=0;l<j;l++)
+        {
           random_sequence_output[k]=random_sequence[l]; k++;
           if (!((k+1)%61))
           {
@@ -256,19 +268,13 @@ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,
   fclose(outp);
-  /* freeing some memory... */
-  free(index);
-  printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
+  // printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
   k=0;l=0;
   for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
-    // printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
-  // for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
-  printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
+  // printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
   return 0;

data/lib/protk/galaxy_stager.rb CHANGED

@@ -20,14 +20,13 @@ class GalaxyStager
     end
   end
-  def replace_references(in_file, options = {})
-    options = { :base_only => false }.merge(options)
-    replacement = options[:base_only] ? @staged_base : @staged_path
+  def replace_references(in_file)
     GalaxyStager.replace_references(in_file, @original_path, replacement)
   end
-  def restore_references(in_file)
-    GalaxyStager.replace_references(in_file, @staged_path, @original_path)
+  def restore_references(in_file, options = {})
+    path = options[:base_only] ? @staged_path.gsub(/#{@extension}/,"") : @staged_path
+    GalaxyStager.replace_references(in_file, path, @original_path)
   end
   def self.replace_references(in_file, from_path, to_path)

data/lib/protk/galaxy_util.rb CHANGED

@@ -14,7 +14,7 @@ class GalaxyUtil
   def self.stage_pepxml(input_pepxml_path)
     stager = GalaxyStager.new(input_pepxml_path, :extension => ".pep.xml")
-    stager.staged_path
+    stager
   end
   def self.stage_protxml(input_protxml_path)

data/lib/protk/mzml_parser.rb ADDED

@@ -0,0 +1,67 @@
+require 'libxml'
+include LibXML
+class MzMLParser < Object
+	def initialize(path)
+		@namespace=
+		@mzml_ns_prefix="xmlns:"
+		@mzml_ns="xmlns:http://psi.hupo.org/ms/mzml"
+		doc=XML::Document.file(path)
+		@file_reader=XML::Reader.document(doc)
+	end
+	def next_spectrum()
+		until @file_reader.name=="spectrum"
+			if !@file_reader.read()
+				return nil
+			end
+		end
+		this_spect=spectrum_as_hash(@file_reader.expand)
+		@file_reader.next_sibling
+		return this_spect
+	end
+	def spectrum_as_hash(spectrum)
+		index=spectrum.attributes['index']
+		sid = spectrum.attributes['id']
+		precursor_mz_param = spectrum.find(".//#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000744\"]",@mzml_ns)[0]
+		mslevel_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000511\"]",@mzml_ns)[0]
+		title_param = spectrum.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000796\"]",@mzml_ns)[0]
+		# prec_mz = spectrum.find(".//#{@mz}")
+		precursor_mz_mz = precursor_mz_param.attributes['value'] if precursor_mz_param
+		mslevel = mslevel_param.attributes['value'] if mslevel_param
+		spectrum_title = title_param['value'] if title_param
+		data_arrays = spectrum.find("./#{@mzml_ns_prefix}binaryDataArrayList/#{@mzml_ns_prefix}binaryDataArray",@mzml_ns)
+		data={}
+		data_arrays.each do |arr|
+			the_data = arr.find("./#{@mzml_ns_prefix}binary",@mzml_ns)[0].content
+			mzaccession = arr.find("./#{@mzml_ns_prefix}cvParam[@accession=\"MS:1000514\"]",@mzml_ns)
+			if ( mzaccession.length==1 )
+				data[:mz] = the_data
+			else
+				data[:intensity] = the_data
+			end
+		end
+		data[:title]=spectrum_title
+		data[:mzlevel]=mslevel
+		data[:index]=index
+		data[:precursormz]=precursor_mz_mz
+		data[:id]=sid
+		data
+	end
+end

data/lib/protk/tool.rb CHANGED

@@ -53,7 +53,7 @@ class Tool
   end
-  def add_value_option(symbol,default_value,opts)
+  def add_value_option(symbol,default_value,opts)
     @options[symbol]=default_value
     @option_parser.on(*opts) do |val|
       @options[symbol]=val
@@ -108,6 +108,8 @@ class Tool
       add_value_option(:threads,1,['-n','--threads num','Number of processing threads to use. Set to 0 to autodetect an appropriate value'])
     end
   end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: protk
 version: !ruby/object:Gem::Version
-  version: 1.3.0.pre3
+  version: 1.3.0
 platform: ruby
 authors:
 - Ira Cooke
@@ -152,6 +152,20 @@ dependencies:
     - - ~>
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: sqlite3
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0'
 description: A bunch of tools for proteomics
 email: iracooke@gmail.com
 executables:
@@ -195,6 +209,7 @@ files:
 - bin/protein_prophet.rb
 - bin/protk_setup.rb
 - bin/protxml_to_gff.rb
+- bin/protxml_to_psql.rb
 - bin/protxml_to_table.rb
 - bin/repair_run_summary.rb
 - bin/sixframe.rb
@@ -237,6 +252,7 @@ files:
 - lib/protk/manage_db_rakefile.rake
 - lib/protk/manage_db_tool.rb
 - lib/protk/mascot_util.rb
+- lib/protk/mzml_parser.rb
 - lib/protk/omssa_util.rb
 - lib/protk/openms_defaults.rb
 - lib/protk/pepxml.rb
@@ -266,9 +282,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>'
+  - - '>='
     - !ruby/object:Gem::Version
-      version: 1.3.1
+      version: '0'
 requirements: []
 rubyforge_project:
 rubygems_version: 2.2.1