protk 1.2.6.pre5 → 1.3.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/libra.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# Created by John Chilton
|
4
|
-
#
|
5
|
-
# Run libra quantification against protein prophet results.
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/protxml'
|
11
|
-
require 'protk/galaxy_util'
|
12
|
-
require 'optparse'
|
13
|
-
|
14
|
-
for_galaxy = GalaxyUtil.for_galaxy?
|
15
|
-
|
16
|
-
protxml_path = ARGV.shift
|
17
|
-
|
18
|
-
if for_galaxy
|
19
|
-
protxml_path = GalaxyUtil.stage_protxml(protxml_path)
|
20
|
-
end
|
21
|
-
|
22
|
-
protxml = ProtXML.new(protxml_path)
|
23
|
-
pepxml_path = protxml.find_pep_xml()
|
24
|
-
|
25
|
-
genv=Constants.new
|
26
|
-
|
27
|
-
option_parser=OptionParser.new()
|
28
|
-
|
29
|
-
reagents = []
|
30
|
-
mass_tolerance = "0.2"
|
31
|
-
option_parser.on( '--mass-tolerance TOL',"Specifies the mass tolerance (window libra will search for the most intense m/z value in)." ) do |tol|
|
32
|
-
mass_tolerance = tol
|
33
|
-
end
|
34
|
-
|
35
|
-
option_parser.on( '--reagent MZ', "Specify a reagent (via m/z values).") do |reagent|
|
36
|
-
reagents << reagent
|
37
|
-
end
|
38
|
-
|
39
|
-
minimum_threshold_string = ""
|
40
|
-
option_parser.on( '--minimum-threshold THRESH', "Minimum threshhold intensity (not required).") do |thresh|
|
41
|
-
minimum_threshold_string = "<minimumThreshhold value=\"#{thresh}\"/>"
|
42
|
-
end
|
43
|
-
|
44
|
-
option_parser.parse!
|
45
|
-
|
46
|
-
|
47
|
-
reagent_strings = reagents.map do |reagent|
|
48
|
-
"<reagent mz=\"#{reagent}\" />"
|
49
|
-
end
|
50
|
-
reagents_string = reagent_strings.join(" ")
|
51
|
-
|
52
|
-
isotopic_contributions = ""
|
53
|
-
|
54
|
-
condition_contents = "<SUMmOnCondition description=\"libra_galaxy_run\">
|
55
|
-
<fragmentMasses>
|
56
|
-
#{reagents_string}
|
57
|
-
</fragmentMasses>
|
58
|
-
#{isotopic_contributions}
|
59
|
-
<massTolerance value=\"#{mass_tolerance}\"/>
|
60
|
-
<centroiding type=\"2\" iterations=\"1\"/>
|
61
|
-
<normalization type=\"4\"/>
|
62
|
-
<targetMs level=\"2\"/>
|
63
|
-
<output type=\"1\"/>
|
64
|
-
<quantitationFile name=\"quantitation.tsv\"/>
|
65
|
-
#{minimum_threshold_string}
|
66
|
-
</SUMmOnCondition>"
|
67
|
-
File.open("condition.xml", "w") { |f| f.write(condition_contents) }
|
68
|
-
print condition_contents
|
69
|
-
command="#{genv.librapeptideparser} '#{pepxml_path}' -ccondition.xml; #{genv.libraproteinratioparser} '#{protxml_path}' -c#{condition_file}"
|
70
|
-
%x[#{command}]
|
data/bin/toppas_pipeline.rb
DELETED
@@ -1,84 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 30/01/13
|
5
|
-
#
|
6
|
-
# A wrapper for the OpenMS tool ExecutePipeline.
|
7
|
-
# Executes simple toppas pipelines, automatically creating the trf file.
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/command_runner'
|
11
|
-
require 'protk/tool'
|
12
|
-
require 'protk/openms_defaults'
|
13
|
-
require 'tempfile'
|
14
|
-
require 'libxml'
|
15
|
-
|
16
|
-
include LibXML
|
17
|
-
|
18
|
-
tool=Tool.new([:background,:over_write])
|
19
|
-
tool.option_parser.banner = "Execute a toppas pipeline with a single inputs node\n\nUsage: toppas_pipeline.rb [options] input1 input2 ..."
|
20
|
-
|
21
|
-
tool.options.outdir = ""
|
22
|
-
tool.option_parser.on( '--outdir dir',"save outputs to dir" ) do |dir|
|
23
|
-
tool.options.outdir = dir
|
24
|
-
end
|
25
|
-
|
26
|
-
tool.options.toppas_file = ""
|
27
|
-
tool.option_parser.on( '--toppas-file f',"the toppas file to run" ) do |file|
|
28
|
-
tool.options.toppas_file = file
|
29
|
-
end
|
30
|
-
|
31
|
-
tool.options.threads = "1"
|
32
|
-
tool.option_parser.on( '--threads t',"Number of threads to use" ) do |tr|
|
33
|
-
tool.options.threads=tr
|
34
|
-
end
|
35
|
-
|
36
|
-
exit unless tool.check_options
|
37
|
-
|
38
|
-
if ( ARGV[0].nil? )
|
39
|
-
puts "You must supply an input file"
|
40
|
-
puts tool.option_parser
|
41
|
-
exit
|
42
|
-
end
|
43
|
-
|
44
|
-
# Obtain a global environment object
|
45
|
-
genv=Constants.new
|
46
|
-
|
47
|
-
def run_pipeline(genv,tool,cmd,output_path,jobid)
|
48
|
-
jobscript_path="#{output_path}.pbs.sh"
|
49
|
-
job_params={:jobid=>jobid, :vmem=>"14Gb", :queue => "sixteen"}
|
50
|
-
code=tool.run(cmd,genv,job_params,jobscript_path)
|
51
|
-
throw "Command failed with exit code #{code}" unless code==0
|
52
|
-
end
|
53
|
-
|
54
|
-
def generate_trf(input_files,out_path)
|
55
|
-
p OpenMSDefaults.new.trf_path
|
56
|
-
parser=XML::Parser.file(OpenMSDefaults.new.trf_path)
|
57
|
-
doc=parser.parse
|
58
|
-
itemlist_node=doc.find('/PARAMETERS/NODE/ITEMLIST')[0]
|
59
|
-
|
60
|
-
input_files.each do |f|
|
61
|
-
|
62
|
-
mnode=XML::Node.new('LISTITEM')
|
63
|
-
mnode["value"]="file://#{Pathname.new(f).realpath.to_s}"
|
64
|
-
|
65
|
-
itemlist_node << mnode
|
66
|
-
end
|
67
|
-
p out_path
|
68
|
-
doc.save(out_path)
|
69
|
-
end
|
70
|
-
|
71
|
-
throw "outdir is a required parameter" if tool.outdir==""
|
72
|
-
throw "toppas-file is a required parameter" if tool.toppas_file==""
|
73
|
-
throw "outdir must exist" unless Dir.exist?(tool.outdir)
|
74
|
-
|
75
|
-
trf_path = "#{Pathname.new(Tempfile.new(tool.toppas_file).path).basename.to_s}.trf"
|
76
|
-
|
77
|
-
generate_trf(ARGV,trf_path)
|
78
|
-
|
79
|
-
cmd=""
|
80
|
-
cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
|
81
|
-
#{genv.executepipeline} -in #{Pathname.new(tool.toppas_file).realpath.to_s} -out_dir #{Pathname.new(tool.outdir).realpath.to_s} -resource_file #{Pathname.new(trf_path).realpath.to_s} -threads #{tool.threads}"
|
82
|
-
|
83
|
-
run_pipeline(genv,tool,cmd,tool.outdir,tool.jobid_from_filename(tool.toppas_file))
|
84
|
-
|
data/bin/uniprot_annotation.rb
DELETED
@@ -1,141 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of Protk
|
4
|
-
# Created by Ira Cooke 24/3/2013
|
5
|
-
#
|
6
|
-
# Retrieve annotation information for proteins from the Uniprot Swissprot database
|
7
|
-
#
|
8
|
-
#
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/command_runner'
|
11
|
-
require 'protk/tool'
|
12
|
-
require 'protk/swissprot_database'
|
13
|
-
require 'protk/bio_sptr_extensions'
|
14
|
-
|
15
|
-
|
16
|
-
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
17
|
-
#
|
18
|
-
tool=Tool.new([:explicit_output])
|
19
|
-
tool.option_parser.banner = "Retrieve information from the Uniprot database given a list of ID's.\n\n\
|
20
|
-
Usage: uniprot_annotation.rb [options] input.tsv"
|
21
|
-
|
22
|
-
tool.options.id_column=1
|
23
|
-
tool.option_parser.on( '--id-column num', 'Specify a column for ids (default is column 1)' ) do |col|
|
24
|
-
tool.options.id_column=col.to_i
|
25
|
-
end
|
26
|
-
|
27
|
-
tool.options.flatfiledb="swissprot"
|
28
|
-
tool.option_parser.on( '--flatfiledb dbname', 'Specify path to a Uniprot flatfile' ) do |dbname|
|
29
|
-
tool.options.flatfiledb=dbname
|
30
|
-
end
|
31
|
-
|
32
|
-
tool.options.fields=nil
|
33
|
-
tool.option_parser.on( '--fields flds', 'A comma separated list of fields to extract' ) do |flds|
|
34
|
-
tool.options.fields=flds
|
35
|
-
end
|
36
|
-
|
37
|
-
exit unless tool.check_options
|
38
|
-
|
39
|
-
if ( ARGV[0].nil? )
|
40
|
-
puts "You must supply an input file"
|
41
|
-
puts tool.option_parser
|
42
|
-
exit
|
43
|
-
end
|
44
|
-
|
45
|
-
# Obtain a global environment object
|
46
|
-
genv=Constants.new
|
47
|
-
|
48
|
-
input_file=ARGV[0]
|
49
|
-
|
50
|
-
swissprotdb=SwissprotDatabase.new(genv,tool.flatfiledb)
|
51
|
-
|
52
|
-
output_file=nil
|
53
|
-
|
54
|
-
if ( tool.explicit_output==nil)
|
55
|
-
output_file=$stdout
|
56
|
-
else
|
57
|
-
output_file=File.open(tool.explicit_output,'w+')
|
58
|
-
end
|
59
|
-
|
60
|
-
ac_column = tool.id_column-1
|
61
|
-
|
62
|
-
db_fields = {
|
63
|
-
'recname'=>"Primary Name",
|
64
|
-
'cd'=>"CD Antigen Name",
|
65
|
-
'altnames'=>"Alternate Names",
|
66
|
-
'location' => "Subcellular Location",
|
67
|
-
'function' => "Known Function",
|
68
|
-
'similarity' => "Similarity",
|
69
|
-
'tissues' => "Tissue Specificity",
|
70
|
-
'disease' => "Disease Association",
|
71
|
-
'domain' => "Domain",
|
72
|
-
'subunit' => "Sub Unit",
|
73
|
-
'nextbio' => "NextBio",
|
74
|
-
'ipi' => "IPI",
|
75
|
-
'intact' => "Interactions",
|
76
|
-
'pride' => 'Pride',
|
77
|
-
'ensembl'=> 'Ensembl',
|
78
|
-
'num_transmem'=>"Transmembrane Regions",
|
79
|
-
'signalp'=>'Signal Peptide',
|
80
|
-
'ref_dump'=>'References',
|
81
|
-
'tax_dump'=>'Taxonomy Cross Ref',
|
82
|
-
'species_dump'=>'Species',
|
83
|
-
'feature_dump'=>'Feature Table',
|
84
|
-
'seq_dump' => 'AA Sequence'
|
85
|
-
}
|
86
|
-
|
87
|
-
hyperlink_fields = {
|
88
|
-
'uniprot_link'=>"Uniprot Link",
|
89
|
-
'nextbio_link'=>'NextBio Link',
|
90
|
-
'intact_link'=>"Interactions Link",
|
91
|
-
'pride_link'=>"Pride Link",
|
92
|
-
'ensembl_link'=>"Ensembl Link"
|
93
|
-
}
|
94
|
-
|
95
|
-
if tool.fields !=nil
|
96
|
-
fields = tool.fields.split(",").collect { |f| f.lstrip.rstrip }.reject {|e| e.empty? }
|
97
|
-
db_fields = db_fields.select { |k| fields.include? k }
|
98
|
-
hyperlink_fields = hyperlink_fields.select { |k| fields.include? k}
|
99
|
-
end
|
100
|
-
|
101
|
-
output_file.write db_fields.values.join("\t")
|
102
|
-
if ( hyperlink_fields.count > 0 )
|
103
|
-
output_file.write("\t")
|
104
|
-
output_file.write hyperlink_fields.values.join("\t")
|
105
|
-
end
|
106
|
-
output_file.write("\n")
|
107
|
-
|
108
|
-
line_num=0
|
109
|
-
File.foreach(input_file) { |line|
|
110
|
-
input_cols=line.split("\t")
|
111
|
-
throw "Not enough columns in line #{line_num}" unless input_cols.count > ac_column
|
112
|
-
accession=input_cols[ac_column].chomp
|
113
|
-
|
114
|
-
sptr_entry=swissprotdb.get_entry_for_name(accession)
|
115
|
-
|
116
|
-
if ( sptr_entry==nil)
|
117
|
-
genv.log("No entry for #{accession} in uniprot database",:warn)
|
118
|
-
else
|
119
|
-
|
120
|
-
db_values = db_fields.collect { |key,value|
|
121
|
-
sptr_entry.send(key)
|
122
|
-
}
|
123
|
-
|
124
|
-
hyperlink_values = hyperlink_fields.collect { |key,value|
|
125
|
-
sptr_entry.send(key)
|
126
|
-
}
|
127
|
-
|
128
|
-
output_file.write db_values.join("\t")
|
129
|
-
if ( hyperlink_fields.count > 0 )
|
130
|
-
output_file.write("\t")
|
131
|
-
output_file.write hyperlink_values.join("\t")
|
132
|
-
end
|
133
|
-
output_file.write "\n"
|
134
|
-
end
|
135
|
-
|
136
|
-
line_num+=1
|
137
|
-
|
138
|
-
}
|
139
|
-
|
140
|
-
|
141
|
-
|
data/bin/xls_to_table.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 18/1/2011
|
5
|
-
#
|
6
|
-
# Converts an Excel Spreadsheet to a tab delimited table
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
require 'protk/constants'
|
11
|
-
require 'protk/command_runner'
|
12
|
-
require 'protk/tool'
|
13
|
-
require 'spreadsheet'
|
14
|
-
|
15
|
-
# Setup command-line options for this tool.
|
16
|
-
#
|
17
|
-
tool=Tool.new([:explicit_output])
|
18
|
-
tool.option_parser.banner = "Convert an xls file to a tab delimited table.\n\nUsage: xls_to_table.rb [options] file1.xls"
|
19
|
-
|
20
|
-
exit unless tool.check_options
|
21
|
-
|
22
|
-
if ( ARGV[0].nil? )
|
23
|
-
puts "You must supply an input file"
|
24
|
-
puts tool.option_parser
|
25
|
-
exit
|
26
|
-
end
|
27
|
-
|
28
|
-
input_file=ARGV[0]
|
29
|
-
|
30
|
-
output_file=tool.explicit_output
|
31
|
-
output_file="#{input_file}.csv" unless ( output_file != nil )
|
32
|
-
|
33
|
-
output_fh = File.new(output_file,'w')
|
34
|
-
|
35
|
-
|
36
|
-
# Open the original excel workbook for reading
|
37
|
-
Spreadsheet.client_encoding = 'UTF-8'
|
38
|
-
inputBook = Spreadsheet.open "#{input_file}"
|
39
|
-
inputSheet = inputBook.worksheet 0
|
40
|
-
|
41
|
-
inputSheet.each do |row|
|
42
|
-
line=""
|
43
|
-
row.each do |colv|
|
44
|
-
line << "#{colv}\t"
|
45
|
-
end
|
46
|
-
line.chop!
|
47
|
-
output_fh.write "#{line}\n"
|
48
|
-
end
|
49
|
-
|
50
|
-
output_fh.close
|
51
|
-
|
52
|
-
|
data/bin/xpress.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# Created by John Chilton
|
4
|
-
#
|
5
|
-
# Run XPRESS against protein prophet results.
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/protxml'
|
11
|
-
require 'protk/galaxy_util'
|
12
|
-
|
13
|
-
for_galaxy = GalaxyUtil.for_galaxy?
|
14
|
-
|
15
|
-
protxml_path = ARGV.shift
|
16
|
-
|
17
|
-
if for_galaxy
|
18
|
-
protxml_path = GalaxyUtil.stage_protxml(protxml_path)
|
19
|
-
end
|
20
|
-
|
21
|
-
protxml = ProtXML.new(protxml_path)
|
22
|
-
pepxml_path = protxml.find_pep_xml()
|
23
|
-
|
24
|
-
genv=Constants.new
|
25
|
-
|
26
|
-
command="#{genv.xpresspeptideparser} '#{pepxml_path}' #{ARGV.join(" ")} ; #{genv.xpressproteinratioparser} '#{protxml_path}'"
|
27
|
-
%x[#{command}]
|
@@ -1,60 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'spreadsheet'
|
3
|
-
|
4
|
-
|
5
|
-
class BioToolsExcelConverter
|
6
|
-
|
7
|
-
def initialize(filename)
|
8
|
-
@inputBook = Spreadsheet.open File.new("#{filename}")
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.isBiotools(filename)
|
12
|
-
testBook = Spreadsheet.open File.new("#{filename}")
|
13
|
-
testSheet = testBook.worksheet 0
|
14
|
-
|
15
|
-
isbiotools=FALSE
|
16
|
-
testSheet.each do |row|
|
17
|
-
if (row[0].class==String) && row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
|
18
|
-
isbiotools=TRUE
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
isbiotools
|
24
|
-
end
|
25
|
-
|
26
|
-
def get_rows
|
27
|
-
|
28
|
-
sheet=@inputBook.worksheet 0
|
29
|
-
|
30
|
-
protein_rows=[]
|
31
|
-
|
32
|
-
n_rows=sheet.dimensions[1]
|
33
|
-
|
34
|
-
protein_rows=(0...n_rows).collect do |row_i|
|
35
|
-
new_row=nil
|
36
|
-
|
37
|
-
row=sheet.row row_i
|
38
|
-
if ( row[0]!=nil)
|
39
|
-
digmatch=row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
|
40
|
-
if ( digmatch!=nil )
|
41
|
-
new_row=[]
|
42
|
-
text= sheet.row(row_i-1)[0]
|
43
|
-
m=text.match(/\s(\S*)\s*$/)
|
44
|
-
throw "Badly formed protein line in biotools file ... could not parse protein name from #{text}" unless m!=nil
|
45
|
-
new_row[0]=m[1]
|
46
|
-
new_row[1]=digmatch[1]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
new_row
|
51
|
-
end
|
52
|
-
|
53
|
-
protein_rows.compact!
|
54
|
-
protein_rows.insert(0,["Accession","Ion Scores"])
|
55
|
-
|
56
|
-
protein_rows
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
@@ -1,158 +0,0 @@
|
|
1
|
-
# Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
2
|
-
# These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
|
3
|
-
|
4
|
-
require 'tempfile'
|
5
|
-
|
6
|
-
# A class for extracting gene info from a particular gene from the information file
|
7
|
-
class EuPathDBGeneInformationFileExtractor
|
8
|
-
# A filename path to the gene information file
|
9
|
-
attr_accessor :filename
|
10
|
-
|
11
|
-
def initialize(filename = nil)
|
12
|
-
@filename = filename
|
13
|
-
end
|
14
|
-
|
15
|
-
# Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
|
16
|
-
# there are multiple in the file, only the first is returned. If none are found, nil is returned.
|
17
|
-
#
|
18
|
-
# If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
|
19
|
-
def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
|
20
|
-
inside_iterator = lambda do |gene|
|
21
|
-
return gene if wanted_gene_id == gene.info['Gene Id']
|
22
|
-
end
|
23
|
-
|
24
|
-
filename = @filename
|
25
|
-
p @filename
|
26
|
-
if grep_hack_lines and grep_hack_lines.to_i != 0
|
27
|
-
tempfile=Tempfile.new('reubypathdb_grep_hack')
|
28
|
-
# grep however many lines from past the point. Rather dodgy, but faster.
|
29
|
-
raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
|
30
|
-
`grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
|
31
|
-
EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
|
32
|
-
return inside_iterator.call(gene)
|
33
|
-
end
|
34
|
-
else
|
35
|
-
# no grep hack. Parse the whole gene information file
|
36
|
-
EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
|
37
|
-
return inside_iterator.call(gene)
|
38
|
-
end
|
39
|
-
end
|
40
|
-
return nil
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
# A class for parsing the 'gene information table' files from EuPathDB, such
|
45
|
-
# as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
46
|
-
#
|
47
|
-
# The usual way of interacting with these is the use of the each method,
|
48
|
-
# which returns a EuPathDBGeneInformation object with all of the recorded
|
49
|
-
# information in it.
|
50
|
-
class EuPathDBGeneInformationTable
|
51
|
-
include Enumerable
|
52
|
-
|
53
|
-
# Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
|
54
|
-
def initialize(io)
|
55
|
-
@io = io
|
56
|
-
end
|
57
|
-
|
58
|
-
# Return a EuPathDBGeneInformation object with
|
59
|
-
# the contained info in it, one at a time
|
60
|
-
def each
|
61
|
-
while g = next_gene
|
62
|
-
yield g
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
# Returns a EuPathDBGeneInformation object with all the data you could
|
67
|
-
# possibly want.
|
68
|
-
def next_gene
|
69
|
-
info = EuPathDBGeneInformation.new
|
70
|
-
|
71
|
-
# first, read the table, which should start with the ID column
|
72
|
-
line = @io.readline.strip
|
73
|
-
while line == ''
|
74
|
-
return nil if @io.eof?
|
75
|
-
line = @io.readline.strip
|
76
|
-
end
|
77
|
-
|
78
|
-
while line != ''
|
79
|
-
if matches = line.match(/^(.*?)\: (.*)$/)
|
80
|
-
info.add_information(matches[1], matches[2])
|
81
|
-
else
|
82
|
-
raise Exception, "EuPathDBGeneInformationTable Couldn't parse this line: #{line}"
|
83
|
-
end
|
84
|
-
|
85
|
-
line = @io.readline.strip
|
86
|
-
end
|
87
|
-
|
88
|
-
# now read each of the tables, which should start with the
|
89
|
-
# 'TABLE: <name>' entry
|
90
|
-
line = @io.readline.strip
|
91
|
-
table_name = nil
|
92
|
-
headers = nil
|
93
|
-
data = []
|
94
|
-
while line != '------------------------------------------------------------'
|
95
|
-
if line == ''
|
96
|
-
# add it to the stack unless we are just starting out
|
97
|
-
info.add_table(table_name, headers, data) unless table_name.nil?
|
98
|
-
|
99
|
-
# reset things
|
100
|
-
table_name = nil
|
101
|
-
headers = nil
|
102
|
-
data = []
|
103
|
-
elsif matches = line.match(/^TABLE\: (.*)$/)
|
104
|
-
# name of a table
|
105
|
-
table_name = matches[1]
|
106
|
-
elsif line.match(/^\[.*\]/)
|
107
|
-
# headings of the table
|
108
|
-
headers = line.split("\t").collect do |header|
|
109
|
-
header.gsub(/^\[/,'').gsub(/\]$/,'')
|
110
|
-
end
|
111
|
-
else
|
112
|
-
# a proper data row
|
113
|
-
data.push line.split("\t")
|
114
|
-
end
|
115
|
-
line = @io.readline.strip
|
116
|
-
end
|
117
|
-
|
118
|
-
# return the object that has been created
|
119
|
-
return info
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
# Each gene in the gene information table is represented
|
124
|
-
# by 2 types of information - info and tables.
|
125
|
-
# info are 1 line data, whereas tables are tables of
|
126
|
-
# data with possibly multiple rows
|
127
|
-
class EuPathDBGeneInformation
|
128
|
-
def info
|
129
|
-
@info
|
130
|
-
end
|
131
|
-
|
132
|
-
def get_info(key)
|
133
|
-
@info[key]
|
134
|
-
end
|
135
|
-
alias_method :[], :get_info
|
136
|
-
|
137
|
-
def get_table(table_name)
|
138
|
-
@tables[table_name]
|
139
|
-
end
|
140
|
-
|
141
|
-
def add_information(key, value)
|
142
|
-
@info ||= {}
|
143
|
-
@info[key] = value
|
144
|
-
"Added info #{key}, now is #{@info[key]}"
|
145
|
-
end
|
146
|
-
|
147
|
-
def add_table(name, headers, data)
|
148
|
-
@tables ||= {}
|
149
|
-
@tables[name] = []
|
150
|
-
data.each do |row|
|
151
|
-
final = {}
|
152
|
-
row.each_with_index do |cell, i|
|
153
|
-
final[headers[i]] = cell
|
154
|
-
end
|
155
|
-
@tables[name].push final
|
156
|
-
end
|
157
|
-
end
|
158
|
-
end
|