protk 1.2.6.pre5 → 1.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/libra.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# Created by John Chilton
|
4
|
-
#
|
5
|
-
# Run libra quantification against protein prophet results.
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/protxml'
|
11
|
-
require 'protk/galaxy_util'
|
12
|
-
require 'optparse'
|
13
|
-
|
14
|
-
for_galaxy = GalaxyUtil.for_galaxy?
|
15
|
-
|
16
|
-
protxml_path = ARGV.shift
|
17
|
-
|
18
|
-
if for_galaxy
|
19
|
-
protxml_path = GalaxyUtil.stage_protxml(protxml_path)
|
20
|
-
end
|
21
|
-
|
22
|
-
protxml = ProtXML.new(protxml_path)
|
23
|
-
pepxml_path = protxml.find_pep_xml()
|
24
|
-
|
25
|
-
genv=Constants.new
|
26
|
-
|
27
|
-
option_parser=OptionParser.new()
|
28
|
-
|
29
|
-
reagents = []
|
30
|
-
mass_tolerance = "0.2"
|
31
|
-
option_parser.on( '--mass-tolerance TOL',"Specifies the mass tolerance (window libra will search for the most intense m/z value in)." ) do |tol|
|
32
|
-
mass_tolerance = tol
|
33
|
-
end
|
34
|
-
|
35
|
-
option_parser.on( '--reagent MZ', "Specify a reagent (via m/z values).") do |reagent|
|
36
|
-
reagents << reagent
|
37
|
-
end
|
38
|
-
|
39
|
-
minimum_threshold_string = ""
|
40
|
-
option_parser.on( '--minimum-threshold THRESH', "Minimum threshhold intensity (not required).") do |thresh|
|
41
|
-
minimum_threshold_string = "<minimumThreshhold value=\"#{thresh}\"/>"
|
42
|
-
end
|
43
|
-
|
44
|
-
option_parser.parse!
|
45
|
-
|
46
|
-
|
47
|
-
reagent_strings = reagents.map do |reagent|
|
48
|
-
"<reagent mz=\"#{reagent}\" />"
|
49
|
-
end
|
50
|
-
reagents_string = reagent_strings.join(" ")
|
51
|
-
|
52
|
-
isotopic_contributions = ""
|
53
|
-
|
54
|
-
condition_contents = "<SUMmOnCondition description=\"libra_galaxy_run\">
|
55
|
-
<fragmentMasses>
|
56
|
-
#{reagents_string}
|
57
|
-
</fragmentMasses>
|
58
|
-
#{isotopic_contributions}
|
59
|
-
<massTolerance value=\"#{mass_tolerance}\"/>
|
60
|
-
<centroiding type=\"2\" iterations=\"1\"/>
|
61
|
-
<normalization type=\"4\"/>
|
62
|
-
<targetMs level=\"2\"/>
|
63
|
-
<output type=\"1\"/>
|
64
|
-
<quantitationFile name=\"quantitation.tsv\"/>
|
65
|
-
#{minimum_threshold_string}
|
66
|
-
</SUMmOnCondition>"
|
67
|
-
File.open("condition.xml", "w") { |f| f.write(condition_contents) }
|
68
|
-
print condition_contents
|
69
|
-
command="#{genv.librapeptideparser} '#{pepxml_path}' -ccondition.xml; #{genv.libraproteinratioparser} '#{protxml_path}' -c#{condition_file}"
|
70
|
-
%x[#{command}]
|
data/bin/toppas_pipeline.rb
DELETED
@@ -1,84 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 30/01/13
|
5
|
-
#
|
6
|
-
# A wrapper for the OpenMS tool ExecutePipeline.
|
7
|
-
# Executes simple toppas pipelines, automatically creating the trf file.
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/command_runner'
|
11
|
-
require 'protk/tool'
|
12
|
-
require 'protk/openms_defaults'
|
13
|
-
require 'tempfile'
|
14
|
-
require 'libxml'
|
15
|
-
|
16
|
-
include LibXML
|
17
|
-
|
18
|
-
tool=Tool.new([:background,:over_write])
|
19
|
-
tool.option_parser.banner = "Execute a toppas pipeline with a single inputs node\n\nUsage: toppas_pipeline.rb [options] input1 input2 ..."
|
20
|
-
|
21
|
-
tool.options.outdir = ""
|
22
|
-
tool.option_parser.on( '--outdir dir',"save outputs to dir" ) do |dir|
|
23
|
-
tool.options.outdir = dir
|
24
|
-
end
|
25
|
-
|
26
|
-
tool.options.toppas_file = ""
|
27
|
-
tool.option_parser.on( '--toppas-file f',"the toppas file to run" ) do |file|
|
28
|
-
tool.options.toppas_file = file
|
29
|
-
end
|
30
|
-
|
31
|
-
tool.options.threads = "1"
|
32
|
-
tool.option_parser.on( '--threads t',"Number of threads to use" ) do |tr|
|
33
|
-
tool.options.threads=tr
|
34
|
-
end
|
35
|
-
|
36
|
-
exit unless tool.check_options
|
37
|
-
|
38
|
-
if ( ARGV[0].nil? )
|
39
|
-
puts "You must supply an input file"
|
40
|
-
puts tool.option_parser
|
41
|
-
exit
|
42
|
-
end
|
43
|
-
|
44
|
-
# Obtain a global environment object
|
45
|
-
genv=Constants.new
|
46
|
-
|
47
|
-
def run_pipeline(genv,tool,cmd,output_path,jobid)
|
48
|
-
jobscript_path="#{output_path}.pbs.sh"
|
49
|
-
job_params={:jobid=>jobid, :vmem=>"14Gb", :queue => "sixteen"}
|
50
|
-
code=tool.run(cmd,genv,job_params,jobscript_path)
|
51
|
-
throw "Command failed with exit code #{code}" unless code==0
|
52
|
-
end
|
53
|
-
|
54
|
-
def generate_trf(input_files,out_path)
|
55
|
-
p OpenMSDefaults.new.trf_path
|
56
|
-
parser=XML::Parser.file(OpenMSDefaults.new.trf_path)
|
57
|
-
doc=parser.parse
|
58
|
-
itemlist_node=doc.find('/PARAMETERS/NODE/ITEMLIST')[0]
|
59
|
-
|
60
|
-
input_files.each do |f|
|
61
|
-
|
62
|
-
mnode=XML::Node.new('LISTITEM')
|
63
|
-
mnode["value"]="file://#{Pathname.new(f).realpath.to_s}"
|
64
|
-
|
65
|
-
itemlist_node << mnode
|
66
|
-
end
|
67
|
-
p out_path
|
68
|
-
doc.save(out_path)
|
69
|
-
end
|
70
|
-
|
71
|
-
throw "outdir is a required parameter" if tool.outdir==""
|
72
|
-
throw "toppas-file is a required parameter" if tool.toppas_file==""
|
73
|
-
throw "outdir must exist" unless Dir.exist?(tool.outdir)
|
74
|
-
|
75
|
-
trf_path = "#{Pathname.new(Tempfile.new(tool.toppas_file).path).basename.to_s}.trf"
|
76
|
-
|
77
|
-
generate_trf(ARGV,trf_path)
|
78
|
-
|
79
|
-
cmd=""
|
80
|
-
cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
|
81
|
-
#{genv.executepipeline} -in #{Pathname.new(tool.toppas_file).realpath.to_s} -out_dir #{Pathname.new(tool.outdir).realpath.to_s} -resource_file #{Pathname.new(trf_path).realpath.to_s} -threads #{tool.threads}"
|
82
|
-
|
83
|
-
run_pipeline(genv,tool,cmd,tool.outdir,tool.jobid_from_filename(tool.toppas_file))
|
84
|
-
|
data/bin/uniprot_annotation.rb
DELETED
@@ -1,141 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of Protk
|
4
|
-
# Created by Ira Cooke 24/3/2013
|
5
|
-
#
|
6
|
-
# Retrieve annotation information for proteins from the Uniprot Swissprot database
|
7
|
-
#
|
8
|
-
#
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/command_runner'
|
11
|
-
require 'protk/tool'
|
12
|
-
require 'protk/swissprot_database'
|
13
|
-
require 'protk/bio_sptr_extensions'
|
14
|
-
|
15
|
-
|
16
|
-
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
17
|
-
#
|
18
|
-
tool=Tool.new([:explicit_output])
|
19
|
-
tool.option_parser.banner = "Retrieve information from the Uniprot database given a list of ID's.\n\n\
|
20
|
-
Usage: uniprot_annotation.rb [options] input.tsv"
|
21
|
-
|
22
|
-
tool.options.id_column=1
|
23
|
-
tool.option_parser.on( '--id-column num', 'Specify a column for ids (default is column 1)' ) do |col|
|
24
|
-
tool.options.id_column=col.to_i
|
25
|
-
end
|
26
|
-
|
27
|
-
tool.options.flatfiledb="swissprot"
|
28
|
-
tool.option_parser.on( '--flatfiledb dbname', 'Specify path to a Uniprot flatfile' ) do |dbname|
|
29
|
-
tool.options.flatfiledb=dbname
|
30
|
-
end
|
31
|
-
|
32
|
-
tool.options.fields=nil
|
33
|
-
tool.option_parser.on( '--fields flds', 'A comma separated list of fields to extract' ) do |flds|
|
34
|
-
tool.options.fields=flds
|
35
|
-
end
|
36
|
-
|
37
|
-
exit unless tool.check_options
|
38
|
-
|
39
|
-
if ( ARGV[0].nil? )
|
40
|
-
puts "You must supply an input file"
|
41
|
-
puts tool.option_parser
|
42
|
-
exit
|
43
|
-
end
|
44
|
-
|
45
|
-
# Obtain a global environment object
|
46
|
-
genv=Constants.new
|
47
|
-
|
48
|
-
input_file=ARGV[0]
|
49
|
-
|
50
|
-
swissprotdb=SwissprotDatabase.new(genv,tool.flatfiledb)
|
51
|
-
|
52
|
-
output_file=nil
|
53
|
-
|
54
|
-
if ( tool.explicit_output==nil)
|
55
|
-
output_file=$stdout
|
56
|
-
else
|
57
|
-
output_file=File.open(tool.explicit_output,'w+')
|
58
|
-
end
|
59
|
-
|
60
|
-
ac_column = tool.id_column-1
|
61
|
-
|
62
|
-
db_fields = {
|
63
|
-
'recname'=>"Primary Name",
|
64
|
-
'cd'=>"CD Antigen Name",
|
65
|
-
'altnames'=>"Alternate Names",
|
66
|
-
'location' => "Subcellular Location",
|
67
|
-
'function' => "Known Function",
|
68
|
-
'similarity' => "Similarity",
|
69
|
-
'tissues' => "Tissue Specificity",
|
70
|
-
'disease' => "Disease Association",
|
71
|
-
'domain' => "Domain",
|
72
|
-
'subunit' => "Sub Unit",
|
73
|
-
'nextbio' => "NextBio",
|
74
|
-
'ipi' => "IPI",
|
75
|
-
'intact' => "Interactions",
|
76
|
-
'pride' => 'Pride',
|
77
|
-
'ensembl'=> 'Ensembl',
|
78
|
-
'num_transmem'=>"Transmembrane Regions",
|
79
|
-
'signalp'=>'Signal Peptide',
|
80
|
-
'ref_dump'=>'References',
|
81
|
-
'tax_dump'=>'Taxonomy Cross Ref',
|
82
|
-
'species_dump'=>'Species',
|
83
|
-
'feature_dump'=>'Feature Table',
|
84
|
-
'seq_dump' => 'AA Sequence'
|
85
|
-
}
|
86
|
-
|
87
|
-
hyperlink_fields = {
|
88
|
-
'uniprot_link'=>"Uniprot Link",
|
89
|
-
'nextbio_link'=>'NextBio Link',
|
90
|
-
'intact_link'=>"Interactions Link",
|
91
|
-
'pride_link'=>"Pride Link",
|
92
|
-
'ensembl_link'=>"Ensembl Link"
|
93
|
-
}
|
94
|
-
|
95
|
-
if tool.fields !=nil
|
96
|
-
fields = tool.fields.split(",").collect { |f| f.lstrip.rstrip }.reject {|e| e.empty? }
|
97
|
-
db_fields = db_fields.select { |k| fields.include? k }
|
98
|
-
hyperlink_fields = hyperlink_fields.select { |k| fields.include? k}
|
99
|
-
end
|
100
|
-
|
101
|
-
output_file.write db_fields.values.join("\t")
|
102
|
-
if ( hyperlink_fields.count > 0 )
|
103
|
-
output_file.write("\t")
|
104
|
-
output_file.write hyperlink_fields.values.join("\t")
|
105
|
-
end
|
106
|
-
output_file.write("\n")
|
107
|
-
|
108
|
-
line_num=0
|
109
|
-
File.foreach(input_file) { |line|
|
110
|
-
input_cols=line.split("\t")
|
111
|
-
throw "Not enough columns in line #{line_num}" unless input_cols.count > ac_column
|
112
|
-
accession=input_cols[ac_column].chomp
|
113
|
-
|
114
|
-
sptr_entry=swissprotdb.get_entry_for_name(accession)
|
115
|
-
|
116
|
-
if ( sptr_entry==nil)
|
117
|
-
genv.log("No entry for #{accession} in uniprot database",:warn)
|
118
|
-
else
|
119
|
-
|
120
|
-
db_values = db_fields.collect { |key,value|
|
121
|
-
sptr_entry.send(key)
|
122
|
-
}
|
123
|
-
|
124
|
-
hyperlink_values = hyperlink_fields.collect { |key,value|
|
125
|
-
sptr_entry.send(key)
|
126
|
-
}
|
127
|
-
|
128
|
-
output_file.write db_values.join("\t")
|
129
|
-
if ( hyperlink_fields.count > 0 )
|
130
|
-
output_file.write("\t")
|
131
|
-
output_file.write hyperlink_values.join("\t")
|
132
|
-
end
|
133
|
-
output_file.write "\n"
|
134
|
-
end
|
135
|
-
|
136
|
-
line_num+=1
|
137
|
-
|
138
|
-
}
|
139
|
-
|
140
|
-
|
141
|
-
|
data/bin/xls_to_table.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 18/1/2011
|
5
|
-
#
|
6
|
-
# Converts an Excel Spreadsheet to a tab delimited table
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
require 'protk/constants'
|
11
|
-
require 'protk/command_runner'
|
12
|
-
require 'protk/tool'
|
13
|
-
require 'spreadsheet'
|
14
|
-
|
15
|
-
# Setup command-line options for this tool.
|
16
|
-
#
|
17
|
-
tool=Tool.new([:explicit_output])
|
18
|
-
tool.option_parser.banner = "Convert an xls file to a tab delimited table.\n\nUsage: xls_to_table.rb [options] file1.xls"
|
19
|
-
|
20
|
-
exit unless tool.check_options
|
21
|
-
|
22
|
-
if ( ARGV[0].nil? )
|
23
|
-
puts "You must supply an input file"
|
24
|
-
puts tool.option_parser
|
25
|
-
exit
|
26
|
-
end
|
27
|
-
|
28
|
-
input_file=ARGV[0]
|
29
|
-
|
30
|
-
output_file=tool.explicit_output
|
31
|
-
output_file="#{input_file}.csv" unless ( output_file != nil )
|
32
|
-
|
33
|
-
output_fh = File.new(output_file,'w')
|
34
|
-
|
35
|
-
|
36
|
-
# Open the original excel workbook for reading
|
37
|
-
Spreadsheet.client_encoding = 'UTF-8'
|
38
|
-
inputBook = Spreadsheet.open "#{input_file}"
|
39
|
-
inputSheet = inputBook.worksheet 0
|
40
|
-
|
41
|
-
inputSheet.each do |row|
|
42
|
-
line=""
|
43
|
-
row.each do |colv|
|
44
|
-
line << "#{colv}\t"
|
45
|
-
end
|
46
|
-
line.chop!
|
47
|
-
output_fh.write "#{line}\n"
|
48
|
-
end
|
49
|
-
|
50
|
-
output_fh.close
|
51
|
-
|
52
|
-
|
data/bin/xpress.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# Created by John Chilton
|
4
|
-
#
|
5
|
-
# Run XPRESS against protein prophet results.
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/protxml'
|
11
|
-
require 'protk/galaxy_util'
|
12
|
-
|
13
|
-
for_galaxy = GalaxyUtil.for_galaxy?
|
14
|
-
|
15
|
-
protxml_path = ARGV.shift
|
16
|
-
|
17
|
-
if for_galaxy
|
18
|
-
protxml_path = GalaxyUtil.stage_protxml(protxml_path)
|
19
|
-
end
|
20
|
-
|
21
|
-
protxml = ProtXML.new(protxml_path)
|
22
|
-
pepxml_path = protxml.find_pep_xml()
|
23
|
-
|
24
|
-
genv=Constants.new
|
25
|
-
|
26
|
-
command="#{genv.xpresspeptideparser} '#{pepxml_path}' #{ARGV.join(" ")} ; #{genv.xpressproteinratioparser} '#{protxml_path}'"
|
27
|
-
%x[#{command}]
|
@@ -1,60 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'spreadsheet'
|
3
|
-
|
4
|
-
|
5
|
-
class BioToolsExcelConverter
|
6
|
-
|
7
|
-
def initialize(filename)
|
8
|
-
@inputBook = Spreadsheet.open File.new("#{filename}")
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.isBiotools(filename)
|
12
|
-
testBook = Spreadsheet.open File.new("#{filename}")
|
13
|
-
testSheet = testBook.worksheet 0
|
14
|
-
|
15
|
-
isbiotools=FALSE
|
16
|
-
testSheet.each do |row|
|
17
|
-
if (row[0].class==String) && row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
|
18
|
-
isbiotools=TRUE
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
isbiotools
|
24
|
-
end
|
25
|
-
|
26
|
-
def get_rows
|
27
|
-
|
28
|
-
sheet=@inputBook.worksheet 0
|
29
|
-
|
30
|
-
protein_rows=[]
|
31
|
-
|
32
|
-
n_rows=sheet.dimensions[1]
|
33
|
-
|
34
|
-
protein_rows=(0...n_rows).collect do |row_i|
|
35
|
-
new_row=nil
|
36
|
-
|
37
|
-
row=sheet.row row_i
|
38
|
-
if ( row[0]!=nil)
|
39
|
-
digmatch=row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
|
40
|
-
if ( digmatch!=nil )
|
41
|
-
new_row=[]
|
42
|
-
text= sheet.row(row_i-1)[0]
|
43
|
-
m=text.match(/\s(\S*)\s*$/)
|
44
|
-
throw "Badly formed protein line in biotools file ... could not parse protein name from #{text}" unless m!=nil
|
45
|
-
new_row[0]=m[1]
|
46
|
-
new_row[1]=digmatch[1]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
new_row
|
51
|
-
end
|
52
|
-
|
53
|
-
protein_rows.compact!
|
54
|
-
protein_rows.insert(0,["Accession","Ion Scores"])
|
55
|
-
|
56
|
-
protein_rows
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
@@ -1,158 +0,0 @@
|
|
1
|
-
# Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
2
|
-
# These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
|
3
|
-
|
4
|
-
require 'tempfile'
|
5
|
-
|
6
|
-
# A class for extracting gene info from a particular gene from the information file
|
7
|
-
class EuPathDBGeneInformationFileExtractor
|
8
|
-
# A filename path to the gene information file
|
9
|
-
attr_accessor :filename
|
10
|
-
|
11
|
-
def initialize(filename = nil)
|
12
|
-
@filename = filename
|
13
|
-
end
|
14
|
-
|
15
|
-
# Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
|
16
|
-
# there are multiple in the file, only the first is returned. If none are found, nil is returned.
|
17
|
-
#
|
18
|
-
# If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
|
19
|
-
def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
|
20
|
-
inside_iterator = lambda do |gene|
|
21
|
-
return gene if wanted_gene_id == gene.info['Gene Id']
|
22
|
-
end
|
23
|
-
|
24
|
-
filename = @filename
|
25
|
-
p @filename
|
26
|
-
if grep_hack_lines and grep_hack_lines.to_i != 0
|
27
|
-
tempfile=Tempfile.new('reubypathdb_grep_hack')
|
28
|
-
# grep however many lines from past the point. Rather dodgy, but faster.
|
29
|
-
raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
|
30
|
-
`grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
|
31
|
-
EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
|
32
|
-
return inside_iterator.call(gene)
|
33
|
-
end
|
34
|
-
else
|
35
|
-
# no grep hack. Parse the whole gene information file
|
36
|
-
EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
|
37
|
-
return inside_iterator.call(gene)
|
38
|
-
end
|
39
|
-
end
|
40
|
-
return nil
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
# A class for parsing the 'gene information table' files from EuPathDB, such
|
45
|
-
# as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
46
|
-
#
|
47
|
-
# The usual way of interacting with these is the use of the each method,
|
48
|
-
# which returns a EuPathDBGeneInformation object with all of the recorded
|
49
|
-
# information in it.
|
50
|
-
class EuPathDBGeneInformationTable
|
51
|
-
include Enumerable
|
52
|
-
|
53
|
-
# Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
|
54
|
-
def initialize(io)
|
55
|
-
@io = io
|
56
|
-
end
|
57
|
-
|
58
|
-
# Return a EuPathDBGeneInformation object with
|
59
|
-
# the contained info in it, one at a time
|
60
|
-
def each
|
61
|
-
while g = next_gene
|
62
|
-
yield g
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
# Returns a EuPathDBGeneInformation object with all the data you could
|
67
|
-
# possibly want.
|
68
|
-
def next_gene
|
69
|
-
info = EuPathDBGeneInformation.new
|
70
|
-
|
71
|
-
# first, read the table, which should start with the ID column
|
72
|
-
line = @io.readline.strip
|
73
|
-
while line == ''
|
74
|
-
return nil if @io.eof?
|
75
|
-
line = @io.readline.strip
|
76
|
-
end
|
77
|
-
|
78
|
-
while line != ''
|
79
|
-
if matches = line.match(/^(.*?)\: (.*)$/)
|
80
|
-
info.add_information(matches[1], matches[2])
|
81
|
-
else
|
82
|
-
raise Exception, "EuPathDBGeneInformationTable Couldn't parse this line: #{line}"
|
83
|
-
end
|
84
|
-
|
85
|
-
line = @io.readline.strip
|
86
|
-
end
|
87
|
-
|
88
|
-
# now read each of the tables, which should start with the
|
89
|
-
# 'TABLE: <name>' entry
|
90
|
-
line = @io.readline.strip
|
91
|
-
table_name = nil
|
92
|
-
headers = nil
|
93
|
-
data = []
|
94
|
-
while line != '------------------------------------------------------------'
|
95
|
-
if line == ''
|
96
|
-
# add it to the stack unless we are just starting out
|
97
|
-
info.add_table(table_name, headers, data) unless table_name.nil?
|
98
|
-
|
99
|
-
# reset things
|
100
|
-
table_name = nil
|
101
|
-
headers = nil
|
102
|
-
data = []
|
103
|
-
elsif matches = line.match(/^TABLE\: (.*)$/)
|
104
|
-
# name of a table
|
105
|
-
table_name = matches[1]
|
106
|
-
elsif line.match(/^\[.*\]/)
|
107
|
-
# headings of the table
|
108
|
-
headers = line.split("\t").collect do |header|
|
109
|
-
header.gsub(/^\[/,'').gsub(/\]$/,'')
|
110
|
-
end
|
111
|
-
else
|
112
|
-
# a proper data row
|
113
|
-
data.push line.split("\t")
|
114
|
-
end
|
115
|
-
line = @io.readline.strip
|
116
|
-
end
|
117
|
-
|
118
|
-
# return the object that has been created
|
119
|
-
return info
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
# Each gene in the gene information table is represented
|
124
|
-
# by 2 types of information - info and tables.
|
125
|
-
# info are 1 line data, whereas tables are tables of
|
126
|
-
# data with possibly multiple rows
|
127
|
-
class EuPathDBGeneInformation
|
128
|
-
def info
|
129
|
-
@info
|
130
|
-
end
|
131
|
-
|
132
|
-
def get_info(key)
|
133
|
-
@info[key]
|
134
|
-
end
|
135
|
-
alias_method :[], :get_info
|
136
|
-
|
137
|
-
def get_table(table_name)
|
138
|
-
@tables[table_name]
|
139
|
-
end
|
140
|
-
|
141
|
-
def add_information(key, value)
|
142
|
-
@info ||= {}
|
143
|
-
@info[key] = value
|
144
|
-
"Added info #{key}, now is #{@info[key]}"
|
145
|
-
end
|
146
|
-
|
147
|
-
def add_table(name, headers, data)
|
148
|
-
@tables ||= {}
|
149
|
-
@tables[name] = []
|
150
|
-
data.each do |row|
|
151
|
-
final = {}
|
152
|
-
row.each_with_index do |cell, i|
|
153
|
-
final[headers[i]] = cell
|
154
|
-
end
|
155
|
-
@tables[name].push final
|
156
|
-
end
|
157
|
-
end
|
158
|
-
end
|