rbbt-sources 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,57 +8,115 @@ end
8
8
 
9
9
  if defined? Entity
10
10
 
11
- module NCINaturePathways
11
+ module NCINaturePathway
12
12
  extend Entity
13
13
  self.format = "NCI Nature Pathway ID"
14
14
 
15
+ self.annotation :organism
16
+
17
+ def self.name_index
18
+ @name_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :single)
19
+ end
20
+
21
+ def self.gene_index
22
+ @gene_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
23
+ end
24
+
25
+ def self.filter(query, field = nil, options = nil, entity = nil)
26
+ return true if query == entity
27
+
28
+ return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
29
+
30
+ false
31
+ end
15
32
  property :name => :array2single do
16
- @name ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
33
+ @name ||= NCINaturePathway.name_index.values_at *self
17
34
  end
18
35
 
19
36
  property :genes => :array2single do
20
- @genes ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
37
+ @genes ||= NCINaturePathway.gene_index.values_at *self
21
38
  end
22
39
  end
23
40
 
24
- module NCIReactomePathways
41
+ module NCIReactomePathway
25
42
  extend Entity
26
43
  self.format = "NCI Reactome Pathway ID"
44
+
45
+ self.annotation :organism
46
+
47
+ def self.name_index
48
+ @name_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
49
+ end
50
+
51
+ def self.gene_index
52
+ @gene_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
53
+ end
54
+
55
+ def self.filter(query, field = nil, options = nil, entity = nil)
56
+ return true if query == entity
57
+
58
+ return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
59
+
60
+ false
61
+ end
27
62
 
28
63
  property :name => :array2single do
29
- @name ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
64
+ @name ||= NCIReactomePathway.name_index.values_at *self
30
65
  end
31
66
 
32
67
  property :genes => :array2single do
33
- @genes ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
68
+ @genes ||= NCIReactomePathway.gene_index.values_at *self
34
69
  end
35
70
  end
36
71
 
37
- module NCIBioCartaPathways
72
+ module NCIBioCartaPathway
38
73
  extend Entity
39
74
  self.format = "NCI BioCarta Pathway ID"
40
75
 
76
+ self.annotation :organism
77
+
78
+ def self.name_index
79
+ @name_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :single)
80
+ end
81
+
82
+ def self.gene_index
83
+ @gene_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true)
84
+ end
85
+
86
+ def self.filter(query, field = nil, options = nil, entity = nil)
87
+ return true if query == entity
88
+
89
+ return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
90
+
91
+ false
92
+ end
93
+
41
94
  property :name => :array2single do
42
- @name ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
95
+ @name ||= NCIBioCartaPathway.name_index.values_at *self
43
96
  end
44
97
 
45
98
  property :genes => :array2single do
46
- @genes ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true).values_at *self
99
+ @genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
100
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }
47
101
  end
48
102
  end
49
103
 
50
104
  if defined? Gene and Entity === Gene
51
105
  module Gene
52
106
  property :nature_pathways => :array2single do
53
- @nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
107
+ @nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).
108
+ values_at(*self.to("UniProt/SwissProt Accession")).
109
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
54
110
  end
55
111
 
56
112
  property :reactome_pathways => :array2single do
57
- @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
113
+ @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
114
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
58
115
  end
59
116
 
60
117
  property :biocarta_pathways => :array2single do
61
- @biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at *self.entrez
118
+ @biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at(*self.entrez).
119
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIBioCartaPathway.setup(o, organism)}
62
120
  end
63
121
  end
64
122
  end
@@ -0,0 +1,142 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ module Cath
4
+ extend Resource
5
+
6
+ Rbbt.claim Rbbt.share.databases.CATH.CathNames, :proc do
7
+ tsv = TSV.setup({}, :key_field => "CATH Code", :type => :list, :fields => ["PDB ID", "CATH Domain", "CATH Description"])
8
+ Open.read("http://release.cathdb.info/v3.4.0/CathNames").split(/\n/).each do |line|
9
+ next if line =~ /^#/
10
+ code, pdb, domain, name = line.match(/([\d\.]+)\s+(\w\w\w\w)(\w\w\w)\s+:(.*)/).values_at 1,2,3,4
11
+ tsv[code] = [pdb.downcase, domain, name]
12
+ end
13
+
14
+ tsv.to_s
15
+ end
16
+
17
+ Rbbt.claim Rbbt.share.databases.CATH.CathUnclassifiedList , :proc do
18
+ Open.read("http://release.cathdb.info/v3.4.0/CathUnclassifiedList").split(/\n/).collect do |line|
19
+ next if line =~ /^#/
20
+ line.split(/\s/).first
21
+ end * "\n"
22
+ end
23
+
24
+
25
+ Rbbt.claim Rbbt.share.databases.CATH.CathDomainSeqs, :proc do
26
+ tsv = TSV.setup({}, :key_field => "CATH Domain", :type => :single, :fields => ["Cath Domain Sequence"])
27
+
28
+ Open.read("http://release.cathdb.info/v3.4.0/CathDomainSeqs.ATOM").split(/>pdb\|/).each do |chunk|
29
+ next if chunk.empty?
30
+ domain, sequence = chunk.strip.match(/(.*)\n(.*)/).values_at 1, 2
31
+ tsv[domain] = sequence
32
+ end
33
+
34
+ tsv.to_s
35
+ end
36
+
37
+
38
+ Rbbt.claim Rbbt.share.databases.CATH.CathRegions, :proc do
39
+ domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["Start", "End"])
40
+ Open.read("http://release.cathdb.info/v3.4.0/CathDomall").split(/\n/).each do |line|
41
+ next if line =~ /^#/
42
+ chain, ndomains, nfragments, rest = line.match(/(\w\w\w\w\w)\s+D(\d+)\s+F(\d+)\s+(.*)/).values_at 1,2,3,4
43
+
44
+ ndomains.to_i.times do |dn|
45
+ nsegments, rest = rest.match(/^\s*(\d+)\s+(.*)/).values_at 1, 2
46
+ segments = []
47
+ nsegments.to_i.times do |sn|
48
+ start, eend, rest = rest.match(/\w\s+(-?\d+)\s+.\s+\w\s+(-?\d+)\s+.(.*)/).values_at 1, 2, 3
49
+ segments << [start, eend]
50
+ end
51
+
52
+ domain = chain + "%02d" % dn.to_i
53
+ segments = segments[0].zip(*segments[1..-1])
54
+ domains[domain] = segments
55
+ end
56
+ end
57
+
58
+ domains.to_s
59
+ end
60
+
61
+ Rbbt.claim Rbbt.share.databases.CATH.CathDomainList, :proc do
62
+ domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["CATH domain name (seven characters)",
63
+ "Class number", "Architecture number", "Topology number", "Homologous superfamily number", "S35 sequence cluster number",
64
+ "S60 sequence cluster number", "S95 sequence cluster number", "S100 sequence cluster number", "S100 sequence count number",
65
+ "Domain length", "Structure resolution (Angstroms)"], :type => :list)
66
+
67
+ Open.read("http://release.cathdb.info/v3.4.0/CathDomainList").split(/\n/).each do |line|
68
+ next if line =~ /^#/
69
+ parts = line.chomp.split /\s+/
70
+ domain = parts.shift
71
+ domains[domain] = parts
72
+ end
73
+
74
+ domains.to_s
75
+ end
76
+
77
+
78
+ def self.cath_index
79
+ @@cath ||= Rbbt.share.databases.CATH.CathNames.tsv :persist => true, :case_insensitive => true
80
+ end
81
+
82
+ def self.pdb_index
83
+ if not defined? @@pdb or @@pdb.nil?
84
+ @@pdb = {}
85
+ Rbbt.share.databases.CATH.CathDomainSeqs.read.split("\n").each do |line|
86
+ domain = line.split(/\t/).first
87
+ pdb = domain[0..3]
88
+ @@pdb[pdb] ||= []
89
+ @@pdb[pdb] << domain
90
+ end
91
+ end
92
+ @@pdb
93
+ end
94
+
95
+ def self.unclassified
96
+ @@unclassified = {}
97
+ Rbbt.share.databases.CATH.CathUnclassifiedList.read.split("\n").each do |domain|
98
+ pdb = domain[0..3]
99
+ @@unclassified[pdb] ||= []
100
+ @@unclassified[pdb] << domain
101
+ end
102
+ @@unclassified
103
+ end
104
+
105
+ def self.domain_sequences
106
+ @@domain_sequences ||= Rbbt.share.databases.CATH.CathDomainSeqs.tsv(:persist => true)
107
+ end
108
+
109
+ def self.pdbs(cath_code)
110
+ cath = cath_index
111
+ if cath.include? cath_code
112
+ cath[cath_code]["PDB ID"]
113
+ else
114
+ nil
115
+ end
116
+ end
117
+
118
+ def self.domains_for_pdb(pdb)
119
+ pdb2cath = pdb_index
120
+ (pdb2cath[pdb] || []) + (unclassified[pdb] || [])
121
+ end
122
+
123
+ def self.align(domain, sequence)
124
+ require 'bio'
125
+
126
+ return nil if not domain_sequences.include? domain
127
+
128
+ TmpFile.with_file(">target\n" << sequence) do |target|
129
+ TmpFile.with_file(">domain\n" << domain_sequences[domain]) do |domain|
130
+
131
+ result = CMD.cmd("fasta35 #{ target } #{ domain }").read
132
+
133
+ if result.match(/([\d\.]+)% identity.*overlap \((\d+)-(\d+):/s)
134
+ {:identity => $1.to_f, :range => ($2.to_i..$3.to_i)}
135
+ else
136
+ false
137
+ end
138
+ end
139
+ end
140
+ end
141
+ end
142
+
@@ -91,22 +91,33 @@ if defined? Entity
91
91
  extend Entity
92
92
  self.format = "GO ID"
93
93
 
94
+ self.annotation :organism
95
+
94
96
  property :name => :array2single do
95
97
  @name ||= GO.id2name(self)
96
98
  end
97
99
 
98
100
  property :genes => :array2single do |organism|
101
+ organism ||= self.organism
99
102
  @genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
100
103
  end
104
+
105
+ property :description => :single2array do
106
+ description = GO.info[self]['def']
107
+ description.gsub!(/"|\[.*\]/,'') if description
108
+
109
+ description
110
+ end
111
+
101
112
  end
102
113
 
103
114
  if defined? Gene and Entity === Gene
104
115
  module Gene
105
- property :go_terms => :array2single do |organism|
116
+ property :go_terms => :array2single do
106
117
  @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
107
118
  end
108
-
109
- property :go_bp_terms => :array2single do |organism|
119
+
120
+ property :go_bp_terms => :array2single do
110
121
  @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
111
122
  end
112
123
  end
@@ -7,7 +7,7 @@ module Organism
7
7
  self.pkgdir = "rbbt"
8
8
  self.subdir = "share/organisms"
9
9
 
10
- ["Hsa", "Rno", "Sce"].each do |organism|
10
+ ["Hsa", "Mmu", "Rno", "Sce"].each do |organism|
11
11
  claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
12
12
 
13
13
  module_eval "#{ organism } = with_key '#{organism}'"
@@ -0,0 +1,35 @@
1
+ require 'rbbt'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/resource'
4
+
5
+ module Pfam
6
+ extend Resource
7
+ self.subdir = "share/databases/Pfam"
8
+
9
+ Pfam.claim Pfam.domains, :proc do
10
+ url = "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
11
+ tsv = TSV.open(Open.open(url), :key_field => "Pfam Domain ID", :fields => ["Pfam Clan ID", "Code Name", "Name", "Description"])
12
+ tsv.to_s
13
+ end
14
+
15
+ NAMES_FILE = Rbbt.share.databases.InterPro.pfam_names.find
16
+
17
+ def self.name_index
18
+ @name_index ||= TSV.open NAMES_FILE, :single
19
+ end
20
+
21
+ def self.name(id)
22
+ name_index[id]
23
+ end
24
+ end
25
+
26
+ if defined? Entity
27
+ module PfamDomain
28
+ extend Entity
29
+ self.format = "Pfam Domain"
30
+
31
+ property :name => :array2single do
32
+ self.collect{|id| Pfam.name(id)}
33
+ end
34
+ end
35
+ end
@@ -13,11 +13,13 @@ module PubMed
13
13
 
14
14
  pmids_complete = pmids.is_a?(Array) ? pmids : [pmids]
15
15
 
16
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
16
17
  articles = []
17
- Misc.divide(pmids_complete, (pmids_complete.length / 500) + 1).each do |pmid_list|
18
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list * ","}"
19
-
20
- xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
18
+ Misc.divide(pmids.sort, (pmids.length / 1000) + 1) do |pmid_list|
19
+ postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
20
+ xml = TmpFile.with_file(postdata) do |postfile|
21
+ Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
22
+ end
21
23
 
22
24
  articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
23
25
  end
@@ -202,14 +204,8 @@ module PubMed
202
204
  }
203
205
 
204
206
  return list unless missing.any?
205
- chunk_size = [100, missing.length].min
206
- chunks = (missing.length.to_f / chunk_size).ceil
207
207
 
208
- articles = {}
209
- chunks.times do |chunk|
210
- pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
211
- articles.merge!(get_online(pmids))
212
- end
208
+ articles = get_online(missing)
213
209
 
214
210
  articles.each{|p, xml|
215
211
  filename = p + '.xml'
@@ -45,7 +45,6 @@ module TFacts
45
45
  end
46
46
  end
47
47
 
48
-
49
48
  if defined? Entity and defined? Gene and Entity === Gene
50
49
 
51
50
  module Gene
@@ -0,0 +1,125 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/sources/cath'
4
+ require 'rbbt/sources/uniprot'
5
+
6
+ module Uniprot
7
+ extend Resource
8
+ self.subdir = "share/databases/Uniprot"
9
+
10
+ Uniprot.claim Uniprot.annotated_variants, :proc do
11
+ url = "http://www.uniprot.org/docs/humsavar.txt"
12
+ tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
13
+ :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[0..5] + [(parts[6..-1] || []) * " "]) * "\t"}, :type => :list,:key_field => "Associated Gene Name",
14
+ :fields => ["Uniprot/SwissProt Accession", "Uniprot Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
15
+
16
+ tsv.unnamed = true
17
+ tsv.process "Amino Acid Mutation" do |mutation|
18
+ if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
19
+ wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
20
+ mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
21
+ [wt, $2, mut] * ""
22
+ else
23
+ mutation
24
+ end
25
+ end
26
+
27
+ uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
28
+ mutation_pos = tsv.identify_field "Amino Acid Mutation"
29
+ tsv.add_field "Mutated Isoform" do |key, values|
30
+ [values[uniprot_pos], values[mutation_pos]] * ":"
31
+ end
32
+
33
+ tsv.reorder("Mutated Isoform").to_s
34
+ end
35
+
36
+
37
+ UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
38
+ def self.pdbs(protein)
39
+ url = UNIPROT_TEXT.sub "[PROTEIN]", protein
40
+ text = Open.read(url)
41
+
42
+ pdb = {}
43
+
44
+ text.split(/\n/).each{|l|
45
+ next unless l =~ /^DR\s+PDB; (.*)\./
46
+ id, method, resolution, region = $1.split(";").collect{|v| v.strip}
47
+ chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
48
+ pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
49
+ }
50
+ pdb
51
+ end
52
+
53
+ def self.variants(protein)
54
+ url = UNIPROT_TEXT.sub "[PROTEIN]", protein
55
+ text = Open.read(url)
56
+
57
+ text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
58
+
59
+ parts = text.split(/^(FT \w+)/)
60
+ parts.shift
61
+
62
+ variants = []
63
+
64
+ type = nil
65
+ parts.each do |part|
66
+ if type.nil?
67
+ type = part
68
+ else
69
+ if type !~ /VARIANT/
70
+ type = nil
71
+ next
72
+ end
73
+ type = nil
74
+
75
+ value = part.gsub("\nFT", '').gsub(/\s+/, ' ')
76
+ # 291 291 K -> E (in sporadic cancers; somatic mutation). /FTId=VAR_045413.
77
+ case
78
+ when value.match(/(\d+) (\d+) ([A-Z])\s*\-\>\s*([A-Z]) (.*)\. \/FTId=(.*)/)
79
+ start, eend, ref, mut, desc, id = $1, $2, $3, $4, $5, $6
80
+ when value.match(/(\d+) (\d+) (.*)\. \/FTId=(.*)/)
81
+ start, eend, ref, mut, desc, id = $1, $2, nil, nil, $3, $4
82
+ else
83
+ Log.debug "Value not understood: #{ value }"
84
+ end
85
+ variants << {
86
+ :start => start,
87
+ :end => eend,
88
+ :ref => ref,
89
+ :mut => mut,
90
+ :desc => desc,
91
+ :id => id,
92
+ }
93
+ end
94
+ end
95
+
96
+ variants
97
+ end
98
+
99
+
100
+ def self.cath(protein)
101
+ url = UNIPROT_TEXT.sub "[PROTEIN]", protein
102
+ text = Open.read(url)
103
+
104
+ cath = {}
105
+ text.split(/\n/).each{|l|
106
+ next unless l =~ /^DR\s+Gene3D; G3DSA:(.*)\./
107
+ id, description, cuantity = $1.split(";").collect{|v| v.strip}
108
+ cath[id] = {:description => description, :cuantity => cuantity}
109
+ }
110
+ cath
111
+ end
112
+
113
+ def self.cath_domains(protein)
114
+ pdbs = pdbs(protein).keys.uniq
115
+ pdbs.collect do |pdb|
116
+ Cath.domains_for_pdb(pdb)
117
+ end.flatten.compact
118
+ end
119
+
120
+ def self.pdbs_covering_aa_position(protein, aa_position)
121
+ Uniprot.pdbs(protein).select do |pdb, info|
122
+ info[:region].include? aa_position
123
+ end
124
+ end
125
+ end
@@ -5,6 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
5
 
6
6
  $taxs = [9606]
7
7
  $scientific_name = "Homo sapiens"
8
+ $ortholog_key = "human_ensembl_gene"
8
9
 
9
10
  $biomart_db = 'hsapiens_gene_ensembl'
10
11
  $biomart_db_germline_variation = 'hsapiens_snp'
@@ -97,9 +98,5 @@ $biomart_go_2009= [
97
98
  ["GO CC ID", 'go_cellular_component_id'],
98
99
  ]
99
100
 
100
- $biomart_pfam= [
101
- ["Pfam Domain", 'pfam'],
102
- ]
103
-
104
101
  $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
105
102
  load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
@@ -0,0 +1,57 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
+ require 'rbbt/sources/biomart'
3
+ require 'rbbt/sources/entrez'
4
+ require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
+
6
+ $taxs = [10090]
7
+ $scientific_name = "Mus musculus"
8
+ $ortholog_key = "mouse_ensembl_gene"
9
+
10
+ $biomart_db = 'mmusculus_gene_ensembl'
11
+ $biomart_db_germline_variation = 'mmusculus_snp'
12
+ $biomart_db_somatic_variation = 'mmusculus_snp_som'
13
+
14
+ $biomart_lexicon = [
15
+ [ 'Associated Gene Name' , "external_gene_id"],
16
+ [ 'HGNC symbol', "hgnc_symbol" ],
17
+ [ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
18
+ [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
19
+ ]
20
+
21
+ $biomart_protein_identifiers = [
22
+ [ 'Protein ID', "protein_id" ],
23
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
24
+ [ 'Unigene ID', "unigene" ],
25
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
26
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
27
+ ]
28
+
29
+ $biomart_probe_identifiers = [
30
+ ]
31
+
32
+ $biomart_identifiers = [
33
+ [ 'Entrez Gene ID', "entrezgene"],
34
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
35
+ [ 'Associated Gene Name', "external_gene_id" ],
36
+ [ 'CCDS ID', "ccds" ],
37
+ [ 'Protein ID', "protein_id" ],
38
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
39
+ [ 'Unigene ID', "unigene" ],
40
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
41
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
42
+ [ 'EMBL (Genbank) ID' , "embl"] ,
43
+ ]
44
+
45
+ $biomart_go= [
46
+ ["GO ID", 'go_id'],
47
+ ["GO Namespace", 'namespace_1003'],
48
+ ]
49
+
50
+ $biomart_go_2009= [
51
+ ["GO BP ID", 'go_biological_process_id'],
52
+ ["GO MF ID", 'go_molecular_function_id'],
53
+ ["GO CC ID", 'go_cellular_component_id'],
54
+ ]
55
+
56
+ $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
57
+ load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
@@ -9,6 +9,7 @@ $scientific_name = "Rattus norvegicus"
9
9
  $biomart_db = 'rnorvegicus_gene_ensembl'
10
10
  $biomart_db_germline_variation = 'rnorvegicus_snp'
11
11
  $biomart_db_somatic_variation = 'rnorvegicus_snp_som'
12
+ $ortholog_key = "rat_ensembl_gene"
12
13
 
13
14
  $biomart_lexicon = [
14
15
  [ 'Associated Gene Name' , "external_gene_id"],
@@ -8,6 +8,7 @@ $native = "SGD ID"
8
8
  $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
9
9
  $biomart_db = 'scerevisiae_gene_ensembl'
10
10
  $biomart_main = ['Entrez Gene ID', 'entrezgene']
11
+ $ortholog_key = "yeast_ensembl_gene"
11
12
 
12
13
 
13
14
  file 'scientific_name' do |t|
@@ -1,3 +1,5 @@
1
+ require 'net/ftp'
2
+
1
3
  $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
2
4
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
3
5
  $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
@@ -56,7 +58,9 @@ $biomart_exon_phase = [
56
58
  ['Phase','phase'],
57
59
  ]
58
60
 
59
-
61
+ $biomart_pfam= [
62
+ ["Pfam Domain", 'pfam'],
63
+ ]
60
64
 
61
65
  $biomart_exons = [
62
66
  $biomart_ensembl_gene,
@@ -71,6 +75,12 @@ file 'scientific_name' do |t|
71
75
  File.open(t.name, 'w') do |f| f.write $scientific_name end
72
76
  end
73
77
 
78
+ file 'ortholog_key' do |t|
79
+ raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil?
80
+
81
+ File.open(t.name, 'w') do |f| f.write $ortholog_key end
82
+ end
83
+
74
84
  file 'identifiers' do |t|
75
85
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
76
86
  identifiers.unnamed = true
@@ -456,6 +466,49 @@ file 'chromosomes' do |t|
456
466
  File.open(t.name, 'w') do |f| f.puts goterms end
457
467
  end
458
468
 
469
+ rule /^chromosome_.*/ do |t|
470
+ chr = t.name.match(/chromosome_(.*)/)[1]
471
+
472
+ archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
473
+
474
+ release = case archive
475
+ when "may2009"
476
+ "release-54"
477
+ when "jun2011"
478
+ "release-64"
479
+ when nil
480
+ Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
481
+ end
482
+
483
+
484
+ ftp = Net::FTP.new("ftp.ensembl.org")
485
+ ftp.login
486
+ ftp.chdir("pub/#{ release }/fasta/")
487
+ ftp.chdir($scientific_name.downcase.sub(" ",'_'))
488
+ ftp.chdir('dna')
489
+ file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
490
+
491
+ raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
492
+
493
+ Log.debug("Downloading chromosome sequence: #{ file }")
494
+ TmpFile.with_file do |tmpfile|
495
+ ftp.getbinaryfile(file, tmpfile)
496
+ Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
497
+ ftp.close
498
+ end
499
+ end
500
+
501
+ rule /^possible_ortholog_(.*)/ do |t|
502
+ other = t.name.match(/ortholog_(.*)/)[1]
503
+ other_key = Organism.ortholog_key(other).produce.read
504
+ BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
505
+ end
506
+
507
+ rule /^ortholog_(.*)/ do |t|
508
+ other = t.name.match(/ortholog_(.*)/)[1]
509
+ other_key = Organism.ortholog_key(other).produce.read
510
+ BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
511
+ end
459
512
 
460
513
  rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
461
514
  t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 31
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 1.1.0
10
+ version: 1.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-11-17 00:00:00 +01:00
18
+ date: 2012-01-13 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -108,20 +108,23 @@ files:
108
108
  - lib/rbbt/sources/barcode.rb
109
109
  - lib/rbbt/sources/bibtex.rb
110
110
  - lib/rbbt/sources/biomart.rb
111
+ - lib/rbbt/sources/cath.rb
111
112
  - lib/rbbt/sources/entrez.rb
112
113
  - lib/rbbt/sources/go.rb
113
114
  - lib/rbbt/sources/gscholar.rb
114
115
  - lib/rbbt/sources/jochem.rb
115
116
  - lib/rbbt/sources/organism.rb
116
- - lib/rbbt/sources/organism/sequence.rb
117
+ - lib/rbbt/sources/pfam.rb
117
118
  - lib/rbbt/sources/polysearch.rb
118
119
  - lib/rbbt/sources/pubmed.rb
119
120
  - lib/rbbt/sources/tfacts.rb
121
+ - lib/rbbt/sources/uniprot.rb
120
122
  - lib/rbbt/sources/wgEncodeBroadHmm.rb
121
123
  - share/install/InterPro/Rakefile
122
124
  - share/install/JoChem/Rakefile
123
125
  - share/install/NCI/Rakefile
124
126
  - share/install/Organism/Hsa/Rakefile
127
+ - share/install/Organism/Mmu/Rakefile
125
128
  - share/install/Organism/Rno/Rakefile
126
129
  - share/install/Organism/Sce/Rakefile
127
130
  - share/install/Organism/organism_helpers.rb
@@ -1,612 +0,0 @@
1
- require 'rbbt/sources/organism'
2
- require 'rbbt/util/workflow'
3
- require 'bio'
4
- # Sequence analyses
5
- module Organism
6
- extend WorkFlow
7
- relative_to Rbbt, "share/organisms"
8
- self.jobdir = Rbbt.var.organism.find
9
-
10
- def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
11
- exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
12
- transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
13
-
14
- transcripts = begin
15
- exon_transcripts[exon].first
16
- rescue
17
- []
18
- end
19
-
20
- transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
21
- end
22
-
23
- def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
24
- transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
25
- transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
26
-
27
- utr5 = transcript_5utr[transcript]
28
-
29
- raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
30
-
31
- return nil if utr5 > offset
32
-
33
- sequence = transcript_sequence[transcript]
34
- raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
35
-
36
- ccds_offset = offset - utr5
37
- return nil if ccds_offset > sequence.length
38
-
39
- range = (utr5..-1)
40
- sequence = sequence[range]
41
-
42
- codon = ccds_offset / 3
43
- codon_offset = ccds_offset % 3
44
-
45
- [sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
46
- end
47
-
48
- def self.codon_change(allele, codon, offset)
49
- original = Bio::Sequence::NA .new(codon).translate
50
- codon = codon.dup
51
- codon[offset] = allele
52
- new = Bio::Sequence::NA .new(codon).translate
53
- [original, new]
54
- end
55
-
56
- def self.genes_at_chromosome_positions(org, chromosome, positions)
57
- chromosome = chromosome.to_s
58
- chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
59
- tsv = file.tsv(:persistence => false, :type => :list)
60
- tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
61
- [gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
62
- end
63
- end
64
-
65
- if Array === positions
66
- positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
67
- else
68
- pos = chromosome_bed[positions];
69
- pos.nil? ? nil : pos.first
70
- end
71
- end
72
-
73
- def self.genes_at_genomic_positions(org, positions)
74
- positions = [positions] unless Array === positions.first
75
- genes = []
76
- chromosomes = {}
77
- indices = {}
78
- positions.each_with_index do |info,i|
79
- chr, pos = info
80
- chromosomes[chr] ||= []
81
- indices[chr] ||= []
82
- chromosomes[chr] << pos
83
- indices[chr] << i
84
- end
85
-
86
- chromosomes.each do |chr, pos_list|
87
- chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
88
- chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
89
- end
90
-
91
- genes
92
- end
93
-
94
- def self.exons_at_chromosome_positions(org, chromosome, positions)
95
- chromosome = chromosome.to_s
96
- chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
97
- tsv = file.tsv(:persistence => true, :type => :list)
98
- tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
99
- [exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
100
- end
101
- end
102
-
103
- if Array === positions
104
- positions.collect{|position|
105
- chromosome_bed[position];
106
- }
107
- else
108
- chromosome_bed[positions];
109
- end
110
- end
111
-
112
-
113
- def self.exons_at_genomic_positions(org, positions)
114
- positions = [positions] unless Array === positions.first
115
-
116
- exons = []
117
- chromosomes = {}
118
- indices = {}
119
- positions.each_with_index do |info,i|
120
- chr, pos = info
121
- chromosomes[chr] ||= []
122
- indices[chr] ||= []
123
- chromosomes[chr] << pos
124
- indices[chr] << i
125
- end
126
-
127
- chromosomes.each do |chr, pos_list|
128
- chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
129
- chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
130
- end
131
-
132
- exons
133
- end
134
-
135
- def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
136
- exons ||= Organism.exons(org).tsv(:persistence => true)
137
- transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
138
-
139
- sizes = [0]
140
- rank = nil
141
- transcript_exons[transcript].zip_fields.each do |_exon, _rank|
142
- _rank = _rank.to_i
143
- s, e = exons[_exon].values_at("Start", "End")
144
- size = e.to_i - s.to_i + 1
145
- sizes[_rank] = size
146
- rank = _rank if _exon == exon
147
- end
148
-
149
- if not rank.nil?
150
- sizes[0..rank - 1].inject(0){|e,acc| acc += e}
151
- else
152
- nil
153
- end
154
- end
155
-
156
- def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
157
- exon_info ||= Organism.exons(org).tsv(:persistence => true)
158
- exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
159
-
160
- exons = [exons] unless Array === exons
161
- transcript_offsets = {}
162
- exons.each do |exon|
163
- transcript_offsets[exon] ||= {}
164
- offsets = nil
165
- next unless exon_offsets.include? exon
166
- offsets = exon_offsets[exon].zip_fields
167
-
168
- offsets.collect do |transcript, offset|
169
- next if transcript.empty?
170
- transcript_offsets[exon][transcript] = offset.to_i
171
- end
172
- end
173
-
174
- transcript_offsets
175
- end
176
-
177
- def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
178
- exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
179
- exon_start ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
180
- exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
181
- exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
182
-
183
- exons = exons_at_genomic_positions(org, positions)
184
- offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
185
-
186
- position_exons = {}
187
- positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
188
-
189
- position_offsets = {}
190
- position_exons.each do |position,pos_exons|
191
- chr, pos = position
192
- next if pos_exons.nil? or pos_exons.empty?
193
- pos_exons.each do |exon|
194
- if offsets.include? exon
195
- if exon_strand[exon] == 1
196
- offset_in_exon = (pos.to_i - exon_start[exon].to_i)
197
- else
198
- offset_in_exon = (exon_end[exon] - pos.to_i)
199
- end
200
- position_offsets[position] ||= {}
201
- offsets[exon].each do |transcript, offset|
202
- if not offset.nil?
203
- position_offsets[position][transcript] = [offset + offset_in_exon, exon_strand[exon]]
204
- end
205
- end
206
- end
207
- end
208
- end
209
-
210
- position_offsets
211
- end
212
-
213
- def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
214
- chromosome = chromosome.to_s
215
- chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
216
- tsv = file.tsv(:persistence => true, :type => :list)
217
- tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
218
- [exon, values["Exon Chr Start"].to_i]
219
- end
220
- end
221
- chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
222
- tsv = file.tsv(:persistence => true, :type => :list)
223
- tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
224
- [exon, values["Exon Chr End"].to_i]
225
- end
226
- end
227
-
228
- if Array === positions
229
- positions.collect{|position|
230
- position = position.to_i
231
- chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
232
- }
233
- else
234
- position = positions.to_i
235
- chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
236
- end
237
-
238
- end
239
-
240
- def self.exon_junctures_at_genomic_positions(org, positions)
241
- positions = [positions] unless Array === positions.first
242
-
243
- exons = []
244
- chromosomes = {}
245
- indices = {}
246
- positions.each_with_index do |info,i|
247
- chr, pos = info
248
- chromosomes[chr] ||= []
249
- indices[chr] ||= []
250
- chromosomes[chr] << pos
251
- indices[chr] << i
252
- end
253
-
254
- chromosomes.each do |chr, pos_list|
255
- chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
256
- chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
257
- end
258
-
259
- exons
260
- end
261
-
262
- def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
263
- chromosome = chromosome.to_s
264
-
265
- chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
266
- rows = []
267
- chromosome = options[:chromosome]
268
- f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
269
- while not f.eof?
270
- line = f.gets.chomp
271
- id, chr, pos = line.split "\t"
272
- rows << [id, pos.to_i]
273
- end
274
-
275
- rows
276
- end
277
-
278
- if Array === positions
279
- positions.collect{|position|
280
- chromosome_bed[position];
281
- }
282
- else
283
- chromosome_bed[positions];
284
- end
285
- end
286
-
287
-
288
- def self.identify_variations_at_genomic_positions(org, positions, variations_file)
289
- positions = [positions] unless Array === positions.first
290
-
291
- variations = []
292
- chromosomes = {}
293
- indices = {}
294
- positions.each_with_index do |info,i|
295
- chr, pos = info
296
- chromosomes[chr] ||= []
297
- indices[chr] ||= []
298
- chromosomes[chr] << pos
299
- indices[chr] << i
300
- end
301
-
302
- chromosomes.each do |chr, pos_list|
303
- chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
304
- chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
305
- end
306
-
307
- variations
308
- end
309
-
310
- task_option :organism, "Organism", :string, "Hsa"
311
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
312
- task_dependencies nil
313
- task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
314
- genomic_mutations = case
315
- when TSV === genomic_mutations
316
- genomic_mutations
317
- else
318
- TSV.new StringIO.new(genomic_mutations), :list
319
- end
320
- genomic_mutations.key_field ||= "Position"
321
- genomic_mutations.fields ||= ["Mutation"]
322
-
323
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
324
-
325
- step(:resources, "Load Resources")
326
-
327
- exon_junctures = {}
328
- genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
329
- exon_junctures[position] = exons
330
- end
331
-
332
- genomic_mutations.add_field "Exon Junctions" do |position, values|
333
- exon_junctures[position] * "|"
334
- end
335
-
336
- genomic_mutations.to_s :sort, true
337
- end
338
-
339
-
340
- task_option :organism, "Organism", :string, "Hsa"
341
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
342
- task_dependencies nil
343
- task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
344
- genomic_mutations = case
345
- when TSV === genomic_mutations
346
- genomic_mutations
347
- else
348
- TSV.new StringIO.new(genomic_mutations), :list
349
- end
350
- genomic_mutations.key_field ||= "Position"
351
- genomic_mutations.fields ||= ["Mutation"]
352
-
353
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
354
-
355
- step(:resources, "Load Resources")
356
- genes_at_positions = Hash[*genomic_mutations.keys.zip(Organism.genes_at_genomic_positions(org, positions)).flatten]
357
-
358
- genomic_mutations.add_field "#{org.sub(/\/.*/,'')}:Ensembl Gene ID" do |position, values|
359
- genes_at_positions[position]
360
- end
361
-
362
- genomic_mutations
363
- end
364
-
365
-
366
- task_description <<-EOF
367
- Translates a collection of mutations in genomic coordinates into mutations in aminoacids for the
368
- protein products of transcripts including those positions.
369
- EOF
370
- task_option :organism, "Organism", :string, "Hsa"
371
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
372
- task_dependencies nil
373
- task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
374
- genomic_mutations = case
375
- when TSV === genomic_mutations
376
- genomic_mutations
377
- else
378
- TSV.new StringIO.new(genomic_mutations), :list
379
- end
380
-
381
- genomic_mutations.key_field ||= "Position"
382
- genomic_mutations.fields ||= ["Mutation"]
383
-
384
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
385
-
386
- step(:prepare, "Prepare Results")
387
- results = TSV.new({})
388
- results.key_field = "Position"
389
- results.fields = ["#{org.sub(/\/.*/,'')}:Ensembl Transcript ID", "Protein Mutation"]
390
- results.type = :double
391
- results.filename = path
392
-
393
- step(:resources, "Load Resources")
394
- transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
395
- transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
396
- exon_offsets = Organism.exon_offsets(org).tsv(:double, :persistence => true)
397
- exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
398
- exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
399
- exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
400
- transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
401
-
402
- step(:offsets, "Find transcripts and offsets for mutations")
403
- offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
404
-
405
- step(:aminoacid, "Translate mutation to amino acid substitutions")
406
- offsets.each do |position, transcripts|
407
- if genomic_mutations.type === :double
408
- alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
409
- else
410
- alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
411
- end
412
-
413
- transcripts.each do |transcript, offset_info|
414
- offset, strand = offset_info
415
- codon = begin
416
- Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
417
- rescue
418
- Log.medium $!.message
419
- next
420
- end
421
-
422
- if not codon.nil? and not codon.empty?
423
- alleles.each do |allele|
424
- allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
425
- change = Organism.codon_change(allele, *codon.values_at(0,1))
426
- pos_code = position * ":"
427
- mutation = [change.first, codon.last + 1, change.last] * ""
428
- if results.include? pos_code
429
- results[pos_code] = results[pos_code].merge [transcript, mutation]
430
- else
431
- results[pos_code] = [[transcript], [mutation]]
432
- end
433
- end
434
- end
435
- end
436
-
437
- end
438
-
439
- step(:identify_proteins, "Identify Proteins for Transcripts")
440
- transcript_field = results.identify_field "Ensembl Transcript ID"
441
- results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
442
- values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
443
- end
444
-
445
-
446
- results
447
- end
448
-
449
-
450
- task_option :organism, "Organism", :string, "Hsa"
451
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
452
- task_dependencies nil
453
- task :identify_germline_variations => :tsv do |org,genomic_mutations|
454
- genomic_mutations = case
455
- when TSV === genomic_mutations
456
- genomic_mutations
457
- else
458
- TSV.new StringIO.new(genomic_mutations), :list
459
- end
460
-
461
- genomic_mutations.key_field ||= "Position"
462
- genomic_mutations.fields ||= ["Mutation"]
463
-
464
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
465
-
466
-
467
- step(:prepare, "Prepare Results")
468
- results = TSV.new({})
469
- results.key_field = "Position"
470
- results.fields = ["SNP Id"]
471
- results.type = :double
472
- results.filename = path
473
-
474
-
475
- step(:resources, "Load Resources")
476
-
477
- snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
478
- snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
479
-
480
- genomic_mutations.add_field "Germline SNP Id" do |position, values|
481
- snps_for_positions[position]
482
- end
483
-
484
- genomic_mutations
485
- end
486
-
487
-
488
- task_option :organism, "Organism", :string, "Hsa"
489
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
490
- task_dependencies nil
491
- task :identify_somatic_variations => :tsv do |org,genomic_mutations|
492
- genomic_mutations = case
493
- when TSV === genomic_mutations
494
- genomic_mutations
495
- else
496
- TSV.new StringIO.new(genomic_mutations), :list
497
- end
498
-
499
- genomic_mutations.key_field ||= "Position"
500
- genomic_mutations.fields ||= ["Mutation"]
501
-
502
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
503
-
504
-
505
- step(:prepare, "Prepare Results")
506
- results = TSV.new({})
507
- results.key_field = "Position"
508
- results.fields = ["SNP Id"]
509
- results.type = :double
510
- results.filename = path
511
-
512
-
513
- step(:resources, "Load Resources")
514
-
515
- snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
516
- snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
517
-
518
- genomic_mutations.add_field "Germline SNP Id" do |position, values|
519
- snps_for_positions[position]
520
- end
521
-
522
- genomic_mutations
523
- end
524
-
525
-
526
- end
527
-
528
- if __FILE__ == $0
529
- require 'rbbt/util/log'
530
- require 'benchmark'
531
-
532
- select = <<-EOF
533
- 3:64581875
534
- EOF
535
- select = select.split("\n").collect{|l| l.split(":")}
536
-
537
- picmi_test = <<-EOF
538
- #Chromosome Name Position Reference Tumor
539
- 1 100382265 C G
540
- 1 100380997 A G
541
- 22 30163533 A C
542
- X 10094215 G A
543
- X 10085674 C T
544
- 20 50071099 G T
545
- 21 19638426 G T
546
- 2 230633386 C T
547
- 2 230312220 C T
548
- 1 100624830 T A
549
- 4 30723053 G T
550
- EOF
551
-
552
- # Build 37
553
- picmi_test = <<-EOF
554
- #Chromosome Name Position Reference Tumor
555
- 1 100624830 T A
556
- 21 19638426 G T
557
- EOF
558
-
559
- exon_juncture_test = <<-EOF
560
- #Position Mutation
561
- 7:150753996 T
562
- EOF
563
-
564
-
565
- job = Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
566
- job.run
567
- job.clean if job.error?
568
- puts job.messages
569
- puts job.read
570
-
571
- # # Build 36
572
- # picmi_test = <<-EOF
573
- ##Chromosome Name Position Reference Tumor
574
- #3 81780820 T C
575
- #2 43881517 A T
576
- #2 43857514 T C
577
- #6 88375602 G A
578
- #16 69875502 G T
579
- #16 69876078 T C
580
- #16 69877147 G A
581
- #17 8101874 C T
582
- # EOF
583
-
584
-
585
- Log.severity = 2
586
- org = 'Hsa/may2009'
587
- file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
588
-
589
- #positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
590
- positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
591
- positions.key_field = "Position"
592
- positions.fields = %w(Reference Control Tumor)
593
- #positions.fields = %w(Reference Tumor)
594
-
595
- #puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
596
-
597
-
598
- #positions = positions.select ["10:98099540"]
599
-
600
- Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
601
- job = Organism.job :genomic_mutations_to_protein_mutations, "Metastasis", org, positions.slice("Tumor")
602
- job.run
603
-
604
- while not job.done?
605
- puts job.step
606
- sleep 2
607
- end
608
-
609
- raise job.messages.last if job.error?
610
- mutations = job.load
611
-
612
- end