rbbt-sources 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,57 +8,115 @@ end
8
8
 
9
9
  if defined? Entity
10
10
 
11
- module NCINaturePathways
11
+ module NCINaturePathway
12
12
  extend Entity
13
13
  self.format = "NCI Nature Pathway ID"
14
14
 
15
+ self.annotation :organism
16
+
17
+ def self.name_index
18
+ @name_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :single)
19
+ end
20
+
21
+ def self.gene_index
22
+ @gene_index ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
23
+ end
24
+
25
+ def self.filter(query, field = nil, options = nil, entity = nil)
26
+ return true if query == entity
27
+
28
+ return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
29
+
30
+ false
31
+ end
15
32
  property :name => :array2single do
16
- @name ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
33
+ @name ||= NCINaturePathway.name_index.values_at *self
17
34
  end
18
35
 
19
36
  property :genes => :array2single do
20
- @genes ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
37
+ @genes ||= NCINaturePathway.gene_index.values_at *self
21
38
  end
22
39
  end
23
40
 
24
- module NCIReactomePathways
41
+ module NCIReactomePathway
25
42
  extend Entity
26
43
  self.format = "NCI Reactome Pathway ID"
44
+
45
+ self.annotation :organism
46
+
47
+ def self.name_index
48
+ @name_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :single)
49
+ end
50
+
51
+ def self.gene_index
52
+ @gene_index ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true)
53
+ end
54
+
55
+ def self.filter(query, field = nil, options = nil, entity = nil)
56
+ return true if query == entity
57
+
58
+ return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
59
+
60
+ false
61
+ end
27
62
 
28
63
  property :name => :array2single do
29
- @name ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
64
+ @name ||= NCIReactomePathway.name_index.values_at *self
30
65
  end
31
66
 
32
67
  property :genes => :array2single do
33
- @genes ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
68
+ @genes ||= NCIReactomePathway.gene_index.values_at *self
34
69
  end
35
70
  end
36
71
 
37
- module NCIBioCartaPathways
72
+ module NCIBioCartaPathway
38
73
  extend Entity
39
74
  self.format = "NCI BioCarta Pathway ID"
40
75
 
76
+ self.annotation :organism
77
+
78
+ def self.name_index
79
+ @name_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :single)
80
+ end
81
+
82
+ def self.gene_index
83
+ @gene_index ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true)
84
+ end
85
+
86
+ def self.filter(query, field = nil, options = nil, entity = nil)
87
+ return true if query == entity
88
+
89
+ return true if self.setup(entity.dup, options.merge(:format => field)).name.index query
90
+
91
+ false
92
+ end
93
+
41
94
  property :name => :array2single do
42
- @name ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
95
+ @name ||= NCIBioCartaPathway.name_index.values_at *self
43
96
  end
44
97
 
45
98
  property :genes => :array2single do
46
- @genes ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true).values_at *self
99
+ @genes ||= NCIBioCartaPathway.gene_index.values_at(*self).
100
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }
47
101
  end
48
102
  end
49
103
 
50
104
  if defined? Gene and Entity === Gene
51
105
  module Gene
52
106
  property :nature_pathways => :array2single do
53
- @nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
107
+ @nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).
108
+ values_at(*self.to("UniProt/SwissProt Accession")).
109
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCINaturePathway.setup(o, organism)}
54
110
  end
55
111
 
56
112
  property :reactome_pathways => :array2single do
57
- @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
113
+ @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at(*self.to("UniProt/SwissProt Accession")).
114
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIReactomePathway.setup(o, organism)}
58
115
  end
59
116
 
60
117
  property :biocarta_pathways => :array2single do
61
- @biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at *self.entrez
118
+ @biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at(*self.entrez).
119
+ each{|pth| pth.organism = organism if pth.respond_to? :organism }.tap{|o| NCIBioCartaPathway.setup(o, organism)}
62
120
  end
63
121
  end
64
122
  end
@@ -0,0 +1,142 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ module Cath
4
+ extend Resource
5
+
6
+ Rbbt.claim Rbbt.share.databases.CATH.CathNames, :proc do
7
+ tsv = TSV.setup({}, :key_field => "CATH Code", :type => :list, :fields => ["PDB ID", "CATH Domain", "CATH Description"])
8
+ Open.read("http://release.cathdb.info/v3.4.0/CathNames").split(/\n/).each do |line|
9
+ next if line =~ /^#/
10
+ code, pdb, domain, name = line.match(/([\d\.]+)\s+(\w\w\w\w)(\w\w\w)\s+:(.*)/).values_at 1,2,3,4
11
+ tsv[code] = [pdb.downcase, domain, name]
12
+ end
13
+
14
+ tsv.to_s
15
+ end
16
+
17
+ Rbbt.claim Rbbt.share.databases.CATH.CathUnclassifiedList , :proc do
18
+ Open.read("http://release.cathdb.info/v3.4.0/CathUnclassifiedList").split(/\n/).collect do |line|
19
+ next if line =~ /^#/
20
+ line.split(/\s/).first
21
+ end * "\n"
22
+ end
23
+
24
+
25
+ Rbbt.claim Rbbt.share.databases.CATH.CathDomainSeqs, :proc do
26
+ tsv = TSV.setup({}, :key_field => "CATH Domain", :type => :single, :fields => ["Cath Domain Sequence"])
27
+
28
+ Open.read("http://release.cathdb.info/v3.4.0/CathDomainSeqs.ATOM").split(/>pdb\|/).each do |chunk|
29
+ next if chunk.empty?
30
+ domain, sequence = chunk.strip.match(/(.*)\n(.*)/).values_at 1, 2
31
+ tsv[domain] = sequence
32
+ end
33
+
34
+ tsv.to_s
35
+ end
36
+
37
+
38
+ Rbbt.claim Rbbt.share.databases.CATH.CathRegions, :proc do
39
+ domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["Start", "End"])
40
+ Open.read("http://release.cathdb.info/v3.4.0/CathDomall").split(/\n/).each do |line|
41
+ next if line =~ /^#/
42
+ chain, ndomains, nfragments, rest = line.match(/(\w\w\w\w\w)\s+D(\d+)\s+F(\d+)\s+(.*)/).values_at 1,2,3,4
43
+
44
+ ndomains.to_i.times do |dn|
45
+ nsegments, rest = rest.match(/^\s*(\d+)\s+(.*)/).values_at 1, 2
46
+ segments = []
47
+ nsegments.to_i.times do |sn|
48
+ start, eend, rest = rest.match(/\w\s+(-?\d+)\s+.\s+\w\s+(-?\d+)\s+.(.*)/).values_at 1, 2, 3
49
+ segments << [start, eend]
50
+ end
51
+
52
+ domain = chain + "%02d" % dn.to_i
53
+ segments = segments[0].zip(*segments[1..-1])
54
+ domains[domain] = segments
55
+ end
56
+ end
57
+
58
+ domains.to_s
59
+ end
60
+
61
+ Rbbt.claim Rbbt.share.databases.CATH.CathDomainList, :proc do
62
+ domains = TSV.setup({}, :key_field => "Cath Domain", :type => :double, :fields => ["CATH domain name (seven characters)",
63
+ "Class number", "Architecture number", "Topology number", "Homologous superfamily number", "S35 sequence cluster number",
64
+ "S60 sequence cluster number", "S95 sequence cluster number", "S100 sequence cluster number", "S100 sequence count number",
65
+ "Domain length", "Structure resolution (Angstroms)"], :type => :list)
66
+
67
+ Open.read("http://release.cathdb.info/v3.4.0/CathDomainList").split(/\n/).each do |line|
68
+ next if line =~ /^#/
69
+ parts = line.chomp.split /\s+/
70
+ domain = parts.shift
71
+ domains[domain] = parts
72
+ end
73
+
74
+ domains.to_s
75
+ end
76
+
77
+
78
+ def self.cath_index
79
+ @@cath ||= Rbbt.share.databases.CATH.CathNames.tsv :persist => true, :case_insensitive => true
80
+ end
81
+
82
+ def self.pdb_index
83
+ if not defined? @@pdb or @@pdb.nil?
84
+ @@pdb = {}
85
+ Rbbt.share.databases.CATH.CathDomainSeqs.read.split("\n").each do |line|
86
+ domain = line.split(/\t/).first
87
+ pdb = domain[0..3]
88
+ @@pdb[pdb] ||= []
89
+ @@pdb[pdb] << domain
90
+ end
91
+ end
92
+ @@pdb
93
+ end
94
+
95
+ def self.unclassified
96
+ @@unclassified = {}
97
+ Rbbt.share.databases.CATH.CathUnclassifiedList.read.split("\n").each do |domain|
98
+ pdb = domain[0..3]
99
+ @@unclassified[pdb] ||= []
100
+ @@unclassified[pdb] << domain
101
+ end
102
+ @@unclassified
103
+ end
104
+
105
+ def self.domain_sequences
106
+ @@domain_sequences ||= Rbbt.share.databases.CATH.CathDomainSeqs.tsv(:persist => true)
107
+ end
108
+
109
+ def self.pdbs(cath_code)
110
+ cath = cath_index
111
+ if cath.include? cath_code
112
+ cath[cath_code]["PDB ID"]
113
+ else
114
+ nil
115
+ end
116
+ end
117
+
118
+ def self.domains_for_pdb(pdb)
119
+ pdb2cath = pdb_index
120
+ (pdb2cath[pdb] || []) + (unclassified[pdb] || [])
121
+ end
122
+
123
+ def self.align(domain, sequence)
124
+ require 'bio'
125
+
126
+ return nil if not domain_sequences.include? domain
127
+
128
+ TmpFile.with_file(">target\n" << sequence) do |target|
129
+ TmpFile.with_file(">domain\n" << domain_sequences[domain]) do |domain|
130
+
131
+ result = CMD.cmd("fasta35 #{ target } #{ domain }").read
132
+
133
+ if result.match(/([\d\.]+)% identity.*overlap \((\d+)-(\d+):/s)
134
+ {:identity => $1.to_f, :range => ($2.to_i..$3.to_i)}
135
+ else
136
+ false
137
+ end
138
+ end
139
+ end
140
+ end
141
+ end
142
+
@@ -91,22 +91,33 @@ if defined? Entity
91
91
  extend Entity
92
92
  self.format = "GO ID"
93
93
 
94
+ self.annotation :organism
95
+
94
96
  property :name => :array2single do
95
97
  @name ||= GO.id2name(self)
96
98
  end
97
99
 
98
100
  property :genes => :array2single do |organism|
101
+ organism ||= self.organism
99
102
  @genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
100
103
  end
104
+
105
+ property :description => :single2array do
106
+ description = GO.info[self]['def']
107
+ description.gsub!(/"|\[.*\]/,'') if description
108
+
109
+ description
110
+ end
111
+
101
112
  end
102
113
 
103
114
  if defined? Gene and Entity === Gene
104
115
  module Gene
105
- property :go_terms => :array2single do |organism|
116
+ property :go_terms => :array2single do
106
117
  @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
107
118
  end
108
-
109
- property :go_bp_terms => :array2single do |organism|
119
+
120
+ property :go_bp_terms => :array2single do
110
121
  @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
111
122
  end
112
123
  end
@@ -7,7 +7,7 @@ module Organism
7
7
  self.pkgdir = "rbbt"
8
8
  self.subdir = "share/organisms"
9
9
 
10
- ["Hsa", "Rno", "Sce"].each do |organism|
10
+ ["Hsa", "Mmu", "Rno", "Sce"].each do |organism|
11
11
  claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
12
12
 
13
13
  module_eval "#{ organism } = with_key '#{organism}'"
@@ -0,0 +1,35 @@
1
+ require 'rbbt'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/resource'
4
+
5
+ module Pfam
6
+ extend Resource
7
+ self.subdir = "share/databases/Pfam"
8
+
9
+ Pfam.claim Pfam.domains, :proc do
10
+ url = "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
11
+ tsv = TSV.open(Open.open(url), :key_field => "Pfam Domain ID", :fields => ["Pfam Clan ID", "Code Name", "Name", "Description"])
12
+ tsv.to_s
13
+ end
14
+
15
+ NAMES_FILE = Rbbt.share.databases.InterPro.pfam_names.find
16
+
17
+ def self.name_index
18
+ @name_index ||= TSV.open NAMES_FILE, :single
19
+ end
20
+
21
+ def self.name(id)
22
+ name_index[id]
23
+ end
24
+ end
25
+
26
+ if defined? Entity
27
+ module PfamDomain
28
+ extend Entity
29
+ self.format = "Pfam Domain"
30
+
31
+ property :name => :array2single do
32
+ self.collect{|id| Pfam.name(id)}
33
+ end
34
+ end
35
+ end
@@ -13,11 +13,13 @@ module PubMed
13
13
 
14
14
  pmids_complete = pmids.is_a?(Array) ? pmids : [pmids]
15
15
 
16
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
16
17
  articles = []
17
- Misc.divide(pmids_complete, (pmids_complete.length / 500) + 1).each do |pmid_list|
18
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list * ","}"
19
-
20
- xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
18
+ Misc.divide(pmids.sort, (pmids.length / 1000) + 1) do |pmid_list|
19
+ postdata = "db=pubmed&retmode=xml&id=#{pmid_list* ","}"
20
+ xml = TmpFile.with_file(postdata) do |postfile|
21
+ Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile)
22
+ end
21
23
 
22
24
  articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
23
25
  end
@@ -202,14 +204,8 @@ module PubMed
202
204
  }
203
205
 
204
206
  return list unless missing.any?
205
- chunk_size = [100, missing.length].min
206
- chunks = (missing.length.to_f / chunk_size).ceil
207
207
 
208
- articles = {}
209
- chunks.times do |chunk|
210
- pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
211
- articles.merge!(get_online(pmids))
212
- end
208
+ articles = get_online(missing)
213
209
 
214
210
  articles.each{|p, xml|
215
211
  filename = p + '.xml'
@@ -45,7 +45,6 @@ module TFacts
45
45
  end
46
46
  end
47
47
 
48
-
49
48
  if defined? Entity and defined? Gene and Entity === Gene
50
49
 
51
50
  module Gene
@@ -0,0 +1,125 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/resource'
3
+ require 'rbbt/sources/cath'
4
+ require 'rbbt/sources/uniprot'
5
+
6
+ module Uniprot
7
+ extend Resource
8
+ self.subdir = "share/databases/Uniprot"
9
+
10
+ Uniprot.claim Uniprot.annotated_variants, :proc do
11
+ url = "http://www.uniprot.org/docs/humsavar.txt"
12
+ tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
13
+ :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[0..5] + [(parts[6..-1] || []) * " "]) * "\t"}, :type => :list,:key_field => "Associated Gene Name",
14
+ :fields => ["Uniprot/SwissProt Accession", "Uniprot Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
15
+
16
+ tsv.unnamed = true
17
+ tsv.process "Amino Acid Mutation" do |mutation|
18
+ if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
19
+ wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
20
+ mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
21
+ [wt, $2, mut] * ""
22
+ else
23
+ mutation
24
+ end
25
+ end
26
+
27
+ uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
28
+ mutation_pos = tsv.identify_field "Amino Acid Mutation"
29
+ tsv.add_field "Mutated Isoform" do |key, values|
30
+ [values[uniprot_pos], values[mutation_pos]] * ":"
31
+ end
32
+
33
+ tsv.reorder("Mutated Isoform").to_s
34
+ end
35
+
36
+
37
+ UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
38
+ def self.pdbs(protein)
39
+ url = UNIPROT_TEXT.sub "[PROTEIN]", protein
40
+ text = Open.read(url)
41
+
42
+ pdb = {}
43
+
44
+ text.split(/\n/).each{|l|
45
+ next unless l =~ /^DR\s+PDB; (.*)\./
46
+ id, method, resolution, region = $1.split(";").collect{|v| v.strip}
47
+ chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
48
+ pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
49
+ }
50
+ pdb
51
+ end
52
+
53
+ def self.variants(protein)
54
+ url = UNIPROT_TEXT.sub "[PROTEIN]", protein
55
+ text = Open.read(url)
56
+
57
+ text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
58
+
59
+ parts = text.split(/^(FT \w+)/)
60
+ parts.shift
61
+
62
+ variants = []
63
+
64
+ type = nil
65
+ parts.each do |part|
66
+ if type.nil?
67
+ type = part
68
+ else
69
+ if type !~ /VARIANT/
70
+ type = nil
71
+ next
72
+ end
73
+ type = nil
74
+
75
+ value = part.gsub("\nFT", '').gsub(/\s+/, ' ')
76
+ # 291 291 K -> E (in sporadic cancers; somatic mutation). /FTId=VAR_045413.
77
+ case
78
+ when value.match(/(\d+) (\d+) ([A-Z])\s*\-\>\s*([A-Z]) (.*)\. \/FTId=(.*)/)
79
+ start, eend, ref, mut, desc, id = $1, $2, $3, $4, $5, $6
80
+ when value.match(/(\d+) (\d+) (.*)\. \/FTId=(.*)/)
81
+ start, eend, ref, mut, desc, id = $1, $2, nil, nil, $3, $4
82
+ else
83
+ Log.debug "Value not understood: #{ value }"
84
+ end
85
+ variants << {
86
+ :start => start,
87
+ :end => eend,
88
+ :ref => ref,
89
+ :mut => mut,
90
+ :desc => desc,
91
+ :id => id,
92
+ }
93
+ end
94
+ end
95
+
96
+ variants
97
+ end
98
+
99
+
100
+ def self.cath(protein)
101
+ url = UNIPROT_TEXT.sub "[PROTEIN]", protein
102
+ text = Open.read(url)
103
+
104
+ cath = {}
105
+ text.split(/\n/).each{|l|
106
+ next unless l =~ /^DR\s+Gene3D; G3DSA:(.*)\./
107
+ id, description, cuantity = $1.split(";").collect{|v| v.strip}
108
+ cath[id] = {:description => description, :cuantity => cuantity}
109
+ }
110
+ cath
111
+ end
112
+
113
+ def self.cath_domains(protein)
114
+ pdbs = pdbs(protein).keys.uniq
115
+ pdbs.collect do |pdb|
116
+ Cath.domains_for_pdb(pdb)
117
+ end.flatten.compact
118
+ end
119
+
120
+ def self.pdbs_covering_aa_position(protein, aa_position)
121
+ Uniprot.pdbs(protein).select do |pdb, info|
122
+ info[:region].include? aa_position
123
+ end
124
+ end
125
+ end
@@ -5,6 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
5
 
6
6
  $taxs = [9606]
7
7
  $scientific_name = "Homo sapiens"
8
+ $ortholog_key = "human_ensembl_gene"
8
9
 
9
10
  $biomart_db = 'hsapiens_gene_ensembl'
10
11
  $biomart_db_germline_variation = 'hsapiens_snp'
@@ -97,9 +98,5 @@ $biomart_go_2009= [
97
98
  ["GO CC ID", 'go_cellular_component_id'],
98
99
  ]
99
100
 
100
- $biomart_pfam= [
101
- ["Pfam Domain", 'pfam'],
102
- ]
103
-
104
101
  $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
105
102
  load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
@@ -0,0 +1,57 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
+ require 'rbbt/sources/biomart'
3
+ require 'rbbt/sources/entrez'
4
+ require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
+
6
+ $taxs = [10090]
7
+ $scientific_name = "Mus musculus"
8
+ $ortholog_key = "mouse_ensembl_gene"
9
+
10
+ $biomart_db = 'mmusculus_gene_ensembl'
11
+ $biomart_db_germline_variation = 'mmusculus_snp'
12
+ $biomart_db_somatic_variation = 'mmusculus_snp_som'
13
+
14
+ $biomart_lexicon = [
15
+ [ 'Associated Gene Name' , "external_gene_id"],
16
+ [ 'HGNC symbol', "hgnc_symbol" ],
17
+ [ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
18
+ [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
19
+ ]
20
+
21
+ $biomart_protein_identifiers = [
22
+ [ 'Protein ID', "protein_id" ],
23
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
24
+ [ 'Unigene ID', "unigene" ],
25
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
26
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
27
+ ]
28
+
29
+ $biomart_probe_identifiers = [
30
+ ]
31
+
32
+ $biomart_identifiers = [
33
+ [ 'Entrez Gene ID', "entrezgene"],
34
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
35
+ [ 'Associated Gene Name', "external_gene_id" ],
36
+ [ 'CCDS ID', "ccds" ],
37
+ [ 'Protein ID', "protein_id" ],
38
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
39
+ [ 'Unigene ID', "unigene" ],
40
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
41
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
42
+ [ 'EMBL (Genbank) ID' , "embl"] ,
43
+ ]
44
+
45
+ $biomart_go= [
46
+ ["GO ID", 'go_id'],
47
+ ["GO Namespace", 'namespace_1003'],
48
+ ]
49
+
50
+ $biomart_go_2009= [
51
+ ["GO BP ID", 'go_biological_process_id'],
52
+ ["GO MF ID", 'go_molecular_function_id'],
53
+ ["GO CC ID", 'go_cellular_component_id'],
54
+ ]
55
+
56
+ $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
57
+ load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
@@ -9,6 +9,7 @@ $scientific_name = "Rattus norvegicus"
9
9
  $biomart_db = 'rnorvegicus_gene_ensembl'
10
10
  $biomart_db_germline_variation = 'rnorvegicus_snp'
11
11
  $biomart_db_somatic_variation = 'rnorvegicus_snp_som'
12
+ $ortholog_key = "rat_ensembl_gene"
12
13
 
13
14
  $biomart_lexicon = [
14
15
  [ 'Associated Gene Name' , "external_gene_id"],
@@ -8,6 +8,7 @@ $native = "SGD ID"
8
8
  $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
9
9
  $biomart_db = 'scerevisiae_gene_ensembl'
10
10
  $biomart_main = ['Entrez Gene ID', 'entrezgene']
11
+ $ortholog_key = "yeast_ensembl_gene"
11
12
 
12
13
 
13
14
  file 'scientific_name' do |t|
@@ -1,3 +1,5 @@
1
+ require 'net/ftp'
2
+
1
3
  $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
2
4
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
3
5
  $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
@@ -56,7 +58,9 @@ $biomart_exon_phase = [
56
58
  ['Phase','phase'],
57
59
  ]
58
60
 
59
-
61
+ $biomart_pfam= [
62
+ ["Pfam Domain", 'pfam'],
63
+ ]
60
64
 
61
65
  $biomart_exons = [
62
66
  $biomart_ensembl_gene,
@@ -71,6 +75,12 @@ file 'scientific_name' do |t|
71
75
  File.open(t.name, 'w') do |f| f.write $scientific_name end
72
76
  end
73
77
 
78
+ file 'ortholog_key' do |t|
79
+ raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil?
80
+
81
+ File.open(t.name, 'w') do |f| f.write $ortholog_key end
82
+ end
83
+
74
84
  file 'identifiers' do |t|
75
85
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
76
86
  identifiers.unnamed = true
@@ -456,6 +466,49 @@ file 'chromosomes' do |t|
456
466
  File.open(t.name, 'w') do |f| f.puts goterms end
457
467
  end
458
468
 
469
+ rule /^chromosome_.*/ do |t|
470
+ chr = t.name.match(/chromosome_(.*)/)[1]
471
+
472
+ archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
473
+
474
+ release = case archive
475
+ when "may2009"
476
+ "release-54"
477
+ when "jun2011"
478
+ "release-64"
479
+ when nil
480
+ Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
481
+ end
482
+
483
+
484
+ ftp = Net::FTP.new("ftp.ensembl.org")
485
+ ftp.login
486
+ ftp.chdir("pub/#{ release }/fasta/")
487
+ ftp.chdir($scientific_name.downcase.sub(" ",'_'))
488
+ ftp.chdir('dna')
489
+ file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
490
+
491
+ raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
492
+
493
+ Log.debug("Downloading chromosome sequence: #{ file }")
494
+ TmpFile.with_file do |tmpfile|
495
+ ftp.getbinaryfile(file, tmpfile)
496
+ Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
497
+ ftp.close
498
+ end
499
+ end
500
+
501
+ rule /^possible_ortholog_(.*)/ do |t|
502
+ other = t.name.match(/ortholog_(.*)/)[1]
503
+ other_key = Organism.ortholog_key(other).produce.read
504
+ BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
505
+ end
506
+
507
+ rule /^ortholog_(.*)/ do |t|
508
+ other = t.name.match(/ortholog_(.*)/)[1]
509
+ other_key = Organism.ortholog_key(other).produce.read
510
+ BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
511
+ end
459
512
 
460
513
  rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
461
514
  t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 31
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 1.1.0
10
+ version: 1.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-11-17 00:00:00 +01:00
18
+ date: 2012-01-13 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -108,20 +108,23 @@ files:
108
108
  - lib/rbbt/sources/barcode.rb
109
109
  - lib/rbbt/sources/bibtex.rb
110
110
  - lib/rbbt/sources/biomart.rb
111
+ - lib/rbbt/sources/cath.rb
111
112
  - lib/rbbt/sources/entrez.rb
112
113
  - lib/rbbt/sources/go.rb
113
114
  - lib/rbbt/sources/gscholar.rb
114
115
  - lib/rbbt/sources/jochem.rb
115
116
  - lib/rbbt/sources/organism.rb
116
- - lib/rbbt/sources/organism/sequence.rb
117
+ - lib/rbbt/sources/pfam.rb
117
118
  - lib/rbbt/sources/polysearch.rb
118
119
  - lib/rbbt/sources/pubmed.rb
119
120
  - lib/rbbt/sources/tfacts.rb
121
+ - lib/rbbt/sources/uniprot.rb
120
122
  - lib/rbbt/sources/wgEncodeBroadHmm.rb
121
123
  - share/install/InterPro/Rakefile
122
124
  - share/install/JoChem/Rakefile
123
125
  - share/install/NCI/Rakefile
124
126
  - share/install/Organism/Hsa/Rakefile
127
+ - share/install/Organism/Mmu/Rakefile
125
128
  - share/install/Organism/Rno/Rakefile
126
129
  - share/install/Organism/Sce/Rakefile
127
130
  - share/install/Organism/organism_helpers.rb
@@ -1,612 +0,0 @@
1
- require 'rbbt/sources/organism'
2
- require 'rbbt/util/workflow'
3
- require 'bio'
4
- # Sequence analyses
5
- module Organism
6
- extend WorkFlow
7
- relative_to Rbbt, "share/organisms"
8
- self.jobdir = Rbbt.var.organism.find
9
-
10
- def self.coding_transcripts_for_exon(org, exon, exon_transcripts, transcript_info)
11
- exon_transcripts ||= Organism.transcript_exons(org).tsv(:double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
12
- transcript_info ||= Organism.transcripts.tsv(org).tsv(:list, :persistence => true )
13
-
14
- transcripts = begin
15
- exon_transcripts[exon].first
16
- rescue
17
- []
18
- end
19
-
20
- transcripts.select{|transcript| transcript_info[transcript]["Ensembl Protein ID"].any?}
21
- end
22
-
23
- def self.codon_at_transcript_position(org, transcript, offset, transcript_sequence = nil, transcript_5utr = nil)
24
- transcript_sequence ||= Organism.transcript_sequence(org).tsv(:single, :persistence => true)
25
- transcript_5utr ||= Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
26
-
27
- utr5 = transcript_5utr[transcript]
28
-
29
- raise "UTR5 for transcript #{ transcript } was missing" if utr5.nil?
30
-
31
- return nil if utr5 > offset
32
-
33
- sequence = transcript_sequence[transcript]
34
- raise "Sequence for transcript #{ transcript } was missing" if sequence.nil? if sequence.nil?
35
-
36
- ccds_offset = offset - utr5
37
- return nil if ccds_offset > sequence.length
38
-
39
- range = (utr5..-1)
40
- sequence = sequence[range]
41
-
42
- codon = ccds_offset / 3
43
- codon_offset = ccds_offset % 3
44
-
45
- [sequence[(codon * 3)..((codon + 1) * 3 - 1)], codon_offset, codon]
46
- end
47
-
48
- def self.codon_change(allele, codon, offset)
49
- original = Bio::Sequence::NA .new(codon).translate
50
- codon = codon.dup
51
- codon[offset] = allele
52
- new = Bio::Sequence::NA .new(codon).translate
53
- [original, new]
54
- end
55
-
56
- def self.genes_at_chromosome_positions(org, chromosome, positions)
57
- chromosome = chromosome.to_s
58
- chromosome_bed = Persistence.persist(Organism.gene_positions(org), "Gene_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
59
- tsv = file.tsv(:persistence => false, :type => :list)
60
- tsv.select("Chromosome Name" => chromosome).collect do |gene, values|
61
- [gene, values.values_at("Gene Start", "Gene End").collect{|p| p.to_i}]
62
- end
63
- end
64
-
65
- if Array === positions
66
- positions.collect{|position| pos = chromosome_bed[position]; pos.nil? ? nil : pos.first}
67
- else
68
- pos = chromosome_bed[positions];
69
- pos.nil? ? nil : pos.first
70
- end
71
- end
72
-
73
- def self.genes_at_genomic_positions(org, positions)
74
- positions = [positions] unless Array === positions.first
75
- genes = []
76
- chromosomes = {}
77
- indices = {}
78
- positions.each_with_index do |info,i|
79
- chr, pos = info
80
- chromosomes[chr] ||= []
81
- indices[chr] ||= []
82
- chromosomes[chr] << pos
83
- indices[chr] << i
84
- end
85
-
86
- chromosomes.each do |chr, pos_list|
87
- chr_genes = genes_at_chromosome_positions(org, chr, pos_list)
88
- chr_genes.zip(indices[chr]).each do |gene, index| genes[index] = gene end
89
- end
90
-
91
- genes
92
- end
93
-
94
- def self.exons_at_chromosome_positions(org, chromosome, positions)
95
- chromosome = chromosome.to_s
96
- chromosome_bed = Persistence.persist(Organism.exons(org), "Exon_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => true) do |file, options|
97
- tsv = file.tsv(:persistence => true, :type => :list)
98
- tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
99
- [exon, values.values_at("Exon Chr Start", "Exon Chr End").collect{|p| p.to_i}]
100
- end
101
- end
102
-
103
- if Array === positions
104
- positions.collect{|position|
105
- chromosome_bed[position];
106
- }
107
- else
108
- chromosome_bed[positions];
109
- end
110
- end
111
-
112
-
113
- def self.exons_at_genomic_positions(org, positions)
114
- positions = [positions] unless Array === positions.first
115
-
116
- exons = []
117
- chromosomes = {}
118
- indices = {}
119
- positions.each_with_index do |info,i|
120
- chr, pos = info
121
- chromosomes[chr] ||= []
122
- indices[chr] ||= []
123
- chromosomes[chr] << pos
124
- indices[chr] << i
125
- end
126
-
127
- chromosomes.each do |chr, pos_list|
128
- chr_exons = exons_at_chromosome_positions(org, chr, pos_list)
129
- chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
130
- end
131
-
132
- exons
133
- end
134
-
135
- def self.exon_offset_in_transcript(org, exon, transcript, exons = nil, transcript_exons = nil)
136
- exons ||= Organism.exons(org).tsv(:persistence => true)
137
- transcript_exons ||= Organism.transcript_exons(org).tsv(:double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true)
138
-
139
- sizes = [0]
140
- rank = nil
141
- transcript_exons[transcript].zip_fields.each do |_exon, _rank|
142
- _rank = _rank.to_i
143
- s, e = exons[_exon].values_at("Start", "End")
144
- size = e.to_i - s.to_i + 1
145
- sizes[_rank] = size
146
- rank = _rank if _exon == exon
147
- end
148
-
149
- if not rank.nil?
150
- sizes[0..rank - 1].inject(0){|e,acc| acc += e}
151
- else
152
- nil
153
- end
154
- end
155
-
156
- def self.exon_transcript_offsets(org, exons, exon_offsets = nil, exon_info = nil)
157
- exon_info ||= Organism.exons(org).tsv(:persistence => true)
158
- exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
159
-
160
- exons = [exons] unless Array === exons
161
- transcript_offsets = {}
162
- exons.each do |exon|
163
- transcript_offsets[exon] ||= {}
164
- offsets = nil
165
- next unless exon_offsets.include? exon
166
- offsets = exon_offsets[exon].zip_fields
167
-
168
- offsets.collect do |transcript, offset|
169
- next if transcript.empty?
170
- transcript_offsets[exon][transcript] = offset.to_i
171
- end
172
- end
173
-
174
- transcript_offsets
175
- end
176
-
177
- def self.genomic_position_transcript_offsets(org, positions, exon_offsets = nil, exon_start = nil, exon_end = nil, exon_strand = nil)
178
- exon_offsets ||= Organism.exon_offsets(org).tsv(:double, :persistence => true)
179
- exon_start ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
180
- exon_end ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
181
- exon_strand ||= Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
182
-
183
- exons = exons_at_genomic_positions(org, positions)
184
- offsets = Organism.exon_transcript_offsets(org, exons.flatten.uniq, exon_offsets, exon_info)
185
-
186
- position_exons = {}
187
- positions.zip(exons).each do |position,pos_exons| position_exons[position] = pos_exons end
188
-
189
- position_offsets = {}
190
- position_exons.each do |position,pos_exons|
191
- chr, pos = position
192
- next if pos_exons.nil? or pos_exons.empty?
193
- pos_exons.each do |exon|
194
- if offsets.include? exon
195
- if exon_strand[exon] == 1
196
- offset_in_exon = (pos.to_i - exon_start[exon].to_i)
197
- else
198
- offset_in_exon = (exon_end[exon] - pos.to_i)
199
- end
200
- position_offsets[position] ||= {}
201
- offsets[exon].each do |transcript, offset|
202
- if not offset.nil?
203
- position_offsets[position][transcript] = [offset + offset_in_exon, exon_strand[exon]]
204
- end
205
- end
206
- end
207
- end
208
- end
209
-
210
- position_offsets
211
- end
212
-
213
- def self.exon_junctures_at_chromosome_positions(org, chromosome, positions)
214
- chromosome = chromosome.to_s
215
- chromosome_start = Persistence.persist(Organism.exons(org), "Exon_start[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
216
- tsv = file.tsv(:persistence => true, :type => :list)
217
- tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
218
- [exon, values["Exon Chr Start"].to_i]
219
- end
220
- end
221
- chromosome_end = Persistence.persist(Organism.exons(org), "Exon_end[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
222
- tsv = file.tsv(:persistence => true, :type => :list)
223
- tsv.select("Chromosome Name" => chromosome).collect do |exon, values|
224
- [exon, values["Exon Chr End"].to_i]
225
- end
226
- end
227
-
228
- if Array === positions
229
- positions.collect{|position|
230
- position = position.to_i
231
- chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
232
- }
233
- else
234
- position = positions.to_i
235
- chromosome_start[(position - 2)..(position + 2)] + chromosome_end[(position - 2)..(position + 2)];
236
- end
237
-
238
- end
239
-
240
- def self.exon_junctures_at_genomic_positions(org, positions)
241
- positions = [positions] unless Array === positions.first
242
-
243
- exons = []
244
- chromosomes = {}
245
- indices = {}
246
- positions.each_with_index do |info,i|
247
- chr, pos = info
248
- chromosomes[chr] ||= []
249
- indices[chr] ||= []
250
- chromosomes[chr] << pos
251
- indices[chr] << i
252
- end
253
-
254
- chromosomes.each do |chr, pos_list|
255
- chr_exons = exon_junctures_at_chromosome_positions(org, chr, pos_list)
256
- chr_exons.zip(indices[chr]).each do |exon, index| exons[index] = exon end
257
- end
258
-
259
- exons
260
- end
261
-
262
- def self.identify_variations_at_chromosome_positions(org, chromosome, positions, variations)
263
- chromosome = chromosome.to_s
264
-
265
- chromosome_bed = Persistence.persist(variations, "Variation_positions[#{chromosome}]", :fwt, :chromosome => chromosome, :range => false) do |file, options|
266
- rows = []
267
- chromosome = options[:chromosome]
268
- f = CMD.cmd("grep '[[:space:]]#{chromosome}[[:space:]]' #{ file }", :pipe => true)
269
- while not f.eof?
270
- line = f.gets.chomp
271
- id, chr, pos = line.split "\t"
272
- rows << [id, pos.to_i]
273
- end
274
-
275
- rows
276
- end
277
-
278
- if Array === positions
279
- positions.collect{|position|
280
- chromosome_bed[position];
281
- }
282
- else
283
- chromosome_bed[positions];
284
- end
285
- end
286
-
287
-
288
- def self.identify_variations_at_genomic_positions(org, positions, variations_file)
289
- positions = [positions] unless Array === positions.first
290
-
291
- variations = []
292
- chromosomes = {}
293
- indices = {}
294
- positions.each_with_index do |info,i|
295
- chr, pos = info
296
- chromosomes[chr] ||= []
297
- indices[chr] ||= []
298
- chromosomes[chr] << pos
299
- indices[chr] << i
300
- end
301
-
302
- chromosomes.each do |chr, pos_list|
303
- chr_variations = identify_variations_at_chromosome_positions(org, chr, pos_list, variations_file)
304
- chr_variations.zip(indices[chr]).each do |variation, index| variations[index] = variation end
305
- end
306
-
307
- variations
308
- end
309
-
310
- task_option :organism, "Organism", :string, "Hsa"
311
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
312
- task_dependencies nil
313
- task :genomic_mutations_in_exon_junctions => :tsv do |org,genomic_mutations|
314
- genomic_mutations = case
315
- when TSV === genomic_mutations
316
- genomic_mutations
317
- else
318
- TSV.new StringIO.new(genomic_mutations), :list
319
- end
320
- genomic_mutations.key_field ||= "Position"
321
- genomic_mutations.fields ||= ["Mutation"]
322
-
323
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
324
-
325
- step(:resources, "Load Resources")
326
-
327
- exon_junctures = {}
328
- genomic_mutations.keys.zip(Organism.exon_junctures_at_genomic_positions(org, positions)).each do |position, exons|
329
- exon_junctures[position] = exons
330
- end
331
-
332
- genomic_mutations.add_field "Exon Junctions" do |position, values|
333
- exon_junctures[position] * "|"
334
- end
335
-
336
- genomic_mutations.to_s :sort, true
337
- end
338
-
339
-
340
- task_option :organism, "Organism", :string, "Hsa"
341
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
342
- task_dependencies nil
343
- task :genomic_mutations_to_genes => :tsv do |org,genomic_mutations|
344
- genomic_mutations = case
345
- when TSV === genomic_mutations
346
- genomic_mutations
347
- else
348
- TSV.new StringIO.new(genomic_mutations), :list
349
- end
350
- genomic_mutations.key_field ||= "Position"
351
- genomic_mutations.fields ||= ["Mutation"]
352
-
353
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
354
-
355
- step(:resources, "Load Resources")
356
- genes_at_positions = Hash[*genomic_mutations.keys.zip(Organism.genes_at_genomic_positions(org, positions)).flatten]
357
-
358
- genomic_mutations.add_field "#{org.sub(/\/.*/,'')}:Ensembl Gene ID" do |position, values|
359
- genes_at_positions[position]
360
- end
361
-
362
- genomic_mutations
363
- end
364
-
365
-
366
- task_description <<-EOF
367
- Translates a collection of mutations in genomic coordinates into mutations in aminoacids for the
368
- protein products of transcripts including those positions.
369
- EOF
370
- task_option :organism, "Organism", :string, "Hsa"
371
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
372
- task_dependencies nil
373
- task :genomic_mutations_to_protein_mutations => :tsv do |org,genomic_mutations|
374
- genomic_mutations = case
375
- when TSV === genomic_mutations
376
- genomic_mutations
377
- else
378
- TSV.new StringIO.new(genomic_mutations), :list
379
- end
380
-
381
- genomic_mutations.key_field ||= "Position"
382
- genomic_mutations.fields ||= ["Mutation"]
383
-
384
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
385
-
386
- step(:prepare, "Prepare Results")
387
- results = TSV.new({})
388
- results.key_field = "Position"
389
- results.fields = ["#{org.sub(/\/.*/,'')}:Ensembl Transcript ID", "Protein Mutation"]
390
- results.type = :double
391
- results.filename = path
392
-
393
- step(:resources, "Load Resources")
394
- transcript_sequence = Organism.transcript_sequence(org).tsv(:single, :persistence => true)
395
- transcript_5utr = Organism.transcript_5utr(org).tsv(:single, :persistence => true, :cast => 'to_i')
396
- exon_offsets = Organism.exon_offsets(org).tsv(:double, :persistence => true)
397
- exon_start = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr Start"], :cast => :to_i)
398
- exon_end = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Chr End"], :cast => :to_i)
399
- exon_strand = Organism.exons(org).tsv(:single, :persistence => true, :fields => ["Exon Strand"], :cast => :to_i)
400
- transcript_to_protein = Organism.transcripts(org).tsv(:single, :fields => "Ensembl Protein ID", :persistence => true)
401
-
402
- step(:offsets, "Find transcripts and offsets for mutations")
403
- offsets = Organism.genomic_position_transcript_offsets(org, positions, exon_offsets, exon_start, exon_end, exon_strand)
404
-
405
- step(:aminoacid, "Translate mutation to amino acid substitutions")
406
- offsets.each do |position, transcripts|
407
- if genomic_mutations.type === :double
408
- alleles = genomic_mutations[position * ":"]["Mutation"].collect{|mutation| Misc.IUPAC_to_base(mutation)}.compact.flatten
409
- else
410
- alleles = Misc.IUPAC_to_base(genomic_mutations[position * ":"]["Mutation"]) || []
411
- end
412
-
413
- transcripts.each do |transcript, offset_info|
414
- offset, strand = offset_info
415
- codon = begin
416
- Organism.codon_at_transcript_position(org, transcript, offset, transcript_sequence, transcript_5utr)
417
- rescue
418
- Log.medium $!.message
419
- next
420
- end
421
-
422
- if not codon.nil? and not codon.empty?
423
- alleles.each do |allele|
424
- allele = Misc::BASE2COMPLEMENT[allele] if strand == "-1"
425
- change = Organism.codon_change(allele, *codon.values_at(0,1))
426
- pos_code = position * ":"
427
- mutation = [change.first, codon.last + 1, change.last] * ""
428
- if results.include? pos_code
429
- results[pos_code] = results[pos_code].merge [transcript, mutation]
430
- else
431
- results[pos_code] = [[transcript], [mutation]]
432
- end
433
- end
434
- end
435
- end
436
-
437
- end
438
-
439
- step(:identify_proteins, "Identify Proteins for Transcripts")
440
- transcript_field = results.identify_field "Ensembl Transcript ID"
441
- results.add_field "#{org.sub(/\/.*/,'')}:Ensembl Protein ID" do |key,values|
442
- values[transcript_field].collect do |transcript| transcript_to_protein[transcript] end
443
- end
444
-
445
-
446
- results
447
- end
448
-
449
-
450
- task_option :organism, "Organism", :string, "Hsa"
451
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
452
- task_dependencies nil
453
- task :identify_germline_variations => :tsv do |org,genomic_mutations|
454
- genomic_mutations = case
455
- when TSV === genomic_mutations
456
- genomic_mutations
457
- else
458
- TSV.new StringIO.new(genomic_mutations), :list
459
- end
460
-
461
- genomic_mutations.key_field ||= "Position"
462
- genomic_mutations.fields ||= ["Mutation"]
463
-
464
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
465
-
466
-
467
- step(:prepare, "Prepare Results")
468
- results = TSV.new({})
469
- results.key_field = "Position"
470
- results.fields = ["SNP Id"]
471
- results.type = :double
472
- results.filename = path
473
-
474
-
475
- step(:resources, "Load Resources")
476
-
477
- snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.germline_variations(org).produce).collect{|ids| ids * "|"}
478
- snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
479
-
480
- genomic_mutations.add_field "Germline SNP Id" do |position, values|
481
- snps_for_positions[position]
482
- end
483
-
484
- genomic_mutations
485
- end
486
-
487
-
488
- task_option :organism, "Organism", :string, "Hsa"
489
- task_option :genomic_mutations, "Position (chr:position)\tMutation", :tsv
490
- task_dependencies nil
491
- task :identify_somatic_variations => :tsv do |org,genomic_mutations|
492
- genomic_mutations = case
493
- when TSV === genomic_mutations
494
- genomic_mutations
495
- else
496
- TSV.new StringIO.new(genomic_mutations), :list
497
- end
498
-
499
- genomic_mutations.key_field ||= "Position"
500
- genomic_mutations.fields ||= ["Mutation"]
501
-
502
- positions = genomic_mutations.keys.collect{|l| l.split(":")}
503
-
504
-
505
- step(:prepare, "Prepare Results")
506
- results = TSV.new({})
507
- results.key_field = "Position"
508
- results.fields = ["SNP Id"]
509
- results.type = :double
510
- results.filename = path
511
-
512
-
513
- step(:resources, "Load Resources")
514
-
515
- snp_ids = Organism.identify_variations_at_genomic_positions(org, positions, Organism.somatic_variations(org).produce).collect{|ids| ids * "|"}
516
- snps_for_positions = Hash[*genomic_mutations.keys.zip(snp_ids).flatten]
517
-
518
- genomic_mutations.add_field "Germline SNP Id" do |position, values|
519
- snps_for_positions[position]
520
- end
521
-
522
- genomic_mutations
523
- end
524
-
525
-
526
- end
527
-
528
- if __FILE__ == $0
529
- require 'rbbt/util/log'
530
- require 'benchmark'
531
-
532
- select = <<-EOF
533
- 3:64581875
534
- EOF
535
- select = select.split("\n").collect{|l| l.split(":")}
536
-
537
- picmi_test = <<-EOF
538
- #Chromosome Name Position Reference Tumor
539
- 1 100382265 C G
540
- 1 100380997 A G
541
- 22 30163533 A C
542
- X 10094215 G A
543
- X 10085674 C T
544
- 20 50071099 G T
545
- 21 19638426 G T
546
- 2 230633386 C T
547
- 2 230312220 C T
548
- 1 100624830 T A
549
- 4 30723053 G T
550
- EOF
551
-
552
- # Build 37
553
- picmi_test = <<-EOF
554
- #Chromosome Name Position Reference Tumor
555
- 1 100624830 T A
556
- 21 19638426 G T
557
- EOF
558
-
559
- exon_juncture_test = <<-EOF
560
- #Position Mutation
561
- 7:150753996 T
562
- EOF
563
-
564
-
565
- job = Organism.job :genomic_mutations_in_exon_junctures, "Test1", TSV.new(StringIO.new(exon_juncture_test), :list, :sep => " "), :organism => "Hsa"
566
- job.run
567
- job.clean if job.error?
568
- puts job.messages
569
- puts job.read
570
-
571
- # # Build 36
572
- # picmi_test = <<-EOF
573
- ##Chromosome Name Position Reference Tumor
574
- #3 81780820 T C
575
- #2 43881517 A T
576
- #2 43857514 T C
577
- #6 88375602 G A
578
- #16 69875502 G T
579
- #16 69876078 T C
580
- #16 69877147 G A
581
- #17 8101874 C T
582
- # EOF
583
-
584
-
585
- Log.severity = 2
586
- org = 'Hsa/may2009'
587
- file = File.join(ENV["HOME"], 'git/rbbt-util/integration_test/data/Metastasis.tsv')
588
-
589
- #positions = TSV.new(StringIO.new(picmi_test), :list, :sep => /\s+/, :fix => Proc.new{|l| l.sub(/\s+/,':')})
590
- positions = TSV.new(file, :list, :fix => Proc.new{|l| l.sub(/\t/,':')})
591
- positions.key_field = "Position"
592
- positions.fields = %w(Reference Control Tumor)
593
- #positions.fields = %w(Reference Tumor)
594
-
595
- #puts positions.slice(["Reference", "Tumor"]).to_s.split(/\n/).collect{|line| next if line =~ /#/; parts = line.split(/\t|:/); parts[3] = Misc.IUPAC_to_base(parts[3]).first; parts * ","}.compact * "\n"
596
-
597
-
598
- #positions = positions.select ["10:98099540"]
599
-
600
- Organism.basedir = Rbbt.tmp.organism.sequence.jobs.find :user
601
- job = Organism.job :genomic_mutations_to_protein_mutations, "Metastasis", org, positions.slice("Tumor")
602
- job.run
603
-
604
- while not job.done?
605
- puts job.step
606
- sleep 2
607
- end
608
-
609
- raise job.messages.last if job.error?
610
- mutations = job.load
611
-
612
- end