rbbt-sources 0.4.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,10 +1,10 @@
1
1
  require 'rbbt'
2
2
 
3
3
  module Polysearch
4
- Rbbt.share.Polysearch.organ.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
5
- Rbbt.share.Polysearch.tissue.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
6
- Rbbt.share.Polysearch.location.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
7
- Rbbt.share.Polysearch.disease.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
8
- Rbbt.share.Polysearch.drug.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
4
+ Rbbt.claim Rbbt.share.databases.Polysearch.organ, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
5
+ Rbbt.claim Rbbt.share.databases.Polysearch.tissue, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
6
+ Rbbt.claim Rbbt.share.databases.Polysearch.location, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
7
+ Rbbt.claim Rbbt.share.databases.Polysearch.disease, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
8
+ Rbbt.claim Rbbt.share.databases.Polysearch.drug, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
9
9
  end
10
10
 
@@ -1,5 +1,6 @@
1
- require 'rbbt-util'
1
+ require 'rbbt'
2
2
  require 'libxml'
3
+ require 'rbbt/sources/gscholar'
3
4
 
4
5
  # This module offers an interface with PubMed, to perform queries, and
5
6
  # retrieve simple information from articles. It uses the caching
@@ -10,12 +11,16 @@ module PubMed
10
11
  @@pubmed_lag = 1
11
12
  def self.get_online(pmids)
12
13
 
13
- pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
14
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
14
+ pmids_complete = pmids.is_a?(Array) ? pmids : [pmids]
15
15
 
16
- xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
16
+ articles = []
17
+ Misc.divide(pmids_complete, (pmids_complete.length / 500) + 1).each do |pmid_list|
18
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list * ","}"
17
19
 
18
- articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
20
+ xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
21
+
22
+ articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
23
+ end
19
24
 
20
25
  if pmids.is_a? Array
21
26
  list = {}
@@ -0,0 +1,37 @@
1
+ require 'rbbt'
2
+
3
+ module EBChromatin
4
+ BASE_URL='http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/'
5
+
6
+ TISSUES= %w(Gm12878 H1hesc Hmec Hsmm Huvec Hepg2 K562 Nhek Nhlf)
7
+
8
+ TISSUES.each do |tissue|
9
+ file = "wgEncodeBroadHmm#{tissue}HMM.bed.gz"
10
+
11
+ Rbbt.claim Rbbt.share.databases.EBChromatin[file.match(/wgEncodeBroadHmm(.*)HMM.bed.gz/)[1]], :proc do
12
+ url = File.join(BASE_URL, file)
13
+
14
+ CMD.cmd('sed \'s/^chr\([[:alnum:]]\+\)\t\([[:digit:]]\+\)\t\([[:digit:]]\+\)/\1:\2:\3\t\1\t\2\t\3/\' | cut -f 1,2,3,4,5 | awk \'BEGIN {print "#Region ID\tChromosome Name\tStart\tEnd\tType"} /./ {print $0}\' ', :in => Open.read(url), :pipe => true).read
15
+ end
16
+ end
17
+
18
+ def self.chromosome(tissue, chr, positions)
19
+ list = Array === positions ? positions : [positions]
20
+
21
+ file = Rbbt.share.databases.EBChromatin[tissue]
22
+ chromosome_bed = Persistence.persist(file, "EBChromatin[#{tissue}][#{chr}]", :fwt, :chromosome => chr, :range => true) do |file, options|
23
+ chromosome = options[:chromosome]
24
+ tsv = file.tsv(:persist => false, :type => :list, :grep => "^#{chromosome}:\\|^#")
25
+ if tsv.size > 0
26
+ tsv.collect do |gene, values|
27
+ [gene, values.values_at("Start", "End").collect{|p| p.to_i}]
28
+ end
29
+ else
30
+ raise "No chromatin information for chromosome #{ chr } in tissue #{ tissue }"
31
+ end
32
+ end
33
+
34
+ list.collect do |pos| chromosome_bed[pos] end
35
+ end
36
+
37
+ end
@@ -0,0 +1,29 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
+ require 'rbbt/sources/biomart'
3
+ require 'rbbt/sources/entrez'
4
+
5
+ $interpro_db = 'entry'
6
+
7
+ $interpro_id = ['InterPro Entry Accession','entry_id']
8
+
9
+ $interpro_pos = [
10
+ ["UniProt/SwissProt Accession", "protein_ac"],
11
+ ["Match Start Position", "pos_from"],
12
+ ["Match Stop Position ", "pos_to"]
13
+ ]
14
+
15
+ file 'interpro_positions' do |t|
16
+ Open.write(t.name, InterPro.tsv($interpro_db, $interpro_id, $interpro_pos, [], nil, :type => :double, :nocache => true).to_s)
17
+ end
18
+
19
+ file 'interpro_names' do |t|
20
+ Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tName\n" + Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/names.dat"))
21
+ end
22
+
23
+
24
+ file 'interpro_short_names' do |t|
25
+ Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tShort Name\n" + Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/short_names.dat"))
26
+ end
27
+
28
+
29
+
@@ -0,0 +1,67 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/util/misc'
3
+
4
+ def read_chunk(jochem)
5
+ chunk = ""
6
+ while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
7
+ chunk << line
8
+ end
9
+ return nil if chunk.empty?
10
+ chunk
11
+ end
12
+
13
+ def first(list)
14
+ return nil if list.nil? or list.empty?
15
+ list.first
16
+ end
17
+
18
+ def process_jochem
19
+ jochem = Open.open("http://www.biosemantics.org/uploads/file/Jochem/JochemV1_2.zip")
20
+ identifiers = File.open('identifiers', 'w')
21
+
22
+ identifiers.puts("#: :namespace=JoChem")
23
+ identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
24
+
25
+ lexicon = File.open('lexicon', 'w')
26
+ lexicon.puts("#: :namespace=JoChem")
27
+ lexicon.puts("#ID\tSynonyms")
28
+
29
+ inchi = File.open('inchi', 'w')
30
+ inchi.puts("#: :namespace=JoChem")
31
+ inchi.puts("#ID\tInChi")
32
+
33
+ definitions = File.open('definitions', 'w')
34
+ definitions.puts("#: :namespace=JoChem#:type=:list")
35
+ definitions.puts("#ID\tDefinition")
36
+
37
+ while chunk = read_chunk(jochem) do
38
+ next if chunk.empty? or chunk =~ /^#/ or chunk =~ /^NS /
39
+ info = {}
40
+ chunk.split(/\n/).each do |line|
41
+ line.sub!(/\t@match.*/,'')
42
+ code, value = line.match(/([A-Z]*) (.*)/).values_at 1, 2
43
+ info[code] ||= []
44
+ info[code] << value
45
+ end
46
+ id = first(info["ID"])
47
+ na = first(info["NA"])
48
+ df = first(info["DF"])
49
+ tm = info["TM"] || []
50
+ db = info["DB"] || []
51
+
52
+ pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
53
+ drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
54
+ inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
55
+
56
+ lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
57
+ identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
58
+ inchi.puts [id, inch * "|"] * "\t" if inch.any?
59
+ definitions.puts [id, df] * "\t" unless df.nil?
60
+ end
61
+ end
62
+
63
+ rule /identifiers|lexicon|inchi|definitions/ do |t|
64
+ Misc.in_dir(File.dirname(t.name)) do
65
+ process_jochem
66
+ end
67
+ end
@@ -0,0 +1,79 @@
1
+ require 'nokogiri'
2
+
3
+ module NCI
4
+ def self.get_pathways(xml, format = "UP")
5
+ doc = Nokogiri::XML(xml)
6
+ pathways = {}
7
+
8
+ molecules = {}
9
+ doc.xpath("//Molecule").each do |molecule|
10
+ id = molecule.attribute('id').value
11
+ type = molecule.attribute('molecule_type').value
12
+ next unless type == "protein"
13
+ names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
14
+ next if names.empty?
15
+ molecules[id] = {:xml => molecule, :uniprot => names.first}
16
+ end
17
+
18
+ interactions = {}
19
+ doc.xpath("//Interaction").each do |interaction|
20
+ id = interaction.attribute('id').value
21
+ molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}
22
+
23
+ interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
24
+ end
25
+
26
+ doc.xpath("//Pathway").each do |pathway|
27
+ id = pathway.attribute('id').value
28
+ subnet = pathway.attribute('subnet').value
29
+ name = pathway.xpath('LongName').first.content
30
+
31
+ interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
32
+
33
+ pathway_interactions = interaction_ids.collect{|id| interactions[id]}
34
+ pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
35
+
36
+ pathway_uniprot_ids = pathway_molecule_ids.collect do |id|
37
+ next unless molecules.include? id
38
+ molecules[id][:uniprot]
39
+ end
40
+ pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
41
+ end
42
+ pathways
43
+ end
44
+ end
45
+
46
+ file 'nature_pathways' do |t|
47
+
48
+ url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/NCI-Nature_Curated.xml.gz"
49
+
50
+ xml = Open.read(url)
51
+
52
+ pathways = NCI.get_pathways(xml)
53
+
54
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
55
+ end
56
+
57
+ file 'biocarta_pathways' do |t|
58
+
59
+ url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/BioCarta.xml.gz"
60
+
61
+ xml = Open.read(url)
62
+
63
+ pathways = NCI.get_pathways(xml, "LL")
64
+
65
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
66
+ end
67
+
68
+ file 'reactome_pathways' do |t|
69
+
70
+ url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
71
+
72
+ xml = Open.read(url)
73
+
74
+ pathways = NCI.get_pathways(xml, "UP")
75
+
76
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
77
+ end
78
+
79
+
@@ -7,6 +7,8 @@ $taxs = [9606]
7
7
  $scientific_name = "Homo sapiens"
8
8
 
9
9
  $biomart_db = 'hsapiens_gene_ensembl'
10
+ $biomart_db_germline_variation = 'hsapiens_snp'
11
+ $biomart_db_somatic_variation = 'hsapiens_snp_som'
10
12
 
11
13
  $biomart_lexicon = [
12
14
  [ 'Associated Gene Name' , "external_gene_id"],
@@ -15,6 +17,14 @@ $biomart_lexicon = [
15
17
  [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
16
18
  ]
17
19
 
20
+ $biomart_protein_identifiers = [
21
+ [ 'Protein ID', "protein_id" ],
22
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
23
+ [ 'Unigene ID', "unigene" ],
24
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
25
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
26
+ ]
27
+
18
28
  $biomart_identifiers = [
19
29
  [ 'Entrez Gene ID', "entrezgene"],
20
30
  [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
@@ -42,7 +52,7 @@ $biomart_identifiers = [
42
52
  [ 'AFFY HG U95E', 'affy_hg_u95e' ],
43
53
  [ 'AFFY HG U95A', 'affy_hg_u95a' ],
44
54
  [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
45
- [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
55
+ [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
46
56
  [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
47
57
  [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
48
58
  [ 'Agilent WholeGenome',"agilent_wholegenome" ],
@@ -52,5 +62,14 @@ $biomart_identifiers = [
52
62
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
53
63
  ]
54
64
 
65
+ $biomart_go= [
66
+ ["GO ID", 'go_id'],
67
+ ["GO Namespace", 'namespace_1003'],
68
+ ]
69
+
70
+ $biomart_pfam= [
71
+ ["Pfam Domain", 'pfam'],
72
+ ]
73
+
55
74
  $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
56
75
  load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
@@ -7,6 +7,8 @@ $taxs = [10116]
7
7
  $scientific_name = "Rattus norvegicus"
8
8
 
9
9
  $biomart_db = 'rnorvegicus_gene_ensembl'
10
+ $biomart_db_germline_variation = 'rnorvegicus_snp'
11
+ $biomart_db_somatic_variation = 'rnorvegicus_snp_som'
10
12
 
11
13
  $biomart_lexicon = [
12
14
  [ 'Associated Gene Name' , "external_gene_id"],
@@ -2,8 +2,6 @@ $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
2
2
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
3
3
  $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
4
4
  $biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
5
- $biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
6
- $biomart_germline_variation_id = ['Variation ID', "external_id" ]
7
5
 
8
6
  $biomart_gene_positions = [
9
7
  ['Chromosome Name','chromosome_name'],
@@ -60,42 +58,6 @@ $biomart_exons = [
60
58
  ['Exon Chr End','exon_chrom_end'],
61
59
  ]
62
60
 
63
- #{{{ Variations
64
-
65
- $biomart_germline_variation_positions = [
66
- ['Chromosome Location (bp)', "chromosome_location" ],
67
- ['SNP Chromosome Strand', "snp_chromosome_strand" ],
68
- ['Transcript location (bp)', "transcript_location" ],
69
- ['Allele', "allele" ],
70
- ['Protein Allele', "peptide_shift" ],
71
- ['CDS Start', "cds_start_2076" ],
72
- ['CDS End', "cds_end_2076" ],
73
- ]
74
-
75
- $biomart_germline_variations = [
76
- $biomart_ensembl_gene,
77
- ['Source', "source_name" ],
78
- ['Validated', "validated" ],
79
- ['Consequence Type', "synonymous_status" ],
80
- ]
81
-
82
- $biomart_somatic_variation_positions = [
83
- ['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
84
- ['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
85
- ['Transcript location (bp)' , "somatic_transcript_location" ] ,
86
- ['Allele' , "somatic_allele" ] ,
87
- ['Protein Allele' , "somatic_peptide_shift" ] ,
88
- ['CDS Start' , "somatic_cds_start_2076" ] ,
89
- ['CDS End' , "somatic_cds_end_2076" ] ,
90
- ]
91
-
92
- $biomart_somatic_variations = [
93
- $biomart_ensembl_gene,
94
- ['Source' , "somatic_source_name" ] ,
95
- ['Validated' , "somatic_validated" ] ,
96
- ['Consequence Type' , "somatic_synonymous_status" ] ,
97
- ]
98
-
99
61
  #{{{ Rules
100
62
 
101
63
  file 'scientific_name' do |t|
@@ -104,15 +66,69 @@ end
104
66
 
105
67
  file 'identifiers' do |t|
106
68
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
69
+
107
70
  $biomart_identifiers.each do |name, key, prefix|
108
71
  if prefix
109
72
  identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
110
73
  end
111
74
  end
112
75
 
76
+ name_pos = identifiers.identify_field "Associated Gene Name"
77
+ entrez2name = Entrez.entrez2name($taxs)
78
+ identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
79
+ names = values[name_pos]
80
+
81
+ matches = entrez.select do |e|
82
+ entrez2name.include? e and (names & entrez2name[e]).any?
83
+ end
84
+
85
+ if matches.any?
86
+ matches
87
+ else
88
+ entrez
89
+ end
90
+ end
91
+
92
+ entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
93
+ entrez_synonyms.key_field = "Entrez Gene ID"
94
+ entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
95
+
96
+ identifiers.attach entrez_synonyms
97
+
98
+ identifiers.each do |key, values|
99
+ values.each do |list|
100
+ list.reject!{|v| v.nil? or v.empty?}
101
+ list.uniq!
102
+ end
103
+ end
104
+
105
+ File.open(t.name, 'w') do |f| f.puts identifiers end
106
+ end
107
+
108
+ file 'lexicon' => 'identifiers' do |t|
109
+ tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
110
+
111
+ entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 8
112
+ entrez_description.key_field = "Entrez Gene ID"
113
+ entrez_description.fields = ["Entrez Gene Description"]
114
+
115
+ tsv.attach entrez_description
116
+ Open.write(t.name, tsv.to_s)
117
+ end
118
+
119
+
120
+ file 'protein_identifiers' do |t|
121
+ identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_identifiers, [], nil, :namespace => $namespace)
122
+ $biomart_protein_identifiers.each do |name, key, prefix|
123
+ if prefix
124
+ identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
125
+ end
126
+ end
127
+
113
128
  File.open(t.name, 'w') do |f| f.puts identifiers end
114
129
  end
115
130
 
131
+
116
132
  file 'gene_transcripts' do |t|
117
133
  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat, :namespace => $namespace)
118
134
 
@@ -121,7 +137,7 @@ end
121
137
 
122
138
  file 'transcripts' => 'gene_positions' do |t|
123
139
  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
124
- transcripts.attach TSV.new('gene_positions'), "Chromosome Name"
140
+ transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
125
141
 
126
142
  File.open(t.name, 'w') do |f| f.puts transcripts end
127
143
  end
@@ -198,7 +214,7 @@ end
198
214
 
199
215
  file 'exons' => 'gene_positions' do |t|
200
216
  exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
201
- exons.attach TSV.new('gene_positions'), "Chromosome Name"
217
+ exons.attach TSV.open('gene_positions'), "Chromosome Name"
202
218
 
203
219
  File.open(t.name, 'w') do |f| f.puts exons end
204
220
  end
@@ -227,28 +243,21 @@ file 'transcript_sequence' do |t|
227
243
  end
228
244
 
229
245
 
230
- $biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,COMPLEX_INDEL&NMD_TRANSCRIPT,COMPLEX_INDEL&SPLICE_SITE,ESSENTIAL_SPLICE_SITE&INTRONIC,ESSENTIAL_SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,FRAMESHIFT_CODING,FRAMESHIFT_CODING&NMD_TRANSCRIPT,FRAMESHIFT_CODING&SPLICE_SITE,FRAMESHIFT_CODING&SPLICE_SITE&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING&SPLICE_SITE,NON_SYNONYMOUS_CODING&SPLICE_SITE&NMD_TRANSCRIPT,REGULATORY_REGION,SPLICE_SITE&3PRIME_UTR,SPLICE_SITE&3PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&5PRIME_UTR,SPLICE_SITE&5PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&INTRONIC,SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,SPLICE_SITE&SYNONYMOUS_CODING,SPLICE_SITE&SYNONYMOUS_CODING&NMD_TRANSCRIPT,STOP_GAINED,STOP_GAINED&FRAMESHIFT_CODING,STOP_GAINED&FRAMESHIFT_CODING&NMD_TRANSCRIPT,STOP_GAINED&NMD_TRANSCRIPT,STOP_GAINED&SPLICE_SITE,STOP_GAINED&SPLICE_SITE&NMD_TRANSCRIPT,STOP_LOST,STOP_LOST&NMD_TRANSCRIPT,STOP_LOST&SPLICE_SITE,STOP_LOST&SPLICE_SITE&NMD_TRANSCRIPT,SYNONYMOUS_CODING,SYNONYMOUS_CODING&NMD_TRANSCRIPT"]
231
- #$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
232
- $biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
246
+ #{{{ Variations
233
247
 
234
- file 'germline_variations' do |t|
235
- variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
236
- end
248
+ $biomart_variation_id = ["SNP ID", "refsnp_id"]
249
+ $biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
237
250
 
238
- file 'germline_variation_positions' do |t|
239
- variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
240
- File.open(t.name, 'w') do |f| f.puts variations.to_s end
251
+ file 'germline_variations' do |t|
252
+ BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
241
253
  end
242
254
 
243
255
  file 'somatic_variations' do |t|
244
- variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
245
- File.open(t.name, 'w') do |f| f.puts variations.to_s end
256
+ BioMart.tsv($biomart_db_somatic_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
246
257
  end
247
258
 
248
- file 'somatic_variation_positions' do |t|
249
- variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
250
- File.open(t.name, 'w') do |f| f.puts variations.to_s end
251
- end
259
+
260
+ # {{{ Other info
252
261
 
253
262
  file 'gene_pmids' do |t|
254
263
  tsv = Entrez.entrez2pubmed($taxs)
@@ -260,47 +269,95 @@ file 'gene_pmids' do |t|
260
269
  Open.write(t.name, text)
261
270
  end
262
271
 
263
- file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
264
- require 'rbbt/sources/organism/sequence'
272
+ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
273
+ transcripts = begin
274
+ exon_transcripts[exon].first
275
+ rescue
276
+ []
277
+ end
265
278
 
266
- exons = TSV.new('exons', :persistence => true)
267
- exon_transcripts = TSV.new('transcript_exons', :double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
268
- gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
269
- transcript_info = TSV.new('transcripts', :list, :persistence => true )
270
- transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
279
+ transcripts.select{|transcript| transcript_info[transcript].first.any?}
280
+ end
271
281
 
282
+ def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
283
+ sizes = [0]
284
+ rank = nil
285
+ start_pos = exons.identify_field "Exon Chr Start"
286
+ end_pos = exons.identify_field "Exon Chr End"
287
+
288
+ Misc.zip_fields(transcript_exons[transcript]).each do |_exon, _rank|
289
+ _rank = _rank.to_i
290
+ s, e = exons[_exon].values_at(start_pos, end_pos)
291
+ size = e.to_i - s.to_i + 1
292
+ sizes[_rank] = size
293
+ rank = _rank if _exon == exon
294
+ end
295
+
296
+ if not rank.nil?
297
+ sizes[0..rank - 1].inject(0){|e,acc| acc += e}
298
+ else
299
+ nil
300
+ end
301
+ end
302
+
303
+ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
304
+ exons = TSV.open('exons')
305
+ exon_transcripts = nil
306
+ exon_transcripts = TSV.open('transcript_exons', :double, :key_field => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true)
307
+ gene_transcripts = TSV.open('gene_transcripts', :flat)
308
+ transcript_info = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
309
+ transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
272
310
 
273
311
  string = "#: :namespace=#{$namespace}"
274
312
  string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
275
- exons.each do |exon, info|
276
- gene, start, finish, strand, chr = info
277
313
 
278
- transcripts = Organism::Hsa.coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
314
+ exons.unnamed = true
315
+ exon_transcripts.unnamed = true
316
+ gene_transcripts.unnamed = true
317
+ transcript_info.unnamed = true
318
+ transcript_exons.unnamed = true
319
+
320
+ exons.monitor = true
321
+ Misc.profile do
322
+ exons.through do |exon, info|
323
+ gene, start, finish, strand, chr = info
279
324
 
280
- transcript_offsets = {}
281
- transcripts.each do |transcript|
282
- offset = Organism::Hsa.exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
283
- transcript_offsets[transcript] = offset unless offset.nil?
325
+ transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
326
+
327
+ transcript_offsets = {}
328
+ transcripts.each do |transcript|
329
+ offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
330
+ transcript_offsets[transcript] = offset unless offset.nil?
331
+ end
332
+
333
+ string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
284
334
  end
285
-
286
- string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
287
335
  end
288
336
 
289
337
  Open.write(t.name, string)
290
338
  end
291
339
 
340
+ file 'gene_go' do |t|
341
+ goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
342
+
343
+ File.open(t.name, 'w') do |f| f.puts goterms end
344
+ end
345
+
346
+
347
+ file 'gene_pfam' do |t|
348
+ goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
349
+
350
+ File.open(t.name, 'w') do |f| f.puts goterms end
351
+ end
352
+
353
+
292
354
  rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
293
355
  t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
294
356
  archive = $1
295
357
  task = $2
296
- old_pwd = FileUtils.pwd
297
- begin
298
- FileUtils.mkdir archive unless File.exists? archive
299
- FileUtils.cd File.join(archive)
358
+ Misc.in_dir(archive) do
300
359
  BioMart.set_archive archive
301
360
  Rake::Task[task].invoke
302
361
  BioMart.unset_archive
303
- ensure
304
- FileUtils.cd old_pwd
305
362
  end
306
363
  end