rbbt-sources 0.4.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  require 'rbbt'
2
2
 
3
3
  module Polysearch
4
- Rbbt.share.Polysearch.organ.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
5
- Rbbt.share.Polysearch.tissue.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
6
- Rbbt.share.Polysearch.location.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
7
- Rbbt.share.Polysearch.disease.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
8
- Rbbt.share.Polysearch.drug.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
4
+ Rbbt.claim Rbbt.share.databases.Polysearch.organ, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
5
+ Rbbt.claim Rbbt.share.databases.Polysearch.tissue, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
6
+ Rbbt.claim Rbbt.share.databases.Polysearch.location, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
7
+ Rbbt.claim Rbbt.share.databases.Polysearch.disease, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
8
+ Rbbt.claim Rbbt.share.databases.Polysearch.drug, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
9
9
  end
10
10
 
@@ -1,5 +1,6 @@
1
- require 'rbbt-util'
1
+ require 'rbbt'
2
2
  require 'libxml'
3
+ require 'rbbt/sources/gscholar'
3
4
 
4
5
  # This module offers an interface with PubMed, to perform queries, and
5
6
  # retrieve simple information from articles. It uses the caching
@@ -10,12 +11,16 @@ module PubMed
10
11
  @@pubmed_lag = 1
11
12
  def self.get_online(pmids)
12
13
 
13
- pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
14
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
14
+ pmids_complete = pmids.is_a?(Array) ? pmids : [pmids]
15
15
 
16
- xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
16
+ articles = []
17
+ Misc.divide(pmids_complete, (pmids_complete.length / 500) + 1).each do |pmid_list|
18
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list * ","}"
17
19
 
18
- articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
20
+ xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
21
+
22
+ articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
23
+ end
19
24
 
20
25
  if pmids.is_a? Array
21
26
  list = {}
@@ -0,0 +1,37 @@
1
+ require 'rbbt'
2
+
3
+ module EBChromatin
4
+ BASE_URL='http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/'
5
+
6
+ TISSUES= %w(Gm12878 H1hesc Hmec Hsmm Huvec Hepg2 K562 Nhek Nhlf)
7
+
8
+ TISSUES.each do |tissue|
9
+ file = "wgEncodeBroadHmm#{tissue}HMM.bed.gz"
10
+
11
+ Rbbt.claim Rbbt.share.databases.EBChromatin[file.match(/wgEncodeBroadHmm(.*)HMM.bed.gz/)[1]], :proc do
12
+ url = File.join(BASE_URL, file)
13
+
14
+ CMD.cmd('sed \'s/^chr\([[:alnum:]]\+\)\t\([[:digit:]]\+\)\t\([[:digit:]]\+\)/\1:\2:\3\t\1\t\2\t\3/\' | cut -f 1,2,3,4,5 | awk \'BEGIN {print "#Region ID\tChromosome Name\tStart\tEnd\tType"} /./ {print $0}\' ', :in => Open.read(url), :pipe => true).read
15
+ end
16
+ end
17
+
18
+ def self.chromosome(tissue, chr, positions)
19
+ list = Array === positions ? positions : [positions]
20
+
21
+ file = Rbbt.share.databases.EBChromatin[tissue]
22
+ chromosome_bed = Persistence.persist(file, "EBChromatin[#{tissue}][#{chr}]", :fwt, :chromosome => chr, :range => true) do |file, options|
23
+ chromosome = options[:chromosome]
24
+ tsv = file.tsv(:persist => false, :type => :list, :grep => "^#{chromosome}:\\|^#")
25
+ if tsv.size > 0
26
+ tsv.collect do |gene, values|
27
+ [gene, values.values_at("Start", "End").collect{|p| p.to_i}]
28
+ end
29
+ else
30
+ raise "No chromatin information for chromosome #{ chr } in tissue #{ tissue }"
31
+ end
32
+ end
33
+
34
+ list.collect do |pos| chromosome_bed[pos] end
35
+ end
36
+
37
+ end
@@ -0,0 +1,29 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
+ require 'rbbt/sources/biomart'
3
+ require 'rbbt/sources/entrez'
4
+
5
+ $interpro_db = 'entry'
6
+
7
+ $interpro_id = ['InterPro Entry Accession','entry_id']
8
+
9
+ $interpro_pos = [
10
+ ["UniProt/SwissProt Accession", "protein_ac"],
11
+ ["Match Start Position", "pos_from"],
12
+ ["Match Stop Position ", "pos_to"]
13
+ ]
14
+
15
+ file 'interpro_positions' do |t|
16
+ Open.write(t.name, InterPro.tsv($interpro_db, $interpro_id, $interpro_pos, [], nil, :type => :double, :nocache => true).to_s)
17
+ end
18
+
19
+ file 'interpro_names' do |t|
20
+ Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tName\n" + Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/names.dat"))
21
+ end
22
+
23
+
24
+ file 'interpro_short_names' do |t|
25
+ Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tShort Name\n" + Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/short_names.dat"))
26
+ end
27
+
28
+
29
+
@@ -0,0 +1,67 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/util/misc'
3
+
4
+ def read_chunk(jochem)
5
+ chunk = ""
6
+ while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
7
+ chunk << line
8
+ end
9
+ return nil if chunk.empty?
10
+ chunk
11
+ end
12
+
13
+ def first(list)
14
+ return nil if list.nil? or list.empty?
15
+ list.first
16
+ end
17
+
18
+ def process_jochem
19
+ jochem = Open.open("http://www.biosemantics.org/uploads/file/Jochem/JochemV1_2.zip")
20
+ identifiers = File.open('identifiers', 'w')
21
+
22
+ identifiers.puts("#: :namespace=JoChem")
23
+ identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
24
+
25
+ lexicon = File.open('lexicon', 'w')
26
+ lexicon.puts("#: :namespace=JoChem")
27
+ lexicon.puts("#ID\tSynonyms")
28
+
29
+ inchi = File.open('inchi', 'w')
30
+ inchi.puts("#: :namespace=JoChem")
31
+ inchi.puts("#ID\tInChi")
32
+
33
+ definitions = File.open('definitions', 'w')
34
+ definitions.puts("#: :namespace=JoChem#:type=:list")
35
+ definitions.puts("#ID\tDefinition")
36
+
37
+ while chunk = read_chunk(jochem) do
38
+ next if chunk.empty? or chunk =~ /^#/ or chunk =~ /^NS /
39
+ info = {}
40
+ chunk.split(/\n/).each do |line|
41
+ line.sub!(/\t@match.*/,'')
42
+ code, value = line.match(/([A-Z]*) (.*)/).values_at 1, 2
43
+ info[code] ||= []
44
+ info[code] << value
45
+ end
46
+ id = first(info["ID"])
47
+ na = first(info["NA"])
48
+ df = first(info["DF"])
49
+ tm = info["TM"] || []
50
+ db = info["DB"] || []
51
+
52
+ pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
53
+ drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
54
+ inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
55
+
56
+ lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
57
+ identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
58
+ inchi.puts [id, inch * "|"] * "\t" if inch.any?
59
+ definitions.puts [id, df] * "\t" unless df.nil?
60
+ end
61
+ end
62
+
63
+ rule /identifiers|lexicon|inchi|definitions/ do |t|
64
+ Misc.in_dir(File.dirname(t.name)) do
65
+ process_jochem
66
+ end
67
+ end
@@ -0,0 +1,79 @@
1
+ require 'nokogiri'
2
+
3
+ module NCI
4
+ def self.get_pathways(xml, format = "UP")
5
+ doc = Nokogiri::XML(xml)
6
+ pathways = {}
7
+
8
+ molecules = {}
9
+ doc.xpath("//Molecule").each do |molecule|
10
+ id = molecule.attribute('id').value
11
+ type = molecule.attribute('molecule_type').value
12
+ next unless type == "protein"
13
+ names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
14
+ next if names.empty?
15
+ molecules[id] = {:xml => molecule, :uniprot => names.first}
16
+ end
17
+
18
+ interactions = {}
19
+ doc.xpath("//Interaction").each do |interaction|
20
+ id = interaction.attribute('id').value
21
+ molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}
22
+
23
+ interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
24
+ end
25
+
26
+ doc.xpath("//Pathway").each do |pathway|
27
+ id = pathway.attribute('id').value
28
+ subnet = pathway.attribute('subnet').value
29
+ name = pathway.xpath('LongName').first.content
30
+
31
+ interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
32
+
33
+ pathway_interactions = interaction_ids.collect{|id| interactions[id]}
34
+ pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
35
+
36
+ pathway_uniprot_ids = pathway_molecule_ids.collect do |id|
37
+ next unless molecules.include? id
38
+ molecules[id][:uniprot]
39
+ end
40
+ pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
41
+ end
42
+ pathways
43
+ end
44
+ end
45
+
46
+ file 'nature_pathways' do |t|
47
+
48
+ url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/NCI-Nature_Curated.xml.gz"
49
+
50
+ xml = Open.read(url)
51
+
52
+ pathways = NCI.get_pathways(xml)
53
+
54
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
55
+ end
56
+
57
+ file 'biocarta_pathways' do |t|
58
+
59
+ url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/BioCarta.xml.gz"
60
+
61
+ xml = Open.read(url)
62
+
63
+ pathways = NCI.get_pathways(xml, "LL")
64
+
65
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
66
+ end
67
+
68
+ file 'reactome_pathways' do |t|
69
+
70
+ url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
71
+
72
+ xml = Open.read(url)
73
+
74
+ pathways = NCI.get_pathways(xml, "UP")
75
+
76
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
77
+ end
78
+
79
+
@@ -7,6 +7,8 @@ $taxs = [9606]
7
7
  $scientific_name = "Homo sapiens"
8
8
 
9
9
  $biomart_db = 'hsapiens_gene_ensembl'
10
+ $biomart_db_germline_variation = 'hsapiens_snp'
11
+ $biomart_db_somatic_variation = 'hsapiens_snp_som'
10
12
 
11
13
  $biomart_lexicon = [
12
14
  [ 'Associated Gene Name' , "external_gene_id"],
@@ -15,6 +17,14 @@ $biomart_lexicon = [
15
17
  [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
16
18
  ]
17
19
 
20
+ $biomart_protein_identifiers = [
21
+ [ 'Protein ID', "protein_id" ],
22
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
23
+ [ 'Unigene ID', "unigene" ],
24
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
25
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
26
+ ]
27
+
18
28
  $biomart_identifiers = [
19
29
  [ 'Entrez Gene ID', "entrezgene"],
20
30
  [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
@@ -42,7 +52,7 @@ $biomart_identifiers = [
42
52
  [ 'AFFY HG U95E', 'affy_hg_u95e' ],
43
53
  [ 'AFFY HG U95A', 'affy_hg_u95a' ],
44
54
  [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
45
- [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
55
+ [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
46
56
  [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
47
57
  [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
48
58
  [ 'Agilent WholeGenome',"agilent_wholegenome" ],
@@ -52,5 +62,14 @@ $biomart_identifiers = [
52
62
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
53
63
  ]
54
64
 
65
+ $biomart_go= [
66
+ ["GO ID", 'go_id'],
67
+ ["GO Namespace", 'namespace_1003'],
68
+ ]
69
+
70
+ $biomart_pfam= [
71
+ ["Pfam Domain", 'pfam'],
72
+ ]
73
+
55
74
  $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
56
75
  load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
@@ -7,6 +7,8 @@ $taxs = [10116]
7
7
  $scientific_name = "Rattus norvegicus"
8
8
 
9
9
  $biomart_db = 'rnorvegicus_gene_ensembl'
10
+ $biomart_db_germline_variation = 'rnorvegicus_snp'
11
+ $biomart_db_somatic_variation = 'rnorvegicus_snp_som'
10
12
 
11
13
  $biomart_lexicon = [
12
14
  [ 'Associated Gene Name' , "external_gene_id"],
@@ -2,8 +2,6 @@ $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
2
2
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
3
3
  $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
4
4
  $biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
5
- $biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
6
- $biomart_germline_variation_id = ['Variation ID', "external_id" ]
7
5
 
8
6
  $biomart_gene_positions = [
9
7
  ['Chromosome Name','chromosome_name'],
@@ -60,42 +58,6 @@ $biomart_exons = [
60
58
  ['Exon Chr End','exon_chrom_end'],
61
59
  ]
62
60
 
63
- #{{{ Variations
64
-
65
- $biomart_germline_variation_positions = [
66
- ['Chromosome Location (bp)', "chromosome_location" ],
67
- ['SNP Chromosome Strand', "snp_chromosome_strand" ],
68
- ['Transcript location (bp)', "transcript_location" ],
69
- ['Allele', "allele" ],
70
- ['Protein Allele', "peptide_shift" ],
71
- ['CDS Start', "cds_start_2076" ],
72
- ['CDS End', "cds_end_2076" ],
73
- ]
74
-
75
- $biomart_germline_variations = [
76
- $biomart_ensembl_gene,
77
- ['Source', "source_name" ],
78
- ['Validated', "validated" ],
79
- ['Consequence Type', "synonymous_status" ],
80
- ]
81
-
82
- $biomart_somatic_variation_positions = [
83
- ['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
84
- ['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
85
- ['Transcript location (bp)' , "somatic_transcript_location" ] ,
86
- ['Allele' , "somatic_allele" ] ,
87
- ['Protein Allele' , "somatic_peptide_shift" ] ,
88
- ['CDS Start' , "somatic_cds_start_2076" ] ,
89
- ['CDS End' , "somatic_cds_end_2076" ] ,
90
- ]
91
-
92
- $biomart_somatic_variations = [
93
- $biomart_ensembl_gene,
94
- ['Source' , "somatic_source_name" ] ,
95
- ['Validated' , "somatic_validated" ] ,
96
- ['Consequence Type' , "somatic_synonymous_status" ] ,
97
- ]
98
-
99
61
  #{{{ Rules
100
62
 
101
63
  file 'scientific_name' do |t|
@@ -104,15 +66,69 @@ end
104
66
 
105
67
  file 'identifiers' do |t|
106
68
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
69
+
107
70
  $biomart_identifiers.each do |name, key, prefix|
108
71
  if prefix
109
72
  identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
110
73
  end
111
74
  end
112
75
 
76
+ name_pos = identifiers.identify_field "Associated Gene Name"
77
+ entrez2name = Entrez.entrez2name($taxs)
78
+ identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
79
+ names = values[name_pos]
80
+
81
+ matches = entrez.select do |e|
82
+ entrez2name.include? e and (names & entrez2name[e]).any?
83
+ end
84
+
85
+ if matches.any?
86
+ matches
87
+ else
88
+ entrez
89
+ end
90
+ end
91
+
92
+ entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
93
+ entrez_synonyms.key_field = "Entrez Gene ID"
94
+ entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
95
+
96
+ identifiers.attach entrez_synonyms
97
+
98
+ identifiers.each do |key, values|
99
+ values.each do |list|
100
+ list.reject!{|v| v.nil? or v.empty?}
101
+ list.uniq!
102
+ end
103
+ end
104
+
105
+ File.open(t.name, 'w') do |f| f.puts identifiers end
106
+ end
107
+
108
+ file 'lexicon' => 'identifiers' do |t|
109
+ tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
110
+
111
+ entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 8
112
+ entrez_description.key_field = "Entrez Gene ID"
113
+ entrez_description.fields = ["Entrez Gene Description"]
114
+
115
+ tsv.attach entrez_description
116
+ Open.write(t.name, tsv.to_s)
117
+ end
118
+
119
+
120
+ file 'protein_identifiers' do |t|
121
+ identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_identifiers, [], nil, :namespace => $namespace)
122
+ $biomart_protein_identifiers.each do |name, key, prefix|
123
+ if prefix
124
+ identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
125
+ end
126
+ end
127
+
113
128
  File.open(t.name, 'w') do |f| f.puts identifiers end
114
129
  end
115
130
 
131
+
116
132
  file 'gene_transcripts' do |t|
117
133
  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat, :namespace => $namespace)
118
134
 
@@ -121,7 +137,7 @@ end
121
137
 
122
138
  file 'transcripts' => 'gene_positions' do |t|
123
139
  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
124
- transcripts.attach TSV.new('gene_positions'), "Chromosome Name"
140
+ transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
125
141
 
126
142
  File.open(t.name, 'w') do |f| f.puts transcripts end
127
143
  end
@@ -198,7 +214,7 @@ end
198
214
 
199
215
  file 'exons' => 'gene_positions' do |t|
200
216
  exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
201
- exons.attach TSV.new('gene_positions'), "Chromosome Name"
217
+ exons.attach TSV.open('gene_positions'), "Chromosome Name"
202
218
 
203
219
  File.open(t.name, 'w') do |f| f.puts exons end
204
220
  end
@@ -227,28 +243,21 @@ file 'transcript_sequence' do |t|
227
243
  end
228
244
 
229
245
 
230
- $biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,COMPLEX_INDEL&NMD_TRANSCRIPT,COMPLEX_INDEL&SPLICE_SITE,ESSENTIAL_SPLICE_SITE&INTRONIC,ESSENTIAL_SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,FRAMESHIFT_CODING,FRAMESHIFT_CODING&NMD_TRANSCRIPT,FRAMESHIFT_CODING&SPLICE_SITE,FRAMESHIFT_CODING&SPLICE_SITE&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING&SPLICE_SITE,NON_SYNONYMOUS_CODING&SPLICE_SITE&NMD_TRANSCRIPT,REGULATORY_REGION,SPLICE_SITE&3PRIME_UTR,SPLICE_SITE&3PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&5PRIME_UTR,SPLICE_SITE&5PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&INTRONIC,SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,SPLICE_SITE&SYNONYMOUS_CODING,SPLICE_SITE&SYNONYMOUS_CODING&NMD_TRANSCRIPT,STOP_GAINED,STOP_GAINED&FRAMESHIFT_CODING,STOP_GAINED&FRAMESHIFT_CODING&NMD_TRANSCRIPT,STOP_GAINED&NMD_TRANSCRIPT,STOP_GAINED&SPLICE_SITE,STOP_GAINED&SPLICE_SITE&NMD_TRANSCRIPT,STOP_LOST,STOP_LOST&NMD_TRANSCRIPT,STOP_LOST&SPLICE_SITE,STOP_LOST&SPLICE_SITE&NMD_TRANSCRIPT,SYNONYMOUS_CODING,SYNONYMOUS_CODING&NMD_TRANSCRIPT"]
231
- #$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
232
- $biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
246
+ #{{{ Variations
233
247
 
234
- file 'germline_variations' do |t|
235
- variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
236
- end
248
+ $biomart_variation_id = ["SNP ID", "refsnp_id"]
249
+ $biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
237
250
 
238
- file 'germline_variation_positions' do |t|
239
- variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
240
- File.open(t.name, 'w') do |f| f.puts variations.to_s end
251
+ file 'germline_variations' do |t|
252
+ BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
241
253
  end
242
254
 
243
255
  file 'somatic_variations' do |t|
244
- variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
245
- File.open(t.name, 'w') do |f| f.puts variations.to_s end
256
+ BioMart.tsv($biomart_db_somatic_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
246
257
  end
247
258
 
248
- file 'somatic_variation_positions' do |t|
249
- variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
250
- File.open(t.name, 'w') do |f| f.puts variations.to_s end
251
- end
259
+
260
+ # {{{ Other info
252
261
 
253
262
  file 'gene_pmids' do |t|
254
263
  tsv = Entrez.entrez2pubmed($taxs)
@@ -260,47 +269,95 @@ file 'gene_pmids' do |t|
260
269
  Open.write(t.name, text)
261
270
  end
262
271
 
263
- file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
264
- require 'rbbt/sources/organism/sequence'
272
+ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
273
+ transcripts = begin
274
+ exon_transcripts[exon].first
275
+ rescue
276
+ []
277
+ end
265
278
 
266
- exons = TSV.new('exons', :persistence => true)
267
- exon_transcripts = TSV.new('transcript_exons', :double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
268
- gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
269
- transcript_info = TSV.new('transcripts', :list, :persistence => true )
270
- transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
279
+ transcripts.select{|transcript| transcript_info[transcript].first.any?}
280
+ end
271
281
 
282
+ def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
283
+ sizes = [0]
284
+ rank = nil
285
+ start_pos = exons.identify_field "Exon Chr Start"
286
+ end_pos = exons.identify_field "Exon Chr End"
287
+
288
+ Misc.zip_fields(transcript_exons[transcript]).each do |_exon, _rank|
289
+ _rank = _rank.to_i
290
+ s, e = exons[_exon].values_at(start_pos, end_pos)
291
+ size = e.to_i - s.to_i + 1
292
+ sizes[_rank] = size
293
+ rank = _rank if _exon == exon
294
+ end
295
+
296
+ if not rank.nil?
297
+ sizes[0..rank - 1].inject(0){|e,acc| acc += e}
298
+ else
299
+ nil
300
+ end
301
+ end
302
+
303
+ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
304
+ exons = TSV.open('exons')
305
+ exon_transcripts = nil
306
+ exon_transcripts = TSV.open('transcript_exons', :double, :key_field => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true)
307
+ gene_transcripts = TSV.open('gene_transcripts', :flat)
308
+ transcript_info = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
309
+ transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
272
310
 
273
311
  string = "#: :namespace=#{$namespace}"
274
312
  string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
275
- exons.each do |exon, info|
276
- gene, start, finish, strand, chr = info
277
313
 
278
- transcripts = Organism::Hsa.coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
314
+ exons.unnamed = true
315
+ exon_transcripts.unnamed = true
316
+ gene_transcripts.unnamed = true
317
+ transcript_info.unnamed = true
318
+ transcript_exons.unnamed = true
319
+
320
+ exons.monitor = true
321
+ Misc.profile do
322
+ exons.through do |exon, info|
323
+ gene, start, finish, strand, chr = info
279
324
 
280
- transcript_offsets = {}
281
- transcripts.each do |transcript|
282
- offset = Organism::Hsa.exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
283
- transcript_offsets[transcript] = offset unless offset.nil?
325
+ transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
326
+
327
+ transcript_offsets = {}
328
+ transcripts.each do |transcript|
329
+ offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
330
+ transcript_offsets[transcript] = offset unless offset.nil?
331
+ end
332
+
333
+ string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
284
334
  end
285
-
286
- string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
287
335
  end
288
336
 
289
337
  Open.write(t.name, string)
290
338
  end
291
339
 
340
+ file 'gene_go' do |t|
341
+ goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
342
+
343
+ File.open(t.name, 'w') do |f| f.puts goterms end
344
+ end
345
+
346
+
347
+ file 'gene_pfam' do |t|
348
+ goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
349
+
350
+ File.open(t.name, 'w') do |f| f.puts goterms end
351
+ end
352
+
353
+
292
354
  rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
293
355
  t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
294
356
  archive = $1
295
357
  task = $2
296
- old_pwd = FileUtils.pwd
297
- begin
298
- FileUtils.mkdir archive unless File.exists? archive
299
- FileUtils.cd File.join(archive)
358
+ Misc.in_dir(archive) do
300
359
  BioMart.set_archive archive
301
360
  Rake::Task[task].invoke
302
361
  BioMart.unset_archive
303
- ensure
304
- FileUtils.cd old_pwd
305
362
  end
306
363
  end