rbbt-sources 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,64 +1,65 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/resource'
3
- require 'nokogiri'
3
+ require 'spreadsheet'
4
4
 
5
5
  module TFacts
6
6
  extend Resource
7
7
  self.subdir = "share/databases/TF"
8
8
 
9
- def self.targets_for_gene_unsigned(gene_name)
10
- doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
9
+ TFacts.claim TFacts.source["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
11
10
 
12
- doc.css("td a").collect{|link| link.content.strip}
13
- end
11
+ TFacts.claim TFacts.targets, :proc do
12
+ book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
13
+ sheet = book.worksheet 0
14
14
 
15
- def self.targets_for_gene_signed(gene_name)
16
- doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResults.php", :post => "TFS_ID=#{ gene_name }"))
15
+ tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name"], :namespace => "Hsa", :type => :flat)
16
+ sheet.each do |row|
17
+ target, tf = row.values_at 0, 1
18
+ tsv[target] ||= []
19
+ tsv[target] << tf
20
+ end
17
21
 
18
- rows = doc.css("tr")
19
- rows.shift
20
- targets = {}
21
- rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
22
- targets
22
+ tsv.to_s
23
23
  end
24
24
 
25
- def self.known_transcription_factors_signed
26
- Open.read("http://www.tfacts.org/source/tfs.php").scan(/OPTION VALUE=([^\s]+)/).flatten
27
- end
25
+ TFacts.claim TFacts.targets_signed, :proc do
26
+ book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
27
+ sheet = book.worksheet 0
28
28
 
29
- def self.known_transcription_factors_unsigned
30
- Open.read("http://www.tfacts.org/source/tfsns.php").scan(/OPTION VALUE=([^\s]+)/).flatten
31
- end
29
+ tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name", "Sign"], :namespace => "Hsa", :type => :double)
30
+ sheet.each do |row|
31
+ target, tf, sign = row.values_at 0, 1, 2
32
+ tsv[target] ||= [[],[]]
33
+ tsv[target][0] << tf
34
+ tsv[target][1] << sign
35
+ end
32
36
 
33
- TFacts.claim TFacts.targets, :proc do
34
- tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
35
- TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
36
37
  tsv.to_s
37
38
  end
38
39
 
39
- TFacts.claim TFacts.targets_signed, :proc do
40
- tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name", "Target Sign"], :type => :double)
41
- Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
42
- tsv[tf] = [targets.keys, targets.values]
43
- end
44
- tsv.to_s
40
+ TFacts.claim TFacts.regulators, :proc do
41
+ TFacts.targets.tsv.reorder("Transcription Factor Associated Gene Name").to_s
45
42
  end
43
+
46
44
  end
47
45
 
48
46
  if defined? Entity and defined? Gene and Entity === Gene
49
47
 
50
48
  module Gene
51
49
  property :is_transcription_factor? => :array2single do
52
- @is_trasncription_factor ||= begin
53
- tfs = TFacts.targets.keys
54
- self.name.collect{|gene| tfs.include? gene}
55
- end
50
+ tfs = TFacts.targets.keys
51
+ self.name.collect{|gene| tfs.include? gene}
52
+ end
53
+ persist :_ary_is_transcription_factor?
54
+
55
+ property :transcription_regulators => :array2single do
56
+ Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
56
57
  end
58
+ persist :_ary_transcription_regulators
57
59
 
58
- property :transcription_targets => :array2single do
59
- @transcription_targets ||= begin
60
- Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
61
- end
62
- end
60
+ property :transcription_targets => :array2single do
61
+ Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
62
+ end
63
+ persist :_ary_transcription_targets
63
64
  end
64
65
  end
@@ -3,34 +3,32 @@ require 'rbbt/resource'
3
3
  require 'rbbt/sources/cath'
4
4
  require 'rbbt/sources/uniprot'
5
5
 
6
- module Uniprot
6
+ module UniProt
7
7
  extend Resource
8
- self.subdir = "share/databases/Uniprot"
8
+ self.subdir = "share/databases/UniProt"
9
9
 
10
- Uniprot.claim Uniprot.annotated_variants, :proc do
10
+ UniProt.claim UniProt.annotated_variants, :proc do
11
11
  url = "http://www.uniprot.org/docs/humsavar.txt"
12
12
  tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
13
- :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[0..5] + [(parts[6..-1] || []) * " "]) * "\t"}, :type => :list,:key_field => "Associated Gene Name",
14
- :fields => ["Uniprot/SwissProt Accession", "Uniprot Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
13
+ :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[1..5] + [(parts[6..-1] || []) * " "]) * "\t"},
14
+ :type => :double,
15
+ :merge => true,
16
+ :key_field => "UniProt/SwissProt Accession",
17
+ :fields => ["UniProt Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
15
18
 
16
19
  tsv.unnamed = true
17
- tsv.process "Amino Acid Mutation" do |mutation|
18
- if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
19
- wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
20
- mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
21
- [wt, $2, mut] * ""
22
- else
23
- mutation
20
+ tsv.process "Amino Acid Mutation" do |mutations|
21
+ mutations.collect do |mutation|
22
+ if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
23
+ wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
24
+ mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
25
+ [wt, $2, mut] * ""
26
+ else
27
+ mutation
28
+ end
24
29
  end
25
30
  end
26
-
27
- uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
28
- mutation_pos = tsv.identify_field "Amino Acid Mutation"
29
- tsv.add_field "Mutated Isoform" do |key, values|
30
- [values[uniprot_pos], values[mutation_pos]] * ":"
31
- end
32
-
33
- tsv.reorder("Mutated Isoform").to_s
31
+ tsv.to_s
34
32
  end
35
33
 
36
34
 
@@ -44,7 +42,12 @@ module Uniprot
44
42
  text.split(/\n/).each{|l|
45
43
  next unless l =~ /^DR\s+PDB; (.*)\./
46
44
  id, method, resolution, region = $1.split(";").collect{|v| v.strip}
47
- chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
45
+ begin
46
+ chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
47
+ rescue
48
+ Log.warn("Error process Uniprot PDB line: #{line}")
49
+ next
50
+ end
48
51
  pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
49
52
  }
50
53
  pdb
@@ -96,7 +99,6 @@ module Uniprot
96
99
  variants
97
100
  end
98
101
 
99
-
100
102
  def self.cath(protein)
101
103
  url = UNIPROT_TEXT.sub "[PROTEIN]", protein
102
104
  text = Open.read(url)
@@ -118,7 +120,7 @@ module Uniprot
118
120
  end
119
121
 
120
122
  def self.pdbs_covering_aa_position(protein, aa_position)
121
- Uniprot.pdbs(protein).select do |pdb, info|
123
+ UniProt.pdbs(protein).select do |pdb, info|
122
124
  info[:region].include? aa_position
123
125
  end
124
126
  end
@@ -0,0 +1,18 @@
1
+ #: :type=:single
2
+ #Release build
3
+ current jul2012
4
+ release-68 jul2012
5
+ release-67 may2012
6
+ release-66 feb2012
7
+ release-65 dec2011
8
+ release-64 sep2011
9
+ release-63 jun2011
10
+ release-62 apr2011
11
+ release-61 feb2011
12
+ release-60 nov2010
13
+ release-59 aug2010
14
+ release-58 may2010
15
+ release-57 mar2010
16
+ release-56 sep2009
17
+ release-55 jul2009
18
+ release-54 may2009
@@ -0,0 +1,15 @@
1
+
2
+
3
+ rule /(.+)/ do |t|
4
+ require 'net/ftp'
5
+ chromosome = File.basename(t.name)
6
+
7
+ ftp = Genomes1000::URL
8
+ ftp = Net::FTP.new(Genomes1000::FTP_SERVER)
9
+ ftp.login
10
+ ftp.chdir(Genomes1000::FTP_PATH)
11
+ file = ftp.list("*.chr" + chromosome + ".*").collect{|l| l.split(" ").last}.last
12
+ ddd file
13
+ exit
14
+
15
+ end
@@ -3,7 +3,7 @@ require 'rbbt/util/misc'
3
3
 
4
4
  def read_chunk(jochem)
5
5
  chunk = ""
6
- while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
6
+ while (not jochem.eof? and not (line = Misc.fixutf8(jochem.gets)).match(/^--/))
7
7
  chunk << line
8
8
  end
9
9
  return nil if chunk.empty?
@@ -20,7 +20,7 @@ def process_jochem
20
20
  identifiers = File.open('identifiers', 'w')
21
21
 
22
22
  identifiers.puts("#: :namespace=JoChem")
23
- identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
23
+ identifiers.puts("#ID\tCompound Name\tPubChem:Substance ID\tPubChem:Coumpound ID\tDrugBank:ID\tChemIDplus:ID\tCAS:ID\tMeSH:Term\tChEBI:ID\tHMDB:ID\tKEGG:Coumpound ID\tKEGG:Drug ID")
24
24
 
25
25
  lexicon = File.open('lexicon', 'w')
26
26
  lexicon.puts("#: :namespace=JoChem")
@@ -49,12 +49,20 @@ def process_jochem
49
49
  tm = info["TM"] || []
50
50
  db = info["DB"] || []
51
51
 
52
+ cheb = db.collect{|code| code.match(/CHEB_(.*)/) ? $1 : nil}.compact
53
+ chid = db.collect{|code| code.match(/CHID_(.*)/) ? $1 : nil}.compact
52
54
  pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
55
+ pubs = db.collect{|code| code.match(/PUBS_(.*)/) ? $1 : nil}.compact
53
56
  drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
57
+ cas = db.collect{|code| code.match(/CAS_(.*)/) ? $1 : nil}.compact
58
+ mesh = db.collect{|code| code.match(/MESH_(.*)/) ? $1 : nil}.compact
59
+ hmdb = db.collect{|code| code.match(/HMDB_(.*)/) ? $1 : nil}.compact
60
+ kegg = db.collect{|code| code.match(/KEGG_(.*)/) ? $1 : nil}.compact
61
+ kegd = db.collect{|code| code.match(/KEGD_(.*)/) ? $1 : nil}.compact
54
62
  inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
55
63
 
56
64
  lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
57
- identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
65
+ identifiers.puts [id, na, pubs * "|", pubc * "|", drug * "|", chid * "|", cas * "|", mesh * "|", cheb * "|", hmdb * "|", kegg * "|", kegd * "|" ] * "\t"
58
66
  inchi.puts [id, inch * "|"] * "\t" if inch.any?
59
67
  definitions.puts [id, df] * "\t" unless df.nil?
60
68
  end
@@ -1,7 +1,8 @@
1
1
  require 'nokogiri'
2
+ require 'rbbt-util'
2
3
 
3
4
  module NCI
4
- def self.get_pathways(xml, format = "UP")
5
+ def self.get_pathways(xml, format = "UP", get_short_name = false)
5
6
  doc = Nokogiri::XML(xml)
6
7
  pathways = {}
7
8
 
@@ -9,36 +10,60 @@ module NCI
9
10
  doc.xpath("//Molecule").each do |molecule|
10
11
  id = molecule.attribute('id').value
11
12
  type = molecule.attribute('molecule_type').value
12
- next unless type == "protein"
13
13
  names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
14
- next if names.empty?
15
- molecules[id] = {:xml => molecule, :uniprot => names.first}
14
+ molecules[id] = {:xml => molecule, :proteins => names}
16
15
  end
17
16
 
17
+
18
18
  interactions = {}
19
19
  doc.xpath("//Interaction").each do |interaction|
20
20
  id = interaction.attribute('id').value
21
- molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}
21
+ type = interaction.attribute('interaction_type').value
22
+ molecule_ids = interaction.xpath('InteractionComponentList/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}.flatten.compact
23
+ pathway_ids = interaction.xpath('Abstraction').collect{|c| c.attribute('pathway_idref').value}.flatten.compact
22
24
 
23
- interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
25
+ interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids, :pathway_ids => pathway_ids}
24
26
  end
25
27
 
26
28
  doc.xpath("//Pathway").each do |pathway|
27
29
  id = pathway.attribute('id').value
28
30
  subnet = pathway.attribute('subnet').value
29
31
  name = pathway.xpath('LongName').first.content
32
+ short_name = pathway.xpath('ShortName').first.content if get_short_name
30
33
 
31
- interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
34
+ interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
32
35
 
33
- pathway_interactions = interaction_ids.collect{|id| interactions[id]}
34
- pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
36
+ pathway_interactions = interaction_ids.collect{|i| interactions[i]}
37
+ pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.compact.flatten
35
38
 
36
- pathway_uniprot_ids = pathway_molecule_ids.collect do |id|
37
- next unless molecules.include? id
38
- molecules[id][:uniprot]
39
+ pathway_uniprot_ids = pathway_molecule_ids.collect do |i|
40
+ next unless molecules.include? i
41
+ molecules[i][:proteins]
42
+ end
43
+ if get_short_name
44
+ pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq], [short_name]]
45
+ else
46
+ pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq]]
39
47
  end
40
- pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
41
48
  end
49
+
50
+ doc.xpath("//Pathway").each do |pathway|
51
+ id = pathway.attribute('id').value
52
+ subnet = pathway.attribute('subnet').value
53
+ name = pathway.xpath('LongName').first.content
54
+
55
+ interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
56
+
57
+ pathway_interactions = interaction_ids.collect{|i| interactions[i]}
58
+ pathway_subnet_ids = pathway_interactions.collect{|info| info[:pathway_ids]}.compact.flatten
59
+
60
+ pathway_subnet_ids.collect do |nid|
61
+ next unless pathways.include? nid
62
+ new_genes = pathways[id].last
63
+ pathways[nid][1] = (pathways[nid][1] + new_genes).uniq
64
+ end
65
+ end
66
+
42
67
  pathways
43
68
  end
44
69
  end
@@ -60,11 +85,25 @@ file 'biocarta_pathways' do |t|
60
85
 
61
86
  xml = Open.read(url)
62
87
 
63
- pathways = NCI.get_pathways(xml, "LL")
88
+ pathways = NCI.get_pathways(xml, "LL", true)
64
89
 
65
- Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
90
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID", "Pathway Short Name"]).to_s)
66
91
  end
67
92
 
93
+ file 'biocarta_pathways_fixed_ids' => 'biocarta_pathways' do |t|
94
+ orig = TSV.open(Open.open(t.prerequisites.first))
95
+ tsv = TSV.setup({}, :type => :double, :key_field => "BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID"])
96
+
97
+ orig.through do |key, values|
98
+ name, genes, short = values
99
+ code = "h_" + short.first
100
+ tsv[code] = [name, genes]
101
+ end
102
+
103
+ Open.write(t.name, tsv.to_s)
104
+ end
105
+
106
+
68
107
  file 'reactome_pathways' do |t|
69
108
 
70
109
  url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
@@ -76,4 +115,3 @@ file 'reactome_pathways' do |t|
76
115
  Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
77
116
  end
78
117
 
79
-
@@ -43,7 +43,7 @@ $biomart_probe_identifiers = [
43
43
  [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
44
44
  [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
45
45
  [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
46
- [ 'Agilent WholeGenome',"agilent_wholegenome" ],
46
+ #[ 'Agilent WholeGenome',"agilent_wholegenome" ],
47
47
  [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
48
48
  [ 'Codelink ID', 'codelink' ],
49
49
  [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
@@ -62,6 +62,7 @@ $biomart_identifiers = [
62
62
  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
63
63
  [ 'HGNC ID', "hgnc_id", 'HGNC'],
64
64
  [ 'EMBL (Genbank) ID' , "embl"] ,
65
+ [ 'RefSeq mRNA' , "refseq_mrna"] ,
65
66
 
66
67
  # Probes
67
68
  [ 'AFFY HC G110', 'affy_hc_g110' ],
@@ -80,7 +81,7 @@ $biomart_identifiers = [
80
81
  [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
81
82
  [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
82
83
  [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
83
- [ 'Agilent WholeGenome',"agilent_wholegenome" ],
84
+ #[ 'Agilent WholeGenome',"agilent_wholegenome" ],
84
85
  [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
85
86
  [ 'Codelink ID', 'codelink' ],
86
87
  [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
@@ -19,12 +19,12 @@ $biomart_lexicon = [
19
19
  ]
20
20
 
21
21
  $biomart_identifiers = [
22
+ ['Entrez Gene ID', "entrezgene"],
22
23
  ['Associated Gene Name' , "external_gene_id"],
23
24
  ['Protein ID' , "protein_id"] ,
24
25
  ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
25
26
  ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
26
27
  ['RefSeq Protein ID' , "refseq_peptide"] ,
27
- ['RefSeq DNA ID' , "refseq_dna"] ,
28
28
  ['EMBL (Genbank) ID' , "embl"] ,
29
29
  ['RGD ID' , "rgd"] ,
30
30
  ['RGD Symbol' , "rgd_symbol"] ,
@@ -39,7 +39,6 @@ $biomart_identifiers = [
39
39
  ['Affy rg u34c', "affy_rg_u34c"],
40
40
  ['Affy rn u34', "affy_rn_u34"],
41
41
  ['Affy rt u34', "affy_rt_u34"],
42
- ['Agilent WholeGenome',"agilent_wholegenome" ],
43
42
  ['Codelink ID ', "codelink"],
44
43
  ]
45
44
 
@@ -4,50 +4,48 @@ require 'rbbt/sources/entrez'
4
4
  require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
5
 
6
6
  $taxs = [559292,4932]
7
- $native = "SGD ID"
8
- $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
9
- $biomart_db = 'scerevisiae_gene_ensembl'
10
- $biomart_main = ['Entrez Gene ID', 'entrezgene']
11
- $ortholog_key = "yeast_ensembl_gene"
12
-
13
-
14
- file 'scientific_name' do |t|
15
- File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
16
- end
17
-
18
- file 'lexicon' do |t|
19
- lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
20
-
21
- lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
22
-
23
- lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
24
-
25
- lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
7
+ $scientific_name = "Saccharomyces cerevisiae"
8
+ #$ortholog_key = "yeast_ensembl_gene"
26
9
 
27
- File.open(t.name, 'w') do |f| f.puts lexicon end
28
- end
29
-
30
- file 'identifiers' do |t|
31
- identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
32
-
33
- identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
34
-
35
- identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
36
- [['Associated Gene Name' , "external_gene_id"],
37
- ['Ensembl Gene ID', "ensembl_gene_id" ],
38
- ['Ensembl Protein ID', "ensembl_peptide_id" ],
39
- ['RefSeq Protein ID' , "refseq_peptide"] ,
40
- ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
41
- ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
42
- ['Protein ID' , "protein_id"] ,
43
- ['EMBL (Genbank) ID' , "embl"] ,
44
- # Affymetrix
45
- ['Affy yeast 2',"affy_yeast_2"],
46
- ['Affy yg s98', "affy_yg_s98"]])
47
-
48
- File.open(t.name, 'w') do |f| f.puts identifiers end
49
- end
50
-
51
-
52
- task :default => ['name', 'lexicon', 'identifiers']
10
+ $biomart_db = 'scerevisiae_gene_ensembl'
53
11
 
12
+ $biomart_lexicon = [
13
+ [ 'Associated Gene Name' , "external_gene_id"],
14
+ ]
15
+
16
+ $biomart_protein_identifiers = [
17
+ [ 'Protein ID', "protein_id" ],
18
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
19
+ [ 'Unigene ID', "unigene" ],
20
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
21
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
22
+ ]
23
+
24
+ $biomart_probe_identifiers = [
25
+ ]
26
+
27
+ $biomart_identifiers = [
28
+ [ 'Entrez Gene ID', "entrezgene"],
29
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
30
+ [ 'Associated Gene Name', "external_gene_id" ],
31
+ [ 'Protein ID', "protein_id" ],
32
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
33
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
34
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
35
+ [ 'EMBL (Genbank) ID' , "embl"] ,
36
+ [ 'RefSeq mRNA' , "refseq_mrna"] ,
37
+ ]
38
+
39
+ $biomart_go= [
40
+ ["GO ID", 'go_id'],
41
+ ["GO Namespace", 'namespace_1003'],
42
+ ]
43
+
44
+ $biomart_go_2009= [
45
+ ["GO BP ID", 'go_biological_process_id'],
46
+ ["GO MF ID", 'go_molecular_function_id'],
47
+ ["GO CC ID", 'go_cellular_component_id'],
48
+ ]
49
+
50
+ $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
51
+ load File.join(File.dirname(__FILE__), '../organism_helpers.rb')