rbbt-sources 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,64 +1,65 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/resource'
3
- require 'nokogiri'
3
+ require 'spreadsheet'
4
4
 
5
5
  module TFacts
6
6
  extend Resource
7
7
  self.subdir = "share/databases/TF"
8
8
 
9
- def self.targets_for_gene_unsigned(gene_name)
10
- doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
9
+ TFacts.claim TFacts.source["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
11
10
 
12
- doc.css("td a").collect{|link| link.content.strip}
13
- end
11
+ TFacts.claim TFacts.targets, :proc do
12
+ book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
13
+ sheet = book.worksheet 0
14
14
 
15
- def self.targets_for_gene_signed(gene_name)
16
- doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResults.php", :post => "TFS_ID=#{ gene_name }"))
15
+ tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name"], :namespace => "Hsa", :type => :flat)
16
+ sheet.each do |row|
17
+ target, tf = row.values_at 0, 1
18
+ tsv[target] ||= []
19
+ tsv[target] << tf
20
+ end
17
21
 
18
- rows = doc.css("tr")
19
- rows.shift
20
- targets = {}
21
- rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
22
- targets
22
+ tsv.to_s
23
23
  end
24
24
 
25
- def self.known_transcription_factors_signed
26
- Open.read("http://www.tfacts.org/source/tfs.php").scan(/OPTION VALUE=([^\s]+)/).flatten
27
- end
25
+ TFacts.claim TFacts.targets_signed, :proc do
26
+ book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
27
+ sheet = book.worksheet 0
28
28
 
29
- def self.known_transcription_factors_unsigned
30
- Open.read("http://www.tfacts.org/source/tfsns.php").scan(/OPTION VALUE=([^\s]+)/).flatten
31
- end
29
+ tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name", "Sign"], :namespace => "Hsa", :type => :double)
30
+ sheet.each do |row|
31
+ target, tf, sign = row.values_at 0, 1, 2
32
+ tsv[target] ||= [[],[]]
33
+ tsv[target][0] << tf
34
+ tsv[target][1] << sign
35
+ end
32
36
 
33
- TFacts.claim TFacts.targets, :proc do
34
- tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
35
- TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
36
37
  tsv.to_s
37
38
  end
38
39
 
39
- TFacts.claim TFacts.targets_signed, :proc do
40
- tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name", "Target Sign"], :type => :double)
41
- Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
42
- tsv[tf] = [targets.keys, targets.values]
43
- end
44
- tsv.to_s
40
+ TFacts.claim TFacts.regulators, :proc do
41
+ TFacts.targets.tsv.reorder("Transcription Factor Associated Gene Name").to_s
45
42
  end
43
+
46
44
  end
47
45
 
48
46
  if defined? Entity and defined? Gene and Entity === Gene
49
47
 
50
48
  module Gene
51
49
  property :is_transcription_factor? => :array2single do
52
- @is_trasncription_factor ||= begin
53
- tfs = TFacts.targets.keys
54
- self.name.collect{|gene| tfs.include? gene}
55
- end
50
+ tfs = TFacts.targets.keys
51
+ self.name.collect{|gene| tfs.include? gene}
52
+ end
53
+ persist :_ary_is_transcription_factor?
54
+
55
+ property :transcription_regulators => :array2single do
56
+ Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
56
57
  end
58
+ persist :_ary_transcription_regulators
57
59
 
58
- property :transcription_targets => :array2single do
59
- @transcription_targets ||= begin
60
- Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
61
- end
62
- end
60
+ property :transcription_targets => :array2single do
61
+ Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
62
+ end
63
+ persist :_ary_transcription_targets
63
64
  end
64
65
  end
@@ -3,34 +3,32 @@ require 'rbbt/resource'
3
3
  require 'rbbt/sources/cath'
4
4
  require 'rbbt/sources/uniprot'
5
5
 
6
- module Uniprot
6
+ module UniProt
7
7
  extend Resource
8
- self.subdir = "share/databases/Uniprot"
8
+ self.subdir = "share/databases/UniProt"
9
9
 
10
- Uniprot.claim Uniprot.annotated_variants, :proc do
10
+ UniProt.claim UniProt.annotated_variants, :proc do
11
11
  url = "http://www.uniprot.org/docs/humsavar.txt"
12
12
  tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
13
- :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[0..5] + [(parts[6..-1] || []) * " "]) * "\t"}, :type => :list,:key_field => "Associated Gene Name",
14
- :fields => ["Uniprot/SwissProt Accession", "Uniprot Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
13
+ :fix => Proc.new{|line| parts = line.split(/\s+/); (parts[1..5] + [(parts[6..-1] || []) * " "]) * "\t"},
14
+ :type => :double,
15
+ :merge => true,
16
+ :key_field => "UniProt/SwissProt Accession",
17
+ :fields => ["UniProt Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
15
18
 
16
19
  tsv.unnamed = true
17
- tsv.process "Amino Acid Mutation" do |mutation|
18
- if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
19
- wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
20
- mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
21
- [wt, $2, mut] * ""
22
- else
23
- mutation
20
+ tsv.process "Amino Acid Mutation" do |mutations|
21
+ mutations.collect do |mutation|
22
+ if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
23
+ wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
24
+ mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
25
+ [wt, $2, mut] * ""
26
+ else
27
+ mutation
28
+ end
24
29
  end
25
30
  end
26
-
27
- uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
28
- mutation_pos = tsv.identify_field "Amino Acid Mutation"
29
- tsv.add_field "Mutated Isoform" do |key, values|
30
- [values[uniprot_pos], values[mutation_pos]] * ":"
31
- end
32
-
33
- tsv.reorder("Mutated Isoform").to_s
31
+ tsv.to_s
34
32
  end
35
33
 
36
34
 
@@ -44,7 +42,12 @@ module Uniprot
44
42
  text.split(/\n/).each{|l|
45
43
  next unless l =~ /^DR\s+PDB; (.*)\./
46
44
  id, method, resolution, region = $1.split(";").collect{|v| v.strip}
47
- chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
45
+ begin
46
+ chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
47
+ rescue
48
+ Log.warn("Error process Uniprot PDB line: #{line}")
49
+ next
50
+ end
48
51
  pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
49
52
  }
50
53
  pdb
@@ -96,7 +99,6 @@ module Uniprot
96
99
  variants
97
100
  end
98
101
 
99
-
100
102
  def self.cath(protein)
101
103
  url = UNIPROT_TEXT.sub "[PROTEIN]", protein
102
104
  text = Open.read(url)
@@ -118,7 +120,7 @@ module Uniprot
118
120
  end
119
121
 
120
122
  def self.pdbs_covering_aa_position(protein, aa_position)
121
- Uniprot.pdbs(protein).select do |pdb, info|
123
+ UniProt.pdbs(protein).select do |pdb, info|
122
124
  info[:region].include? aa_position
123
125
  end
124
126
  end
@@ -0,0 +1,18 @@
1
+ #: :type=:single
2
+ #Release build
3
+ current jul2012
4
+ release-68 jul2012
5
+ release-67 may2012
6
+ release-66 feb2012
7
+ release-65 dec2011
8
+ release-64 sep2011
9
+ release-63 jun2011
10
+ release-62 apr2011
11
+ release-61 feb2011
12
+ release-60 nov2010
13
+ release-59 aug2010
14
+ release-58 may2010
15
+ release-57 mar2010
16
+ release-56 sep2009
17
+ release-55 jul2009
18
+ release-54 may2009
@@ -0,0 +1,15 @@
1
+
2
+
3
+ rule /(.+)/ do |t|
4
+ require 'net/ftp'
5
+ chromosome = File.basename(t.name)
6
+
7
+ ftp = Genomes1000::URL
8
+ ftp = Net::FTP.new(Genomes1000::FTP_SERVER)
9
+ ftp.login
10
+ ftp.chdir(Genomes1000::FTP_PATH)
11
+ file = ftp.list("*.chr" + chromosome + ".*").collect{|l| l.split(" ").last}.last
12
+ ddd file
13
+ exit
14
+
15
+ end
@@ -3,7 +3,7 @@ require 'rbbt/util/misc'
3
3
 
4
4
  def read_chunk(jochem)
5
5
  chunk = ""
6
- while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
6
+ while (not jochem.eof? and not (line = Misc.fixutf8(jochem.gets)).match(/^--/))
7
7
  chunk << line
8
8
  end
9
9
  return nil if chunk.empty?
@@ -20,7 +20,7 @@ def process_jochem
20
20
  identifiers = File.open('identifiers', 'w')
21
21
 
22
22
  identifiers.puts("#: :namespace=JoChem")
23
- identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
23
+ identifiers.puts("#ID\tCompound Name\tPubChem:Substance ID\tPubChem:Coumpound ID\tDrugBank:ID\tChemIDplus:ID\tCAS:ID\tMeSH:Term\tChEBI:ID\tHMDB:ID\tKEGG:Coumpound ID\tKEGG:Drug ID")
24
24
 
25
25
  lexicon = File.open('lexicon', 'w')
26
26
  lexicon.puts("#: :namespace=JoChem")
@@ -49,12 +49,20 @@ def process_jochem
49
49
  tm = info["TM"] || []
50
50
  db = info["DB"] || []
51
51
 
52
+ cheb = db.collect{|code| code.match(/CHEB_(.*)/) ? $1 : nil}.compact
53
+ chid = db.collect{|code| code.match(/CHID_(.*)/) ? $1 : nil}.compact
52
54
  pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
55
+ pubs = db.collect{|code| code.match(/PUBS_(.*)/) ? $1 : nil}.compact
53
56
  drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
57
+ cas = db.collect{|code| code.match(/CAS_(.*)/) ? $1 : nil}.compact
58
+ mesh = db.collect{|code| code.match(/MESH_(.*)/) ? $1 : nil}.compact
59
+ hmdb = db.collect{|code| code.match(/HMDB_(.*)/) ? $1 : nil}.compact
60
+ kegg = db.collect{|code| code.match(/KEGG_(.*)/) ? $1 : nil}.compact
61
+ kegd = db.collect{|code| code.match(/KEGD_(.*)/) ? $1 : nil}.compact
54
62
  inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
55
63
 
56
64
  lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
57
- identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
65
+ identifiers.puts [id, na, pubs * "|", pubc * "|", drug * "|", chid * "|", cas * "|", mesh * "|", cheb * "|", hmdb * "|", kegg * "|", kegd * "|" ] * "\t"
58
66
  inchi.puts [id, inch * "|"] * "\t" if inch.any?
59
67
  definitions.puts [id, df] * "\t" unless df.nil?
60
68
  end
@@ -1,7 +1,8 @@
1
1
  require 'nokogiri'
2
+ require 'rbbt-util'
2
3
 
3
4
  module NCI
4
- def self.get_pathways(xml, format = "UP")
5
+ def self.get_pathways(xml, format = "UP", get_short_name = false)
5
6
  doc = Nokogiri::XML(xml)
6
7
  pathways = {}
7
8
 
@@ -9,36 +10,60 @@ module NCI
9
10
  doc.xpath("//Molecule").each do |molecule|
10
11
  id = molecule.attribute('id').value
11
12
  type = molecule.attribute('molecule_type').value
12
- next unless type == "protein"
13
13
  names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
14
- next if names.empty?
15
- molecules[id] = {:xml => molecule, :uniprot => names.first}
14
+ molecules[id] = {:xml => molecule, :proteins => names}
16
15
  end
17
16
 
17
+
18
18
  interactions = {}
19
19
  doc.xpath("//Interaction").each do |interaction|
20
20
  id = interaction.attribute('id').value
21
- molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}
21
+ type = interaction.attribute('interaction_type').value
22
+ molecule_ids = interaction.xpath('InteractionComponentList/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}.flatten.compact
23
+ pathway_ids = interaction.xpath('Abstraction').collect{|c| c.attribute('pathway_idref').value}.flatten.compact
22
24
 
23
- interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
25
+ interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids, :pathway_ids => pathway_ids}
24
26
  end
25
27
 
26
28
  doc.xpath("//Pathway").each do |pathway|
27
29
  id = pathway.attribute('id').value
28
30
  subnet = pathway.attribute('subnet').value
29
31
  name = pathway.xpath('LongName').first.content
32
+ short_name = pathway.xpath('ShortName').first.content if get_short_name
30
33
 
31
- interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
34
+ interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
32
35
 
33
- pathway_interactions = interaction_ids.collect{|id| interactions[id]}
34
- pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
36
+ pathway_interactions = interaction_ids.collect{|i| interactions[i]}
37
+ pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.compact.flatten
35
38
 
36
- pathway_uniprot_ids = pathway_molecule_ids.collect do |id|
37
- next unless molecules.include? id
38
- molecules[id][:uniprot]
39
+ pathway_uniprot_ids = pathway_molecule_ids.collect do |i|
40
+ next unless molecules.include? i
41
+ molecules[i][:proteins]
42
+ end
43
+ if get_short_name
44
+ pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq], [short_name]]
45
+ else
46
+ pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq]]
39
47
  end
40
- pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
41
48
  end
49
+
50
+ doc.xpath("//Pathway").each do |pathway|
51
+ id = pathway.attribute('id').value
52
+ subnet = pathway.attribute('subnet').value
53
+ name = pathway.xpath('LongName').first.content
54
+
55
+ interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
56
+
57
+ pathway_interactions = interaction_ids.collect{|i| interactions[i]}
58
+ pathway_subnet_ids = pathway_interactions.collect{|info| info[:pathway_ids]}.compact.flatten
59
+
60
+ pathway_subnet_ids.collect do |nid|
61
+ next unless pathways.include? nid
62
+ new_genes = pathways[id].last
63
+ pathways[nid][1] = (pathways[nid][1] + new_genes).uniq
64
+ end
65
+ end
66
+
42
67
  pathways
43
68
  end
44
69
  end
@@ -60,11 +85,25 @@ file 'biocarta_pathways' do |t|
60
85
 
61
86
  xml = Open.read(url)
62
87
 
63
- pathways = NCI.get_pathways(xml, "LL")
88
+ pathways = NCI.get_pathways(xml, "LL", true)
64
89
 
65
- Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
90
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID", "Pathway Short Name"]).to_s)
66
91
  end
67
92
 
93
+ file 'biocarta_pathways_fixed_ids' => 'biocarta_pathways' do |t|
94
+ orig = TSV.open(Open.open(t.prerequisites.first))
95
+ tsv = TSV.setup({}, :type => :double, :key_field => "BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID"])
96
+
97
+ orig.through do |key, values|
98
+ name, genes, short = values
99
+ code = "h_" + short.first
100
+ tsv[code] = [name, genes]
101
+ end
102
+
103
+ Open.write(t.name, tsv.to_s)
104
+ end
105
+
106
+
68
107
  file 'reactome_pathways' do |t|
69
108
 
70
109
  url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
@@ -76,4 +115,3 @@ file 'reactome_pathways' do |t|
76
115
  Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
77
116
  end
78
117
 
79
-
@@ -43,7 +43,7 @@ $biomart_probe_identifiers = [
43
43
  [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
44
44
  [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
45
45
  [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
46
- [ 'Agilent WholeGenome',"agilent_wholegenome" ],
46
+ #[ 'Agilent WholeGenome',"agilent_wholegenome" ],
47
47
  [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
48
48
  [ 'Codelink ID', 'codelink' ],
49
49
  [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
@@ -62,6 +62,7 @@ $biomart_identifiers = [
62
62
  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
63
63
  [ 'HGNC ID', "hgnc_id", 'HGNC'],
64
64
  [ 'EMBL (Genbank) ID' , "embl"] ,
65
+ [ 'RefSeq mRNA' , "refseq_mrna"] ,
65
66
 
66
67
  # Probes
67
68
  [ 'AFFY HC G110', 'affy_hc_g110' ],
@@ -80,7 +81,7 @@ $biomart_identifiers = [
80
81
  [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
81
82
  [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
82
83
  [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
83
- [ 'Agilent WholeGenome',"agilent_wholegenome" ],
84
+ #[ 'Agilent WholeGenome',"agilent_wholegenome" ],
84
85
  [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
85
86
  [ 'Codelink ID', 'codelink' ],
86
87
  [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
@@ -19,12 +19,12 @@ $biomart_lexicon = [
19
19
  ]
20
20
 
21
21
  $biomart_identifiers = [
22
+ ['Entrez Gene ID', "entrezgene"],
22
23
  ['Associated Gene Name' , "external_gene_id"],
23
24
  ['Protein ID' , "protein_id"] ,
24
25
  ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
25
26
  ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
26
27
  ['RefSeq Protein ID' , "refseq_peptide"] ,
27
- ['RefSeq DNA ID' , "refseq_dna"] ,
28
28
  ['EMBL (Genbank) ID' , "embl"] ,
29
29
  ['RGD ID' , "rgd"] ,
30
30
  ['RGD Symbol' , "rgd_symbol"] ,
@@ -39,7 +39,6 @@ $biomart_identifiers = [
39
39
  ['Affy rg u34c', "affy_rg_u34c"],
40
40
  ['Affy rn u34', "affy_rn_u34"],
41
41
  ['Affy rt u34', "affy_rt_u34"],
42
- ['Agilent WholeGenome',"agilent_wholegenome" ],
43
42
  ['Codelink ID ', "codelink"],
44
43
  ]
45
44
 
@@ -4,50 +4,48 @@ require 'rbbt/sources/entrez'
4
4
  require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
5
 
6
6
  $taxs = [559292,4932]
7
- $native = "SGD ID"
8
- $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
9
- $biomart_db = 'scerevisiae_gene_ensembl'
10
- $biomart_main = ['Entrez Gene ID', 'entrezgene']
11
- $ortholog_key = "yeast_ensembl_gene"
12
-
13
-
14
- file 'scientific_name' do |t|
15
- File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
16
- end
17
-
18
- file 'lexicon' do |t|
19
- lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
20
-
21
- lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
22
-
23
- lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
24
-
25
- lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
7
+ $scientific_name = "Saccharomyces cerevisiae"
8
+ #$ortholog_key = "yeast_ensembl_gene"
26
9
 
27
- File.open(t.name, 'w') do |f| f.puts lexicon end
28
- end
29
-
30
- file 'identifiers' do |t|
31
- identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
32
-
33
- identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
34
-
35
- identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
36
- [['Associated Gene Name' , "external_gene_id"],
37
- ['Ensembl Gene ID', "ensembl_gene_id" ],
38
- ['Ensembl Protein ID', "ensembl_peptide_id" ],
39
- ['RefSeq Protein ID' , "refseq_peptide"] ,
40
- ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
41
- ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
42
- ['Protein ID' , "protein_id"] ,
43
- ['EMBL (Genbank) ID' , "embl"] ,
44
- # Affymetrix
45
- ['Affy yeast 2',"affy_yeast_2"],
46
- ['Affy yg s98', "affy_yg_s98"]])
47
-
48
- File.open(t.name, 'w') do |f| f.puts identifiers end
49
- end
50
-
51
-
52
- task :default => ['name', 'lexicon', 'identifiers']
10
+ $biomart_db = 'scerevisiae_gene_ensembl'
53
11
 
12
+ $biomart_lexicon = [
13
+ [ 'Associated Gene Name' , "external_gene_id"],
14
+ ]
15
+
16
+ $biomart_protein_identifiers = [
17
+ [ 'Protein ID', "protein_id" ],
18
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
19
+ [ 'Unigene ID', "unigene" ],
20
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
21
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
22
+ ]
23
+
24
+ $biomart_probe_identifiers = [
25
+ ]
26
+
27
+ $biomart_identifiers = [
28
+ [ 'Entrez Gene ID', "entrezgene"],
29
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
30
+ [ 'Associated Gene Name', "external_gene_id" ],
31
+ [ 'Protein ID', "protein_id" ],
32
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
33
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
34
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
35
+ [ 'EMBL (Genbank) ID' , "embl"] ,
36
+ [ 'RefSeq mRNA' , "refseq_mrna"] ,
37
+ ]
38
+
39
+ $biomart_go= [
40
+ ["GO ID", 'go_id'],
41
+ ["GO Namespace", 'namespace_1003'],
42
+ ]
43
+
44
+ $biomart_go_2009= [
45
+ ["GO BP ID", 'go_biological_process_id'],
46
+ ["GO MF ID", 'go_molecular_function_id'],
47
+ ["GO CC ID", 'go_cellular_component_id'],
48
+ ]
49
+
50
+ $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
51
+ load File.join(File.dirname(__FILE__), '../organism_helpers.rb')