rbbt-sources 1.2.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/etc/biomart/missing_in_archive +11 -0
- data/lib/rbbt/sources/COSMIC.rb +47 -4
- data/lib/rbbt/sources/HPRD.rb +23 -0
- data/lib/rbbt/sources/InterPro.rb +98 -8
- data/lib/rbbt/sources/NCI.rb +7 -5
- data/lib/rbbt/sources/PSI_MI.rb +41 -0
- data/lib/rbbt/sources/STITCH.rb +92 -0
- data/lib/rbbt/sources/barcode.rb +0 -3
- data/lib/rbbt/sources/biomart.rb +3 -3
- data/lib/rbbt/sources/dbSNP.rb +100 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
- data/lib/rbbt/sources/entrez.rb +2 -2
- data/lib/rbbt/sources/genomes1000.rb +45 -0
- data/lib/rbbt/sources/go.rb +16 -4
- data/lib/rbbt/sources/organism.rb +80 -12
- data/lib/rbbt/sources/pfam.rb +63 -3
- data/lib/rbbt/sources/pubmed.rb +10 -3
- data/lib/rbbt/sources/reactome.rb +82 -0
- data/lib/rbbt/sources/tfacts.rb +37 -36
- data/lib/rbbt/sources/uniprot.rb +25 -23
- data/share/Ensembl/release_dates +18 -0
- data/share/install/Genomes1000/Rakefile +15 -0
- data/share/install/JoChem/Rakefile +11 -3
- data/share/install/NCI/Rakefile +54 -16
- data/share/install/Organism/Hsa/Rakefile +3 -2
- data/share/install/Organism/Rno/Rakefile +1 -2
- data/share/install/Organism/Sce/Rakefile +43 -45
- data/share/install/Organism/organism_helpers.rb +360 -96
- data/share/install/STITCH/Rakefile +0 -0
- data/test/rbbt/sources/test_organism.rb +26 -7
- data/test/rbbt/sources/test_pubmed.rb +5 -0
- metadata +94 -97
- data/share/install/InterPro/Rakefile +0 -29
data/lib/rbbt/sources/tfacts.rb
CHANGED
@@ -1,64 +1,65 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/resource'
|
3
|
-
require '
|
3
|
+
require 'spreadsheet'
|
4
4
|
|
5
5
|
module TFacts
|
6
6
|
extend Resource
|
7
7
|
self.subdir = "share/databases/TF"
|
8
8
|
|
9
|
-
|
10
|
-
doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
|
9
|
+
TFacts.claim TFacts.source["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
|
11
10
|
|
12
|
-
|
13
|
-
|
11
|
+
TFacts.claim TFacts.targets, :proc do
|
12
|
+
book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
|
13
|
+
sheet = book.worksheet 0
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name"], :namespace => "Hsa", :type => :flat)
|
16
|
+
sheet.each do |row|
|
17
|
+
target, tf = row.values_at 0, 1
|
18
|
+
tsv[target] ||= []
|
19
|
+
tsv[target] << tf
|
20
|
+
end
|
17
21
|
|
18
|
-
|
19
|
-
rows.shift
|
20
|
-
targets = {}
|
21
|
-
rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
|
22
|
-
targets
|
22
|
+
tsv.to_s
|
23
23
|
end
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
TFacts.claim TFacts.targets_signed, :proc do
|
26
|
+
book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
|
27
|
+
sheet = book.worksheet 0
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name", "Sign"], :namespace => "Hsa", :type => :double)
|
30
|
+
sheet.each do |row|
|
31
|
+
target, tf, sign = row.values_at 0, 1, 2
|
32
|
+
tsv[target] ||= [[],[]]
|
33
|
+
tsv[target][0] << tf
|
34
|
+
tsv[target][1] << sign
|
35
|
+
end
|
32
36
|
|
33
|
-
TFacts.claim TFacts.targets, :proc do
|
34
|
-
tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
|
35
|
-
TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
|
36
37
|
tsv.to_s
|
37
38
|
end
|
38
39
|
|
39
|
-
TFacts.claim TFacts.
|
40
|
-
tsv
|
41
|
-
Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
|
42
|
-
tsv[tf] = [targets.keys, targets.values]
|
43
|
-
end
|
44
|
-
tsv.to_s
|
40
|
+
TFacts.claim TFacts.regulators, :proc do
|
41
|
+
TFacts.targets.tsv.reorder("Transcription Factor Associated Gene Name").to_s
|
45
42
|
end
|
43
|
+
|
46
44
|
end
|
47
45
|
|
48
46
|
if defined? Entity and defined? Gene and Entity === Gene
|
49
47
|
|
50
48
|
module Gene
|
51
49
|
property :is_transcription_factor? => :array2single do
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
50
|
+
tfs = TFacts.targets.keys
|
51
|
+
self.name.collect{|gene| tfs.include? gene}
|
52
|
+
end
|
53
|
+
persist :_ary_is_transcription_factor?
|
54
|
+
|
55
|
+
property :transcription_regulators => :array2single do
|
56
|
+
Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
56
57
|
end
|
58
|
+
persist :_ary_transcription_regulators
|
57
59
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
end
|
60
|
+
property :transcription_targets => :array2single do
|
61
|
+
Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
62
|
+
end
|
63
|
+
persist :_ary_transcription_targets
|
63
64
|
end
|
64
65
|
end
|
data/lib/rbbt/sources/uniprot.rb
CHANGED
@@ -3,34 +3,32 @@ require 'rbbt/resource'
|
|
3
3
|
require 'rbbt/sources/cath'
|
4
4
|
require 'rbbt/sources/uniprot'
|
5
5
|
|
6
|
-
module
|
6
|
+
module UniProt
|
7
7
|
extend Resource
|
8
|
-
self.subdir = "share/databases/
|
8
|
+
self.subdir = "share/databases/UniProt"
|
9
9
|
|
10
|
-
|
10
|
+
UniProt.claim UniProt.annotated_variants, :proc do
|
11
11
|
url = "http://www.uniprot.org/docs/humsavar.txt"
|
12
12
|
tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
|
13
|
-
:fix => Proc.new{|line| parts = line.split(/\s+/); (parts[
|
14
|
-
:
|
13
|
+
:fix => Proc.new{|line| parts = line.split(/\s+/); (parts[1..5] + [(parts[6..-1] || []) * " "]) * "\t"},
|
14
|
+
:type => :double,
|
15
|
+
:merge => true,
|
16
|
+
:key_field => "UniProt/SwissProt Accession",
|
17
|
+
:fields => ["UniProt Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
|
15
18
|
|
16
19
|
tsv.unnamed = true
|
17
|
-
tsv.process "Amino Acid Mutation" do |
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
tsv.process "Amino Acid Mutation" do |mutations|
|
21
|
+
mutations.collect do |mutation|
|
22
|
+
if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
|
23
|
+
wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
|
24
|
+
mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
|
25
|
+
[wt, $2, mut] * ""
|
26
|
+
else
|
27
|
+
mutation
|
28
|
+
end
|
24
29
|
end
|
25
30
|
end
|
26
|
-
|
27
|
-
uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
|
28
|
-
mutation_pos = tsv.identify_field "Amino Acid Mutation"
|
29
|
-
tsv.add_field "Mutated Isoform" do |key, values|
|
30
|
-
[values[uniprot_pos], values[mutation_pos]] * ":"
|
31
|
-
end
|
32
|
-
|
33
|
-
tsv.reorder("Mutated Isoform").to_s
|
31
|
+
tsv.to_s
|
34
32
|
end
|
35
33
|
|
36
34
|
|
@@ -44,7 +42,12 @@ module Uniprot
|
|
44
42
|
text.split(/\n/).each{|l|
|
45
43
|
next unless l =~ /^DR\s+PDB; (.*)\./
|
46
44
|
id, method, resolution, region = $1.split(";").collect{|v| v.strip}
|
47
|
-
|
45
|
+
begin
|
46
|
+
chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
|
47
|
+
rescue
|
48
|
+
Log.warn("Error process Uniprot PDB line: #{line}")
|
49
|
+
next
|
50
|
+
end
|
48
51
|
pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
|
49
52
|
}
|
50
53
|
pdb
|
@@ -96,7 +99,6 @@ module Uniprot
|
|
96
99
|
variants
|
97
100
|
end
|
98
101
|
|
99
|
-
|
100
102
|
def self.cath(protein)
|
101
103
|
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
102
104
|
text = Open.read(url)
|
@@ -118,7 +120,7 @@ module Uniprot
|
|
118
120
|
end
|
119
121
|
|
120
122
|
def self.pdbs_covering_aa_position(protein, aa_position)
|
121
|
-
|
123
|
+
UniProt.pdbs(protein).select do |pdb, info|
|
122
124
|
info[:region].include? aa_position
|
123
125
|
end
|
124
126
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#: :type=:single
|
2
|
+
#Release build
|
3
|
+
current jul2012
|
4
|
+
release-68 jul2012
|
5
|
+
release-67 may2012
|
6
|
+
release-66 feb2012
|
7
|
+
release-65 dec2011
|
8
|
+
release-64 sep2011
|
9
|
+
release-63 jun2011
|
10
|
+
release-62 apr2011
|
11
|
+
release-61 feb2011
|
12
|
+
release-60 nov2010
|
13
|
+
release-59 aug2010
|
14
|
+
release-58 may2010
|
15
|
+
release-57 mar2010
|
16
|
+
release-56 sep2009
|
17
|
+
release-55 jul2009
|
18
|
+
release-54 may2009
|
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
rule /(.+)/ do |t|
|
4
|
+
require 'net/ftp'
|
5
|
+
chromosome = File.basename(t.name)
|
6
|
+
|
7
|
+
ftp = Genomes1000::URL
|
8
|
+
ftp = Net::FTP.new(Genomes1000::FTP_SERVER)
|
9
|
+
ftp.login
|
10
|
+
ftp.chdir(Genomes1000::FTP_PATH)
|
11
|
+
file = ftp.list("*.chr" + chromosome + ".*").collect{|l| l.split(" ").last}.last
|
12
|
+
ddd file
|
13
|
+
exit
|
14
|
+
|
15
|
+
end
|
@@ -3,7 +3,7 @@ require 'rbbt/util/misc'
|
|
3
3
|
|
4
4
|
def read_chunk(jochem)
|
5
5
|
chunk = ""
|
6
|
-
while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
|
6
|
+
while (not jochem.eof? and not (line = Misc.fixutf8(jochem.gets)).match(/^--/))
|
7
7
|
chunk << line
|
8
8
|
end
|
9
9
|
return nil if chunk.empty?
|
@@ -20,7 +20,7 @@ def process_jochem
|
|
20
20
|
identifiers = File.open('identifiers', 'w')
|
21
21
|
|
22
22
|
identifiers.puts("#: :namespace=JoChem")
|
23
|
-
identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
|
23
|
+
identifiers.puts("#ID\tCompound Name\tPubChem:Substance ID\tPubChem:Coumpound ID\tDrugBank:ID\tChemIDplus:ID\tCAS:ID\tMeSH:Term\tChEBI:ID\tHMDB:ID\tKEGG:Coumpound ID\tKEGG:Drug ID")
|
24
24
|
|
25
25
|
lexicon = File.open('lexicon', 'w')
|
26
26
|
lexicon.puts("#: :namespace=JoChem")
|
@@ -49,12 +49,20 @@ def process_jochem
|
|
49
49
|
tm = info["TM"] || []
|
50
50
|
db = info["DB"] || []
|
51
51
|
|
52
|
+
cheb = db.collect{|code| code.match(/CHEB_(.*)/) ? $1 : nil}.compact
|
53
|
+
chid = db.collect{|code| code.match(/CHID_(.*)/) ? $1 : nil}.compact
|
52
54
|
pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
|
55
|
+
pubs = db.collect{|code| code.match(/PUBS_(.*)/) ? $1 : nil}.compact
|
53
56
|
drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
|
57
|
+
cas = db.collect{|code| code.match(/CAS_(.*)/) ? $1 : nil}.compact
|
58
|
+
mesh = db.collect{|code| code.match(/MESH_(.*)/) ? $1 : nil}.compact
|
59
|
+
hmdb = db.collect{|code| code.match(/HMDB_(.*)/) ? $1 : nil}.compact
|
60
|
+
kegg = db.collect{|code| code.match(/KEGG_(.*)/) ? $1 : nil}.compact
|
61
|
+
kegd = db.collect{|code| code.match(/KEGD_(.*)/) ? $1 : nil}.compact
|
54
62
|
inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
|
55
63
|
|
56
64
|
lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
|
57
|
-
identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
|
65
|
+
identifiers.puts [id, na, pubs * "|", pubc * "|", drug * "|", chid * "|", cas * "|", mesh * "|", cheb * "|", hmdb * "|", kegg * "|", kegd * "|" ] * "\t"
|
58
66
|
inchi.puts [id, inch * "|"] * "\t" if inch.any?
|
59
67
|
definitions.puts [id, df] * "\t" unless df.nil?
|
60
68
|
end
|
data/share/install/NCI/Rakefile
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
require 'rbbt-util'
|
2
3
|
|
3
4
|
module NCI
|
4
|
-
def self.get_pathways(xml, format = "UP")
|
5
|
+
def self.get_pathways(xml, format = "UP", get_short_name = false)
|
5
6
|
doc = Nokogiri::XML(xml)
|
6
7
|
pathways = {}
|
7
8
|
|
@@ -9,36 +10,60 @@ module NCI
|
|
9
10
|
doc.xpath("//Molecule").each do |molecule|
|
10
11
|
id = molecule.attribute('id').value
|
11
12
|
type = molecule.attribute('molecule_type').value
|
12
|
-
next unless type == "protein"
|
13
13
|
names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
|
14
|
-
|
15
|
-
molecules[id] = {:xml => molecule, :uniprot => names.first}
|
14
|
+
molecules[id] = {:xml => molecule, :proteins => names}
|
16
15
|
end
|
17
16
|
|
17
|
+
|
18
18
|
interactions = {}
|
19
19
|
doc.xpath("//Interaction").each do |interaction|
|
20
20
|
id = interaction.attribute('id').value
|
21
|
-
|
21
|
+
type = interaction.attribute('interaction_type').value
|
22
|
+
molecule_ids = interaction.xpath('InteractionComponentList/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}.flatten.compact
|
23
|
+
pathway_ids = interaction.xpath('Abstraction').collect{|c| c.attribute('pathway_idref').value}.flatten.compact
|
22
24
|
|
23
|
-
interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
|
25
|
+
interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids, :pathway_ids => pathway_ids}
|
24
26
|
end
|
25
27
|
|
26
28
|
doc.xpath("//Pathway").each do |pathway|
|
27
29
|
id = pathway.attribute('id').value
|
28
30
|
subnet = pathway.attribute('subnet').value
|
29
31
|
name = pathway.xpath('LongName').first.content
|
32
|
+
short_name = pathway.xpath('ShortName').first.content if get_short_name
|
30
33
|
|
31
|
-
interaction_ids = pathway.xpath("
|
34
|
+
interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
|
32
35
|
|
33
|
-
pathway_interactions = interaction_ids.collect{|
|
34
|
-
pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
|
36
|
+
pathway_interactions = interaction_ids.collect{|i| interactions[i]}
|
37
|
+
pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.compact.flatten
|
35
38
|
|
36
|
-
pathway_uniprot_ids = pathway_molecule_ids.collect do |
|
37
|
-
next unless molecules.include?
|
38
|
-
molecules[
|
39
|
+
pathway_uniprot_ids = pathway_molecule_ids.collect do |i|
|
40
|
+
next unless molecules.include? i
|
41
|
+
molecules[i][:proteins]
|
42
|
+
end
|
43
|
+
if get_short_name
|
44
|
+
pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq], [short_name]]
|
45
|
+
else
|
46
|
+
pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq]]
|
39
47
|
end
|
40
|
-
pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
|
41
48
|
end
|
49
|
+
|
50
|
+
doc.xpath("//Pathway").each do |pathway|
|
51
|
+
id = pathway.attribute('id').value
|
52
|
+
subnet = pathway.attribute('subnet').value
|
53
|
+
name = pathway.xpath('LongName').first.content
|
54
|
+
|
55
|
+
interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
|
56
|
+
|
57
|
+
pathway_interactions = interaction_ids.collect{|i| interactions[i]}
|
58
|
+
pathway_subnet_ids = pathway_interactions.collect{|info| info[:pathway_ids]}.compact.flatten
|
59
|
+
|
60
|
+
pathway_subnet_ids.collect do |nid|
|
61
|
+
next unless pathways.include? nid
|
62
|
+
new_genes = pathways[id].last
|
63
|
+
pathways[nid][1] = (pathways[nid][1] + new_genes).uniq
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
42
67
|
pathways
|
43
68
|
end
|
44
69
|
end
|
@@ -60,11 +85,25 @@ file 'biocarta_pathways' do |t|
|
|
60
85
|
|
61
86
|
xml = Open.read(url)
|
62
87
|
|
63
|
-
pathways = NCI.get_pathways(xml, "LL")
|
88
|
+
pathways = NCI.get_pathways(xml, "LL", true)
|
64
89
|
|
65
|
-
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
|
90
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID", "Pathway Short Name"]).to_s)
|
66
91
|
end
|
67
92
|
|
93
|
+
file 'biocarta_pathways_fixed_ids' => 'biocarta_pathways' do |t|
|
94
|
+
orig = TSV.open(Open.open(t.prerequisites.first))
|
95
|
+
tsv = TSV.setup({}, :type => :double, :key_field => "BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID"])
|
96
|
+
|
97
|
+
orig.through do |key, values|
|
98
|
+
name, genes, short = values
|
99
|
+
code = "h_" + short.first
|
100
|
+
tsv[code] = [name, genes]
|
101
|
+
end
|
102
|
+
|
103
|
+
Open.write(t.name, tsv.to_s)
|
104
|
+
end
|
105
|
+
|
106
|
+
|
68
107
|
file 'reactome_pathways' do |t|
|
69
108
|
|
70
109
|
url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
|
@@ -76,4 +115,3 @@ file 'reactome_pathways' do |t|
|
|
76
115
|
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
77
116
|
end
|
78
117
|
|
79
|
-
|
@@ -43,7 +43,7 @@ $biomart_probe_identifiers = [
|
|
43
43
|
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
|
44
44
|
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
45
45
|
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
46
|
-
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
46
|
+
#[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
47
47
|
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
48
48
|
[ 'Codelink ID', 'codelink' ],
|
49
49
|
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
@@ -62,6 +62,7 @@ $biomart_identifiers = [
|
|
62
62
|
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
63
63
|
[ 'HGNC ID', "hgnc_id", 'HGNC'],
|
64
64
|
[ 'EMBL (Genbank) ID' , "embl"] ,
|
65
|
+
[ 'RefSeq mRNA' , "refseq_mrna"] ,
|
65
66
|
|
66
67
|
# Probes
|
67
68
|
[ 'AFFY HC G110', 'affy_hc_g110' ],
|
@@ -80,7 +81,7 @@ $biomart_identifiers = [
|
|
80
81
|
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
|
81
82
|
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
82
83
|
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
83
|
-
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
84
|
+
#[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
84
85
|
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
85
86
|
[ 'Codelink ID', 'codelink' ],
|
86
87
|
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
@@ -19,12 +19,12 @@ $biomart_lexicon = [
|
|
19
19
|
]
|
20
20
|
|
21
21
|
$biomart_identifiers = [
|
22
|
+
['Entrez Gene ID', "entrezgene"],
|
22
23
|
['Associated Gene Name' , "external_gene_id"],
|
23
24
|
['Protein ID' , "protein_id"] ,
|
24
25
|
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
25
26
|
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
26
27
|
['RefSeq Protein ID' , "refseq_peptide"] ,
|
27
|
-
['RefSeq DNA ID' , "refseq_dna"] ,
|
28
28
|
['EMBL (Genbank) ID' , "embl"] ,
|
29
29
|
['RGD ID' , "rgd"] ,
|
30
30
|
['RGD Symbol' , "rgd_symbol"] ,
|
@@ -39,7 +39,6 @@ $biomart_identifiers = [
|
|
39
39
|
['Affy rg u34c', "affy_rg_u34c"],
|
40
40
|
['Affy rn u34', "affy_rn_u34"],
|
41
41
|
['Affy rt u34', "affy_rt_u34"],
|
42
|
-
['Agilent WholeGenome',"agilent_wholegenome" ],
|
43
42
|
['Codelink ID ', "codelink"],
|
44
43
|
]
|
45
44
|
|
@@ -4,50 +4,48 @@ require 'rbbt/sources/entrez'
|
|
4
4
|
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
5
|
|
6
6
|
$taxs = [559292,4932]
|
7
|
-
$
|
8
|
-
|
9
|
-
$biomart_db = 'scerevisiae_gene_ensembl'
|
10
|
-
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
|
-
$ortholog_key = "yeast_ensembl_gene"
|
12
|
-
|
13
|
-
|
14
|
-
file 'scientific_name' do |t|
|
15
|
-
File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
|
16
|
-
end
|
17
|
-
|
18
|
-
file 'lexicon' do |t|
|
19
|
-
lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
20
|
-
|
21
|
-
lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
|
22
|
-
|
23
|
-
lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
|
24
|
-
|
25
|
-
lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
|
7
|
+
$scientific_name = "Saccharomyces cerevisiae"
|
8
|
+
#$ortholog_key = "yeast_ensembl_gene"
|
26
9
|
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
file 'identifiers' do |t|
|
31
|
-
identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
|
32
|
-
|
33
|
-
identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
|
34
|
-
|
35
|
-
identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
|
36
|
-
[['Associated Gene Name' , "external_gene_id"],
|
37
|
-
['Ensembl Gene ID', "ensembl_gene_id" ],
|
38
|
-
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
39
|
-
['RefSeq Protein ID' , "refseq_peptide"] ,
|
40
|
-
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
41
|
-
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
42
|
-
['Protein ID' , "protein_id"] ,
|
43
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
44
|
-
# Affymetrix
|
45
|
-
['Affy yeast 2',"affy_yeast_2"],
|
46
|
-
['Affy yg s98', "affy_yg_s98"]])
|
47
|
-
|
48
|
-
File.open(t.name, 'w') do |f| f.puts identifiers end
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
task :default => ['name', 'lexicon', 'identifiers']
|
10
|
+
$biomart_db = 'scerevisiae_gene_ensembl'
|
53
11
|
|
12
|
+
$biomart_lexicon = [
|
13
|
+
[ 'Associated Gene Name' , "external_gene_id"],
|
14
|
+
]
|
15
|
+
|
16
|
+
$biomart_protein_identifiers = [
|
17
|
+
[ 'Protein ID', "protein_id" ],
|
18
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
19
|
+
[ 'Unigene ID', "unigene" ],
|
20
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
21
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
22
|
+
]
|
23
|
+
|
24
|
+
$biomart_probe_identifiers = [
|
25
|
+
]
|
26
|
+
|
27
|
+
$biomart_identifiers = [
|
28
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
29
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
30
|
+
[ 'Associated Gene Name', "external_gene_id" ],
|
31
|
+
[ 'Protein ID', "protein_id" ],
|
32
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
33
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
34
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
35
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
36
|
+
[ 'RefSeq mRNA' , "refseq_mrna"] ,
|
37
|
+
]
|
38
|
+
|
39
|
+
$biomart_go= [
|
40
|
+
["GO ID", 'go_id'],
|
41
|
+
["GO Namespace", 'namespace_1003'],
|
42
|
+
]
|
43
|
+
|
44
|
+
$biomart_go_2009= [
|
45
|
+
["GO BP ID", 'go_biological_process_id'],
|
46
|
+
["GO MF ID", 'go_molecular_function_id'],
|
47
|
+
["GO CC ID", 'go_cellular_component_id'],
|
48
|
+
]
|
49
|
+
|
50
|
+
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
51
|
+
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|