rbbt-sources 1.2.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/etc/biomart/missing_in_archive +11 -0
- data/lib/rbbt/sources/COSMIC.rb +47 -4
- data/lib/rbbt/sources/HPRD.rb +23 -0
- data/lib/rbbt/sources/InterPro.rb +98 -8
- data/lib/rbbt/sources/NCI.rb +7 -5
- data/lib/rbbt/sources/PSI_MI.rb +41 -0
- data/lib/rbbt/sources/STITCH.rb +92 -0
- data/lib/rbbt/sources/barcode.rb +0 -3
- data/lib/rbbt/sources/biomart.rb +3 -3
- data/lib/rbbt/sources/dbSNP.rb +100 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
- data/lib/rbbt/sources/entrez.rb +2 -2
- data/lib/rbbt/sources/genomes1000.rb +45 -0
- data/lib/rbbt/sources/go.rb +16 -4
- data/lib/rbbt/sources/organism.rb +80 -12
- data/lib/rbbt/sources/pfam.rb +63 -3
- data/lib/rbbt/sources/pubmed.rb +10 -3
- data/lib/rbbt/sources/reactome.rb +82 -0
- data/lib/rbbt/sources/tfacts.rb +37 -36
- data/lib/rbbt/sources/uniprot.rb +25 -23
- data/share/Ensembl/release_dates +18 -0
- data/share/install/Genomes1000/Rakefile +15 -0
- data/share/install/JoChem/Rakefile +11 -3
- data/share/install/NCI/Rakefile +54 -16
- data/share/install/Organism/Hsa/Rakefile +3 -2
- data/share/install/Organism/Rno/Rakefile +1 -2
- data/share/install/Organism/Sce/Rakefile +43 -45
- data/share/install/Organism/organism_helpers.rb +360 -96
- data/share/install/STITCH/Rakefile +0 -0
- data/test/rbbt/sources/test_organism.rb +26 -7
- data/test/rbbt/sources/test_pubmed.rb +5 -0
- metadata +94 -97
- data/share/install/InterPro/Rakefile +0 -29
data/lib/rbbt/sources/tfacts.rb
CHANGED
@@ -1,64 +1,65 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/resource'
|
3
|
-
require '
|
3
|
+
require 'spreadsheet'
|
4
4
|
|
5
5
|
module TFacts
|
6
6
|
extend Resource
|
7
7
|
self.subdir = "share/databases/TF"
|
8
8
|
|
9
|
-
|
10
|
-
doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
|
9
|
+
TFacts.claim TFacts.source["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
|
11
10
|
|
12
|
-
|
13
|
-
|
11
|
+
TFacts.claim TFacts.targets, :proc do
|
12
|
+
book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
|
13
|
+
sheet = book.worksheet 0
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name"], :namespace => "Hsa", :type => :flat)
|
16
|
+
sheet.each do |row|
|
17
|
+
target, tf = row.values_at 0, 1
|
18
|
+
tsv[target] ||= []
|
19
|
+
tsv[target] << tf
|
20
|
+
end
|
17
21
|
|
18
|
-
|
19
|
-
rows.shift
|
20
|
-
targets = {}
|
21
|
-
rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
|
22
|
-
targets
|
22
|
+
tsv.to_s
|
23
23
|
end
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
TFacts.claim TFacts.targets_signed, :proc do
|
26
|
+
book = Spreadsheet.open TFacts.source["Catalogues.xls"].produce.find
|
27
|
+
sheet = book.worksheet 0
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Transcription Factor Associated Gene Name", "Sign"], :namespace => "Hsa", :type => :double)
|
30
|
+
sheet.each do |row|
|
31
|
+
target, tf, sign = row.values_at 0, 1, 2
|
32
|
+
tsv[target] ||= [[],[]]
|
33
|
+
tsv[target][0] << tf
|
34
|
+
tsv[target][1] << sign
|
35
|
+
end
|
32
36
|
|
33
|
-
TFacts.claim TFacts.targets, :proc do
|
34
|
-
tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
|
35
|
-
TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
|
36
37
|
tsv.to_s
|
37
38
|
end
|
38
39
|
|
39
|
-
TFacts.claim TFacts.
|
40
|
-
tsv
|
41
|
-
Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
|
42
|
-
tsv[tf] = [targets.keys, targets.values]
|
43
|
-
end
|
44
|
-
tsv.to_s
|
40
|
+
TFacts.claim TFacts.regulators, :proc do
|
41
|
+
TFacts.targets.tsv.reorder("Transcription Factor Associated Gene Name").to_s
|
45
42
|
end
|
43
|
+
|
46
44
|
end
|
47
45
|
|
48
46
|
if defined? Entity and defined? Gene and Entity === Gene
|
49
47
|
|
50
48
|
module Gene
|
51
49
|
property :is_transcription_factor? => :array2single do
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
50
|
+
tfs = TFacts.targets.keys
|
51
|
+
self.name.collect{|gene| tfs.include? gene}
|
52
|
+
end
|
53
|
+
persist :_ary_is_transcription_factor?
|
54
|
+
|
55
|
+
property :transcription_regulators => :array2single do
|
56
|
+
Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
56
57
|
end
|
58
|
+
persist :_ary_transcription_regulators
|
57
59
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
end
|
60
|
+
property :transcription_targets => :array2single do
|
61
|
+
Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
62
|
+
end
|
63
|
+
persist :_ary_transcription_targets
|
63
64
|
end
|
64
65
|
end
|
data/lib/rbbt/sources/uniprot.rb
CHANGED
@@ -3,34 +3,32 @@ require 'rbbt/resource'
|
|
3
3
|
require 'rbbt/sources/cath'
|
4
4
|
require 'rbbt/sources/uniprot'
|
5
5
|
|
6
|
-
module
|
6
|
+
module UniProt
|
7
7
|
extend Resource
|
8
|
-
self.subdir = "share/databases/
|
8
|
+
self.subdir = "share/databases/UniProt"
|
9
9
|
|
10
|
-
|
10
|
+
UniProt.claim UniProt.annotated_variants, :proc do
|
11
11
|
url = "http://www.uniprot.org/docs/humsavar.txt"
|
12
12
|
tsv = TSV.open(CMD.cmd('tail -n +31 | head -n -4|grep "[[:alpha:]]"', :in => Open.open(url), :pipe => true),
|
13
|
-
:fix => Proc.new{|line| parts = line.split(/\s+/); (parts[
|
14
|
-
:
|
13
|
+
:fix => Proc.new{|line| parts = line.split(/\s+/); (parts[1..5] + [(parts[6..-1] || []) * " "]) * "\t"},
|
14
|
+
:type => :double,
|
15
|
+
:merge => true,
|
16
|
+
:key_field => "UniProt/SwissProt Accession",
|
17
|
+
:fields => ["UniProt Variant ID", "Amino Acid Mutation", "Type of Variant", "SNP ID", "Disease"])
|
15
18
|
|
16
19
|
tsv.unnamed = true
|
17
|
-
tsv.process "Amino Acid Mutation" do |
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
tsv.process "Amino Acid Mutation" do |mutations|
|
21
|
+
mutations.collect do |mutation|
|
22
|
+
if mutation.match(/p\.(\w{3})(\d+)(\w{3})/)
|
23
|
+
wt = Misc::THREE_TO_ONE_AA_CODE[$1.downcase]
|
24
|
+
mut = Misc::THREE_TO_ONE_AA_CODE[$3.downcase]
|
25
|
+
[wt, $2, mut] * ""
|
26
|
+
else
|
27
|
+
mutation
|
28
|
+
end
|
24
29
|
end
|
25
30
|
end
|
26
|
-
|
27
|
-
uniprot_pos = tsv.identify_field "Uniprot/SwissProt Accession"
|
28
|
-
mutation_pos = tsv.identify_field "Amino Acid Mutation"
|
29
|
-
tsv.add_field "Mutated Isoform" do |key, values|
|
30
|
-
[values[uniprot_pos], values[mutation_pos]] * ":"
|
31
|
-
end
|
32
|
-
|
33
|
-
tsv.reorder("Mutated Isoform").to_s
|
31
|
+
tsv.to_s
|
34
32
|
end
|
35
33
|
|
36
34
|
|
@@ -44,7 +42,12 @@ module Uniprot
|
|
44
42
|
text.split(/\n/).each{|l|
|
45
43
|
next unless l =~ /^DR\s+PDB; (.*)\./
|
46
44
|
id, method, resolution, region = $1.split(";").collect{|v| v.strip}
|
47
|
-
|
45
|
+
begin
|
46
|
+
chains, start, eend = region.match(/(\w+)=(\d+)-(\d+)/).values_at(1,2,3)
|
47
|
+
rescue
|
48
|
+
Log.warn("Error process Uniprot PDB line: #{line}")
|
49
|
+
next
|
50
|
+
end
|
48
51
|
pdb[id.downcase] = {:method => method, :resolution => resolution, :region => (start.to_i..eend.to_i), :chains => chains}
|
49
52
|
}
|
50
53
|
pdb
|
@@ -96,7 +99,6 @@ module Uniprot
|
|
96
99
|
variants
|
97
100
|
end
|
98
101
|
|
99
|
-
|
100
102
|
def self.cath(protein)
|
101
103
|
url = UNIPROT_TEXT.sub "[PROTEIN]", protein
|
102
104
|
text = Open.read(url)
|
@@ -118,7 +120,7 @@ module Uniprot
|
|
118
120
|
end
|
119
121
|
|
120
122
|
def self.pdbs_covering_aa_position(protein, aa_position)
|
121
|
-
|
123
|
+
UniProt.pdbs(protein).select do |pdb, info|
|
122
124
|
info[:region].include? aa_position
|
123
125
|
end
|
124
126
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#: :type=:single
|
2
|
+
#Release build
|
3
|
+
current jul2012
|
4
|
+
release-68 jul2012
|
5
|
+
release-67 may2012
|
6
|
+
release-66 feb2012
|
7
|
+
release-65 dec2011
|
8
|
+
release-64 sep2011
|
9
|
+
release-63 jun2011
|
10
|
+
release-62 apr2011
|
11
|
+
release-61 feb2011
|
12
|
+
release-60 nov2010
|
13
|
+
release-59 aug2010
|
14
|
+
release-58 may2010
|
15
|
+
release-57 mar2010
|
16
|
+
release-56 sep2009
|
17
|
+
release-55 jul2009
|
18
|
+
release-54 may2009
|
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
rule /(.+)/ do |t|
|
4
|
+
require 'net/ftp'
|
5
|
+
chromosome = File.basename(t.name)
|
6
|
+
|
7
|
+
ftp = Genomes1000::URL
|
8
|
+
ftp = Net::FTP.new(Genomes1000::FTP_SERVER)
|
9
|
+
ftp.login
|
10
|
+
ftp.chdir(Genomes1000::FTP_PATH)
|
11
|
+
file = ftp.list("*.chr" + chromosome + ".*").collect{|l| l.split(" ").last}.last
|
12
|
+
ddd file
|
13
|
+
exit
|
14
|
+
|
15
|
+
end
|
@@ -3,7 +3,7 @@ require 'rbbt/util/misc'
|
|
3
3
|
|
4
4
|
def read_chunk(jochem)
|
5
5
|
chunk = ""
|
6
|
-
while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
|
6
|
+
while (not jochem.eof? and not (line = Misc.fixutf8(jochem.gets)).match(/^--/))
|
7
7
|
chunk << line
|
8
8
|
end
|
9
9
|
return nil if chunk.empty?
|
@@ -20,7 +20,7 @@ def process_jochem
|
|
20
20
|
identifiers = File.open('identifiers', 'w')
|
21
21
|
|
22
22
|
identifiers.puts("#: :namespace=JoChem")
|
23
|
-
identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
|
23
|
+
identifiers.puts("#ID\tCompound Name\tPubChem:Substance ID\tPubChem:Coumpound ID\tDrugBank:ID\tChemIDplus:ID\tCAS:ID\tMeSH:Term\tChEBI:ID\tHMDB:ID\tKEGG:Coumpound ID\tKEGG:Drug ID")
|
24
24
|
|
25
25
|
lexicon = File.open('lexicon', 'w')
|
26
26
|
lexicon.puts("#: :namespace=JoChem")
|
@@ -49,12 +49,20 @@ def process_jochem
|
|
49
49
|
tm = info["TM"] || []
|
50
50
|
db = info["DB"] || []
|
51
51
|
|
52
|
+
cheb = db.collect{|code| code.match(/CHEB_(.*)/) ? $1 : nil}.compact
|
53
|
+
chid = db.collect{|code| code.match(/CHID_(.*)/) ? $1 : nil}.compact
|
52
54
|
pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
|
55
|
+
pubs = db.collect{|code| code.match(/PUBS_(.*)/) ? $1 : nil}.compact
|
53
56
|
drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
|
57
|
+
cas = db.collect{|code| code.match(/CAS_(.*)/) ? $1 : nil}.compact
|
58
|
+
mesh = db.collect{|code| code.match(/MESH_(.*)/) ? $1 : nil}.compact
|
59
|
+
hmdb = db.collect{|code| code.match(/HMDB_(.*)/) ? $1 : nil}.compact
|
60
|
+
kegg = db.collect{|code| code.match(/KEGG_(.*)/) ? $1 : nil}.compact
|
61
|
+
kegd = db.collect{|code| code.match(/KEGD_(.*)/) ? $1 : nil}.compact
|
54
62
|
inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
|
55
63
|
|
56
64
|
lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
|
57
|
-
identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
|
65
|
+
identifiers.puts [id, na, pubs * "|", pubc * "|", drug * "|", chid * "|", cas * "|", mesh * "|", cheb * "|", hmdb * "|", kegg * "|", kegd * "|" ] * "\t"
|
58
66
|
inchi.puts [id, inch * "|"] * "\t" if inch.any?
|
59
67
|
definitions.puts [id, df] * "\t" unless df.nil?
|
60
68
|
end
|
data/share/install/NCI/Rakefile
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
require 'rbbt-util'
|
2
3
|
|
3
4
|
module NCI
|
4
|
-
def self.get_pathways(xml, format = "UP")
|
5
|
+
def self.get_pathways(xml, format = "UP", get_short_name = false)
|
5
6
|
doc = Nokogiri::XML(xml)
|
6
7
|
pathways = {}
|
7
8
|
|
@@ -9,36 +10,60 @@ module NCI
|
|
9
10
|
doc.xpath("//Molecule").each do |molecule|
|
10
11
|
id = molecule.attribute('id').value
|
11
12
|
type = molecule.attribute('molecule_type').value
|
12
|
-
next unless type == "protein"
|
13
13
|
names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
|
14
|
-
|
15
|
-
molecules[id] = {:xml => molecule, :uniprot => names.first}
|
14
|
+
molecules[id] = {:xml => molecule, :proteins => names}
|
16
15
|
end
|
17
16
|
|
17
|
+
|
18
18
|
interactions = {}
|
19
19
|
doc.xpath("//Interaction").each do |interaction|
|
20
20
|
id = interaction.attribute('id').value
|
21
|
-
|
21
|
+
type = interaction.attribute('interaction_type').value
|
22
|
+
molecule_ids = interaction.xpath('InteractionComponentList/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}.flatten.compact
|
23
|
+
pathway_ids = interaction.xpath('Abstraction').collect{|c| c.attribute('pathway_idref').value}.flatten.compact
|
22
24
|
|
23
|
-
interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
|
25
|
+
interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids, :pathway_ids => pathway_ids}
|
24
26
|
end
|
25
27
|
|
26
28
|
doc.xpath("//Pathway").each do |pathway|
|
27
29
|
id = pathway.attribute('id').value
|
28
30
|
subnet = pathway.attribute('subnet').value
|
29
31
|
name = pathway.xpath('LongName').first.content
|
32
|
+
short_name = pathway.xpath('ShortName').first.content if get_short_name
|
30
33
|
|
31
|
-
interaction_ids = pathway.xpath("
|
34
|
+
interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
|
32
35
|
|
33
|
-
pathway_interactions = interaction_ids.collect{|
|
34
|
-
pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
|
36
|
+
pathway_interactions = interaction_ids.collect{|i| interactions[i]}
|
37
|
+
pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.compact.flatten
|
35
38
|
|
36
|
-
pathway_uniprot_ids = pathway_molecule_ids.collect do |
|
37
|
-
next unless molecules.include?
|
38
|
-
molecules[
|
39
|
+
pathway_uniprot_ids = pathway_molecule_ids.collect do |i|
|
40
|
+
next unless molecules.include? i
|
41
|
+
molecules[i][:proteins]
|
42
|
+
end
|
43
|
+
if get_short_name
|
44
|
+
pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq], [short_name]]
|
45
|
+
else
|
46
|
+
pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq]]
|
39
47
|
end
|
40
|
-
pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
|
41
48
|
end
|
49
|
+
|
50
|
+
doc.xpath("//Pathway").each do |pathway|
|
51
|
+
id = pathway.attribute('id').value
|
52
|
+
subnet = pathway.attribute('subnet').value
|
53
|
+
name = pathway.xpath('LongName').first.content
|
54
|
+
|
55
|
+
interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
|
56
|
+
|
57
|
+
pathway_interactions = interaction_ids.collect{|i| interactions[i]}
|
58
|
+
pathway_subnet_ids = pathway_interactions.collect{|info| info[:pathway_ids]}.compact.flatten
|
59
|
+
|
60
|
+
pathway_subnet_ids.collect do |nid|
|
61
|
+
next unless pathways.include? nid
|
62
|
+
new_genes = pathways[id].last
|
63
|
+
pathways[nid][1] = (pathways[nid][1] + new_genes).uniq
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
42
67
|
pathways
|
43
68
|
end
|
44
69
|
end
|
@@ -60,11 +85,25 @@ file 'biocarta_pathways' do |t|
|
|
60
85
|
|
61
86
|
xml = Open.read(url)
|
62
87
|
|
63
|
-
pathways = NCI.get_pathways(xml, "LL")
|
88
|
+
pathways = NCI.get_pathways(xml, "LL", true)
|
64
89
|
|
65
|
-
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
|
90
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID", "Pathway Short Name"]).to_s)
|
66
91
|
end
|
67
92
|
|
93
|
+
file 'biocarta_pathways_fixed_ids' => 'biocarta_pathways' do |t|
|
94
|
+
orig = TSV.open(Open.open(t.prerequisites.first))
|
95
|
+
tsv = TSV.setup({}, :type => :double, :key_field => "BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID"])
|
96
|
+
|
97
|
+
orig.through do |key, values|
|
98
|
+
name, genes, short = values
|
99
|
+
code = "h_" + short.first
|
100
|
+
tsv[code] = [name, genes]
|
101
|
+
end
|
102
|
+
|
103
|
+
Open.write(t.name, tsv.to_s)
|
104
|
+
end
|
105
|
+
|
106
|
+
|
68
107
|
file 'reactome_pathways' do |t|
|
69
108
|
|
70
109
|
url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
|
@@ -76,4 +115,3 @@ file 'reactome_pathways' do |t|
|
|
76
115
|
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
77
116
|
end
|
78
117
|
|
79
|
-
|
@@ -43,7 +43,7 @@ $biomart_probe_identifiers = [
|
|
43
43
|
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
|
44
44
|
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
45
45
|
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
46
|
-
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
46
|
+
#[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
47
47
|
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
48
48
|
[ 'Codelink ID', 'codelink' ],
|
49
49
|
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
@@ -62,6 +62,7 @@ $biomart_identifiers = [
|
|
62
62
|
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
63
63
|
[ 'HGNC ID', "hgnc_id", 'HGNC'],
|
64
64
|
[ 'EMBL (Genbank) ID' , "embl"] ,
|
65
|
+
[ 'RefSeq mRNA' , "refseq_mrna"] ,
|
65
66
|
|
66
67
|
# Probes
|
67
68
|
[ 'AFFY HC G110', 'affy_hc_g110' ],
|
@@ -80,7 +81,7 @@ $biomart_identifiers = [
|
|
80
81
|
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
|
81
82
|
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
82
83
|
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
83
|
-
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
84
|
+
#[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
84
85
|
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
85
86
|
[ 'Codelink ID', 'codelink' ],
|
86
87
|
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
@@ -19,12 +19,12 @@ $biomart_lexicon = [
|
|
19
19
|
]
|
20
20
|
|
21
21
|
$biomart_identifiers = [
|
22
|
+
['Entrez Gene ID', "entrezgene"],
|
22
23
|
['Associated Gene Name' , "external_gene_id"],
|
23
24
|
['Protein ID' , "protein_id"] ,
|
24
25
|
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
25
26
|
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
26
27
|
['RefSeq Protein ID' , "refseq_peptide"] ,
|
27
|
-
['RefSeq DNA ID' , "refseq_dna"] ,
|
28
28
|
['EMBL (Genbank) ID' , "embl"] ,
|
29
29
|
['RGD ID' , "rgd"] ,
|
30
30
|
['RGD Symbol' , "rgd_symbol"] ,
|
@@ -39,7 +39,6 @@ $biomart_identifiers = [
|
|
39
39
|
['Affy rg u34c', "affy_rg_u34c"],
|
40
40
|
['Affy rn u34', "affy_rn_u34"],
|
41
41
|
['Affy rt u34', "affy_rt_u34"],
|
42
|
-
['Agilent WholeGenome',"agilent_wholegenome" ],
|
43
42
|
['Codelink ID ', "codelink"],
|
44
43
|
]
|
45
44
|
|
@@ -4,50 +4,48 @@ require 'rbbt/sources/entrez'
|
|
4
4
|
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
5
|
|
6
6
|
$taxs = [559292,4932]
|
7
|
-
$
|
8
|
-
|
9
|
-
$biomart_db = 'scerevisiae_gene_ensembl'
|
10
|
-
$biomart_main = ['Entrez Gene ID', 'entrezgene']
|
11
|
-
$ortholog_key = "yeast_ensembl_gene"
|
12
|
-
|
13
|
-
|
14
|
-
file 'scientific_name' do |t|
|
15
|
-
File.open(t.name, 'w') do |f| f.puts "Saccharomyces cerevisiae" end
|
16
|
-
end
|
17
|
-
|
18
|
-
file 'lexicon' do |t|
|
19
|
-
lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
20
|
-
|
21
|
-
lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
|
22
|
-
|
23
|
-
lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
|
24
|
-
|
25
|
-
lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
|
7
|
+
$scientific_name = "Saccharomyces cerevisiae"
|
8
|
+
#$ortholog_key = "yeast_ensembl_gene"
|
26
9
|
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
file 'identifiers' do |t|
|
31
|
-
identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
|
32
|
-
|
33
|
-
identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
|
34
|
-
|
35
|
-
identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
|
36
|
-
[['Associated Gene Name' , "external_gene_id"],
|
37
|
-
['Ensembl Gene ID', "ensembl_gene_id" ],
|
38
|
-
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
39
|
-
['RefSeq Protein ID' , "refseq_peptide"] ,
|
40
|
-
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
41
|
-
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
42
|
-
['Protein ID' , "protein_id"] ,
|
43
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
44
|
-
# Affymetrix
|
45
|
-
['Affy yeast 2',"affy_yeast_2"],
|
46
|
-
['Affy yg s98', "affy_yg_s98"]])
|
47
|
-
|
48
|
-
File.open(t.name, 'w') do |f| f.puts identifiers end
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
task :default => ['name', 'lexicon', 'identifiers']
|
10
|
+
$biomart_db = 'scerevisiae_gene_ensembl'
|
53
11
|
|
12
|
+
$biomart_lexicon = [
|
13
|
+
[ 'Associated Gene Name' , "external_gene_id"],
|
14
|
+
]
|
15
|
+
|
16
|
+
$biomart_protein_identifiers = [
|
17
|
+
[ 'Protein ID', "protein_id" ],
|
18
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
19
|
+
[ 'Unigene ID', "unigene" ],
|
20
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
21
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
22
|
+
]
|
23
|
+
|
24
|
+
$biomart_probe_identifiers = [
|
25
|
+
]
|
26
|
+
|
27
|
+
$biomart_identifiers = [
|
28
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
29
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
30
|
+
[ 'Associated Gene Name', "external_gene_id" ],
|
31
|
+
[ 'Protein ID', "protein_id" ],
|
32
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
33
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
34
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
35
|
+
[ 'EMBL (Genbank) ID' , "embl"] ,
|
36
|
+
[ 'RefSeq mRNA' , "refseq_mrna"] ,
|
37
|
+
]
|
38
|
+
|
39
|
+
$biomart_go= [
|
40
|
+
["GO ID", 'go_id'],
|
41
|
+
["GO Namespace", 'namespace_1003'],
|
42
|
+
]
|
43
|
+
|
44
|
+
$biomart_go_2009= [
|
45
|
+
["GO BP ID", 'go_biological_process_id'],
|
46
|
+
["GO MF ID", 'go_molecular_function_id'],
|
47
|
+
["GO CC ID", 'go_cellular_component_id'],
|
48
|
+
]
|
49
|
+
|
50
|
+
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
51
|
+
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|