rbbt-sources 0.4.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/etc/biomart/missing_in_archive +15 -0
- data/lib/rbbt/sources/COSMIC.rb +14 -0
- data/lib/rbbt/sources/COSTART.rb +1 -1
- data/lib/rbbt/sources/CTCAE.rb +1 -1
- data/lib/rbbt/sources/InterPro.rb +17 -0
- data/lib/rbbt/sources/NCI.rb +7 -0
- data/lib/rbbt/sources/biomart.rb +9 -9
- data/lib/rbbt/sources/entrez.rb +44 -17
- data/lib/rbbt/sources/go.rb +10 -7
- data/lib/rbbt/sources/jochem.rb +4 -0
- data/lib/rbbt/sources/organism.rb +24 -25
- data/lib/rbbt/sources/organism/sequence.rb +253 -19
- data/lib/rbbt/sources/polysearch.rb +5 -5
- data/lib/rbbt/sources/pubmed.rb +10 -5
- data/lib/rbbt/sources/wgEncodeBroadHmm.rb +37 -0
- data/share/install/InterPro/Rakefile +29 -0
- data/share/install/JoChem/Rakefile +67 -0
- data/share/install/NCI/Rakefile +79 -0
- data/share/install/Organism/Hsa/Rakefile +20 -1
- data/share/install/Organism/Rno/Rakefile +2 -0
- data/share/install/Organism/organism_helpers.rb +134 -77
- data/share/install/lib/helpers.rb +6 -5
- data/test/rbbt/sources/test_biomart.rb +8 -5
- data/test/rbbt/sources/test_organism.rb +23 -19
- metadata +39 -14
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
|
3
3
|
module Polysearch
|
4
|
-
Rbbt.share.Polysearch.organ
|
5
|
-
Rbbt.share.Polysearch.tissue
|
6
|
-
Rbbt.share.Polysearch.location
|
7
|
-
Rbbt.share.Polysearch.disease
|
8
|
-
Rbbt.share.Polysearch.drug
|
4
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.organ, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
|
5
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.tissue, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
|
6
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.location, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
|
7
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.disease, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
|
8
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.drug, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
|
9
9
|
end
|
10
10
|
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
require 'rbbt
|
1
|
+
require 'rbbt'
|
2
2
|
require 'libxml'
|
3
|
+
require 'rbbt/sources/gscholar'
|
3
4
|
|
4
5
|
# This module offers an interface with PubMed, to perform queries, and
|
5
6
|
# retrieve simple information from articles. It uses the caching
|
@@ -10,12 +11,16 @@ module PubMed
|
|
10
11
|
@@pubmed_lag = 1
|
11
12
|
def self.get_online(pmids)
|
12
13
|
|
13
|
-
|
14
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
|
14
|
+
pmids_complete = pmids.is_a?(Array) ? pmids : [pmids]
|
15
15
|
|
16
|
-
|
16
|
+
articles = []
|
17
|
+
Misc.divide(pmids_complete, (pmids_complete.length / 500) + 1).each do |pmid_list|
|
18
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list * ","}"
|
17
19
|
|
18
|
-
|
20
|
+
xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
|
21
|
+
|
22
|
+
articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
|
23
|
+
end
|
19
24
|
|
20
25
|
if pmids.is_a? Array
|
21
26
|
list = {}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
|
3
|
+
module EBChromatin
|
4
|
+
BASE_URL='http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/'
|
5
|
+
|
6
|
+
TISSUES= %w(Gm12878 H1hesc Hmec Hsmm Huvec Hepg2 K562 Nhek Nhlf)
|
7
|
+
|
8
|
+
TISSUES.each do |tissue|
|
9
|
+
file = "wgEncodeBroadHmm#{tissue}HMM.bed.gz"
|
10
|
+
|
11
|
+
Rbbt.claim Rbbt.share.databases.EBChromatin[file.match(/wgEncodeBroadHmm(.*)HMM.bed.gz/)[1]], :proc do
|
12
|
+
url = File.join(BASE_URL, file)
|
13
|
+
|
14
|
+
CMD.cmd('sed \'s/^chr\([[:alnum:]]\+\)\t\([[:digit:]]\+\)\t\([[:digit:]]\+\)/\1:\2:\3\t\1\t\2\t\3/\' | cut -f 1,2,3,4,5 | awk \'BEGIN {print "#Region ID\tChromosome Name\tStart\tEnd\tType"} /./ {print $0}\' ', :in => Open.read(url), :pipe => true).read
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.chromosome(tissue, chr, positions)
|
19
|
+
list = Array === positions ? positions : [positions]
|
20
|
+
|
21
|
+
file = Rbbt.share.databases.EBChromatin[tissue]
|
22
|
+
chromosome_bed = Persistence.persist(file, "EBChromatin[#{tissue}][#{chr}]", :fwt, :chromosome => chr, :range => true) do |file, options|
|
23
|
+
chromosome = options[:chromosome]
|
24
|
+
tsv = file.tsv(:persist => false, :type => :list, :grep => "^#{chromosome}:\\|^#")
|
25
|
+
if tsv.size > 0
|
26
|
+
tsv.collect do |gene, values|
|
27
|
+
[gene, values.values_at("Start", "End").collect{|p| p.to_i}]
|
28
|
+
end
|
29
|
+
else
|
30
|
+
raise "No chromatin information for chromosome #{ chr } in tissue #{ tissue }"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
list.collect do |pos| chromosome_bed[pos] end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
|
5
|
+
$interpro_db = 'entry'
|
6
|
+
|
7
|
+
$interpro_id = ['InterPro Entry Accession','entry_id']
|
8
|
+
|
9
|
+
$interpro_pos = [
|
10
|
+
["UniProt/SwissProt Accession", "protein_ac"],
|
11
|
+
["Match Start Position", "pos_from"],
|
12
|
+
["Match Stop Position ", "pos_to"]
|
13
|
+
]
|
14
|
+
|
15
|
+
file 'interpro_positions' do |t|
|
16
|
+
Open.write(t.name, InterPro.tsv($interpro_db, $interpro_id, $interpro_pos, [], nil, :type => :double, :nocache => true).to_s)
|
17
|
+
end
|
18
|
+
|
19
|
+
file 'interpro_names' do |t|
|
20
|
+
Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tName\n" + Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/names.dat"))
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
file 'interpro_short_names' do |t|
|
25
|
+
Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tShort Name\n" + Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/short_names.dat"))
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/util/misc'
|
3
|
+
|
4
|
+
def read_chunk(jochem)
|
5
|
+
chunk = ""
|
6
|
+
while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
|
7
|
+
chunk << line
|
8
|
+
end
|
9
|
+
return nil if chunk.empty?
|
10
|
+
chunk
|
11
|
+
end
|
12
|
+
|
13
|
+
def first(list)
|
14
|
+
return nil if list.nil? or list.empty?
|
15
|
+
list.first
|
16
|
+
end
|
17
|
+
|
18
|
+
def process_jochem
|
19
|
+
jochem = Open.open("http://www.biosemantics.org/uploads/file/Jochem/JochemV1_2.zip")
|
20
|
+
identifiers = File.open('identifiers', 'w')
|
21
|
+
|
22
|
+
identifiers.puts("#: :namespace=JoChem")
|
23
|
+
identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
|
24
|
+
|
25
|
+
lexicon = File.open('lexicon', 'w')
|
26
|
+
lexicon.puts("#: :namespace=JoChem")
|
27
|
+
lexicon.puts("#ID\tSynonyms")
|
28
|
+
|
29
|
+
inchi = File.open('inchi', 'w')
|
30
|
+
inchi.puts("#: :namespace=JoChem")
|
31
|
+
inchi.puts("#ID\tInChi")
|
32
|
+
|
33
|
+
definitions = File.open('definitions', 'w')
|
34
|
+
definitions.puts("#: :namespace=JoChem#:type=:list")
|
35
|
+
definitions.puts("#ID\tDefinition")
|
36
|
+
|
37
|
+
while chunk = read_chunk(jochem) do
|
38
|
+
next if chunk.empty? or chunk =~ /^#/ or chunk =~ /^NS /
|
39
|
+
info = {}
|
40
|
+
chunk.split(/\n/).each do |line|
|
41
|
+
line.sub!(/\t@match.*/,'')
|
42
|
+
code, value = line.match(/([A-Z]*) (.*)/).values_at 1, 2
|
43
|
+
info[code] ||= []
|
44
|
+
info[code] << value
|
45
|
+
end
|
46
|
+
id = first(info["ID"])
|
47
|
+
na = first(info["NA"])
|
48
|
+
df = first(info["DF"])
|
49
|
+
tm = info["TM"] || []
|
50
|
+
db = info["DB"] || []
|
51
|
+
|
52
|
+
pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
|
53
|
+
drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
|
54
|
+
inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
|
55
|
+
|
56
|
+
lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
|
57
|
+
identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
|
58
|
+
inchi.puts [id, inch * "|"] * "\t" if inch.any?
|
59
|
+
definitions.puts [id, df] * "\t" unless df.nil?
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
rule /identifiers|lexicon|inchi|definitions/ do |t|
|
64
|
+
Misc.in_dir(File.dirname(t.name)) do
|
65
|
+
process_jochem
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module NCI
|
4
|
+
def self.get_pathways(xml, format = "UP")
|
5
|
+
doc = Nokogiri::XML(xml)
|
6
|
+
pathways = {}
|
7
|
+
|
8
|
+
molecules = {}
|
9
|
+
doc.xpath("//Molecule").each do |molecule|
|
10
|
+
id = molecule.attribute('id').value
|
11
|
+
type = molecule.attribute('molecule_type').value
|
12
|
+
next unless type == "protein"
|
13
|
+
names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
|
14
|
+
next if names.empty?
|
15
|
+
molecules[id] = {:xml => molecule, :uniprot => names.first}
|
16
|
+
end
|
17
|
+
|
18
|
+
interactions = {}
|
19
|
+
doc.xpath("//Interaction").each do |interaction|
|
20
|
+
id = interaction.attribute('id').value
|
21
|
+
molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}
|
22
|
+
|
23
|
+
interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
|
24
|
+
end
|
25
|
+
|
26
|
+
doc.xpath("//Pathway").each do |pathway|
|
27
|
+
id = pathway.attribute('id').value
|
28
|
+
subnet = pathway.attribute('subnet').value
|
29
|
+
name = pathway.xpath('LongName').first.content
|
30
|
+
|
31
|
+
interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
|
32
|
+
|
33
|
+
pathway_interactions = interaction_ids.collect{|id| interactions[id]}
|
34
|
+
pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
|
35
|
+
|
36
|
+
pathway_uniprot_ids = pathway_molecule_ids.collect do |id|
|
37
|
+
next unless molecules.include? id
|
38
|
+
molecules[id][:uniprot]
|
39
|
+
end
|
40
|
+
pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
|
41
|
+
end
|
42
|
+
pathways
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
file 'nature_pathways' do |t|
|
47
|
+
|
48
|
+
url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/NCI-Nature_Curated.xml.gz"
|
49
|
+
|
50
|
+
xml = Open.read(url)
|
51
|
+
|
52
|
+
pathways = NCI.get_pathways(xml)
|
53
|
+
|
54
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
55
|
+
end
|
56
|
+
|
57
|
+
file 'biocarta_pathways' do |t|
|
58
|
+
|
59
|
+
url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/BioCarta.xml.gz"
|
60
|
+
|
61
|
+
xml = Open.read(url)
|
62
|
+
|
63
|
+
pathways = NCI.get_pathways(xml, "LL")
|
64
|
+
|
65
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
|
66
|
+
end
|
67
|
+
|
68
|
+
file 'reactome_pathways' do |t|
|
69
|
+
|
70
|
+
url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
|
71
|
+
|
72
|
+
xml = Open.read(url)
|
73
|
+
|
74
|
+
pathways = NCI.get_pathways(xml, "UP")
|
75
|
+
|
76
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
77
|
+
end
|
78
|
+
|
79
|
+
|
@@ -7,6 +7,8 @@ $taxs = [9606]
|
|
7
7
|
$scientific_name = "Homo sapiens"
|
8
8
|
|
9
9
|
$biomart_db = 'hsapiens_gene_ensembl'
|
10
|
+
$biomart_db_germline_variation = 'hsapiens_snp'
|
11
|
+
$biomart_db_somatic_variation = 'hsapiens_snp_som'
|
10
12
|
|
11
13
|
$biomart_lexicon = [
|
12
14
|
[ 'Associated Gene Name' , "external_gene_id"],
|
@@ -15,6 +17,14 @@ $biomart_lexicon = [
|
|
15
17
|
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
16
18
|
]
|
17
19
|
|
20
|
+
$biomart_protein_identifiers = [
|
21
|
+
[ 'Protein ID', "protein_id" ],
|
22
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
23
|
+
[ 'Unigene ID', "unigene" ],
|
24
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
25
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
26
|
+
]
|
27
|
+
|
18
28
|
$biomart_identifiers = [
|
19
29
|
[ 'Entrez Gene ID', "entrezgene"],
|
20
30
|
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
@@ -42,7 +52,7 @@ $biomart_identifiers = [
|
|
42
52
|
[ 'AFFY HG U95E', 'affy_hg_u95e' ],
|
43
53
|
[ 'AFFY HG U95A', 'affy_hg_u95a' ],
|
44
54
|
[ 'AFFY HUGENEFL', 'affy_hugenefl' ],
|
45
|
-
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
|
55
|
+
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
|
46
56
|
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
47
57
|
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
48
58
|
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
@@ -52,5 +62,14 @@ $biomart_identifiers = [
|
|
52
62
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
53
63
|
]
|
54
64
|
|
65
|
+
$biomart_go= [
|
66
|
+
["GO ID", 'go_id'],
|
67
|
+
["GO Namespace", 'namespace_1003'],
|
68
|
+
]
|
69
|
+
|
70
|
+
$biomart_pfam= [
|
71
|
+
["Pfam Domain", 'pfam'],
|
72
|
+
]
|
73
|
+
|
55
74
|
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
56
75
|
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
@@ -7,6 +7,8 @@ $taxs = [10116]
|
|
7
7
|
$scientific_name = "Rattus norvegicus"
|
8
8
|
|
9
9
|
$biomart_db = 'rnorvegicus_gene_ensembl'
|
10
|
+
$biomart_db_germline_variation = 'rnorvegicus_snp'
|
11
|
+
$biomart_db_somatic_variation = 'rnorvegicus_snp_som'
|
10
12
|
|
11
13
|
$biomart_lexicon = [
|
12
14
|
[ 'Associated Gene Name' , "external_gene_id"],
|
@@ -2,8 +2,6 @@ $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
|
2
2
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
3
3
|
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
4
4
|
$biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
|
5
|
-
$biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
|
6
|
-
$biomart_germline_variation_id = ['Variation ID', "external_id" ]
|
7
5
|
|
8
6
|
$biomart_gene_positions = [
|
9
7
|
['Chromosome Name','chromosome_name'],
|
@@ -60,42 +58,6 @@ $biomart_exons = [
|
|
60
58
|
['Exon Chr End','exon_chrom_end'],
|
61
59
|
]
|
62
60
|
|
63
|
-
#{{{ Variations
|
64
|
-
|
65
|
-
$biomart_germline_variation_positions = [
|
66
|
-
['Chromosome Location (bp)', "chromosome_location" ],
|
67
|
-
['SNP Chromosome Strand', "snp_chromosome_strand" ],
|
68
|
-
['Transcript location (bp)', "transcript_location" ],
|
69
|
-
['Allele', "allele" ],
|
70
|
-
['Protein Allele', "peptide_shift" ],
|
71
|
-
['CDS Start', "cds_start_2076" ],
|
72
|
-
['CDS End', "cds_end_2076" ],
|
73
|
-
]
|
74
|
-
|
75
|
-
$biomart_germline_variations = [
|
76
|
-
$biomart_ensembl_gene,
|
77
|
-
['Source', "source_name" ],
|
78
|
-
['Validated', "validated" ],
|
79
|
-
['Consequence Type', "synonymous_status" ],
|
80
|
-
]
|
81
|
-
|
82
|
-
$biomart_somatic_variation_positions = [
|
83
|
-
['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
|
84
|
-
['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
|
85
|
-
['Transcript location (bp)' , "somatic_transcript_location" ] ,
|
86
|
-
['Allele' , "somatic_allele" ] ,
|
87
|
-
['Protein Allele' , "somatic_peptide_shift" ] ,
|
88
|
-
['CDS Start' , "somatic_cds_start_2076" ] ,
|
89
|
-
['CDS End' , "somatic_cds_end_2076" ] ,
|
90
|
-
]
|
91
|
-
|
92
|
-
$biomart_somatic_variations = [
|
93
|
-
$biomart_ensembl_gene,
|
94
|
-
['Source' , "somatic_source_name" ] ,
|
95
|
-
['Validated' , "somatic_validated" ] ,
|
96
|
-
['Consequence Type' , "somatic_synonymous_status" ] ,
|
97
|
-
]
|
98
|
-
|
99
61
|
#{{{ Rules
|
100
62
|
|
101
63
|
file 'scientific_name' do |t|
|
@@ -104,15 +66,69 @@ end
|
|
104
66
|
|
105
67
|
file 'identifiers' do |t|
|
106
68
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
|
69
|
+
|
107
70
|
$biomart_identifiers.each do |name, key, prefix|
|
108
71
|
if prefix
|
109
72
|
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
110
73
|
end
|
111
74
|
end
|
112
75
|
|
76
|
+
name_pos = identifiers.identify_field "Associated Gene Name"
|
77
|
+
entrez2name = Entrez.entrez2name($taxs)
|
78
|
+
identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
|
79
|
+
names = values[name_pos]
|
80
|
+
|
81
|
+
matches = entrez.select do |e|
|
82
|
+
entrez2name.include? e and (names & entrez2name[e]).any?
|
83
|
+
end
|
84
|
+
|
85
|
+
if matches.any?
|
86
|
+
matches
|
87
|
+
else
|
88
|
+
entrez
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
|
93
|
+
entrez_synonyms.key_field = "Entrez Gene ID"
|
94
|
+
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
95
|
+
|
96
|
+
identifiers.attach entrez_synonyms
|
97
|
+
|
98
|
+
identifiers.each do |key, values|
|
99
|
+
values.each do |list|
|
100
|
+
list.reject!{|v| v.nil? or v.empty?}
|
101
|
+
list.uniq!
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
File.open(t.name, 'w') do |f| f.puts identifiers end
|
106
|
+
end
|
107
|
+
|
108
|
+
file 'lexicon' => 'identifiers' do |t|
|
109
|
+
tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
|
110
|
+
|
111
|
+
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 8
|
112
|
+
entrez_description.key_field = "Entrez Gene ID"
|
113
|
+
entrez_description.fields = ["Entrez Gene Description"]
|
114
|
+
|
115
|
+
tsv.attach entrez_description
|
116
|
+
Open.write(t.name, tsv.to_s)
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
file 'protein_identifiers' do |t|
|
121
|
+
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_identifiers, [], nil, :namespace => $namespace)
|
122
|
+
$biomart_protein_identifiers.each do |name, key, prefix|
|
123
|
+
if prefix
|
124
|
+
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
113
128
|
File.open(t.name, 'w') do |f| f.puts identifiers end
|
114
129
|
end
|
115
130
|
|
131
|
+
|
116
132
|
file 'gene_transcripts' do |t|
|
117
133
|
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat, :namespace => $namespace)
|
118
134
|
|
@@ -121,7 +137,7 @@ end
|
|
121
137
|
|
122
138
|
file 'transcripts' => 'gene_positions' do |t|
|
123
139
|
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
|
124
|
-
transcripts.attach TSV.
|
140
|
+
transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
|
125
141
|
|
126
142
|
File.open(t.name, 'w') do |f| f.puts transcripts end
|
127
143
|
end
|
@@ -198,7 +214,7 @@ end
|
|
198
214
|
|
199
215
|
file 'exons' => 'gene_positions' do |t|
|
200
216
|
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
|
201
|
-
exons.attach TSV.
|
217
|
+
exons.attach TSV.open('gene_positions'), "Chromosome Name"
|
202
218
|
|
203
219
|
File.open(t.name, 'w') do |f| f.puts exons end
|
204
220
|
end
|
@@ -227,28 +243,21 @@ file 'transcript_sequence' do |t|
|
|
227
243
|
end
|
228
244
|
|
229
245
|
|
230
|
-
|
231
|
-
#$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
|
232
|
-
$biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
|
246
|
+
#{{{ Variations
|
233
247
|
|
234
|
-
|
235
|
-
|
236
|
-
end
|
248
|
+
$biomart_variation_id = ["SNP ID", "refsnp_id"]
|
249
|
+
$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
|
237
250
|
|
238
|
-
file '
|
239
|
-
|
240
|
-
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
251
|
+
file 'germline_variations' do |t|
|
252
|
+
BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
|
241
253
|
end
|
242
254
|
|
243
255
|
file 'somatic_variations' do |t|
|
244
|
-
|
245
|
-
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
256
|
+
BioMart.tsv($biomart_db_somatic_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
|
246
257
|
end
|
247
258
|
|
248
|
-
|
249
|
-
|
250
|
-
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
251
|
-
end
|
259
|
+
|
260
|
+
# {{{ Other info
|
252
261
|
|
253
262
|
file 'gene_pmids' do |t|
|
254
263
|
tsv = Entrez.entrez2pubmed($taxs)
|
@@ -260,47 +269,95 @@ file 'gene_pmids' do |t|
|
|
260
269
|
Open.write(t.name, text)
|
261
270
|
end
|
262
271
|
|
263
|
-
|
264
|
-
|
272
|
+
def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
273
|
+
transcripts = begin
|
274
|
+
exon_transcripts[exon].first
|
275
|
+
rescue
|
276
|
+
[]
|
277
|
+
end
|
265
278
|
|
266
|
-
|
267
|
-
|
268
|
-
gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
|
269
|
-
transcript_info = TSV.new('transcripts', :list, :persistence => true )
|
270
|
-
transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
|
279
|
+
transcripts.select{|transcript| transcript_info[transcript].first.any?}
|
280
|
+
end
|
271
281
|
|
282
|
+
def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
|
283
|
+
sizes = [0]
|
284
|
+
rank = nil
|
285
|
+
start_pos = exons.identify_field "Exon Chr Start"
|
286
|
+
end_pos = exons.identify_field "Exon Chr End"
|
287
|
+
|
288
|
+
Misc.zip_fields(transcript_exons[transcript]).each do |_exon, _rank|
|
289
|
+
_rank = _rank.to_i
|
290
|
+
s, e = exons[_exon].values_at(start_pos, end_pos)
|
291
|
+
size = e.to_i - s.to_i + 1
|
292
|
+
sizes[_rank] = size
|
293
|
+
rank = _rank if _exon == exon
|
294
|
+
end
|
295
|
+
|
296
|
+
if not rank.nil?
|
297
|
+
sizes[0..rank - 1].inject(0){|e,acc| acc += e}
|
298
|
+
else
|
299
|
+
nil
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
|
304
|
+
exons = TSV.open('exons')
|
305
|
+
exon_transcripts = nil
|
306
|
+
exon_transcripts = TSV.open('transcript_exons', :double, :key_field => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true)
|
307
|
+
gene_transcripts = TSV.open('gene_transcripts', :flat)
|
308
|
+
transcript_info = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
|
309
|
+
transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
|
272
310
|
|
273
311
|
string = "#: :namespace=#{$namespace}"
|
274
312
|
string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
|
275
|
-
exons.each do |exon, info|
|
276
|
-
gene, start, finish, strand, chr = info
|
277
313
|
|
278
|
-
|
314
|
+
exons.unnamed = true
|
315
|
+
exon_transcripts.unnamed = true
|
316
|
+
gene_transcripts.unnamed = true
|
317
|
+
transcript_info.unnamed = true
|
318
|
+
transcript_exons.unnamed = true
|
319
|
+
|
320
|
+
exons.monitor = true
|
321
|
+
Misc.profile do
|
322
|
+
exons.through do |exon, info|
|
323
|
+
gene, start, finish, strand, chr = info
|
279
324
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
325
|
+
transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
326
|
+
|
327
|
+
transcript_offsets = {}
|
328
|
+
transcripts.each do |transcript|
|
329
|
+
offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
|
330
|
+
transcript_offsets[transcript] = offset unless offset.nil?
|
331
|
+
end
|
332
|
+
|
333
|
+
string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
|
284
334
|
end
|
285
|
-
|
286
|
-
string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
|
287
335
|
end
|
288
336
|
|
289
337
|
Open.write(t.name, string)
|
290
338
|
end
|
291
339
|
|
340
|
+
file 'gene_go' do |t|
|
341
|
+
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
|
342
|
+
|
343
|
+
File.open(t.name, 'w') do |f| f.puts goterms end
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
file 'gene_pfam' do |t|
|
348
|
+
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
|
349
|
+
|
350
|
+
File.open(t.name, 'w') do |f| f.puts goterms end
|
351
|
+
end
|
352
|
+
|
353
|
+
|
292
354
|
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
293
355
|
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
|
294
356
|
archive = $1
|
295
357
|
task = $2
|
296
|
-
|
297
|
-
begin
|
298
|
-
FileUtils.mkdir archive unless File.exists? archive
|
299
|
-
FileUtils.cd File.join(archive)
|
358
|
+
Misc.in_dir(archive) do
|
300
359
|
BioMart.set_archive archive
|
301
360
|
Rake::Task[task].invoke
|
302
361
|
BioMart.unset_archive
|
303
|
-
ensure
|
304
|
-
FileUtils.cd old_pwd
|
305
362
|
end
|
306
363
|
end
|