rbbt-sources 0.4.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/etc/biomart/missing_in_archive +15 -0
- data/lib/rbbt/sources/COSMIC.rb +14 -0
- data/lib/rbbt/sources/COSTART.rb +1 -1
- data/lib/rbbt/sources/CTCAE.rb +1 -1
- data/lib/rbbt/sources/InterPro.rb +17 -0
- data/lib/rbbt/sources/NCI.rb +7 -0
- data/lib/rbbt/sources/biomart.rb +9 -9
- data/lib/rbbt/sources/entrez.rb +44 -17
- data/lib/rbbt/sources/go.rb +10 -7
- data/lib/rbbt/sources/jochem.rb +4 -0
- data/lib/rbbt/sources/organism.rb +24 -25
- data/lib/rbbt/sources/organism/sequence.rb +253 -19
- data/lib/rbbt/sources/polysearch.rb +5 -5
- data/lib/rbbt/sources/pubmed.rb +10 -5
- data/lib/rbbt/sources/wgEncodeBroadHmm.rb +37 -0
- data/share/install/InterPro/Rakefile +29 -0
- data/share/install/JoChem/Rakefile +67 -0
- data/share/install/NCI/Rakefile +79 -0
- data/share/install/Organism/Hsa/Rakefile +20 -1
- data/share/install/Organism/Rno/Rakefile +2 -0
- data/share/install/Organism/organism_helpers.rb +134 -77
- data/share/install/lib/helpers.rb +6 -5
- data/test/rbbt/sources/test_biomart.rb +8 -5
- data/test/rbbt/sources/test_organism.rb +23 -19
- metadata +39 -14
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
|
3
3
|
module Polysearch
|
4
|
-
Rbbt.share.Polysearch.organ
|
5
|
-
Rbbt.share.Polysearch.tissue
|
6
|
-
Rbbt.share.Polysearch.location
|
7
|
-
Rbbt.share.Polysearch.disease
|
8
|
-
Rbbt.share.Polysearch.drug
|
4
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.organ, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
|
5
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.tissue, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
|
6
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.location, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
|
7
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.disease, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
|
8
|
+
Rbbt.claim Rbbt.share.databases.Polysearch.drug, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
|
9
9
|
end
|
10
10
|
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
require 'rbbt
|
1
|
+
require 'rbbt'
|
2
2
|
require 'libxml'
|
3
|
+
require 'rbbt/sources/gscholar'
|
3
4
|
|
4
5
|
# This module offers an interface with PubMed, to perform queries, and
|
5
6
|
# retrieve simple information from articles. It uses the caching
|
@@ -10,12 +11,16 @@ module PubMed
|
|
10
11
|
@@pubmed_lag = 1
|
11
12
|
def self.get_online(pmids)
|
12
13
|
|
13
|
-
|
14
|
-
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
|
14
|
+
pmids_complete = pmids.is_a?(Array) ? pmids : [pmids]
|
15
15
|
|
16
|
-
|
16
|
+
articles = []
|
17
|
+
Misc.divide(pmids_complete, (pmids_complete.length / 500) + 1).each do |pmid_list|
|
18
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list * ","}"
|
17
19
|
|
18
|
-
|
20
|
+
xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
|
21
|
+
|
22
|
+
articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
|
23
|
+
end
|
19
24
|
|
20
25
|
if pmids.is_a? Array
|
21
26
|
list = {}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
|
3
|
+
module EBChromatin
|
4
|
+
BASE_URL='http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/'
|
5
|
+
|
6
|
+
TISSUES= %w(Gm12878 H1hesc Hmec Hsmm Huvec Hepg2 K562 Nhek Nhlf)
|
7
|
+
|
8
|
+
TISSUES.each do |tissue|
|
9
|
+
file = "wgEncodeBroadHmm#{tissue}HMM.bed.gz"
|
10
|
+
|
11
|
+
Rbbt.claim Rbbt.share.databases.EBChromatin[file.match(/wgEncodeBroadHmm(.*)HMM.bed.gz/)[1]], :proc do
|
12
|
+
url = File.join(BASE_URL, file)
|
13
|
+
|
14
|
+
CMD.cmd('sed \'s/^chr\([[:alnum:]]\+\)\t\([[:digit:]]\+\)\t\([[:digit:]]\+\)/\1:\2:\3\t\1\t\2\t\3/\' | cut -f 1,2,3,4,5 | awk \'BEGIN {print "#Region ID\tChromosome Name\tStart\tEnd\tType"} /./ {print $0}\' ', :in => Open.read(url), :pipe => true).read
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.chromosome(tissue, chr, positions)
|
19
|
+
list = Array === positions ? positions : [positions]
|
20
|
+
|
21
|
+
file = Rbbt.share.databases.EBChromatin[tissue]
|
22
|
+
chromosome_bed = Persistence.persist(file, "EBChromatin[#{tissue}][#{chr}]", :fwt, :chromosome => chr, :range => true) do |file, options|
|
23
|
+
chromosome = options[:chromosome]
|
24
|
+
tsv = file.tsv(:persist => false, :type => :list, :grep => "^#{chromosome}:\\|^#")
|
25
|
+
if tsv.size > 0
|
26
|
+
tsv.collect do |gene, values|
|
27
|
+
[gene, values.values_at("Start", "End").collect{|p| p.to_i}]
|
28
|
+
end
|
29
|
+
else
|
30
|
+
raise "No chromatin information for chromosome #{ chr } in tissue #{ tissue }"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
list.collect do |pos| chromosome_bed[pos] end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
|
2
|
+
require 'rbbt/sources/biomart'
|
3
|
+
require 'rbbt/sources/entrez'
|
4
|
+
|
5
|
+
$interpro_db = 'entry'
|
6
|
+
|
7
|
+
$interpro_id = ['InterPro Entry Accession','entry_id']
|
8
|
+
|
9
|
+
$interpro_pos = [
|
10
|
+
["UniProt/SwissProt Accession", "protein_ac"],
|
11
|
+
["Match Start Position", "pos_from"],
|
12
|
+
["Match Stop Position ", "pos_to"]
|
13
|
+
]
|
14
|
+
|
15
|
+
file 'interpro_positions' do |t|
|
16
|
+
Open.write(t.name, InterPro.tsv($interpro_db, $interpro_id, $interpro_pos, [], nil, :type => :double, :nocache => true).to_s)
|
17
|
+
end
|
18
|
+
|
19
|
+
file 'interpro_names' do |t|
|
20
|
+
Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tName\n" + Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/names.dat"))
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
file 'interpro_short_names' do |t|
|
25
|
+
Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tShort Name\n" + Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/short_names.dat"))
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/util/misc'
|
3
|
+
|
4
|
+
def read_chunk(jochem)
|
5
|
+
chunk = ""
|
6
|
+
while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
|
7
|
+
chunk << line
|
8
|
+
end
|
9
|
+
return nil if chunk.empty?
|
10
|
+
chunk
|
11
|
+
end
|
12
|
+
|
13
|
+
def first(list)
|
14
|
+
return nil if list.nil? or list.empty?
|
15
|
+
list.first
|
16
|
+
end
|
17
|
+
|
18
|
+
def process_jochem
|
19
|
+
jochem = Open.open("http://www.biosemantics.org/uploads/file/Jochem/JochemV1_2.zip")
|
20
|
+
identifiers = File.open('identifiers', 'w')
|
21
|
+
|
22
|
+
identifiers.puts("#: :namespace=JoChem")
|
23
|
+
identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
|
24
|
+
|
25
|
+
lexicon = File.open('lexicon', 'w')
|
26
|
+
lexicon.puts("#: :namespace=JoChem")
|
27
|
+
lexicon.puts("#ID\tSynonyms")
|
28
|
+
|
29
|
+
inchi = File.open('inchi', 'w')
|
30
|
+
inchi.puts("#: :namespace=JoChem")
|
31
|
+
inchi.puts("#ID\tInChi")
|
32
|
+
|
33
|
+
definitions = File.open('definitions', 'w')
|
34
|
+
definitions.puts("#: :namespace=JoChem#:type=:list")
|
35
|
+
definitions.puts("#ID\tDefinition")
|
36
|
+
|
37
|
+
while chunk = read_chunk(jochem) do
|
38
|
+
next if chunk.empty? or chunk =~ /^#/ or chunk =~ /^NS /
|
39
|
+
info = {}
|
40
|
+
chunk.split(/\n/).each do |line|
|
41
|
+
line.sub!(/\t@match.*/,'')
|
42
|
+
code, value = line.match(/([A-Z]*) (.*)/).values_at 1, 2
|
43
|
+
info[code] ||= []
|
44
|
+
info[code] << value
|
45
|
+
end
|
46
|
+
id = first(info["ID"])
|
47
|
+
na = first(info["NA"])
|
48
|
+
df = first(info["DF"])
|
49
|
+
tm = info["TM"] || []
|
50
|
+
db = info["DB"] || []
|
51
|
+
|
52
|
+
pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
|
53
|
+
drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
|
54
|
+
inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
|
55
|
+
|
56
|
+
lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
|
57
|
+
identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
|
58
|
+
inchi.puts [id, inch * "|"] * "\t" if inch.any?
|
59
|
+
definitions.puts [id, df] * "\t" unless df.nil?
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
rule /identifiers|lexicon|inchi|definitions/ do |t|
|
64
|
+
Misc.in_dir(File.dirname(t.name)) do
|
65
|
+
process_jochem
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module NCI
|
4
|
+
def self.get_pathways(xml, format = "UP")
|
5
|
+
doc = Nokogiri::XML(xml)
|
6
|
+
pathways = {}
|
7
|
+
|
8
|
+
molecules = {}
|
9
|
+
doc.xpath("//Molecule").each do |molecule|
|
10
|
+
id = molecule.attribute('id').value
|
11
|
+
type = molecule.attribute('molecule_type').value
|
12
|
+
next unless type == "protein"
|
13
|
+
names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
|
14
|
+
next if names.empty?
|
15
|
+
molecules[id] = {:xml => molecule, :uniprot => names.first}
|
16
|
+
end
|
17
|
+
|
18
|
+
interactions = {}
|
19
|
+
doc.xpath("//Interaction").each do |interaction|
|
20
|
+
id = interaction.attribute('id').value
|
21
|
+
molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}
|
22
|
+
|
23
|
+
interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
|
24
|
+
end
|
25
|
+
|
26
|
+
doc.xpath("//Pathway").each do |pathway|
|
27
|
+
id = pathway.attribute('id').value
|
28
|
+
subnet = pathway.attribute('subnet').value
|
29
|
+
name = pathway.xpath('LongName').first.content
|
30
|
+
|
31
|
+
interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
|
32
|
+
|
33
|
+
pathway_interactions = interaction_ids.collect{|id| interactions[id]}
|
34
|
+
pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
|
35
|
+
|
36
|
+
pathway_uniprot_ids = pathway_molecule_ids.collect do |id|
|
37
|
+
next unless molecules.include? id
|
38
|
+
molecules[id][:uniprot]
|
39
|
+
end
|
40
|
+
pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
|
41
|
+
end
|
42
|
+
pathways
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
file 'nature_pathways' do |t|
|
47
|
+
|
48
|
+
url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/NCI-Nature_Curated.xml.gz"
|
49
|
+
|
50
|
+
xml = Open.read(url)
|
51
|
+
|
52
|
+
pathways = NCI.get_pathways(xml)
|
53
|
+
|
54
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
55
|
+
end
|
56
|
+
|
57
|
+
file 'biocarta_pathways' do |t|
|
58
|
+
|
59
|
+
url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/BioCarta.xml.gz"
|
60
|
+
|
61
|
+
xml = Open.read(url)
|
62
|
+
|
63
|
+
pathways = NCI.get_pathways(xml, "LL")
|
64
|
+
|
65
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
|
66
|
+
end
|
67
|
+
|
68
|
+
file 'reactome_pathways' do |t|
|
69
|
+
|
70
|
+
url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
|
71
|
+
|
72
|
+
xml = Open.read(url)
|
73
|
+
|
74
|
+
pathways = NCI.get_pathways(xml, "UP")
|
75
|
+
|
76
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
77
|
+
end
|
78
|
+
|
79
|
+
|
@@ -7,6 +7,8 @@ $taxs = [9606]
|
|
7
7
|
$scientific_name = "Homo sapiens"
|
8
8
|
|
9
9
|
$biomart_db = 'hsapiens_gene_ensembl'
|
10
|
+
$biomart_db_germline_variation = 'hsapiens_snp'
|
11
|
+
$biomart_db_somatic_variation = 'hsapiens_snp_som'
|
10
12
|
|
11
13
|
$biomart_lexicon = [
|
12
14
|
[ 'Associated Gene Name' , "external_gene_id"],
|
@@ -15,6 +17,14 @@ $biomart_lexicon = [
|
|
15
17
|
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
16
18
|
]
|
17
19
|
|
20
|
+
$biomart_protein_identifiers = [
|
21
|
+
[ 'Protein ID', "protein_id" ],
|
22
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
23
|
+
[ 'Unigene ID', "unigene" ],
|
24
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
25
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
26
|
+
]
|
27
|
+
|
18
28
|
$biomart_identifiers = [
|
19
29
|
[ 'Entrez Gene ID', "entrezgene"],
|
20
30
|
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
@@ -42,7 +52,7 @@ $biomart_identifiers = [
|
|
42
52
|
[ 'AFFY HG U95E', 'affy_hg_u95e' ],
|
43
53
|
[ 'AFFY HG U95A', 'affy_hg_u95a' ],
|
44
54
|
[ 'AFFY HUGENEFL', 'affy_hugenefl' ],
|
45
|
-
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
|
55
|
+
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
|
46
56
|
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
47
57
|
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
48
58
|
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
@@ -52,5 +62,14 @@ $biomart_identifiers = [
|
|
52
62
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
53
63
|
]
|
54
64
|
|
65
|
+
$biomart_go= [
|
66
|
+
["GO ID", 'go_id'],
|
67
|
+
["GO Namespace", 'namespace_1003'],
|
68
|
+
]
|
69
|
+
|
70
|
+
$biomart_pfam= [
|
71
|
+
["Pfam Domain", 'pfam'],
|
72
|
+
]
|
73
|
+
|
55
74
|
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
56
75
|
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
@@ -7,6 +7,8 @@ $taxs = [10116]
|
|
7
7
|
$scientific_name = "Rattus norvegicus"
|
8
8
|
|
9
9
|
$biomart_db = 'rnorvegicus_gene_ensembl'
|
10
|
+
$biomart_db_germline_variation = 'rnorvegicus_snp'
|
11
|
+
$biomart_db_somatic_variation = 'rnorvegicus_snp_som'
|
10
12
|
|
11
13
|
$biomart_lexicon = [
|
12
14
|
[ 'Associated Gene Name' , "external_gene_id"],
|
@@ -2,8 +2,6 @@ $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
|
2
2
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
3
3
|
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
4
4
|
$biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
|
5
|
-
$biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
|
6
|
-
$biomart_germline_variation_id = ['Variation ID', "external_id" ]
|
7
5
|
|
8
6
|
$biomart_gene_positions = [
|
9
7
|
['Chromosome Name','chromosome_name'],
|
@@ -60,42 +58,6 @@ $biomart_exons = [
|
|
60
58
|
['Exon Chr End','exon_chrom_end'],
|
61
59
|
]
|
62
60
|
|
63
|
-
#{{{ Variations
|
64
|
-
|
65
|
-
$biomart_germline_variation_positions = [
|
66
|
-
['Chromosome Location (bp)', "chromosome_location" ],
|
67
|
-
['SNP Chromosome Strand', "snp_chromosome_strand" ],
|
68
|
-
['Transcript location (bp)', "transcript_location" ],
|
69
|
-
['Allele', "allele" ],
|
70
|
-
['Protein Allele', "peptide_shift" ],
|
71
|
-
['CDS Start', "cds_start_2076" ],
|
72
|
-
['CDS End', "cds_end_2076" ],
|
73
|
-
]
|
74
|
-
|
75
|
-
$biomart_germline_variations = [
|
76
|
-
$biomart_ensembl_gene,
|
77
|
-
['Source', "source_name" ],
|
78
|
-
['Validated', "validated" ],
|
79
|
-
['Consequence Type', "synonymous_status" ],
|
80
|
-
]
|
81
|
-
|
82
|
-
$biomart_somatic_variation_positions = [
|
83
|
-
['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
|
84
|
-
['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
|
85
|
-
['Transcript location (bp)' , "somatic_transcript_location" ] ,
|
86
|
-
['Allele' , "somatic_allele" ] ,
|
87
|
-
['Protein Allele' , "somatic_peptide_shift" ] ,
|
88
|
-
['CDS Start' , "somatic_cds_start_2076" ] ,
|
89
|
-
['CDS End' , "somatic_cds_end_2076" ] ,
|
90
|
-
]
|
91
|
-
|
92
|
-
$biomart_somatic_variations = [
|
93
|
-
$biomart_ensembl_gene,
|
94
|
-
['Source' , "somatic_source_name" ] ,
|
95
|
-
['Validated' , "somatic_validated" ] ,
|
96
|
-
['Consequence Type' , "somatic_synonymous_status" ] ,
|
97
|
-
]
|
98
|
-
|
99
61
|
#{{{ Rules
|
100
62
|
|
101
63
|
file 'scientific_name' do |t|
|
@@ -104,15 +66,69 @@ end
|
|
104
66
|
|
105
67
|
file 'identifiers' do |t|
|
106
68
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
|
69
|
+
|
107
70
|
$biomart_identifiers.each do |name, key, prefix|
|
108
71
|
if prefix
|
109
72
|
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
110
73
|
end
|
111
74
|
end
|
112
75
|
|
76
|
+
name_pos = identifiers.identify_field "Associated Gene Name"
|
77
|
+
entrez2name = Entrez.entrez2name($taxs)
|
78
|
+
identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
|
79
|
+
names = values[name_pos]
|
80
|
+
|
81
|
+
matches = entrez.select do |e|
|
82
|
+
entrez2name.include? e and (names & entrez2name[e]).any?
|
83
|
+
end
|
84
|
+
|
85
|
+
if matches.any?
|
86
|
+
matches
|
87
|
+
else
|
88
|
+
entrez
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
|
93
|
+
entrez_synonyms.key_field = "Entrez Gene ID"
|
94
|
+
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
95
|
+
|
96
|
+
identifiers.attach entrez_synonyms
|
97
|
+
|
98
|
+
identifiers.each do |key, values|
|
99
|
+
values.each do |list|
|
100
|
+
list.reject!{|v| v.nil? or v.empty?}
|
101
|
+
list.uniq!
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
File.open(t.name, 'w') do |f| f.puts identifiers end
|
106
|
+
end
|
107
|
+
|
108
|
+
file 'lexicon' => 'identifiers' do |t|
|
109
|
+
tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
|
110
|
+
|
111
|
+
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 8
|
112
|
+
entrez_description.key_field = "Entrez Gene ID"
|
113
|
+
entrez_description.fields = ["Entrez Gene Description"]
|
114
|
+
|
115
|
+
tsv.attach entrez_description
|
116
|
+
Open.write(t.name, tsv.to_s)
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
file 'protein_identifiers' do |t|
|
121
|
+
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_identifiers, [], nil, :namespace => $namespace)
|
122
|
+
$biomart_protein_identifiers.each do |name, key, prefix|
|
123
|
+
if prefix
|
124
|
+
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
113
128
|
File.open(t.name, 'w') do |f| f.puts identifiers end
|
114
129
|
end
|
115
130
|
|
131
|
+
|
116
132
|
file 'gene_transcripts' do |t|
|
117
133
|
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat, :namespace => $namespace)
|
118
134
|
|
@@ -121,7 +137,7 @@ end
|
|
121
137
|
|
122
138
|
file 'transcripts' => 'gene_positions' do |t|
|
123
139
|
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
|
124
|
-
transcripts.attach TSV.
|
140
|
+
transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
|
125
141
|
|
126
142
|
File.open(t.name, 'w') do |f| f.puts transcripts end
|
127
143
|
end
|
@@ -198,7 +214,7 @@ end
|
|
198
214
|
|
199
215
|
file 'exons' => 'gene_positions' do |t|
|
200
216
|
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
|
201
|
-
exons.attach TSV.
|
217
|
+
exons.attach TSV.open('gene_positions'), "Chromosome Name"
|
202
218
|
|
203
219
|
File.open(t.name, 'w') do |f| f.puts exons end
|
204
220
|
end
|
@@ -227,28 +243,21 @@ file 'transcript_sequence' do |t|
|
|
227
243
|
end
|
228
244
|
|
229
245
|
|
230
|
-
|
231
|
-
#$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
|
232
|
-
$biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
|
246
|
+
#{{{ Variations
|
233
247
|
|
234
|
-
|
235
|
-
|
236
|
-
end
|
248
|
+
$biomart_variation_id = ["SNP ID", "refsnp_id"]
|
249
|
+
$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
|
237
250
|
|
238
|
-
file '
|
239
|
-
|
240
|
-
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
251
|
+
file 'germline_variations' do |t|
|
252
|
+
BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
|
241
253
|
end
|
242
254
|
|
243
255
|
file 'somatic_variations' do |t|
|
244
|
-
|
245
|
-
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
256
|
+
BioMart.tsv($biomart_db_somatic_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
|
246
257
|
end
|
247
258
|
|
248
|
-
|
249
|
-
|
250
|
-
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
251
|
-
end
|
259
|
+
|
260
|
+
# {{{ Other info
|
252
261
|
|
253
262
|
file 'gene_pmids' do |t|
|
254
263
|
tsv = Entrez.entrez2pubmed($taxs)
|
@@ -260,47 +269,95 @@ file 'gene_pmids' do |t|
|
|
260
269
|
Open.write(t.name, text)
|
261
270
|
end
|
262
271
|
|
263
|
-
|
264
|
-
|
272
|
+
def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
273
|
+
transcripts = begin
|
274
|
+
exon_transcripts[exon].first
|
275
|
+
rescue
|
276
|
+
[]
|
277
|
+
end
|
265
278
|
|
266
|
-
|
267
|
-
|
268
|
-
gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
|
269
|
-
transcript_info = TSV.new('transcripts', :list, :persistence => true )
|
270
|
-
transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
|
279
|
+
transcripts.select{|transcript| transcript_info[transcript].first.any?}
|
280
|
+
end
|
271
281
|
|
282
|
+
def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
|
283
|
+
sizes = [0]
|
284
|
+
rank = nil
|
285
|
+
start_pos = exons.identify_field "Exon Chr Start"
|
286
|
+
end_pos = exons.identify_field "Exon Chr End"
|
287
|
+
|
288
|
+
Misc.zip_fields(transcript_exons[transcript]).each do |_exon, _rank|
|
289
|
+
_rank = _rank.to_i
|
290
|
+
s, e = exons[_exon].values_at(start_pos, end_pos)
|
291
|
+
size = e.to_i - s.to_i + 1
|
292
|
+
sizes[_rank] = size
|
293
|
+
rank = _rank if _exon == exon
|
294
|
+
end
|
295
|
+
|
296
|
+
if not rank.nil?
|
297
|
+
sizes[0..rank - 1].inject(0){|e,acc| acc += e}
|
298
|
+
else
|
299
|
+
nil
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
|
304
|
+
exons = TSV.open('exons')
|
305
|
+
exon_transcripts = nil
|
306
|
+
exon_transcripts = TSV.open('transcript_exons', :double, :key_field => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true)
|
307
|
+
gene_transcripts = TSV.open('gene_transcripts', :flat)
|
308
|
+
transcript_info = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
|
309
|
+
transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
|
272
310
|
|
273
311
|
string = "#: :namespace=#{$namespace}"
|
274
312
|
string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
|
275
|
-
exons.each do |exon, info|
|
276
|
-
gene, start, finish, strand, chr = info
|
277
313
|
|
278
|
-
|
314
|
+
exons.unnamed = true
|
315
|
+
exon_transcripts.unnamed = true
|
316
|
+
gene_transcripts.unnamed = true
|
317
|
+
transcript_info.unnamed = true
|
318
|
+
transcript_exons.unnamed = true
|
319
|
+
|
320
|
+
exons.monitor = true
|
321
|
+
Misc.profile do
|
322
|
+
exons.through do |exon, info|
|
323
|
+
gene, start, finish, strand, chr = info
|
279
324
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
325
|
+
transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
326
|
+
|
327
|
+
transcript_offsets = {}
|
328
|
+
transcripts.each do |transcript|
|
329
|
+
offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
|
330
|
+
transcript_offsets[transcript] = offset unless offset.nil?
|
331
|
+
end
|
332
|
+
|
333
|
+
string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
|
284
334
|
end
|
285
|
-
|
286
|
-
string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
|
287
335
|
end
|
288
336
|
|
289
337
|
Open.write(t.name, string)
|
290
338
|
end
|
291
339
|
|
340
|
+
file 'gene_go' do |t|
|
341
|
+
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
|
342
|
+
|
343
|
+
File.open(t.name, 'w') do |f| f.puts goterms end
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
file 'gene_pfam' do |t|
|
348
|
+
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
|
349
|
+
|
350
|
+
File.open(t.name, 'w') do |f| f.puts goterms end
|
351
|
+
end
|
352
|
+
|
353
|
+
|
292
354
|
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
293
355
|
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
|
294
356
|
archive = $1
|
295
357
|
task = $2
|
296
|
-
|
297
|
-
begin
|
298
|
-
FileUtils.mkdir archive unless File.exists? archive
|
299
|
-
FileUtils.cd File.join(archive)
|
358
|
+
Misc.in_dir(archive) do
|
300
359
|
BioMart.set_archive archive
|
301
360
|
Rake::Task[task].invoke
|
302
361
|
BioMart.unset_archive
|
303
|
-
ensure
|
304
|
-
FileUtils.cd old_pwd
|
305
362
|
end
|
306
363
|
end
|