rbbt 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/bin/rbbt_config +180 -0
- data/install_scripts/classifier/R/classify.R +36 -0
- data/install_scripts/classifier/Rakefile +140 -0
- data/install_scripts/get_abner.sh +2 -0
- data/install_scripts/get_banner.sh +25 -0
- data/install_scripts/get_biocreative.sh +72 -0
- data/install_scripts/get_crf++.sh +26 -0
- data/install_scripts/get_entrez.sh +4 -0
- data/install_scripts/get_go.sh +4 -0
- data/install_scripts/get_polysearch.sh +8 -0
- data/install_scripts/ner/Rakefile +206 -0
- data/install_scripts/ner/config/default.rb +52 -0
- data/install_scripts/norm/Rakefile +218 -0
- data/install_scripts/norm/config/cue_default.rb +10 -0
- data/install_scripts/norm/config/tokens_default.rb +79 -0
- data/install_scripts/norm/functions.sh +21 -0
- data/install_scripts/organisms/Rakefile +25 -0
- data/install_scripts/organisms/cgd.Rakefile +84 -0
- data/install_scripts/organisms/human.Rakefile +145 -0
- data/install_scripts/organisms/mgi.Rakefile +77 -0
- data/install_scripts/organisms/pombe.Rakefile +40 -0
- data/install_scripts/organisms/rake-include.rb +258 -0
- data/install_scripts/organisms/rgd.Rakefile +88 -0
- data/install_scripts/organisms/sgd.Rakefile +66 -0
- data/install_scripts/organisms/tair.Rakefile +54 -0
- data/install_scripts/organisms/worm.Rakefile +109 -0
- data/install_scripts/stopwords +1 -0
- data/install_scripts/wordlists/consonants +897 -0
- data/install_scripts/wordlists/stopwords +1 -0
- data/lib/rbbt/bow/bow.rb +87 -0
- data/lib/rbbt/bow/classifier.rb +118 -0
- data/lib/rbbt/bow/dictionary.rb +218 -0
- data/lib/rbbt/ner/abner.rb +34 -0
- data/lib/rbbt/ner/banner.rb +73 -0
- data/lib/rbbt/ner/regexpNER.rb +62 -0
- data/lib/rbbt/ner/rner.rb +227 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
- data/lib/rbbt/ner/rnorm.rb +142 -0
- data/lib/rbbt/sources/biocreative.rb +75 -0
- data/lib/rbbt/sources/biomart.rb +106 -0
- data/lib/rbbt/sources/entrez.rb +211 -0
- data/lib/rbbt/sources/go.rb +40 -0
- data/lib/rbbt/sources/organism.rb +197 -0
- data/lib/rbbt/sources/polysearch.rb +88 -0
- data/lib/rbbt/sources/pubmed.rb +111 -0
- data/lib/rbbt/util/arrayHash.rb +255 -0
- data/lib/rbbt/util/filecache.rb +72 -0
- data/lib/rbbt/util/index.rb +69 -0
- data/lib/rbbt/util/misc.rb +101 -0
- data/lib/rbbt/util/open.rb +207 -0
- data/lib/rbbt/util/simpleDSL.rb +87 -0
- data/lib/rbbt/util/tmpfile.rb +19 -0
- data/lib/rbbt/version.rb +10 -0
- data/lib/rbbt.rb +86 -0
- data/tasks/install.rake +123 -0
- metadata +114 -0
@@ -0,0 +1,145 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Homo sapiens"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "Entrez Gene ID"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 9606,
|
10
|
+
:fix => nil,
|
11
|
+
:check => proc{|code| false},
|
12
|
+
}
|
13
|
+
|
14
|
+
$lexicon = {
|
15
|
+
:biomart => {
|
16
|
+
:database => 'hsapiens_gene_ensembl',
|
17
|
+
:main => ['Entrez Gene ID' , "entrezgene"],
|
18
|
+
:extra => [
|
19
|
+
[ 'Associated Gene Name' , "external_gene_id"],
|
20
|
+
[ 'HGNC symbol', "hgnc_symbol" ],
|
21
|
+
[ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
|
22
|
+
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
23
|
+
],
|
24
|
+
}
|
25
|
+
|
26
|
+
}
|
27
|
+
|
28
|
+
$identifiers = {
|
29
|
+
:biomart => {
|
30
|
+
:database => 'hsapiens_gene_ensembl',
|
31
|
+
:main => ['Entrez Gene ID' , "entrezgene"],
|
32
|
+
:extra => [
|
33
|
+
[ 'Ensembl Gene ID', "ensembl_gene_id" ],
|
34
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
35
|
+
[ 'Associated Gene Name', "external_gene_id" ],
|
36
|
+
[ 'CCDS ID', "ccds" ],
|
37
|
+
[ 'Protein ID', "protein_id" ],
|
38
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
39
|
+
[ 'Unigene ID', "unigene" ],
|
40
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
41
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
42
|
+
[ 'HGNC ID', "hgnc_id", 'HGNC'],
|
43
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
44
|
+
|
45
|
+
# Affymetrix
|
46
|
+
[ 'AFFY HC G110', 'affy_hc_g110' ],
|
47
|
+
[ 'AFFY HG FOCUS', 'affy_hg_focus' ],
|
48
|
+
[ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
|
49
|
+
[ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
|
50
|
+
[ 'AFFY HG U133A', 'affy_hg_u133a' ],
|
51
|
+
[ 'AFFY HG U133B', 'affy_hg_u133b' ],
|
52
|
+
[ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
|
53
|
+
[ 'AFFY HG U95B', 'affy_hg_u95b' ],
|
54
|
+
[ 'AFFY HG U95C', 'affy_hg_u95c' ],
|
55
|
+
[ 'AFFY HG U95D', 'affy_hg_u95d' ],
|
56
|
+
[ 'AFFY HG U95E', 'affy_hg_u95e' ],
|
57
|
+
[ 'AFFY HG U95A', 'affy_hg_u95a' ],
|
58
|
+
[ 'AFFY HUGENEFL', 'affy_hugenefl' ],
|
59
|
+
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
|
60
|
+
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
61
|
+
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
62
|
+
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
63
|
+
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
64
|
+
[ 'Codelink ID', 'codelink' ],
|
65
|
+
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
66
|
+
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
67
|
+
|
68
|
+
],
|
69
|
+
:filter => [],
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
$go = {
|
74
|
+
:url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
|
75
|
+
:code => 2,
|
76
|
+
:go => 4,
|
77
|
+
:pmid => 5,
|
78
|
+
}
|
79
|
+
|
80
|
+
$query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
|
81
|
+
##########################
|
82
|
+
|
83
|
+
require 'rbbt/util/index'
|
84
|
+
|
85
|
+
Rake::Task['gene.go'].clear
|
86
|
+
file 'gene.go' => ['identifiers'] do
|
87
|
+
if File.exists? 'identifiers'
|
88
|
+
require 'rbbt/sources/organism'
|
89
|
+
index = Organism.id_index('human', :other => ['Associated Gene Name'])
|
90
|
+
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
|
91
|
+
|
92
|
+
data = data.collect{|code, value_lists|
|
93
|
+
code = index[code]
|
94
|
+
[code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
|
95
|
+
}.select{|p| p[0] && p[1].any?}
|
96
|
+
|
97
|
+
Open.write('gene.go',
|
98
|
+
data.collect{|p|
|
99
|
+
p[1].uniq.collect{|go|
|
100
|
+
"#{p[0]}\t#{go}"
|
101
|
+
}.join("\n")
|
102
|
+
}.join("\n")
|
103
|
+
)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
Rake::Task['gene_go.pmid'].clear
|
108
|
+
file 'gene_go.pmid' => ['identifiers'] do
|
109
|
+
if File.exists? 'identifiers'
|
110
|
+
index = Index.index('identifiers')
|
111
|
+
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
|
112
|
+
|
113
|
+
data = data.collect{|code, value_lists|
|
114
|
+
code = index[code]
|
115
|
+
[code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
|
116
|
+
}.select{|p| p[0] && p[1].any?}
|
117
|
+
|
118
|
+
Open.write('gene_go.pmid',
|
119
|
+
data.collect{|p|
|
120
|
+
p[1].uniq.collect{|pmid|
|
121
|
+
"#{p[0]}\t#{pmid}"
|
122
|
+
}.join("\n")
|
123
|
+
}.join("\n")
|
124
|
+
)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
Rake::Task['lexicon'].clear
|
130
|
+
file 'lexicon' => ['identifiers'] do
|
131
|
+
if File.exists? 'identifiers'
|
132
|
+
require 'rbbt/sources/organism'
|
133
|
+
HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
|
134
|
+
names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
|
135
|
+
translations = Organism.id_index('human', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
|
136
|
+
|
137
|
+
Open.write('lexicon',
|
138
|
+
names.collect{|code, names|
|
139
|
+
next unless translations[code]
|
140
|
+
([translations[code]] + names).join("\t")
|
141
|
+
}.compact.join("\n")
|
142
|
+
)
|
143
|
+
end
|
144
|
+
|
145
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Mus musculus"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "MGI DB ID"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 10090,
|
10
|
+
:fix => nil,
|
11
|
+
:check => proc{|code| code.match(/^MGI/)},
|
12
|
+
}
|
13
|
+
|
14
|
+
$lexicon = {
|
15
|
+
:file => {
|
16
|
+
:url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
|
17
|
+
:native => 0,
|
18
|
+
:extra => [2,3],
|
19
|
+
:exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
|
20
|
+
},
|
21
|
+
}
|
22
|
+
|
23
|
+
$identifiers = {
|
24
|
+
:file => {
|
25
|
+
:url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
|
26
|
+
:native => 0,
|
27
|
+
:extra => [],
|
28
|
+
:exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
|
29
|
+
},
|
30
|
+
:biomart => {
|
31
|
+
:database => 'mmusculus_gene_ensembl',
|
32
|
+
:main => ['MGI DB ID', 'mgi_id'] ,
|
33
|
+
:extra => [
|
34
|
+
['Associated Gene Name' , "external_gene_id"],
|
35
|
+
['Protein ID' , "protein_id"] ,
|
36
|
+
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
37
|
+
['Unigene ID' , "unigene"] ,
|
38
|
+
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
39
|
+
['RefSeq Protein ID' , "refseq_peptide"] ,
|
40
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
41
|
+
|
42
|
+
['Affy mg u74a',"affy_mg_u74a" ],
|
43
|
+
['Affy mg u74av2',"affy_mg_u74av2" ],
|
44
|
+
['Affy mg u74b',"affy_mg_u74b" ],
|
45
|
+
['Affy mg u74bv2',"affy_mg_u74bv2" ],
|
46
|
+
['Affy mg u74c',"affy_mg_u74c" ],
|
47
|
+
['Affy mg u74cv2',"affy_mg_u74cv2" ],
|
48
|
+
['Affy moe430a',"affy_moe430a" ],
|
49
|
+
['Affy moe430b',"affy_moe430b" ],
|
50
|
+
['AFFY MoEx',"affy_moex_1_0_st_v1" ],
|
51
|
+
['AFFY MoGene',"affy_mogene_1_0_st_v1" ],
|
52
|
+
['Affy mouse430 2',"affy_mouse430_2" ],
|
53
|
+
['Affy mouse430a 2',"affy_mouse430a_2" ],
|
54
|
+
['Affy mu11ksuba',"affy_mu11ksuba" ],
|
55
|
+
['Affy mu11ksubb',"affy_mu11ksubb" ],
|
56
|
+
['Agilent WholeGenome',"agilent_wholegenome" ],
|
57
|
+
['Codelink ID',"codelink" ],
|
58
|
+
['Illumina MouseWG 6 v1',"illumina_mousewg_6_v1" ],
|
59
|
+
['Illumina MouseWG 6 v2',"illumina_mousewg_6_v2" ],
|
60
|
+
|
61
|
+
],
|
62
|
+
:filter => ['with_mgi'], # This is needed as the filter is not with_mgi_id as was expected
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
$go = {
|
67
|
+
:url => "ftp://ftp.geneontology.org/go/gene-associations/gene_association.mgi.gz",
|
68
|
+
:code => 1,
|
69
|
+
:go => 4,
|
70
|
+
:pmid => 5,
|
71
|
+
}
|
72
|
+
|
73
|
+
$query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]))'
|
74
|
+
##########################
|
75
|
+
|
76
|
+
|
77
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Schizosaccharomyces pombe"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "GeneDB Id"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 4896,
|
10
|
+
:fix => proc{|code| code.sub(/GeneDB:SP/,'SP') },
|
11
|
+
:check => proc{|code| code.match(/^SP/)},
|
12
|
+
}
|
13
|
+
|
14
|
+
$lexicon = {
|
15
|
+
:file => {
|
16
|
+
:url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
|
17
|
+
:native => 0,
|
18
|
+
:extra => [1,2,3,4,5,6,7,8]
|
19
|
+
},
|
20
|
+
}
|
21
|
+
|
22
|
+
$identifiers = {
|
23
|
+
:file => {
|
24
|
+
:url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
|
25
|
+
:native => 0,
|
26
|
+
:extra => [],
|
27
|
+
},
|
28
|
+
}
|
29
|
+
|
30
|
+
$go = {
|
31
|
+
:url => "ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Gene_ontology/gene_association.GeneDB_Spombe",
|
32
|
+
:code => 1,
|
33
|
+
:go => 4,
|
34
|
+
:pmid => 5,
|
35
|
+
}
|
36
|
+
|
37
|
+
$query = 'pombe[All Fields] AND (hasabstract[text] AND English[lang])'
|
38
|
+
####
|
39
|
+
|
40
|
+
|
@@ -0,0 +1,258 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/util/arrayHash'
|
4
|
+
require 'rbbt/sources/biomart'
|
5
|
+
require 'rbbt/sources/entrez'
|
6
|
+
require 'rbbt/sources/pubmed'
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
file 'name' do
|
11
|
+
Open.write('name', $name)
|
12
|
+
end
|
13
|
+
|
14
|
+
file 'all.pmid' do
|
15
|
+
Open.write('all.pmid', PubMed.query($query).join("\n"))
|
16
|
+
end
|
17
|
+
|
18
|
+
file 'lexicon' do
|
19
|
+
begin
|
20
|
+
|
21
|
+
data = nil
|
22
|
+
# Read from file
|
23
|
+
if $lexicon[:file]
|
24
|
+
file = Open.to_hash($lexicon[:file][:url], $lexicon[:file])
|
25
|
+
data = ArrayHash.new(file, $native_id)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Translate from entrez to native if needed
|
29
|
+
if $entrez2native
|
30
|
+
translations = {}
|
31
|
+
Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
|
32
|
+
each{|k,v|
|
33
|
+
translations[k] = [v.join("|")]
|
34
|
+
}
|
35
|
+
translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
|
36
|
+
if data
|
37
|
+
data.merge(translations_data)
|
38
|
+
else
|
39
|
+
data = translations_data
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
# Read from Biomart and merge with previous data
|
46
|
+
if $lexicon[:biomart]
|
47
|
+
biomart = {}
|
48
|
+
|
49
|
+
BioMart.query(
|
50
|
+
$lexicon[:biomart][:database],
|
51
|
+
$lexicon[:biomart][:main][1],
|
52
|
+
$lexicon[:biomart][:extra].collect{|v| v[1]},
|
53
|
+
$lexicon[:biomart][:filter]
|
54
|
+
).each{|key, values_list|
|
55
|
+
values = values_list.values_at(*$lexicon[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
|
56
|
+
biomart[key] = values
|
57
|
+
}
|
58
|
+
|
59
|
+
biomart_data = ArrayHash.new(biomart, $lexicon[:biomart][:main][0], $lexicon[:biomart][:extra].collect{|v| v[0]})
|
60
|
+
|
61
|
+
if data
|
62
|
+
if $lexicon[:biomart][:extra].collect{|v| v[1]}.include?( $native_id )|| $lexicon[:biomart][:main][0] == $native_id
|
63
|
+
field = $native_id
|
64
|
+
else
|
65
|
+
field = 'Entrez Gene ID'
|
66
|
+
end
|
67
|
+
data.merge(biomart_data, field)
|
68
|
+
else
|
69
|
+
data = biomart_data
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
if $entrez2native
|
74
|
+
gene_alias = {}
|
75
|
+
Entrez.entrez2native($entrez2native[:tax],4).
|
76
|
+
each{|k,v|
|
77
|
+
gene_alias[k] = [v.select{|e| e.to_s != ""}.join("|")]
|
78
|
+
}
|
79
|
+
if gene_alias.keys.any?
|
80
|
+
gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
|
81
|
+
data.merge(gene_alias_data, 'Entrez Gene ID')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
data.remove('Entrez Gene ID')
|
86
|
+
data.clean
|
87
|
+
Open.write('lexicon', data.data.collect{|code, name_lists|
|
88
|
+
"#{ code }\t" + name_lists.flatten.select{|n| n.to_s != ""}.uniq.join("\t")
|
89
|
+
}.join("\n"))
|
90
|
+
|
91
|
+
rescue Entrez::NoFile
|
92
|
+
puts "Lexicon not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
file 'identifiers' do
|
98
|
+
|
99
|
+
begin
|
100
|
+
data = nil
|
101
|
+
if $identifiers[:file]
|
102
|
+
file = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
|
103
|
+
data = ArrayHash.new(file, $native_id, $identifiers[:file][:fields])
|
104
|
+
end
|
105
|
+
|
106
|
+
# Translate from entrez to native if needed
|
107
|
+
if $entrez2native
|
108
|
+
translations = {}
|
109
|
+
Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
|
110
|
+
each{|k,v|
|
111
|
+
translations[k] = [v.join("|")]
|
112
|
+
}
|
113
|
+
if translations.keys.any?
|
114
|
+
translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
|
115
|
+
if data
|
116
|
+
data.merge(translations_data)
|
117
|
+
else
|
118
|
+
data = translations_data
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# Read from Biomart and merge with previous data
|
126
|
+
if $identifiers[:biomart]
|
127
|
+
biomart = {}
|
128
|
+
|
129
|
+
BioMart.query(
|
130
|
+
$identifiers[:biomart][:database],
|
131
|
+
$identifiers[:biomart][:main][1],
|
132
|
+
$identifiers[:biomart][:extra].collect{|v| v[1]},
|
133
|
+
$identifiers[:biomart][:filter]
|
134
|
+
).each{|key, values_list|
|
135
|
+
values = values_list.values_at(*$identifiers[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
|
136
|
+
biomart[key] = values
|
137
|
+
}
|
138
|
+
|
139
|
+
biomart_data = ArrayHash.new(biomart, $identifiers[:biomart][:main][0], $identifiers[:biomart][:extra].collect{|v| v[0]})
|
140
|
+
$identifiers[:biomart][:extra].each{|values|
|
141
|
+
if values[2]
|
142
|
+
biomart_data.process(values[0]){|n| "#{values[2]}:#{n}"}
|
143
|
+
end
|
144
|
+
}
|
145
|
+
|
146
|
+
|
147
|
+
if data
|
148
|
+
if $identifiers[:biomart][:extra].collect{|v| v[1]}.include?( $native_id ) || $identifiers[:biomart][:main][0] == $native_id
|
149
|
+
field = $native_id
|
150
|
+
else
|
151
|
+
field = 'Entrez Gene ID'
|
152
|
+
end
|
153
|
+
data.merge(biomart_data, field)
|
154
|
+
else
|
155
|
+
data = biomart_data
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
# Add the alias at the end
|
161
|
+
if $entrez2native
|
162
|
+
gene_alias = {}
|
163
|
+
Entrez.entrez2native($entrez2native[:tax],4).
|
164
|
+
each{|k,v|
|
165
|
+
gene_alias[k] = [v.join("|")]
|
166
|
+
}
|
167
|
+
if gene_alias.keys.any?
|
168
|
+
gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
|
169
|
+
if data
|
170
|
+
data.merge(gene_alias_data, 'Entrez Gene ID')
|
171
|
+
else
|
172
|
+
data = gene_alias_data
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
# Write ids to file
|
180
|
+
fout = File.open('identifiers', 'w')
|
181
|
+
fout.puts "##{$native_id}\t" + data.fields.join("\t")
|
182
|
+
data.clean
|
183
|
+
data.data.each{|code, values|
|
184
|
+
fout.puts code + "\t" + values.join("\t")
|
185
|
+
}
|
186
|
+
fout.close
|
187
|
+
|
188
|
+
rescue Entrez::NoFile
|
189
|
+
puts "Identifiers not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
file 'gene.go' do
|
195
|
+
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix])
|
196
|
+
|
197
|
+
data = data.collect{|code, value_lists|
|
198
|
+
[code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
|
199
|
+
}.select{|p| p[1].any?}
|
200
|
+
|
201
|
+
Open.write('gene.go',
|
202
|
+
data.collect{|p|
|
203
|
+
p[1].uniq.collect{|go|
|
204
|
+
"#{p[0]}\t#{go}"
|
205
|
+
}.join("\n")
|
206
|
+
}.join("\n")
|
207
|
+
)
|
208
|
+
end
|
209
|
+
|
210
|
+
file 'gene_go.pmid' do
|
211
|
+
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix])
|
212
|
+
|
213
|
+
data = data.collect{|code, value_lists|
|
214
|
+
[code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
|
215
|
+
}.select{|p| p[1].any?}
|
216
|
+
|
217
|
+
Open.write('gene_go.pmid',
|
218
|
+
data.collect{|p|
|
219
|
+
p[1].uniq.collect{|pmid| "#{p[0]}\t#{pmid}" }.join("\n")
|
220
|
+
}.join("\n")
|
221
|
+
)
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
file 'gene.pmid' do
|
226
|
+
begin
|
227
|
+
translations = Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)) if $native_id != "Entrez Gene ID"
|
228
|
+
|
229
|
+
data = Entrez.entrez2pubmed($entrez2native[:tax])
|
230
|
+
|
231
|
+
Open.write('gene.pmid',
|
232
|
+
data.collect{|code,pmids|
|
233
|
+
next if translations && ! translations[code]
|
234
|
+
code = translations[code].first if translations
|
235
|
+
pmids.collect{|pmid|
|
236
|
+
"#{ code }\t#{pmid}"
|
237
|
+
}.compact.join("\n")
|
238
|
+
}.compact.join("\n")
|
239
|
+
)
|
240
|
+
rescue Entrez::NoFile
|
241
|
+
puts "Gene article associations from entrez not produced, install the gene2pumbed file (rbbt_config install entrez)."
|
242
|
+
end
|
243
|
+
|
244
|
+
end
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
|
249
|
+
task 'all' => ['name', 'lexicon', 'identifiers', 'gene_go.pmid', 'gene.pmid', 'gene.go', 'all.pmid']
|
250
|
+
task 'clean' do
|
251
|
+
`rm -f 'name' 'lexicon' 'identifiers' 'gene_go.pmid' 'gene.pmid' 'gene.go' 'all.pmid'`
|
252
|
+
end
|
253
|
+
|
254
|
+
task 'update' do
|
255
|
+
Rake::Task['clean'].invoke if $force
|
256
|
+
Rake::Task['all'].invoke
|
257
|
+
end
|
258
|
+
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Rattus norvegicus"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "RGD DB ID"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 10116,
|
10
|
+
:check => proc{|code| code.match(/^RGD/)},
|
11
|
+
}
|
12
|
+
|
13
|
+
$lexicon = {
|
14
|
+
:file => {
|
15
|
+
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
16
|
+
:native => 1,
|
17
|
+
:extra => [2,9],
|
18
|
+
:exclude => proc{|l| !l.match(/^RGD/)}
|
19
|
+
},
|
20
|
+
}
|
21
|
+
|
22
|
+
$identifiers = {
|
23
|
+
:file => {
|
24
|
+
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
25
|
+
:native => 1,
|
26
|
+
:extra => [],
|
27
|
+
:exclude => proc{|l| !l.match(/^RGD/)}
|
28
|
+
},
|
29
|
+
:biomart => {
|
30
|
+
:database => 'rnorvegicus_gene_ensembl',
|
31
|
+
:main => ['Entrez Gene ID' , "entrezgene"],
|
32
|
+
:extra => [
|
33
|
+
['Associated Gene Name' , "external_gene_id"],
|
34
|
+
['Protein ID' , "protein_id"] ,
|
35
|
+
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
36
|
+
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
37
|
+
['RefSeq Protein ID' , "refseq_peptide"] ,
|
38
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
39
|
+
|
40
|
+
['Affy rae230a', "affy_rae230a"],
|
41
|
+
['Affy rae230b', "affy_rae230b"],
|
42
|
+
['Affy RaGene', "affy_ragene_1_0_st_v1"],
|
43
|
+
['Affy rat230 2', "affy_rat230_2"],
|
44
|
+
['Affy RaEx', "affy_raex_1_0_st_v1"],
|
45
|
+
['Affy rg u34a', "affy_rg_u34a"],
|
46
|
+
['Affy rg u34b', "affy_rg_u34b"],
|
47
|
+
['Affy rg u34c', "affy_rg_u34c"],
|
48
|
+
['Affy rn u34', "affy_rn_u34"],
|
49
|
+
['Affy rt u34', "affy_rt_u34"],
|
50
|
+
['Agilent WholeGenome',"agilent_wholegenome" ],
|
51
|
+
['Codelink ID ', "codelink"],
|
52
|
+
|
53
|
+
|
54
|
+
],
|
55
|
+
:filter => [],
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
$go = {
|
60
|
+
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
61
|
+
:exclude => proc{|l| !l.match(/^RGD/)},
|
62
|
+
:code => 1,
|
63
|
+
:go => 4,
|
64
|
+
:pmid => 5,
|
65
|
+
}
|
66
|
+
|
67
|
+
$query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
68
|
+
|
69
|
+
#{{{ Redefines
|
70
|
+
|
71
|
+
module Open
|
72
|
+
|
73
|
+
class << self
|
74
|
+
alias_method :old_read, :read
|
75
|
+
|
76
|
+
def read(url, options = {})
|
77
|
+
data = old_read(url, options)
|
78
|
+
|
79
|
+
if url =~ /gene_association.rgd.gz/
|
80
|
+
return data.collect{|l| l.gsub(/^RGD\t/,"RGD\tRGD:")}.join("\n")
|
81
|
+
else
|
82
|
+
return data
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Saccharomyces cerevisiae"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "SGD DB Id"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 4932,
|
10
|
+
:fix => proc{|code| code.sub(/SGD:S0/,'S0') },
|
11
|
+
:check => proc{|code| code.match(/^S0/)},
|
12
|
+
}
|
13
|
+
|
14
|
+
$lexicon = {
|
15
|
+
:file => {
|
16
|
+
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
17
|
+
:native => 0,
|
18
|
+
:extra => [4,3,5]
|
19
|
+
},
|
20
|
+
:biomart => {
|
21
|
+
:database => 'scerevisiae_gene_ensembl',
|
22
|
+
:main => ['Entrez Gene ID', 'entrezgene'],
|
23
|
+
:extra => [
|
24
|
+
['Interpro Description' , "interpro_description"],
|
25
|
+
],
|
26
|
+
:filter => [],
|
27
|
+
}
|
28
|
+
|
29
|
+
}
|
30
|
+
|
31
|
+
$identifiers = {
|
32
|
+
:file => {
|
33
|
+
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
34
|
+
:native => 0,
|
35
|
+
:extra => [],
|
36
|
+
},
|
37
|
+
:biomart => {
|
38
|
+
:database => 'scerevisiae_gene_ensembl',
|
39
|
+
:main => ['Entrez Gene ID', 'entrezgene'],
|
40
|
+
:extra => [
|
41
|
+
['Associated Gene Name' , "external_gene_id"],
|
42
|
+
['Ensembl Gene ID', "ensembl_gene_id" ],
|
43
|
+
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
44
|
+
['RefSeq Protein ID' , "refseq_peptide"] ,
|
45
|
+
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
46
|
+
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
47
|
+
['Protein ID' , "protein_id"] ,
|
48
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
49
|
+
# Affymetrix
|
50
|
+
['Affy yeast 2',"affy_yeast_2"],
|
51
|
+
['Affy yg s98', "affy_yg_s98"],
|
52
|
+
],
|
53
|
+
:filter => [],
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
$go = {
|
58
|
+
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
|
59
|
+
:code => 1,
|
60
|
+
:go => 4,
|
61
|
+
:pmid => 5,
|
62
|
+
}
|
63
|
+
|
64
|
+
$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
65
|
+
|
66
|
+
|