rbbt 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/bin/rbbt_config +180 -0
- data/install_scripts/classifier/R/classify.R +36 -0
- data/install_scripts/classifier/Rakefile +140 -0
- data/install_scripts/get_abner.sh +2 -0
- data/install_scripts/get_banner.sh +25 -0
- data/install_scripts/get_biocreative.sh +72 -0
- data/install_scripts/get_crf++.sh +26 -0
- data/install_scripts/get_entrez.sh +4 -0
- data/install_scripts/get_go.sh +4 -0
- data/install_scripts/get_polysearch.sh +8 -0
- data/install_scripts/ner/Rakefile +206 -0
- data/install_scripts/ner/config/default.rb +52 -0
- data/install_scripts/norm/Rakefile +218 -0
- data/install_scripts/norm/config/cue_default.rb +10 -0
- data/install_scripts/norm/config/tokens_default.rb +79 -0
- data/install_scripts/norm/functions.sh +21 -0
- data/install_scripts/organisms/Rakefile +25 -0
- data/install_scripts/organisms/cgd.Rakefile +84 -0
- data/install_scripts/organisms/human.Rakefile +145 -0
- data/install_scripts/organisms/mgi.Rakefile +77 -0
- data/install_scripts/organisms/pombe.Rakefile +40 -0
- data/install_scripts/organisms/rake-include.rb +258 -0
- data/install_scripts/organisms/rgd.Rakefile +88 -0
- data/install_scripts/organisms/sgd.Rakefile +66 -0
- data/install_scripts/organisms/tair.Rakefile +54 -0
- data/install_scripts/organisms/worm.Rakefile +109 -0
- data/install_scripts/stopwords +1 -0
- data/install_scripts/wordlists/consonants +897 -0
- data/install_scripts/wordlists/stopwords +1 -0
- data/lib/rbbt/bow/bow.rb +87 -0
- data/lib/rbbt/bow/classifier.rb +118 -0
- data/lib/rbbt/bow/dictionary.rb +218 -0
- data/lib/rbbt/ner/abner.rb +34 -0
- data/lib/rbbt/ner/banner.rb +73 -0
- data/lib/rbbt/ner/regexpNER.rb +62 -0
- data/lib/rbbt/ner/rner.rb +227 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
- data/lib/rbbt/ner/rnorm.rb +142 -0
- data/lib/rbbt/sources/biocreative.rb +75 -0
- data/lib/rbbt/sources/biomart.rb +106 -0
- data/lib/rbbt/sources/entrez.rb +211 -0
- data/lib/rbbt/sources/go.rb +40 -0
- data/lib/rbbt/sources/organism.rb +197 -0
- data/lib/rbbt/sources/polysearch.rb +88 -0
- data/lib/rbbt/sources/pubmed.rb +111 -0
- data/lib/rbbt/util/arrayHash.rb +255 -0
- data/lib/rbbt/util/filecache.rb +72 -0
- data/lib/rbbt/util/index.rb +69 -0
- data/lib/rbbt/util/misc.rb +101 -0
- data/lib/rbbt/util/open.rb +207 -0
- data/lib/rbbt/util/simpleDSL.rb +87 -0
- data/lib/rbbt/util/tmpfile.rb +19 -0
- data/lib/rbbt/version.rb +10 -0
- data/lib/rbbt.rb +86 -0
- data/tasks/install.rake +123 -0
- metadata +114 -0
@@ -0,0 +1,145 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Homo sapiens"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "Entrez Gene ID"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 9606,
|
10
|
+
:fix => nil,
|
11
|
+
:check => proc{|code| false},
|
12
|
+
}
|
13
|
+
|
14
|
+
$lexicon = {
|
15
|
+
:biomart => {
|
16
|
+
:database => 'hsapiens_gene_ensembl',
|
17
|
+
:main => ['Entrez Gene ID' , "entrezgene"],
|
18
|
+
:extra => [
|
19
|
+
[ 'Associated Gene Name' , "external_gene_id"],
|
20
|
+
[ 'HGNC symbol', "hgnc_symbol" ],
|
21
|
+
[ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
|
22
|
+
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
23
|
+
],
|
24
|
+
}
|
25
|
+
|
26
|
+
}
|
27
|
+
|
28
|
+
$identifiers = {
|
29
|
+
:biomart => {
|
30
|
+
:database => 'hsapiens_gene_ensembl',
|
31
|
+
:main => ['Entrez Gene ID' , "entrezgene"],
|
32
|
+
:extra => [
|
33
|
+
[ 'Ensembl Gene ID', "ensembl_gene_id" ],
|
34
|
+
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
35
|
+
[ 'Associated Gene Name', "external_gene_id" ],
|
36
|
+
[ 'CCDS ID', "ccds" ],
|
37
|
+
[ 'Protein ID', "protein_id" ],
|
38
|
+
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
39
|
+
[ 'Unigene ID', "unigene" ],
|
40
|
+
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
41
|
+
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
42
|
+
[ 'HGNC ID', "hgnc_id", 'HGNC'],
|
43
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
44
|
+
|
45
|
+
# Affymetrix
|
46
|
+
[ 'AFFY HC G110', 'affy_hc_g110' ],
|
47
|
+
[ 'AFFY HG FOCUS', 'affy_hg_focus' ],
|
48
|
+
[ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
|
49
|
+
[ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
|
50
|
+
[ 'AFFY HG U133A', 'affy_hg_u133a' ],
|
51
|
+
[ 'AFFY HG U133B', 'affy_hg_u133b' ],
|
52
|
+
[ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
|
53
|
+
[ 'AFFY HG U95B', 'affy_hg_u95b' ],
|
54
|
+
[ 'AFFY HG U95C', 'affy_hg_u95c' ],
|
55
|
+
[ 'AFFY HG U95D', 'affy_hg_u95d' ],
|
56
|
+
[ 'AFFY HG U95E', 'affy_hg_u95e' ],
|
57
|
+
[ 'AFFY HG U95A', 'affy_hg_u95a' ],
|
58
|
+
[ 'AFFY HUGENEFL', 'affy_hugenefl' ],
|
59
|
+
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
|
60
|
+
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
61
|
+
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
62
|
+
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
63
|
+
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
64
|
+
[ 'Codelink ID', 'codelink' ],
|
65
|
+
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
66
|
+
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
67
|
+
|
68
|
+
],
|
69
|
+
:filter => [],
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
$go = {
|
74
|
+
:url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
|
75
|
+
:code => 2,
|
76
|
+
:go => 4,
|
77
|
+
:pmid => 5,
|
78
|
+
}
|
79
|
+
|
80
|
+
$query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
|
81
|
+
##########################
|
82
|
+
|
83
|
+
require 'rbbt/util/index'
|
84
|
+
|
85
|
+
Rake::Task['gene.go'].clear
|
86
|
+
file 'gene.go' => ['identifiers'] do
|
87
|
+
if File.exists? 'identifiers'
|
88
|
+
require 'rbbt/sources/organism'
|
89
|
+
index = Organism.id_index('human', :other => ['Associated Gene Name'])
|
90
|
+
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
|
91
|
+
|
92
|
+
data = data.collect{|code, value_lists|
|
93
|
+
code = index[code]
|
94
|
+
[code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
|
95
|
+
}.select{|p| p[0] && p[1].any?}
|
96
|
+
|
97
|
+
Open.write('gene.go',
|
98
|
+
data.collect{|p|
|
99
|
+
p[1].uniq.collect{|go|
|
100
|
+
"#{p[0]}\t#{go}"
|
101
|
+
}.join("\n")
|
102
|
+
}.join("\n")
|
103
|
+
)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
Rake::Task['gene_go.pmid'].clear
|
108
|
+
file 'gene_go.pmid' => ['identifiers'] do
|
109
|
+
if File.exists? 'identifiers'
|
110
|
+
index = Index.index('identifiers')
|
111
|
+
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
|
112
|
+
|
113
|
+
data = data.collect{|code, value_lists|
|
114
|
+
code = index[code]
|
115
|
+
[code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
|
116
|
+
}.select{|p| p[0] && p[1].any?}
|
117
|
+
|
118
|
+
Open.write('gene_go.pmid',
|
119
|
+
data.collect{|p|
|
120
|
+
p[1].uniq.collect{|pmid|
|
121
|
+
"#{p[0]}\t#{pmid}"
|
122
|
+
}.join("\n")
|
123
|
+
}.join("\n")
|
124
|
+
)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
Rake::Task['lexicon'].clear
|
130
|
+
file 'lexicon' => ['identifiers'] do
|
131
|
+
if File.exists? 'identifiers'
|
132
|
+
require 'rbbt/sources/organism'
|
133
|
+
HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
|
134
|
+
names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
|
135
|
+
translations = Organism.id_index('human', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
|
136
|
+
|
137
|
+
Open.write('lexicon',
|
138
|
+
names.collect{|code, names|
|
139
|
+
next unless translations[code]
|
140
|
+
([translations[code]] + names).join("\t")
|
141
|
+
}.compact.join("\n")
|
142
|
+
)
|
143
|
+
end
|
144
|
+
|
145
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Mus musculus"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "MGI DB ID"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 10090,
|
10
|
+
:fix => nil,
|
11
|
+
:check => proc{|code| code.match(/^MGI/)},
|
12
|
+
}
|
13
|
+
|
14
|
+
$lexicon = {
|
15
|
+
:file => {
|
16
|
+
:url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
|
17
|
+
:native => 0,
|
18
|
+
:extra => [2,3],
|
19
|
+
:exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
|
20
|
+
},
|
21
|
+
}
|
22
|
+
|
23
|
+
$identifiers = {
|
24
|
+
:file => {
|
25
|
+
:url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
|
26
|
+
:native => 0,
|
27
|
+
:extra => [],
|
28
|
+
:exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
|
29
|
+
},
|
30
|
+
:biomart => {
|
31
|
+
:database => 'mmusculus_gene_ensembl',
|
32
|
+
:main => ['MGI DB ID', 'mgi_id'] ,
|
33
|
+
:extra => [
|
34
|
+
['Associated Gene Name' , "external_gene_id"],
|
35
|
+
['Protein ID' , "protein_id"] ,
|
36
|
+
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
37
|
+
['Unigene ID' , "unigene"] ,
|
38
|
+
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
39
|
+
['RefSeq Protein ID' , "refseq_peptide"] ,
|
40
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
41
|
+
|
42
|
+
['Affy mg u74a',"affy_mg_u74a" ],
|
43
|
+
['Affy mg u74av2',"affy_mg_u74av2" ],
|
44
|
+
['Affy mg u74b',"affy_mg_u74b" ],
|
45
|
+
['Affy mg u74bv2',"affy_mg_u74bv2" ],
|
46
|
+
['Affy mg u74c',"affy_mg_u74c" ],
|
47
|
+
['Affy mg u74cv2',"affy_mg_u74cv2" ],
|
48
|
+
['Affy moe430a',"affy_moe430a" ],
|
49
|
+
['Affy moe430b',"affy_moe430b" ],
|
50
|
+
['AFFY MoEx',"affy_moex_1_0_st_v1" ],
|
51
|
+
['AFFY MoGene',"affy_mogene_1_0_st_v1" ],
|
52
|
+
['Affy mouse430 2',"affy_mouse430_2" ],
|
53
|
+
['Affy mouse430a 2',"affy_mouse430a_2" ],
|
54
|
+
['Affy mu11ksuba',"affy_mu11ksuba" ],
|
55
|
+
['Affy mu11ksubb',"affy_mu11ksubb" ],
|
56
|
+
['Agilent WholeGenome',"agilent_wholegenome" ],
|
57
|
+
['Codelink ID',"codelink" ],
|
58
|
+
['Illumina MouseWG 6 v1',"illumina_mousewg_6_v1" ],
|
59
|
+
['Illumina MouseWG 6 v2',"illumina_mousewg_6_v2" ],
|
60
|
+
|
61
|
+
],
|
62
|
+
:filter => ['with_mgi'], # This is needed as the filter is not with_mgi_id as was expected
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
$go = {
|
67
|
+
:url => "ftp://ftp.geneontology.org/go/gene-associations/gene_association.mgi.gz",
|
68
|
+
:code => 1,
|
69
|
+
:go => 4,
|
70
|
+
:pmid => 5,
|
71
|
+
}
|
72
|
+
|
73
|
+
$query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]))'
|
74
|
+
##########################
|
75
|
+
|
76
|
+
|
77
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Schizosaccharomyces pombe"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "GeneDB Id"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 4896,
|
10
|
+
:fix => proc{|code| code.sub(/GeneDB:SP/,'SP') },
|
11
|
+
:check => proc{|code| code.match(/^SP/)},
|
12
|
+
}
|
13
|
+
|
14
|
+
$lexicon = {
|
15
|
+
:file => {
|
16
|
+
:url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
|
17
|
+
:native => 0,
|
18
|
+
:extra => [1,2,3,4,5,6,7,8]
|
19
|
+
},
|
20
|
+
}
|
21
|
+
|
22
|
+
$identifiers = {
|
23
|
+
:file => {
|
24
|
+
:url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
|
25
|
+
:native => 0,
|
26
|
+
:extra => [],
|
27
|
+
},
|
28
|
+
}
|
29
|
+
|
30
|
+
$go = {
|
31
|
+
:url => "ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Gene_ontology/gene_association.GeneDB_Spombe",
|
32
|
+
:code => 1,
|
33
|
+
:go => 4,
|
34
|
+
:pmid => 5,
|
35
|
+
}
|
36
|
+
|
37
|
+
$query = 'pombe[All Fields] AND (hasabstract[text] AND English[lang])'
|
38
|
+
####
|
39
|
+
|
40
|
+
|
@@ -0,0 +1,258 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/util/arrayHash'
|
4
|
+
require 'rbbt/sources/biomart'
|
5
|
+
require 'rbbt/sources/entrez'
|
6
|
+
require 'rbbt/sources/pubmed'
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
file 'name' do
|
11
|
+
Open.write('name', $name)
|
12
|
+
end
|
13
|
+
|
14
|
+
file 'all.pmid' do
|
15
|
+
Open.write('all.pmid', PubMed.query($query).join("\n"))
|
16
|
+
end
|
17
|
+
|
18
|
+
file 'lexicon' do
|
19
|
+
begin
|
20
|
+
|
21
|
+
data = nil
|
22
|
+
# Read from file
|
23
|
+
if $lexicon[:file]
|
24
|
+
file = Open.to_hash($lexicon[:file][:url], $lexicon[:file])
|
25
|
+
data = ArrayHash.new(file, $native_id)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Translate from entrez to native if needed
|
29
|
+
if $entrez2native
|
30
|
+
translations = {}
|
31
|
+
Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
|
32
|
+
each{|k,v|
|
33
|
+
translations[k] = [v.join("|")]
|
34
|
+
}
|
35
|
+
translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
|
36
|
+
if data
|
37
|
+
data.merge(translations_data)
|
38
|
+
else
|
39
|
+
data = translations_data
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
# Read from Biomart and merge with previous data
|
46
|
+
if $lexicon[:biomart]
|
47
|
+
biomart = {}
|
48
|
+
|
49
|
+
BioMart.query(
|
50
|
+
$lexicon[:biomart][:database],
|
51
|
+
$lexicon[:biomart][:main][1],
|
52
|
+
$lexicon[:biomart][:extra].collect{|v| v[1]},
|
53
|
+
$lexicon[:biomart][:filter]
|
54
|
+
).each{|key, values_list|
|
55
|
+
values = values_list.values_at(*$lexicon[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
|
56
|
+
biomart[key] = values
|
57
|
+
}
|
58
|
+
|
59
|
+
biomart_data = ArrayHash.new(biomart, $lexicon[:biomart][:main][0], $lexicon[:biomart][:extra].collect{|v| v[0]})
|
60
|
+
|
61
|
+
if data
|
62
|
+
if $lexicon[:biomart][:extra].collect{|v| v[1]}.include?( $native_id )|| $lexicon[:biomart][:main][0] == $native_id
|
63
|
+
field = $native_id
|
64
|
+
else
|
65
|
+
field = 'Entrez Gene ID'
|
66
|
+
end
|
67
|
+
data.merge(biomart_data, field)
|
68
|
+
else
|
69
|
+
data = biomart_data
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
if $entrez2native
|
74
|
+
gene_alias = {}
|
75
|
+
Entrez.entrez2native($entrez2native[:tax],4).
|
76
|
+
each{|k,v|
|
77
|
+
gene_alias[k] = [v.select{|e| e.to_s != ""}.join("|")]
|
78
|
+
}
|
79
|
+
if gene_alias.keys.any?
|
80
|
+
gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
|
81
|
+
data.merge(gene_alias_data, 'Entrez Gene ID')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
data.remove('Entrez Gene ID')
|
86
|
+
data.clean
|
87
|
+
Open.write('lexicon', data.data.collect{|code, name_lists|
|
88
|
+
"#{ code }\t" + name_lists.flatten.select{|n| n.to_s != ""}.uniq.join("\t")
|
89
|
+
}.join("\n"))
|
90
|
+
|
91
|
+
rescue Entrez::NoFile
|
92
|
+
puts "Lexicon not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
|
97
|
+
file 'identifiers' do
|
98
|
+
|
99
|
+
begin
|
100
|
+
data = nil
|
101
|
+
if $identifiers[:file]
|
102
|
+
file = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
|
103
|
+
data = ArrayHash.new(file, $native_id, $identifiers[:file][:fields])
|
104
|
+
end
|
105
|
+
|
106
|
+
# Translate from entrez to native if needed
|
107
|
+
if $entrez2native
|
108
|
+
translations = {}
|
109
|
+
Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
|
110
|
+
each{|k,v|
|
111
|
+
translations[k] = [v.join("|")]
|
112
|
+
}
|
113
|
+
if translations.keys.any?
|
114
|
+
translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
|
115
|
+
if data
|
116
|
+
data.merge(translations_data)
|
117
|
+
else
|
118
|
+
data = translations_data
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# Read from Biomart and merge with previous data
|
126
|
+
if $identifiers[:biomart]
|
127
|
+
biomart = {}
|
128
|
+
|
129
|
+
BioMart.query(
|
130
|
+
$identifiers[:biomart][:database],
|
131
|
+
$identifiers[:biomart][:main][1],
|
132
|
+
$identifiers[:biomart][:extra].collect{|v| v[1]},
|
133
|
+
$identifiers[:biomart][:filter]
|
134
|
+
).each{|key, values_list|
|
135
|
+
values = values_list.values_at(*$identifiers[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
|
136
|
+
biomart[key] = values
|
137
|
+
}
|
138
|
+
|
139
|
+
biomart_data = ArrayHash.new(biomart, $identifiers[:biomart][:main][0], $identifiers[:biomart][:extra].collect{|v| v[0]})
|
140
|
+
$identifiers[:biomart][:extra].each{|values|
|
141
|
+
if values[2]
|
142
|
+
biomart_data.process(values[0]){|n| "#{values[2]}:#{n}"}
|
143
|
+
end
|
144
|
+
}
|
145
|
+
|
146
|
+
|
147
|
+
if data
|
148
|
+
if $identifiers[:biomart][:extra].collect{|v| v[1]}.include?( $native_id ) || $identifiers[:biomart][:main][0] == $native_id
|
149
|
+
field = $native_id
|
150
|
+
else
|
151
|
+
field = 'Entrez Gene ID'
|
152
|
+
end
|
153
|
+
data.merge(biomart_data, field)
|
154
|
+
else
|
155
|
+
data = biomart_data
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
# Add the alias at the end
|
161
|
+
if $entrez2native
|
162
|
+
gene_alias = {}
|
163
|
+
Entrez.entrez2native($entrez2native[:tax],4).
|
164
|
+
each{|k,v|
|
165
|
+
gene_alias[k] = [v.join("|")]
|
166
|
+
}
|
167
|
+
if gene_alias.keys.any?
|
168
|
+
gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
|
169
|
+
if data
|
170
|
+
data.merge(gene_alias_data, 'Entrez Gene ID')
|
171
|
+
else
|
172
|
+
data = gene_alias_data
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
# Write ids to file
|
180
|
+
fout = File.open('identifiers', 'w')
|
181
|
+
fout.puts "##{$native_id}\t" + data.fields.join("\t")
|
182
|
+
data.clean
|
183
|
+
data.data.each{|code, values|
|
184
|
+
fout.puts code + "\t" + values.join("\t")
|
185
|
+
}
|
186
|
+
fout.close
|
187
|
+
|
188
|
+
rescue Entrez::NoFile
|
189
|
+
puts "Identifiers not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
file 'gene.go' do
|
195
|
+
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix])
|
196
|
+
|
197
|
+
data = data.collect{|code, value_lists|
|
198
|
+
[code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
|
199
|
+
}.select{|p| p[1].any?}
|
200
|
+
|
201
|
+
Open.write('gene.go',
|
202
|
+
data.collect{|p|
|
203
|
+
p[1].uniq.collect{|go|
|
204
|
+
"#{p[0]}\t#{go}"
|
205
|
+
}.join("\n")
|
206
|
+
}.join("\n")
|
207
|
+
)
|
208
|
+
end
|
209
|
+
|
210
|
+
file 'gene_go.pmid' do
|
211
|
+
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix])
|
212
|
+
|
213
|
+
data = data.collect{|code, value_lists|
|
214
|
+
[code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
|
215
|
+
}.select{|p| p[1].any?}
|
216
|
+
|
217
|
+
Open.write('gene_go.pmid',
|
218
|
+
data.collect{|p|
|
219
|
+
p[1].uniq.collect{|pmid| "#{p[0]}\t#{pmid}" }.join("\n")
|
220
|
+
}.join("\n")
|
221
|
+
)
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
file 'gene.pmid' do
|
226
|
+
begin
|
227
|
+
translations = Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)) if $native_id != "Entrez Gene ID"
|
228
|
+
|
229
|
+
data = Entrez.entrez2pubmed($entrez2native[:tax])
|
230
|
+
|
231
|
+
Open.write('gene.pmid',
|
232
|
+
data.collect{|code,pmids|
|
233
|
+
next if translations && ! translations[code]
|
234
|
+
code = translations[code].first if translations
|
235
|
+
pmids.collect{|pmid|
|
236
|
+
"#{ code }\t#{pmid}"
|
237
|
+
}.compact.join("\n")
|
238
|
+
}.compact.join("\n")
|
239
|
+
)
|
240
|
+
rescue Entrez::NoFile
|
241
|
+
puts "Gene article associations from entrez not produced, install the gene2pumbed file (rbbt_config install entrez)."
|
242
|
+
end
|
243
|
+
|
244
|
+
end
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
|
249
|
+
task 'all' => ['name', 'lexicon', 'identifiers', 'gene_go.pmid', 'gene.pmid', 'gene.go', 'all.pmid']
|
250
|
+
task 'clean' do
|
251
|
+
`rm -f 'name' 'lexicon' 'identifiers' 'gene_go.pmid' 'gene.pmid' 'gene.go' 'all.pmid'`
|
252
|
+
end
|
253
|
+
|
254
|
+
task 'update' do
|
255
|
+
Rake::Task['clean'].invoke if $force
|
256
|
+
Rake::Task['all'].invoke
|
257
|
+
end
|
258
|
+
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Rattus norvegicus"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "RGD DB ID"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 10116,
|
10
|
+
:check => proc{|code| code.match(/^RGD/)},
|
11
|
+
}
|
12
|
+
|
13
|
+
$lexicon = {
|
14
|
+
:file => {
|
15
|
+
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
16
|
+
:native => 1,
|
17
|
+
:extra => [2,9],
|
18
|
+
:exclude => proc{|l| !l.match(/^RGD/)}
|
19
|
+
},
|
20
|
+
}
|
21
|
+
|
22
|
+
$identifiers = {
|
23
|
+
:file => {
|
24
|
+
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
25
|
+
:native => 1,
|
26
|
+
:extra => [],
|
27
|
+
:exclude => proc{|l| !l.match(/^RGD/)}
|
28
|
+
},
|
29
|
+
:biomart => {
|
30
|
+
:database => 'rnorvegicus_gene_ensembl',
|
31
|
+
:main => ['Entrez Gene ID' , "entrezgene"],
|
32
|
+
:extra => [
|
33
|
+
['Associated Gene Name' , "external_gene_id"],
|
34
|
+
['Protein ID' , "protein_id"] ,
|
35
|
+
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
36
|
+
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
37
|
+
['RefSeq Protein ID' , "refseq_peptide"] ,
|
38
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
39
|
+
|
40
|
+
['Affy rae230a', "affy_rae230a"],
|
41
|
+
['Affy rae230b', "affy_rae230b"],
|
42
|
+
['Affy RaGene', "affy_ragene_1_0_st_v1"],
|
43
|
+
['Affy rat230 2', "affy_rat230_2"],
|
44
|
+
['Affy RaEx', "affy_raex_1_0_st_v1"],
|
45
|
+
['Affy rg u34a', "affy_rg_u34a"],
|
46
|
+
['Affy rg u34b', "affy_rg_u34b"],
|
47
|
+
['Affy rg u34c', "affy_rg_u34c"],
|
48
|
+
['Affy rn u34', "affy_rn_u34"],
|
49
|
+
['Affy rt u34', "affy_rt_u34"],
|
50
|
+
['Agilent WholeGenome',"agilent_wholegenome" ],
|
51
|
+
['Codelink ID ', "codelink"],
|
52
|
+
|
53
|
+
|
54
|
+
],
|
55
|
+
:filter => [],
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
$go = {
|
60
|
+
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
61
|
+
:exclude => proc{|l| !l.match(/^RGD/)},
|
62
|
+
:code => 1,
|
63
|
+
:go => 4,
|
64
|
+
:pmid => 5,
|
65
|
+
}
|
66
|
+
|
67
|
+
$query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
68
|
+
|
69
|
+
#{{{ Redefines
|
70
|
+
|
71
|
+
module Open
|
72
|
+
|
73
|
+
class << self
|
74
|
+
alias_method :old_read, :read
|
75
|
+
|
76
|
+
def read(url, options = {})
|
77
|
+
data = old_read(url, options)
|
78
|
+
|
79
|
+
if url =~ /gene_association.rgd.gz/
|
80
|
+
return data.collect{|l| l.gsub(/^RGD\t/,"RGD\tRGD:")}.join("\n")
|
81
|
+
else
|
82
|
+
return data
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
+
|
3
|
+
$name = "Saccharomyces cerevisiae"
|
4
|
+
|
5
|
+
|
6
|
+
$native_id = "SGD DB Id"
|
7
|
+
|
8
|
+
$entrez2native = {
|
9
|
+
:tax => 4932,
|
10
|
+
:fix => proc{|code| code.sub(/SGD:S0/,'S0') },
|
11
|
+
:check => proc{|code| code.match(/^S0/)},
|
12
|
+
}
|
13
|
+
|
14
|
+
$lexicon = {
|
15
|
+
:file => {
|
16
|
+
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
17
|
+
:native => 0,
|
18
|
+
:extra => [4,3,5]
|
19
|
+
},
|
20
|
+
:biomart => {
|
21
|
+
:database => 'scerevisiae_gene_ensembl',
|
22
|
+
:main => ['Entrez Gene ID', 'entrezgene'],
|
23
|
+
:extra => [
|
24
|
+
['Interpro Description' , "interpro_description"],
|
25
|
+
],
|
26
|
+
:filter => [],
|
27
|
+
}
|
28
|
+
|
29
|
+
}
|
30
|
+
|
31
|
+
$identifiers = {
|
32
|
+
:file => {
|
33
|
+
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
34
|
+
:native => 0,
|
35
|
+
:extra => [],
|
36
|
+
},
|
37
|
+
:biomart => {
|
38
|
+
:database => 'scerevisiae_gene_ensembl',
|
39
|
+
:main => ['Entrez Gene ID', 'entrezgene'],
|
40
|
+
:extra => [
|
41
|
+
['Associated Gene Name' , "external_gene_id"],
|
42
|
+
['Ensembl Gene ID', "ensembl_gene_id" ],
|
43
|
+
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
44
|
+
['RefSeq Protein ID' , "refseq_peptide"] ,
|
45
|
+
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
46
|
+
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
47
|
+
['Protein ID' , "protein_id"] ,
|
48
|
+
['EMBL (Genbank) ID' , "embl"] ,
|
49
|
+
# Affymetrix
|
50
|
+
['Affy yeast 2',"affy_yeast_2"],
|
51
|
+
['Affy yg s98', "affy_yg_s98"],
|
52
|
+
],
|
53
|
+
:filter => [],
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
$go = {
|
58
|
+
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
|
59
|
+
:code => 1,
|
60
|
+
:go => 4,
|
61
|
+
:pmid => 5,
|
62
|
+
}
|
63
|
+
|
64
|
+
$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
65
|
+
|
66
|
+
|