rbbt 1.2.5 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
@@ -1,10 +0,0 @@
|
|
1
|
-
equal do |w| [w] end
|
2
|
-
standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
|
3
|
-
cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
|
4
|
-
special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
|
5
|
-
words do |w|
|
6
|
-
w.sub(/(.*)I$/,'\1I \1').
|
7
|
-
scan(/[a-z][a-z]+/i).
|
8
|
-
sort{|a,b| b.length <=> a.length}.
|
9
|
-
collect{|n| n.downcase}
|
10
|
-
end
|
@@ -1,86 +0,0 @@
|
|
1
|
-
require 'rbbt/util/misc'
|
2
|
-
|
3
|
-
|
4
|
-
plural = Proc.new do |t| t.sub(/s$/,'') end
|
5
|
-
|
6
|
-
tokens do
|
7
|
-
|
8
|
-
# Some (possible) single letters first
|
9
|
-
receptor /^(?:receptor|r)s?$/i
|
10
|
-
protein /^(?:protein|p)s?$/i
|
11
|
-
roman /^[IV]+$/
|
12
|
-
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
13
|
-
|
14
|
-
|
15
|
-
# Some words for removal
|
16
|
-
stopword do |w| $stopwords.include?( w.downcase_first) end
|
17
|
-
gene /genes?/i
|
18
|
-
dna
|
19
|
-
cdna
|
20
|
-
rna
|
21
|
-
mrna
|
22
|
-
trna
|
23
|
-
cdna
|
24
|
-
component
|
25
|
-
exon
|
26
|
-
intron
|
27
|
-
domain
|
28
|
-
family
|
29
|
-
|
30
|
-
|
31
|
-
# Important words
|
32
|
-
number /^(?:\d+[.,]?\d+|\d)$/
|
33
|
-
greek do |w| $greek[w.downcase] != nil end
|
34
|
-
special do |w| w.is_special? end
|
35
|
-
promoter
|
36
|
-
similar /^(homolog.*|like|related|associated)$/
|
37
|
-
ase /ase$/
|
38
|
-
in_end /in$/
|
39
|
-
end
|
40
|
-
|
41
|
-
comparisons do
|
42
|
-
|
43
|
-
compare.number do |l1,l2|
|
44
|
-
v = 0
|
45
|
-
case
|
46
|
-
when l1.empty? && l2.empty?
|
47
|
-
v = 0
|
48
|
-
when l1.sort.uniq == l2.sort.uniq
|
49
|
-
v = 3
|
50
|
-
when l1.any? && l1[0] == l2[0]
|
51
|
-
v = -3
|
52
|
-
when l1.empty? && l2 == ['1']
|
53
|
-
v = -5
|
54
|
-
else
|
55
|
-
v = -10
|
56
|
-
end
|
57
|
-
v
|
58
|
-
end
|
59
|
-
|
60
|
-
diff.promoter -10
|
61
|
-
diff.receptor -10
|
62
|
-
diff.similar -10
|
63
|
-
diff.capital -10
|
64
|
-
|
65
|
-
same.unknown 1
|
66
|
-
miss.unknown -2
|
67
|
-
extr.unknown -2
|
68
|
-
|
69
|
-
same.greek 1
|
70
|
-
miss.greek -2
|
71
|
-
extr.greek -2
|
72
|
-
|
73
|
-
same.special 4
|
74
|
-
miss.special -3
|
75
|
-
extr.special -3
|
76
|
-
|
77
|
-
transform.receptor plural
|
78
|
-
transform.protein plural
|
79
|
-
|
80
|
-
transform.roman do |t| [t.arabic, :number] end
|
81
|
-
transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
|
82
|
-
transform.ase do |t| [t, :special] end
|
83
|
-
transform.in_end do |t| [t, :special] end
|
84
|
-
transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
|
85
|
-
end
|
86
|
-
|
@@ -1,23 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
function norm(){
|
3
|
-
organism=$1
|
4
|
-
shift
|
5
|
-
dataset=$1
|
6
|
-
shift
|
7
|
-
ner=$1
|
8
|
-
shift
|
9
|
-
|
10
|
-
CMD="rm results/${organism}_$dataset; rake results/${organism}_$dataset.eval ner=$ner $@ > ${organism}_$dataset.log_$ner; tail results/${organism}_$dataset.eval"
|
11
|
-
echo $CMD
|
12
|
-
$CMD
|
13
|
-
}
|
14
|
-
|
15
|
-
|
16
|
-
function norm_2(){
|
17
|
-
ner=$1
|
18
|
-
shift
|
19
|
-
|
20
|
-
CMD="rm results/bc2gn; rake results/bc2gn.eval ner=$ner $@ > bc2gn.log_$ner; tail results/bc2gn.eval"
|
21
|
-
echo $CMD
|
22
|
-
$CMD
|
23
|
-
}
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Arabidopsis thaliana"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "TAIR Locus"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax =>3702,
|
10
|
-
:fix => proc{|code| code.sub(/^TAIR:/,'')},
|
11
|
-
:check => proc{|code| true },
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
:file => {
|
16
|
-
:url => "ftp://ftp.arabidopsis.org/home/tair/Genes/gene_aliases.20100413",
|
17
|
-
:native => 0,
|
18
|
-
:extra => [1,2],
|
19
|
-
},
|
20
|
-
}
|
21
|
-
|
22
|
-
$identifiers = {
|
23
|
-
:file => {
|
24
|
-
:url => "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2009-7-29.txt",
|
25
|
-
:native => 4,
|
26
|
-
:extra => [0],
|
27
|
-
:fields => ["Affymetrix"],
|
28
|
-
},
|
29
|
-
:biomart => {
|
30
|
-
:database => 'athaliana_eg_gene',
|
31
|
-
:main => ['TAIR Locus', 'tair_locus'],
|
32
|
-
:extra => [
|
33
|
-
['Associated Gene Name' , "external_gene_id"] ,
|
34
|
-
['Gramene Gene ID' , "ensembl_gene_id"] ,
|
35
|
-
['RefSeq peptide' , "refseq_peptide"] ,
|
36
|
-
['Unigene' , "unigene"] ,
|
37
|
-
['Interpro ID' , "interpro"] ,
|
38
|
-
|
39
|
-
|
40
|
-
],
|
41
|
-
:filter => ['with_tair_locus'], # This is needed as the filter is not with_mgi_id as was expected
|
42
|
-
}
|
43
|
-
|
44
|
-
}
|
45
|
-
|
46
|
-
$go = {
|
47
|
-
:url => "ftp://ftp.arabidopsis.org/home/tair/Ontologies/Gene_Ontology/ATH_GO_GOSLIM.txt",
|
48
|
-
:code => 0,
|
49
|
-
:go => 5,
|
50
|
-
:pmid => 12,
|
51
|
-
}
|
52
|
-
|
53
|
-
$query = '("arabidopsis"[MeSH Terms] OR Arabidopsis[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]))'
|
54
|
-
|
55
|
-
|
@@ -1,84 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Candida albicans"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "Systematic Name"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 237561,
|
10
|
-
:fix => proc{|code| code.sub(/^CaO/,'orf') },
|
11
|
-
:check => proc{|code| code.match(/^orf/)},
|
12
|
-
:native => 3
|
13
|
-
}
|
14
|
-
|
15
|
-
$lexicon = {
|
16
|
-
:file => {
|
17
|
-
:url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
|
18
|
-
:native => 0,
|
19
|
-
:extra => [8,1,2],
|
20
|
-
:exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
|
21
|
-
},
|
22
|
-
}
|
23
|
-
|
24
|
-
$identifiers = {
|
25
|
-
:file => {
|
26
|
-
:url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
|
27
|
-
:native => 0,
|
28
|
-
:extra => [8,1,2],
|
29
|
-
:exclude => proc{|l| l.match(/^!/)},
|
30
|
-
:fields => ["GCD ID", "Gene Name", "Gene Alias"]
|
31
|
-
},
|
32
|
-
}
|
33
|
-
|
34
|
-
$go = {
|
35
|
-
:url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
|
36
|
-
:code => 10,
|
37
|
-
:go => 4,
|
38
|
-
:pmid => 5,
|
39
|
-
:fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
|
40
|
-
}
|
41
|
-
|
42
|
-
$query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
43
|
-
|
44
|
-
####
|
45
|
-
|
46
|
-
#Rake::Task['identifiers'].clear
|
47
|
-
#file 'identifiers' => ['lexicon'] do |t|
|
48
|
-
# identifiers = {}
|
49
|
-
# if $identifiers[:file]
|
50
|
-
# identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
|
51
|
-
# end
|
52
|
-
#
|
53
|
-
# orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
|
54
|
-
#
|
55
|
-
# translations = {}
|
56
|
-
#
|
57
|
-
# Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
|
58
|
-
# orfs.each{|orf|
|
59
|
-
# translations[orf] ||= []
|
60
|
-
# translations[orf] << entrez
|
61
|
-
# }
|
62
|
-
# }
|
63
|
-
#
|
64
|
-
# orf2native.each{|orf, native|
|
65
|
-
# next unless identifiers[native]
|
66
|
-
# identifiers[native] << [orf]
|
67
|
-
# if translations[orf]
|
68
|
-
# identifiers[native] << translations[orf]
|
69
|
-
# else
|
70
|
-
# identifiers[native] << []
|
71
|
-
# end
|
72
|
-
#
|
73
|
-
# }
|
74
|
-
#
|
75
|
-
# header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
|
76
|
-
# Open.write('identifiers',
|
77
|
-
# header +
|
78
|
-
# identifiers.collect{|code, name_lists|
|
79
|
-
# "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
|
80
|
-
# }.join("\n")
|
81
|
-
# )
|
82
|
-
#end
|
83
|
-
#
|
84
|
-
#
|
@@ -1,109 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Caenorhabditis elegans"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "WormBase ID"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 6239,
|
10
|
-
:fix => proc{|code| code.sub(/^WormBase:/,'')},
|
11
|
-
:check => proc{|code| code.match(/^WBGene/)},
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
|
16
|
-
:file =>{
|
17
|
-
|
18
|
-
:url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
|
19
|
-
:native => 0,
|
20
|
-
:extra => [1,2],
|
21
|
-
|
22
|
-
# :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
|
23
|
-
# :native => 0,
|
24
|
-
# :extra => [2,3,4,5],
|
25
|
-
|
26
|
-
},
|
27
|
-
}
|
28
|
-
|
29
|
-
|
30
|
-
$identifiers = {
|
31
|
-
|
32
|
-
:file =>{
|
33
|
-
|
34
|
-
:url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
|
35
|
-
:native => 0,
|
36
|
-
:extra => [1,2],
|
37
|
-
|
38
|
-
# :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
|
39
|
-
# :native => 0,
|
40
|
-
# :extra => [2,3,4,5],
|
41
|
-
|
42
|
-
},
|
43
|
-
|
44
|
-
:biomart => {
|
45
|
-
:database => 'celegans_gene_ensembl',
|
46
|
-
:main => ['Entrez Gene ID' , "entrezgene"],
|
47
|
-
:extra => [
|
48
|
-
['WormBase gene', "wormbase_gene" ],
|
49
|
-
['Associated Gene Name ', "external_gene_id" ],
|
50
|
-
['WormPep id', "wormpep_id" ],
|
51
|
-
[ 'Ensembl Gene ID', "ensembl_gene_id" ],
|
52
|
-
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
53
|
-
[ 'Protein ID ', "protein_id" ],
|
54
|
-
[ 'RefSeq Protein ID ', "refseq_peptide" ],
|
55
|
-
[ 'Unigene ID ', "unigene" ],
|
56
|
-
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
57
|
-
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
58
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
59
|
-
],
|
60
|
-
:filter => [],
|
61
|
-
}
|
62
|
-
}
|
63
|
-
|
64
|
-
$go = {
|
65
|
-
:url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
|
66
|
-
:code => 0,
|
67
|
-
:go => 3,
|
68
|
-
:pmid => 3,
|
69
|
-
}
|
70
|
-
|
71
|
-
$query = '"caenorhabditis elegans"[MeSH Terms] OR Caenorhabditis elegans[Text Word]'
|
72
|
-
##########################
|
73
|
-
|
74
|
-
|
75
|
-
module Open
|
76
|
-
|
77
|
-
class << self
|
78
|
-
alias_method :old_read, :read
|
79
|
-
|
80
|
-
def read(url, options = {})
|
81
|
-
content = old_read(url, options)
|
82
|
-
|
83
|
-
if url =~ /GO/
|
84
|
-
return content.gsub(/.*:.*\((GO:\d+)\)/,'\1').gsub(/\nGO/,"|GO").
|
85
|
-
collect{|l|
|
86
|
-
l = l.sub(/\|/,"\t")
|
87
|
-
names, gos = l.chomp.split(/\t/)
|
88
|
-
|
89
|
-
id, name, extra = names.split(/ /)
|
90
|
-
extra = extra.gsub(/[()]/,'') if extra
|
91
|
-
|
92
|
-
if gos
|
93
|
-
gos.split(/\|/).collect{|go|
|
94
|
-
[id, name, extra, go].join("\t")
|
95
|
-
}.join("\n")
|
96
|
-
else
|
97
|
-
[id, name, extra].join("\t") + "\n"
|
98
|
-
end
|
99
|
-
}.join("\n")
|
100
|
-
elsif url =~ /gene_ids/
|
101
|
-
return content.gsub(/,/,"\t")
|
102
|
-
else
|
103
|
-
return content
|
104
|
-
end
|
105
|
-
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
@@ -1,140 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Homo sapiens"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "Entrez Gene ID"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 9606,
|
10
|
-
:fix => nil,
|
11
|
-
:check => proc{|code| false},
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
:biomart => {
|
16
|
-
:database => 'hsapiens_gene_ensembl',
|
17
|
-
:main => ['Entrez Gene ID' , "entrezgene"],
|
18
|
-
:extra => [
|
19
|
-
[ 'Associated Gene Name' , "external_gene_id"],
|
20
|
-
[ 'HGNC symbol', "hgnc_symbol" ],
|
21
|
-
[ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
|
22
|
-
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
23
|
-
],
|
24
|
-
}
|
25
|
-
|
26
|
-
}
|
27
|
-
|
28
|
-
$identifiers = {
|
29
|
-
:biomart => {
|
30
|
-
:database => 'hsapiens_gene_ensembl',
|
31
|
-
:main => ['Entrez Gene ID' , "entrezgene"],
|
32
|
-
:extra => [
|
33
|
-
[ 'Ensembl Gene ID', "ensembl_gene_id" ],
|
34
|
-
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
35
|
-
[ 'Associated Gene Name', "external_gene_id" ],
|
36
|
-
[ 'CCDS ID', "ccds" ],
|
37
|
-
[ 'Protein ID', "protein_id" ],
|
38
|
-
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
39
|
-
[ 'Unigene ID', "unigene" ],
|
40
|
-
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
41
|
-
[ 'HGNC ID', "hgnc_id", 'HGNC'],
|
42
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
43
|
-
|
44
|
-
# Affymetrix
|
45
|
-
[ 'AFFY HC G110', 'affy_hc_g110' ],
|
46
|
-
[ 'AFFY HG FOCUS', 'affy_hg_focus' ],
|
47
|
-
[ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
|
48
|
-
[ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
|
49
|
-
[ 'AFFY HG U133A', 'affy_hg_u133a' ],
|
50
|
-
[ 'AFFY HG U133B', 'affy_hg_u133b' ],
|
51
|
-
[ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
|
52
|
-
[ 'AFFY HG U95B', 'affy_hg_u95b' ],
|
53
|
-
[ 'AFFY HG U95C', 'affy_hg_u95c' ],
|
54
|
-
[ 'AFFY HG U95D', 'affy_hg_u95d' ],
|
55
|
-
[ 'AFFY HG U95E', 'affy_hg_u95e' ],
|
56
|
-
[ 'AFFY HG U95A', 'affy_hg_u95a' ],
|
57
|
-
[ 'AFFY HUGENEFL', 'affy_hugenefl' ],
|
58
|
-
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
|
59
|
-
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
60
|
-
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
61
|
-
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
62
|
-
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
63
|
-
[ 'Codelink ID', 'codelink' ],
|
64
|
-
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
65
|
-
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
66
|
-
|
67
|
-
],
|
68
|
-
:filter => [],
|
69
|
-
}
|
70
|
-
}
|
71
|
-
|
72
|
-
$go = {
|
73
|
-
:url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
|
74
|
-
:code => 2,
|
75
|
-
:go => 4,
|
76
|
-
:pmid => 5,
|
77
|
-
}
|
78
|
-
|
79
|
-
$query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
|
80
|
-
##########################
|
81
|
-
|
82
|
-
require 'rbbt/util/index'
|
83
|
-
|
84
|
-
Rake::Task['gene.go'].clear
|
85
|
-
file 'gene.go' => ['identifiers'] do
|
86
|
-
if File.exists? 'identifiers'
|
87
|
-
require 'rbbt/sources/organism'
|
88
|
-
index = Organism.id_index('Hsa', :other => ['Associated Gene Name'])
|
89
|
-
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
|
90
|
-
|
91
|
-
data = data.collect{|code, value_lists|
|
92
|
-
code = index[code]
|
93
|
-
[code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
|
94
|
-
}.select{|p| p[0] && p[1].any?}
|
95
|
-
|
96
|
-
Open.write('gene.go',
|
97
|
-
data.collect{|p|
|
98
|
-
"#{p[0]}\t#{p[1].uniq.join("|")}"
|
99
|
-
}.join("\n")
|
100
|
-
)
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
Rake::Task['gene_go.pmid'].clear
|
105
|
-
file 'gene_go.pmid' => ['identifiers'] do
|
106
|
-
if File.exists? 'identifiers'
|
107
|
-
index = Index.index('identifiers')
|
108
|
-
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
|
109
|
-
|
110
|
-
data = data.collect{|code, value_lists|
|
111
|
-
code = index[code]
|
112
|
-
[code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
|
113
|
-
}.select{|p| p[0] && p[1].any?}
|
114
|
-
|
115
|
-
Open.write('gene_go.pmid',
|
116
|
-
data.collect{|p|
|
117
|
-
"#{p[0]}\t#{p[1].uniq.join("|")}"
|
118
|
-
}.join("\n")
|
119
|
-
)
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
|
124
|
-
Rake::Task['lexicon'].clear
|
125
|
-
file 'lexicon' => ['identifiers'] do
|
126
|
-
if File.exists? 'identifiers'
|
127
|
-
require 'rbbt/sources/organism'
|
128
|
-
HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
|
129
|
-
names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
|
130
|
-
translations = Organism.id_index('Hsa', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
|
131
|
-
|
132
|
-
Open.write('lexicon',
|
133
|
-
names.collect{|code, names|
|
134
|
-
next unless translations[code]
|
135
|
-
([translations[code]] + names).join("\t")
|
136
|
-
}.compact.join("\n")
|
137
|
-
)
|
138
|
-
end
|
139
|
-
|
140
|
-
end
|