rbbt 1.2.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
@@ -1,10 +0,0 @@
|
|
1
|
-
equal do |w| [w] end
|
2
|
-
standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
|
3
|
-
cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
|
4
|
-
special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
|
5
|
-
words do |w|
|
6
|
-
w.sub(/(.*)I$/,'\1I \1').
|
7
|
-
scan(/[a-z][a-z]+/i).
|
8
|
-
sort{|a,b| b.length <=> a.length}.
|
9
|
-
collect{|n| n.downcase}
|
10
|
-
end
|
@@ -1,86 +0,0 @@
|
|
1
|
-
require 'rbbt/util/misc'
|
2
|
-
|
3
|
-
|
4
|
-
plural = Proc.new do |t| t.sub(/s$/,'') end
|
5
|
-
|
6
|
-
tokens do
|
7
|
-
|
8
|
-
# Some (possible) single letters first
|
9
|
-
receptor /^(?:receptor|r)s?$/i
|
10
|
-
protein /^(?:protein|p)s?$/i
|
11
|
-
roman /^[IV]+$/
|
12
|
-
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
13
|
-
|
14
|
-
|
15
|
-
# Some words for removal
|
16
|
-
stopword do |w| $stopwords.include?( w.downcase_first) end
|
17
|
-
gene /genes?/i
|
18
|
-
dna
|
19
|
-
cdna
|
20
|
-
rna
|
21
|
-
mrna
|
22
|
-
trna
|
23
|
-
cdna
|
24
|
-
component
|
25
|
-
exon
|
26
|
-
intron
|
27
|
-
domain
|
28
|
-
family
|
29
|
-
|
30
|
-
|
31
|
-
# Important words
|
32
|
-
number /^(?:\d+[.,]?\d+|\d)$/
|
33
|
-
greek do |w| $greek[w.downcase] != nil end
|
34
|
-
special do |w| w.is_special? end
|
35
|
-
promoter
|
36
|
-
similar /^(homolog.*|like|related|associated)$/
|
37
|
-
ase /ase$/
|
38
|
-
in_end /in$/
|
39
|
-
end
|
40
|
-
|
41
|
-
comparisons do
|
42
|
-
|
43
|
-
compare.number do |l1,l2|
|
44
|
-
v = 0
|
45
|
-
case
|
46
|
-
when l1.empty? && l2.empty?
|
47
|
-
v = 0
|
48
|
-
when l1.sort.uniq == l2.sort.uniq
|
49
|
-
v = 3
|
50
|
-
when l1.any? && l1[0] == l2[0]
|
51
|
-
v = -3
|
52
|
-
when l1.empty? && l2 == ['1']
|
53
|
-
v = -5
|
54
|
-
else
|
55
|
-
v = -10
|
56
|
-
end
|
57
|
-
v
|
58
|
-
end
|
59
|
-
|
60
|
-
diff.promoter -10
|
61
|
-
diff.receptor -10
|
62
|
-
diff.similar -10
|
63
|
-
diff.capital -10
|
64
|
-
|
65
|
-
same.unknown 1
|
66
|
-
miss.unknown -2
|
67
|
-
extr.unknown -2
|
68
|
-
|
69
|
-
same.greek 1
|
70
|
-
miss.greek -2
|
71
|
-
extr.greek -2
|
72
|
-
|
73
|
-
same.special 4
|
74
|
-
miss.special -3
|
75
|
-
extr.special -3
|
76
|
-
|
77
|
-
transform.receptor plural
|
78
|
-
transform.protein plural
|
79
|
-
|
80
|
-
transform.roman do |t| [t.arabic, :number] end
|
81
|
-
transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
|
82
|
-
transform.ase do |t| [t, :special] end
|
83
|
-
transform.in_end do |t| [t, :special] end
|
84
|
-
transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
|
85
|
-
end
|
86
|
-
|
@@ -1,23 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
function norm(){
|
3
|
-
organism=$1
|
4
|
-
shift
|
5
|
-
dataset=$1
|
6
|
-
shift
|
7
|
-
ner=$1
|
8
|
-
shift
|
9
|
-
|
10
|
-
CMD="rm results/${organism}_$dataset; rake results/${organism}_$dataset.eval ner=$ner $@ > ${organism}_$dataset.log_$ner; tail results/${organism}_$dataset.eval"
|
11
|
-
echo $CMD
|
12
|
-
$CMD
|
13
|
-
}
|
14
|
-
|
15
|
-
|
16
|
-
function norm_2(){
|
17
|
-
ner=$1
|
18
|
-
shift
|
19
|
-
|
20
|
-
CMD="rm results/bc2gn; rake results/bc2gn.eval ner=$ner $@ > bc2gn.log_$ner; tail results/bc2gn.eval"
|
21
|
-
echo $CMD
|
22
|
-
$CMD
|
23
|
-
}
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Arabidopsis thaliana"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "TAIR Locus"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax =>3702,
|
10
|
-
:fix => proc{|code| code.sub(/^TAIR:/,'')},
|
11
|
-
:check => proc{|code| true },
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
:file => {
|
16
|
-
:url => "ftp://ftp.arabidopsis.org/home/tair/Genes/gene_aliases.20100413",
|
17
|
-
:native => 0,
|
18
|
-
:extra => [1,2],
|
19
|
-
},
|
20
|
-
}
|
21
|
-
|
22
|
-
$identifiers = {
|
23
|
-
:file => {
|
24
|
-
:url => "ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2009-7-29.txt",
|
25
|
-
:native => 4,
|
26
|
-
:extra => [0],
|
27
|
-
:fields => ["Affymetrix"],
|
28
|
-
},
|
29
|
-
:biomart => {
|
30
|
-
:database => 'athaliana_eg_gene',
|
31
|
-
:main => ['TAIR Locus', 'tair_locus'],
|
32
|
-
:extra => [
|
33
|
-
['Associated Gene Name' , "external_gene_id"] ,
|
34
|
-
['Gramene Gene ID' , "ensembl_gene_id"] ,
|
35
|
-
['RefSeq peptide' , "refseq_peptide"] ,
|
36
|
-
['Unigene' , "unigene"] ,
|
37
|
-
['Interpro ID' , "interpro"] ,
|
38
|
-
|
39
|
-
|
40
|
-
],
|
41
|
-
:filter => ['with_tair_locus'], # This is needed as the filter is not with_mgi_id as was expected
|
42
|
-
}
|
43
|
-
|
44
|
-
}
|
45
|
-
|
46
|
-
$go = {
|
47
|
-
:url => "ftp://ftp.arabidopsis.org/home/tair/Ontologies/Gene_Ontology/ATH_GO_GOSLIM.txt",
|
48
|
-
:code => 0,
|
49
|
-
:go => 5,
|
50
|
-
:pmid => 12,
|
51
|
-
}
|
52
|
-
|
53
|
-
$query = '("arabidopsis"[MeSH Terms] OR Arabidopsis[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]))'
|
54
|
-
|
55
|
-
|
@@ -1,84 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Candida albicans"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "Systematic Name"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 237561,
|
10
|
-
:fix => proc{|code| code.sub(/^CaO/,'orf') },
|
11
|
-
:check => proc{|code| code.match(/^orf/)},
|
12
|
-
:native => 3
|
13
|
-
}
|
14
|
-
|
15
|
-
$lexicon = {
|
16
|
-
:file => {
|
17
|
-
:url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
|
18
|
-
:native => 0,
|
19
|
-
:extra => [8,1,2],
|
20
|
-
:exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
|
21
|
-
},
|
22
|
-
}
|
23
|
-
|
24
|
-
$identifiers = {
|
25
|
-
:file => {
|
26
|
-
:url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
|
27
|
-
:native => 0,
|
28
|
-
:extra => [8,1,2],
|
29
|
-
:exclude => proc{|l| l.match(/^!/)},
|
30
|
-
:fields => ["GCD ID", "Gene Name", "Gene Alias"]
|
31
|
-
},
|
32
|
-
}
|
33
|
-
|
34
|
-
$go = {
|
35
|
-
:url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
|
36
|
-
:code => 10,
|
37
|
-
:go => 4,
|
38
|
-
:pmid => 5,
|
39
|
-
:fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
|
40
|
-
}
|
41
|
-
|
42
|
-
$query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
43
|
-
|
44
|
-
####
|
45
|
-
|
46
|
-
#Rake::Task['identifiers'].clear
|
47
|
-
#file 'identifiers' => ['lexicon'] do |t|
|
48
|
-
# identifiers = {}
|
49
|
-
# if $identifiers[:file]
|
50
|
-
# identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
|
51
|
-
# end
|
52
|
-
#
|
53
|
-
# orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
|
54
|
-
#
|
55
|
-
# translations = {}
|
56
|
-
#
|
57
|
-
# Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
|
58
|
-
# orfs.each{|orf|
|
59
|
-
# translations[orf] ||= []
|
60
|
-
# translations[orf] << entrez
|
61
|
-
# }
|
62
|
-
# }
|
63
|
-
#
|
64
|
-
# orf2native.each{|orf, native|
|
65
|
-
# next unless identifiers[native]
|
66
|
-
# identifiers[native] << [orf]
|
67
|
-
# if translations[orf]
|
68
|
-
# identifiers[native] << translations[orf]
|
69
|
-
# else
|
70
|
-
# identifiers[native] << []
|
71
|
-
# end
|
72
|
-
#
|
73
|
-
# }
|
74
|
-
#
|
75
|
-
# header = "#" + [$native_id, 'Gene Name', 'Orf', "Entrez Gene ID"].uniq.join("\t") + "\n"
|
76
|
-
# Open.write('identifiers',
|
77
|
-
# header +
|
78
|
-
# identifiers.collect{|code, name_lists|
|
79
|
-
# "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
|
80
|
-
# }.join("\n")
|
81
|
-
# )
|
82
|
-
#end
|
83
|
-
#
|
84
|
-
#
|
@@ -1,109 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Caenorhabditis elegans"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "WormBase ID"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 6239,
|
10
|
-
:fix => proc{|code| code.sub(/^WormBase:/,'')},
|
11
|
-
:check => proc{|code| code.match(/^WBGene/)},
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
|
16
|
-
:file =>{
|
17
|
-
|
18
|
-
:url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
|
19
|
-
:native => 0,
|
20
|
-
:extra => [1,2],
|
21
|
-
|
22
|
-
# :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
|
23
|
-
# :native => 0,
|
24
|
-
# :extra => [2,3,4,5],
|
25
|
-
|
26
|
-
},
|
27
|
-
}
|
28
|
-
|
29
|
-
|
30
|
-
$identifiers = {
|
31
|
-
|
32
|
-
:file =>{
|
33
|
-
|
34
|
-
:url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
|
35
|
-
:native => 0,
|
36
|
-
:extra => [1,2],
|
37
|
-
|
38
|
-
# :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
|
39
|
-
# :native => 0,
|
40
|
-
# :extra => [2,3,4,5],
|
41
|
-
|
42
|
-
},
|
43
|
-
|
44
|
-
:biomart => {
|
45
|
-
:database => 'celegans_gene_ensembl',
|
46
|
-
:main => ['Entrez Gene ID' , "entrezgene"],
|
47
|
-
:extra => [
|
48
|
-
['WormBase gene', "wormbase_gene" ],
|
49
|
-
['Associated Gene Name ', "external_gene_id" ],
|
50
|
-
['WormPep id', "wormpep_id" ],
|
51
|
-
[ 'Ensembl Gene ID', "ensembl_gene_id" ],
|
52
|
-
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
53
|
-
[ 'Protein ID ', "protein_id" ],
|
54
|
-
[ 'RefSeq Protein ID ', "refseq_peptide" ],
|
55
|
-
[ 'Unigene ID ', "unigene" ],
|
56
|
-
[ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
|
57
|
-
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
58
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
59
|
-
],
|
60
|
-
:filter => [],
|
61
|
-
}
|
62
|
-
}
|
63
|
-
|
64
|
-
$go = {
|
65
|
-
:url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
|
66
|
-
:code => 0,
|
67
|
-
:go => 3,
|
68
|
-
:pmid => 3,
|
69
|
-
}
|
70
|
-
|
71
|
-
$query = '"caenorhabditis elegans"[MeSH Terms] OR Caenorhabditis elegans[Text Word]'
|
72
|
-
##########################
|
73
|
-
|
74
|
-
|
75
|
-
module Open
|
76
|
-
|
77
|
-
class << self
|
78
|
-
alias_method :old_read, :read
|
79
|
-
|
80
|
-
def read(url, options = {})
|
81
|
-
content = old_read(url, options)
|
82
|
-
|
83
|
-
if url =~ /GO/
|
84
|
-
return content.gsub(/.*:.*\((GO:\d+)\)/,'\1').gsub(/\nGO/,"|GO").
|
85
|
-
collect{|l|
|
86
|
-
l = l.sub(/\|/,"\t")
|
87
|
-
names, gos = l.chomp.split(/\t/)
|
88
|
-
|
89
|
-
id, name, extra = names.split(/ /)
|
90
|
-
extra = extra.gsub(/[()]/,'') if extra
|
91
|
-
|
92
|
-
if gos
|
93
|
-
gos.split(/\|/).collect{|go|
|
94
|
-
[id, name, extra, go].join("\t")
|
95
|
-
}.join("\n")
|
96
|
-
else
|
97
|
-
[id, name, extra].join("\t") + "\n"
|
98
|
-
end
|
99
|
-
}.join("\n")
|
100
|
-
elsif url =~ /gene_ids/
|
101
|
-
return content.gsub(/,/,"\t")
|
102
|
-
else
|
103
|
-
return content
|
104
|
-
end
|
105
|
-
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
@@ -1,140 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Homo sapiens"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "Entrez Gene ID"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 9606,
|
10
|
-
:fix => nil,
|
11
|
-
:check => proc{|code| false},
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
:biomart => {
|
16
|
-
:database => 'hsapiens_gene_ensembl',
|
17
|
-
:main => ['Entrez Gene ID' , "entrezgene"],
|
18
|
-
:extra => [
|
19
|
-
[ 'Associated Gene Name' , "external_gene_id"],
|
20
|
-
[ 'HGNC symbol', "hgnc_symbol" ],
|
21
|
-
[ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
|
22
|
-
[ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
|
23
|
-
],
|
24
|
-
}
|
25
|
-
|
26
|
-
}
|
27
|
-
|
28
|
-
$identifiers = {
|
29
|
-
:biomart => {
|
30
|
-
:database => 'hsapiens_gene_ensembl',
|
31
|
-
:main => ['Entrez Gene ID' , "entrezgene"],
|
32
|
-
:extra => [
|
33
|
-
[ 'Ensembl Gene ID', "ensembl_gene_id" ],
|
34
|
-
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
35
|
-
[ 'Associated Gene Name', "external_gene_id" ],
|
36
|
-
[ 'CCDS ID', "ccds" ],
|
37
|
-
[ 'Protein ID', "protein_id" ],
|
38
|
-
[ 'RefSeq Protein ID', "refseq_peptide" ],
|
39
|
-
[ 'Unigene ID', "unigene" ],
|
40
|
-
[ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
|
41
|
-
[ 'HGNC ID', "hgnc_id", 'HGNC'],
|
42
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
43
|
-
|
44
|
-
# Affymetrix
|
45
|
-
[ 'AFFY HC G110', 'affy_hc_g110' ],
|
46
|
-
[ 'AFFY HG FOCUS', 'affy_hg_focus' ],
|
47
|
-
[ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
|
48
|
-
[ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
|
49
|
-
[ 'AFFY HG U133A', 'affy_hg_u133a' ],
|
50
|
-
[ 'AFFY HG U133B', 'affy_hg_u133b' ],
|
51
|
-
[ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
|
52
|
-
[ 'AFFY HG U95B', 'affy_hg_u95b' ],
|
53
|
-
[ 'AFFY HG U95C', 'affy_hg_u95c' ],
|
54
|
-
[ 'AFFY HG U95D', 'affy_hg_u95d' ],
|
55
|
-
[ 'AFFY HG U95E', 'affy_hg_u95e' ],
|
56
|
-
[ 'AFFY HG U95A', 'affy_hg_u95a' ],
|
57
|
-
[ 'AFFY HUGENEFL', 'affy_hugenefl' ],
|
58
|
-
[ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
|
59
|
-
[ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
|
60
|
-
[ 'AFFY U133 X3P', 'affy_u133_x3p' ],
|
61
|
-
[ 'Agilent WholeGenome',"agilent_wholegenome" ],
|
62
|
-
[ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
|
63
|
-
[ 'Codelink ID', 'codelink' ],
|
64
|
-
[ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
|
65
|
-
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
66
|
-
|
67
|
-
],
|
68
|
-
:filter => [],
|
69
|
-
}
|
70
|
-
}
|
71
|
-
|
72
|
-
$go = {
|
73
|
-
:url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
|
74
|
-
:code => 2,
|
75
|
-
:go => 4,
|
76
|
-
:pmid => 5,
|
77
|
-
}
|
78
|
-
|
79
|
-
$query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
|
80
|
-
##########################
|
81
|
-
|
82
|
-
require 'rbbt/util/index'
|
83
|
-
|
84
|
-
Rake::Task['gene.go'].clear
|
85
|
-
file 'gene.go' => ['identifiers'] do
|
86
|
-
if File.exists? 'identifiers'
|
87
|
-
require 'rbbt/sources/organism'
|
88
|
-
index = Organism.id_index('Hsa', :other => ['Associated Gene Name'])
|
89
|
-
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
|
90
|
-
|
91
|
-
data = data.collect{|code, value_lists|
|
92
|
-
code = index[code]
|
93
|
-
[code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
|
94
|
-
}.select{|p| p[0] && p[1].any?}
|
95
|
-
|
96
|
-
Open.write('gene.go',
|
97
|
-
data.collect{|p|
|
98
|
-
"#{p[0]}\t#{p[1].uniq.join("|")}"
|
99
|
-
}.join("\n")
|
100
|
-
)
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
Rake::Task['gene_go.pmid'].clear
|
105
|
-
file 'gene_go.pmid' => ['identifiers'] do
|
106
|
-
if File.exists? 'identifiers'
|
107
|
-
index = Index.index('identifiers')
|
108
|
-
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
|
109
|
-
|
110
|
-
data = data.collect{|code, value_lists|
|
111
|
-
code = index[code]
|
112
|
-
[code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
|
113
|
-
}.select{|p| p[0] && p[1].any?}
|
114
|
-
|
115
|
-
Open.write('gene_go.pmid',
|
116
|
-
data.collect{|p|
|
117
|
-
"#{p[0]}\t#{p[1].uniq.join("|")}"
|
118
|
-
}.join("\n")
|
119
|
-
)
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
|
124
|
-
Rake::Task['lexicon'].clear
|
125
|
-
file 'lexicon' => ['identifiers'] do
|
126
|
-
if File.exists? 'identifiers'
|
127
|
-
require 'rbbt/sources/organism'
|
128
|
-
HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
|
129
|
-
names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
|
130
|
-
translations = Organism.id_index('Hsa', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
|
131
|
-
|
132
|
-
Open.write('lexicon',
|
133
|
-
names.collect{|code, names|
|
134
|
-
next unless translations[code]
|
135
|
-
([translations[code]] + names).join("\t")
|
136
|
-
}.compact.join("\n")
|
137
|
-
)
|
138
|
-
end
|
139
|
-
|
140
|
-
end
|