rbbt 1.2.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +69 -214
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -245
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -140
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -86
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Ath.Rakefile +0 -55
- data/install_scripts/organisms/Cal.Rakefile +0 -84
- data/install_scripts/organisms/Cel.Rakefile +0 -109
- data/install_scripts/organisms/Hsa.Rakefile +0 -140
- data/install_scripts/organisms/Mmu.Rakefile +0 -77
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/Rno.Rakefile +0 -88
- data/install_scripts/organisms/Sce.Rakefile +0 -66
- data/install_scripts/organisms/Spo.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -252
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -83
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -85
- data/lib/rbbt/sources/gscholar.rb +0 -74
- data/lib/rbbt/sources/organism.rb +0 -241
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -248
- data/lib/rbbt/util/arrayHash.rb +0 -266
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -251
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -35
- data/tasks/install.rake +0 -124
- data/test/rbbt/bow/test_bow.rb +0 -33
- data/test/rbbt/bow/test_classifier.rb +0 -72
- data/test/rbbt/bow/test_dictionary.rb +0 -91
- data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
- data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
- data/test/rbbt/ner/test_abner.rb +0 -17
- data/test/rbbt/ner/test_banner.rb +0 -17
- data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
- data/test/rbbt/ner/test_regexpNER.rb +0 -33
- data/test/rbbt/ner/test_rner.rb +0 -126
- data/test/rbbt/ner/test_rnorm.rb +0 -47
- data/test/rbbt/sources/test_biocreative.rb +0 -38
- data/test/rbbt/sources/test_biomart.rb +0 -31
- data/test/rbbt/sources/test_entrez.rb +0 -49
- data/test/rbbt/sources/test_go.rb +0 -24
- data/test/rbbt/sources/test_organism.rb +0 -59
- data/test/rbbt/sources/test_polysearch.rb +0 -27
- data/test/rbbt/sources/test_pubmed.rb +0 -39
- data/test/rbbt/util/test_arrayHash.rb +0 -257
- data/test/rbbt/util/test_filecache.rb +0 -37
- data/test/rbbt/util/test_index.rb +0 -31
- data/test/rbbt/util/test_misc.rb +0 -20
- data/test/rbbt/util/test_open.rb +0 -110
- data/test/rbbt/util/test_simpleDSL.rb +0 -57
- data/test/rbbt/util/test_tmpfile.rb +0 -21
- data/test/test_helper.rb +0 -4
- data/test/test_rbbt.rb +0 -11
@@ -1,77 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Mus musculus"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "MGI DB ID"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 10090,
|
10
|
-
:fix => nil,
|
11
|
-
:check => proc{|code| code.match(/^MGI/)},
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
:file => {
|
16
|
-
:url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
|
17
|
-
:native => 0,
|
18
|
-
:extra => [2,3],
|
19
|
-
:exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
|
20
|
-
},
|
21
|
-
}
|
22
|
-
|
23
|
-
$identifiers = {
|
24
|
-
:file => {
|
25
|
-
:url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
|
26
|
-
:native => 0,
|
27
|
-
:extra => [],
|
28
|
-
:exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
|
29
|
-
},
|
30
|
-
:biomart => {
|
31
|
-
:database => 'mmusculus_gene_ensembl',
|
32
|
-
:main => ['MGI DB ID', 'mgi_id'] ,
|
33
|
-
:extra => [
|
34
|
-
['Associated Gene Name' , "external_gene_id"],
|
35
|
-
['Protein ID' , "protein_id"] ,
|
36
|
-
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
37
|
-
['Unigene ID' , "unigene"] ,
|
38
|
-
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
39
|
-
['RefSeq Protein ID' , "refseq_peptide"] ,
|
40
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
41
|
-
|
42
|
-
['Affy mg u74a',"affy_mg_u74a" ],
|
43
|
-
['Affy mg u74av2',"affy_mg_u74av2" ],
|
44
|
-
['Affy mg u74b',"affy_mg_u74b" ],
|
45
|
-
['Affy mg u74bv2',"affy_mg_u74bv2" ],
|
46
|
-
['Affy mg u74c',"affy_mg_u74c" ],
|
47
|
-
['Affy mg u74cv2',"affy_mg_u74cv2" ],
|
48
|
-
['Affy moe430a',"affy_moe430a" ],
|
49
|
-
['Affy moe430b',"affy_moe430b" ],
|
50
|
-
['AFFY MoEx',"affy_moex_1_0_st_v1" ],
|
51
|
-
['AFFY MoGene',"affy_mogene_1_0_st_v1" ],
|
52
|
-
['Affy mouse430 2',"affy_mouse430_2" ],
|
53
|
-
['Affy mouse430a 2',"affy_mouse430a_2" ],
|
54
|
-
['Affy mu11ksuba',"affy_mu11ksuba" ],
|
55
|
-
['Affy mu11ksubb',"affy_mu11ksubb" ],
|
56
|
-
['Agilent WholeGenome',"agilent_wholegenome" ],
|
57
|
-
['Codelink ID',"codelink" ],
|
58
|
-
['Illumina MouseWG 6 v1',"illumina_mousewg_6_v1" ],
|
59
|
-
['Illumina MouseWG 6 v2',"illumina_mousewg_6_v2" ],
|
60
|
-
|
61
|
-
],
|
62
|
-
:filter => ['with_mgi'], # This is needed as the filter is not with_mgi_id as was expected
|
63
|
-
}
|
64
|
-
}
|
65
|
-
|
66
|
-
$go = {
|
67
|
-
:url => "ftp://ftp.geneontology.org/go/gene-associations/gene_association.mgi.gz",
|
68
|
-
:code => 1,
|
69
|
-
:go => 4,
|
70
|
-
:pmid => 5,
|
71
|
-
}
|
72
|
-
|
73
|
-
$query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]))'
|
74
|
-
##########################
|
75
|
-
|
76
|
-
|
77
|
-
|
@@ -1,43 +0,0 @@
|
|
1
|
-
$org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
|
2
|
-
|
3
|
-
task 'names' do
|
4
|
-
orgs = Dir.glob('*').
|
5
|
-
select{|t|
|
6
|
-
File.directory?(t ) &&
|
7
|
-
File.exist?(t + '/Rakefile')
|
8
|
-
}
|
9
|
-
|
10
|
-
orgs.each{|org|
|
11
|
-
pid = Process.fork{
|
12
|
-
Dir.chdir(org)
|
13
|
-
load 'Rakefile'
|
14
|
-
Rake::Task['name'].invoke
|
15
|
-
}
|
16
|
-
Process.waitpid pid
|
17
|
-
}
|
18
|
-
|
19
|
-
end
|
20
|
-
|
21
|
-
task 'default' do
|
22
|
-
if $org
|
23
|
-
orgs = [$org]
|
24
|
-
else
|
25
|
-
|
26
|
-
orgs = Dir.glob('*').
|
27
|
-
select{|t|
|
28
|
-
File.directory?(t ) &&
|
29
|
-
File.exist?(t + '/Rakefile')
|
30
|
-
}
|
31
|
-
end
|
32
|
-
|
33
|
-
orgs.each{|org|
|
34
|
-
puts "Updating #{ org }"
|
35
|
-
pid = Process.fork{
|
36
|
-
Dir.chdir(org)
|
37
|
-
load 'Rakefile'
|
38
|
-
Rake::Task['update'].invoke
|
39
|
-
}
|
40
|
-
Process.waitpid pid
|
41
|
-
}
|
42
|
-
end
|
43
|
-
|
@@ -1,88 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Rattus norvegicus"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "RGD DB ID"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 10116,
|
10
|
-
:check => proc{|code| code.match(/^RGD/)},
|
11
|
-
}
|
12
|
-
|
13
|
-
$lexicon = {
|
14
|
-
:file => {
|
15
|
-
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
16
|
-
:native => 1,
|
17
|
-
:extra => [2,9],
|
18
|
-
:exclude => proc{|l| !l.match(/^RGD/)}
|
19
|
-
},
|
20
|
-
}
|
21
|
-
|
22
|
-
$identifiers = {
|
23
|
-
:file => {
|
24
|
-
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
25
|
-
:native => 1,
|
26
|
-
:extra => [],
|
27
|
-
:exclude => proc{|l| !l.match(/^RGD/)}
|
28
|
-
},
|
29
|
-
:biomart => {
|
30
|
-
:database => 'rnorvegicus_gene_ensembl',
|
31
|
-
:main => ['Entrez Gene ID' , "entrezgene"],
|
32
|
-
:extra => [
|
33
|
-
['Associated Gene Name' , "external_gene_id"],
|
34
|
-
['Protein ID' , "protein_id"] ,
|
35
|
-
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
36
|
-
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
37
|
-
['RefSeq Protein ID' , "refseq_peptide"] ,
|
38
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
39
|
-
|
40
|
-
['Affy rae230a', "affy_rae230a"],
|
41
|
-
['Affy rae230b', "affy_rae230b"],
|
42
|
-
['Affy RaGene', "affy_ragene_1_0_st_v1"],
|
43
|
-
['Affy rat230 2', "affy_rat230_2"],
|
44
|
-
['Affy RaEx', "affy_raex_1_0_st_v1"],
|
45
|
-
['Affy rg u34a', "affy_rg_u34a"],
|
46
|
-
['Affy rg u34b', "affy_rg_u34b"],
|
47
|
-
['Affy rg u34c', "affy_rg_u34c"],
|
48
|
-
['Affy rn u34', "affy_rn_u34"],
|
49
|
-
['Affy rt u34', "affy_rt_u34"],
|
50
|
-
['Agilent WholeGenome',"agilent_wholegenome" ],
|
51
|
-
['Codelink ID ', "codelink"],
|
52
|
-
|
53
|
-
|
54
|
-
],
|
55
|
-
:filter => [],
|
56
|
-
}
|
57
|
-
}
|
58
|
-
|
59
|
-
$go = {
|
60
|
-
:url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
|
61
|
-
:exclude => proc{|l| !l.match(/^RGD/)},
|
62
|
-
:code => 1,
|
63
|
-
:go => 4,
|
64
|
-
:pmid => 5,
|
65
|
-
}
|
66
|
-
|
67
|
-
$query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
68
|
-
|
69
|
-
#{{{ Redefines
|
70
|
-
|
71
|
-
module Open
|
72
|
-
|
73
|
-
class << self
|
74
|
-
alias_method :old_read, :read
|
75
|
-
|
76
|
-
def read(url, options = {})
|
77
|
-
data = old_read(url, options)
|
78
|
-
|
79
|
-
if url =~ /gene_association.rgd.gz/
|
80
|
-
return data.collect{|l| l.gsub(/^RGD\t/,"RGD\tRGD:")}.join("\n")
|
81
|
-
else
|
82
|
-
return data
|
83
|
-
end
|
84
|
-
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
@@ -1,66 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Saccharomyces cerevisiae"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "SGD DB Id"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 559292,
|
10
|
-
:fix => proc{|code| code.sub(/SGD:S0/,'S0') },
|
11
|
-
:check => proc{|code| code.match(/^S0/)},
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
:file => {
|
16
|
-
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
17
|
-
:native => 0,
|
18
|
-
:extra => [4,3,5]
|
19
|
-
},
|
20
|
-
:biomart => {
|
21
|
-
:database => 'scerevisiae_gene_ensembl',
|
22
|
-
:main => ['Entrez Gene ID', 'entrezgene'],
|
23
|
-
:extra => [
|
24
|
-
['Interpro Description' , "interpro_description"],
|
25
|
-
],
|
26
|
-
:filter => [],
|
27
|
-
}
|
28
|
-
|
29
|
-
}
|
30
|
-
|
31
|
-
$identifiers = {
|
32
|
-
:file => {
|
33
|
-
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
34
|
-
:native => 0,
|
35
|
-
:extra => [],
|
36
|
-
},
|
37
|
-
:biomart => {
|
38
|
-
:database => 'scerevisiae_gene_ensembl',
|
39
|
-
:main => ['Entrez Gene ID', 'entrezgene'],
|
40
|
-
:extra => [
|
41
|
-
['Associated Gene Name' , "external_gene_id"],
|
42
|
-
['Ensembl Gene ID', "ensembl_gene_id" ],
|
43
|
-
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
44
|
-
['RefSeq Protein ID' , "refseq_peptide"] ,
|
45
|
-
['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
46
|
-
['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
47
|
-
['Protein ID' , "protein_id"] ,
|
48
|
-
['EMBL (Genbank) ID' , "embl"] ,
|
49
|
-
# Affymetrix
|
50
|
-
['Affy yeast 2',"affy_yeast_2"],
|
51
|
-
['Affy yg s98', "affy_yg_s98"],
|
52
|
-
],
|
53
|
-
:filter => [],
|
54
|
-
}
|
55
|
-
}
|
56
|
-
|
57
|
-
$go = {
|
58
|
-
:url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
|
59
|
-
:code => 1,
|
60
|
-
:go => 4,
|
61
|
-
:pmid => 5,
|
62
|
-
}
|
63
|
-
|
64
|
-
$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
65
|
-
|
66
|
-
|
@@ -1,40 +0,0 @@
|
|
1
|
-
require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
2
|
-
|
3
|
-
$name = "Schizosaccharomyces pombe"
|
4
|
-
|
5
|
-
|
6
|
-
$native_id = "GeneDB Id"
|
7
|
-
|
8
|
-
$entrez2native = {
|
9
|
-
:tax => 4896,
|
10
|
-
:fix => proc{|code| code.sub(/GeneDB:SP/,'SP') },
|
11
|
-
:check => proc{|code| code.match(/^SP/)},
|
12
|
-
}
|
13
|
-
|
14
|
-
$lexicon = {
|
15
|
-
:file => {
|
16
|
-
:url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
|
17
|
-
:native => 0,
|
18
|
-
:extra => [1,2,3,4,5,6,7,8]
|
19
|
-
},
|
20
|
-
}
|
21
|
-
|
22
|
-
$identifiers = {
|
23
|
-
:file => {
|
24
|
-
:url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
|
25
|
-
:native => 0,
|
26
|
-
:extra => [],
|
27
|
-
},
|
28
|
-
}
|
29
|
-
|
30
|
-
$go = {
|
31
|
-
:url => "ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Gene_ontology/gene_association.GeneDB_Spombe",
|
32
|
-
:code => 1,
|
33
|
-
:go => 4,
|
34
|
-
:pmid => 5,
|
35
|
-
}
|
36
|
-
|
37
|
-
$query = 'pombe[All Fields] AND (hasabstract[text] AND English[lang])'
|
38
|
-
####
|
39
|
-
|
40
|
-
|
@@ -1,252 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
require 'rbbt/util/arrayHash'
|
4
|
-
require 'rbbt/sources/biomart'
|
5
|
-
require 'rbbt/sources/entrez'
|
6
|
-
require 'rbbt/sources/pubmed'
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
file 'name' do
|
11
|
-
Open.write('name', $name)
|
12
|
-
end
|
13
|
-
|
14
|
-
file 'all.pmid' do
|
15
|
-
Open.write('all.pmid', PubMed.query($query).join("\n"))
|
16
|
-
end
|
17
|
-
|
18
|
-
file 'lexicon' do
|
19
|
-
begin
|
20
|
-
|
21
|
-
data = nil
|
22
|
-
# Read from file
|
23
|
-
if $lexicon[:file]
|
24
|
-
file = Open.to_hash($lexicon[:file][:url], $lexicon[:file])
|
25
|
-
data = ArrayHash.new(file, $native_id)
|
26
|
-
end
|
27
|
-
|
28
|
-
# Translate from entrez to native if needed
|
29
|
-
if $entrez2native
|
30
|
-
translations = {}
|
31
|
-
Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
|
32
|
-
each{|k,v|
|
33
|
-
translations[k] = [v.join("|")]
|
34
|
-
}
|
35
|
-
translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
|
36
|
-
if data
|
37
|
-
data.merge(translations_data)
|
38
|
-
else
|
39
|
-
data = translations_data
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
|
45
|
-
# Read from Biomart and merge with previous data
|
46
|
-
if $lexicon[:biomart]
|
47
|
-
biomart = {}
|
48
|
-
|
49
|
-
BioMart.query(
|
50
|
-
$lexicon[:biomart][:database],
|
51
|
-
$lexicon[:biomart][:main][1],
|
52
|
-
$lexicon[:biomart][:extra].collect{|v| v[1]},
|
53
|
-
$lexicon[:biomart][:filter]
|
54
|
-
).each{|key, values_list|
|
55
|
-
values = values_list.values_at(*$lexicon[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
|
56
|
-
biomart[key] = values
|
57
|
-
}
|
58
|
-
|
59
|
-
biomart_data = ArrayHash.new(biomart, $lexicon[:biomart][:main][0], $lexicon[:biomart][:extra].collect{|v| v[0]})
|
60
|
-
|
61
|
-
if data
|
62
|
-
if $lexicon[:biomart][:extra].collect{|v| v[1]}.include?( $native_id )|| $lexicon[:biomart][:main][0] == $native_id
|
63
|
-
field = $native_id
|
64
|
-
else
|
65
|
-
field = 'Entrez Gene ID'
|
66
|
-
end
|
67
|
-
data.merge(biomart_data, field)
|
68
|
-
else
|
69
|
-
data = biomart_data
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
if $entrez2native
|
74
|
-
gene_alias = {}
|
75
|
-
Entrez.entrez2native($entrez2native[:tax],4).
|
76
|
-
each{|k,v|
|
77
|
-
gene_alias[k] = [v.select{|e| e.to_s != ""}.join("|")]
|
78
|
-
}
|
79
|
-
if gene_alias.keys.any?
|
80
|
-
gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
|
81
|
-
data.merge(gene_alias_data, 'Entrez Gene ID')
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
data.remove('Entrez Gene ID')
|
86
|
-
data.clean
|
87
|
-
Open.write('lexicon', data.data.collect{|code, name_lists|
|
88
|
-
"#{ code }\t" + name_lists.flatten.select{|n| n.to_s != ""}.uniq.join("\t")
|
89
|
-
}.join("\n"))
|
90
|
-
|
91
|
-
rescue Entrez::NoFileError
|
92
|
-
puts "Lexicon not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
file 'identifiers' do
|
98
|
-
|
99
|
-
begin
|
100
|
-
data = nil
|
101
|
-
if $identifiers[:file]
|
102
|
-
file = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
|
103
|
-
data = ArrayHash.new(file, $native_id, $identifiers[:file][:fields])
|
104
|
-
end
|
105
|
-
|
106
|
-
# Translate from entrez to native if needed
|
107
|
-
if $entrez2native
|
108
|
-
translations = {}
|
109
|
-
Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
|
110
|
-
each{|k,v| translations[k] = [v.join("|")] }
|
111
|
-
|
112
|
-
if translations.keys.any?
|
113
|
-
translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
|
114
|
-
if data
|
115
|
-
data.merge(translations_data, $native_id)
|
116
|
-
else
|
117
|
-
data = translations_data
|
118
|
-
end
|
119
|
-
else
|
120
|
-
puts "No translations from Entrez to #{ $native_id }"
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
|
125
|
-
# Read from Biomart and merge with previous data
|
126
|
-
if $identifiers[:biomart]
|
127
|
-
biomart = {}
|
128
|
-
|
129
|
-
BioMart.query(
|
130
|
-
$identifiers[:biomart][:database],
|
131
|
-
$identifiers[:biomart][:main][1],
|
132
|
-
$identifiers[:biomart][:extra].collect{|v| v[1]},
|
133
|
-
$identifiers[:biomart][:filter]
|
134
|
-
).each{|key, values_list|
|
135
|
-
values = values_list.values_at(*$identifiers[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
|
136
|
-
biomart[key] = values
|
137
|
-
}
|
138
|
-
|
139
|
-
biomart_data = ArrayHash.new(biomart, $identifiers[:biomart][:main][0], $identifiers[:biomart][:extra].collect{|v| v[0]})
|
140
|
-
$identifiers[:biomart][:extra].each{|values|
|
141
|
-
if values[2]
|
142
|
-
biomart_data.process(values[0]){|n| "#{values[2]}:#{n}"}
|
143
|
-
end
|
144
|
-
}
|
145
|
-
|
146
|
-
|
147
|
-
if data
|
148
|
-
if $identifiers[:biomart][:extra].collect{|v| v[1]}.include?( $native_id ) || $identifiers[:biomart][:main][0] == $native_id
|
149
|
-
field = $native_id
|
150
|
-
else
|
151
|
-
field = 'Entrez Gene ID'
|
152
|
-
end
|
153
|
-
data.merge(biomart_data, field)
|
154
|
-
else
|
155
|
-
data = biomart_data
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
|
160
|
-
# Add the alias at the end
|
161
|
-
if $entrez2native
|
162
|
-
gene_alias = {}
|
163
|
-
Entrez.entrez2native($entrez2native[:tax],4).
|
164
|
-
each{|k,v|
|
165
|
-
gene_alias[k] = [v.join("|")]
|
166
|
-
}
|
167
|
-
if gene_alias.keys.any?
|
168
|
-
gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
|
169
|
-
if data
|
170
|
-
data.merge(gene_alias_data, 'Entrez Gene ID')
|
171
|
-
else
|
172
|
-
data = gene_alias_data
|
173
|
-
end
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
# Write ids to file
|
178
|
-
fout = File.open('identifiers', 'w')
|
179
|
-
fout.puts "##{$native_id}\t" + data.fields.join("\t")
|
180
|
-
data.clean
|
181
|
-
data.data.each{|code, values|
|
182
|
-
fout.puts code + "\t" + values.join("\t")
|
183
|
-
}
|
184
|
-
fout.close
|
185
|
-
|
186
|
-
rescue Entrez::NoFileError
|
187
|
-
puts "Identifiers not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
|
192
|
-
file 'gene.go' do
|
193
|
-
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
|
194
|
-
|
195
|
-
Open.write('gene.go', data.collect { |gene, values|
|
196
|
-
goterms = values.select{|v| v =~ /GO:/}.collect{|v| v.match(/(GO:\d+)/)[1]}
|
197
|
-
goterms.empty? ? nil : "%s\t%s" % [gene, values.uniq.join("|")]
|
198
|
-
}.compact.join("\n"))
|
199
|
-
|
200
|
-
end
|
201
|
-
|
202
|
-
|
203
|
-
file 'gene_go.pmid' do
|
204
|
-
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
|
205
|
-
|
206
|
-
data = data.collect{|code, value_lists|
|
207
|
-
[code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
|
208
|
-
}.select{|p| p[1].any?}
|
209
|
-
|
210
|
-
Open.write('gene_go.pmid',
|
211
|
-
data.collect{|p|
|
212
|
-
next if p[1].empty?
|
213
|
-
"#{p[0]}\t#{p[1].uniq.join("|")}"
|
214
|
-
}.compact.join("\n")
|
215
|
-
)
|
216
|
-
end
|
217
|
-
|
218
|
-
|
219
|
-
file 'gene.pmid' do
|
220
|
-
begin
|
221
|
-
translations = Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)) if $native_id != "Entrez Gene ID"
|
222
|
-
|
223
|
-
data = Entrez.entrez2pubmed($entrez2native[:tax])
|
224
|
-
|
225
|
-
Open.write('gene.pmid',
|
226
|
-
data.collect{|code,pmids|
|
227
|
-
next if translations && ! translations[code]
|
228
|
-
code = translations[code].first if translations
|
229
|
-
"#{code}\t#{pmids.uniq.join("|")}"
|
230
|
-
}.compact.join("\n")
|
231
|
-
)
|
232
|
-
rescue Entrez::NoFileError
|
233
|
-
puts "Gene article associations from entrez not produced, install the gene2pumbed file (rbbt_config install entrez)."
|
234
|
-
end
|
235
|
-
|
236
|
-
end
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
task 'all' => ['name', 'lexicon', 'identifiers', 'gene_go.pmid', 'gene.pmid', 'gene.go', 'all.pmid']
|
242
|
-
task 'clean' do
|
243
|
-
`rm -f 'name' 'lexicon' 'identifiers' 'gene_go.pmid' 'gene.pmid' 'gene.go' 'all.pmid'`
|
244
|
-
end
|
245
|
-
|
246
|
-
task 'update' do
|
247
|
-
Rake::Task['clean'].invoke if $force
|
248
|
-
Rake::Task['all'].invoke
|
249
|
-
end
|
250
|
-
|
251
|
-
task 'default' => 'all'
|
252
|
-
|