rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,77 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Mus musculus"
4
-
5
-
6
- $native_id = "MGI DB ID"
7
-
8
- $entrez2native = {
9
- :tax => 10090,
10
- :fix => nil,
11
- :check => proc{|code| code.match(/^MGI/)},
12
- }
13
-
14
- $lexicon = {
15
- :file => {
16
- :url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
17
- :native => 0,
18
- :extra => [2,3],
19
- :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
20
- },
21
- }
22
-
23
- $identifiers = {
24
- :file => {
25
- :url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
26
- :native => 0,
27
- :extra => [],
28
- :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
29
- },
30
- :biomart => {
31
- :database => 'mmusculus_gene_ensembl',
32
- :main => ['MGI DB ID', 'mgi_id'] ,
33
- :extra => [
34
- ['Associated Gene Name' , "external_gene_id"],
35
- ['Protein ID' , "protein_id"] ,
36
- ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
37
- ['Unigene ID' , "unigene"] ,
38
- ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
39
- ['RefSeq Protein ID' , "refseq_peptide"] ,
40
- ['EMBL (Genbank) ID' , "embl"] ,
41
-
42
- ['Affy mg u74a',"affy_mg_u74a" ],
43
- ['Affy mg u74av2',"affy_mg_u74av2" ],
44
- ['Affy mg u74b',"affy_mg_u74b" ],
45
- ['Affy mg u74bv2',"affy_mg_u74bv2" ],
46
- ['Affy mg u74c',"affy_mg_u74c" ],
47
- ['Affy mg u74cv2',"affy_mg_u74cv2" ],
48
- ['Affy moe430a',"affy_moe430a" ],
49
- ['Affy moe430b',"affy_moe430b" ],
50
- ['AFFY MoEx',"affy_moex_1_0_st_v1" ],
51
- ['AFFY MoGene',"affy_mogene_1_0_st_v1" ],
52
- ['Affy mouse430 2',"affy_mouse430_2" ],
53
- ['Affy mouse430a 2',"affy_mouse430a_2" ],
54
- ['Affy mu11ksuba',"affy_mu11ksuba" ],
55
- ['Affy mu11ksubb',"affy_mu11ksubb" ],
56
- ['Agilent WholeGenome',"agilent_wholegenome" ],
57
- ['Codelink ID',"codelink" ],
58
- ['Illumina MouseWG 6 v1',"illumina_mousewg_6_v1" ],
59
- ['Illumina MouseWG 6 v2',"illumina_mousewg_6_v2" ],
60
-
61
- ],
62
- :filter => ['with_mgi'], # This is needed as the filter is not with_mgi_id as was expected
63
- }
64
- }
65
-
66
- $go = {
67
- :url => "ftp://ftp.geneontology.org/go/gene-associations/gene_association.mgi.gz",
68
- :code => 1,
69
- :go => 4,
70
- :pmid => 5,
71
- }
72
-
73
- $query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]))'
74
- ##########################
75
-
76
-
77
-
@@ -1,43 +0,0 @@
1
- $org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
2
-
3
- task 'names' do
4
- orgs = Dir.glob('*').
5
- select{|t|
6
- File.directory?(t ) &&
7
- File.exist?(t + '/Rakefile')
8
- }
9
-
10
- orgs.each{|org|
11
- pid = Process.fork{
12
- Dir.chdir(org)
13
- load 'Rakefile'
14
- Rake::Task['name'].invoke
15
- }
16
- Process.waitpid pid
17
- }
18
-
19
- end
20
-
21
- task 'default' do
22
- if $org
23
- orgs = [$org]
24
- else
25
-
26
- orgs = Dir.glob('*').
27
- select{|t|
28
- File.directory?(t ) &&
29
- File.exist?(t + '/Rakefile')
30
- }
31
- end
32
-
33
- orgs.each{|org|
34
- puts "Updating #{ org }"
35
- pid = Process.fork{
36
- Dir.chdir(org)
37
- load 'Rakefile'
38
- Rake::Task['update'].invoke
39
- }
40
- Process.waitpid pid
41
- }
42
- end
43
-
@@ -1,88 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Rattus norvegicus"
4
-
5
-
6
- $native_id = "RGD DB ID"
7
-
8
- $entrez2native = {
9
- :tax => 10116,
10
- :check => proc{|code| code.match(/^RGD/)},
11
- }
12
-
13
- $lexicon = {
14
- :file => {
15
- :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
16
- :native => 1,
17
- :extra => [2,9],
18
- :exclude => proc{|l| !l.match(/^RGD/)}
19
- },
20
- }
21
-
22
- $identifiers = {
23
- :file => {
24
- :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
25
- :native => 1,
26
- :extra => [],
27
- :exclude => proc{|l| !l.match(/^RGD/)}
28
- },
29
- :biomart => {
30
- :database => 'rnorvegicus_gene_ensembl',
31
- :main => ['Entrez Gene ID' , "entrezgene"],
32
- :extra => [
33
- ['Associated Gene Name' , "external_gene_id"],
34
- ['Protein ID' , "protein_id"] ,
35
- ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
36
- ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
37
- ['RefSeq Protein ID' , "refseq_peptide"] ,
38
- ['EMBL (Genbank) ID' , "embl"] ,
39
-
40
- ['Affy rae230a', "affy_rae230a"],
41
- ['Affy rae230b', "affy_rae230b"],
42
- ['Affy RaGene', "affy_ragene_1_0_st_v1"],
43
- ['Affy rat230 2', "affy_rat230_2"],
44
- ['Affy RaEx', "affy_raex_1_0_st_v1"],
45
- ['Affy rg u34a', "affy_rg_u34a"],
46
- ['Affy rg u34b', "affy_rg_u34b"],
47
- ['Affy rg u34c', "affy_rg_u34c"],
48
- ['Affy rn u34', "affy_rn_u34"],
49
- ['Affy rt u34', "affy_rt_u34"],
50
- ['Agilent WholeGenome',"agilent_wholegenome" ],
51
- ['Codelink ID ', "codelink"],
52
-
53
-
54
- ],
55
- :filter => [],
56
- }
57
- }
58
-
59
- $go = {
60
- :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
61
- :exclude => proc{|l| !l.match(/^RGD/)},
62
- :code => 1,
63
- :go => 4,
64
- :pmid => 5,
65
- }
66
-
67
- $query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
68
-
69
- #{{{ Redefines
70
-
71
- module Open
72
-
73
- class << self
74
- alias_method :old_read, :read
75
-
76
- def read(url, options = {})
77
- data = old_read(url, options)
78
-
79
- if url =~ /gene_association.rgd.gz/
80
- return data.collect{|l| l.gsub(/^RGD\t/,"RGD\tRGD:")}.join("\n")
81
- else
82
- return data
83
- end
84
-
85
- end
86
- end
87
- end
88
-
@@ -1,66 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Saccharomyces cerevisiae"
4
-
5
-
6
- $native_id = "SGD DB Id"
7
-
8
- $entrez2native = {
9
- :tax => 559292,
10
- :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
11
- :check => proc{|code| code.match(/^S0/)},
12
- }
13
-
14
- $lexicon = {
15
- :file => {
16
- :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
17
- :native => 0,
18
- :extra => [4,3,5]
19
- },
20
- :biomart => {
21
- :database => 'scerevisiae_gene_ensembl',
22
- :main => ['Entrez Gene ID', 'entrezgene'],
23
- :extra => [
24
- ['Interpro Description' , "interpro_description"],
25
- ],
26
- :filter => [],
27
- }
28
-
29
- }
30
-
31
- $identifiers = {
32
- :file => {
33
- :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
34
- :native => 0,
35
- :extra => [],
36
- },
37
- :biomart => {
38
- :database => 'scerevisiae_gene_ensembl',
39
- :main => ['Entrez Gene ID', 'entrezgene'],
40
- :extra => [
41
- ['Associated Gene Name' , "external_gene_id"],
42
- ['Ensembl Gene ID', "ensembl_gene_id" ],
43
- ['Ensembl Protein ID', "ensembl_peptide_id" ],
44
- ['RefSeq Protein ID' , "refseq_peptide"] ,
45
- ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
46
- ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
47
- ['Protein ID' , "protein_id"] ,
48
- ['EMBL (Genbank) ID' , "embl"] ,
49
- # Affymetrix
50
- ['Affy yeast 2',"affy_yeast_2"],
51
- ['Affy yg s98', "affy_yg_s98"],
52
- ],
53
- :filter => [],
54
- }
55
- }
56
-
57
- $go = {
58
- :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
59
- :code => 1,
60
- :go => 4,
61
- :pmid => 5,
62
- }
63
-
64
- $query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
65
-
66
-
@@ -1,40 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Schizosaccharomyces pombe"
4
-
5
-
6
- $native_id = "GeneDB Id"
7
-
8
- $entrez2native = {
9
- :tax => 4896,
10
- :fix => proc{|code| code.sub(/GeneDB:SP/,'SP') },
11
- :check => proc{|code| code.match(/^SP/)},
12
- }
13
-
14
- $lexicon = {
15
- :file => {
16
- :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
17
- :native => 0,
18
- :extra => [1,2,3,4,5,6,7,8]
19
- },
20
- }
21
-
22
- $identifiers = {
23
- :file => {
24
- :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
25
- :native => 0,
26
- :extra => [],
27
- },
28
- }
29
-
30
- $go = {
31
- :url => "ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Gene_ontology/gene_association.GeneDB_Spombe",
32
- :code => 1,
33
- :go => 4,
34
- :pmid => 5,
35
- }
36
-
37
- $query = 'pombe[All Fields] AND (hasabstract[text] AND English[lang])'
38
- ####
39
-
40
-
@@ -1,252 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/util/arrayHash'
4
- require 'rbbt/sources/biomart'
5
- require 'rbbt/sources/entrez'
6
- require 'rbbt/sources/pubmed'
7
-
8
-
9
-
10
- file 'name' do
11
- Open.write('name', $name)
12
- end
13
-
14
- file 'all.pmid' do
15
- Open.write('all.pmid', PubMed.query($query).join("\n"))
16
- end
17
-
18
- file 'lexicon' do
19
- begin
20
-
21
- data = nil
22
- # Read from file
23
- if $lexicon[:file]
24
- file = Open.to_hash($lexicon[:file][:url], $lexicon[:file])
25
- data = ArrayHash.new(file, $native_id)
26
- end
27
-
28
- # Translate from entrez to native if needed
29
- if $entrez2native
30
- translations = {}
31
- Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
32
- each{|k,v|
33
- translations[k] = [v.join("|")]
34
- }
35
- translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
36
- if data
37
- data.merge(translations_data)
38
- else
39
- data = translations_data
40
- end
41
-
42
- end
43
-
44
-
45
- # Read from Biomart and merge with previous data
46
- if $lexicon[:biomart]
47
- biomart = {}
48
-
49
- BioMart.query(
50
- $lexicon[:biomart][:database],
51
- $lexicon[:biomart][:main][1],
52
- $lexicon[:biomart][:extra].collect{|v| v[1]},
53
- $lexicon[:biomart][:filter]
54
- ).each{|key, values_list|
55
- values = values_list.values_at(*$lexicon[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
56
- biomart[key] = values
57
- }
58
-
59
- biomart_data = ArrayHash.new(biomart, $lexicon[:biomart][:main][0], $lexicon[:biomart][:extra].collect{|v| v[0]})
60
-
61
- if data
62
- if $lexicon[:biomart][:extra].collect{|v| v[1]}.include?( $native_id )|| $lexicon[:biomart][:main][0] == $native_id
63
- field = $native_id
64
- else
65
- field = 'Entrez Gene ID'
66
- end
67
- data.merge(biomart_data, field)
68
- else
69
- data = biomart_data
70
- end
71
- end
72
-
73
- if $entrez2native
74
- gene_alias = {}
75
- Entrez.entrez2native($entrez2native[:tax],4).
76
- each{|k,v|
77
- gene_alias[k] = [v.select{|e| e.to_s != ""}.join("|")]
78
- }
79
- if gene_alias.keys.any?
80
- gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
81
- data.merge(gene_alias_data, 'Entrez Gene ID')
82
- end
83
- end
84
-
85
- data.remove('Entrez Gene ID')
86
- data.clean
87
- Open.write('lexicon', data.data.collect{|code, name_lists|
88
- "#{ code }\t" + name_lists.flatten.select{|n| n.to_s != ""}.uniq.join("\t")
89
- }.join("\n"))
90
-
91
- rescue Entrez::NoFileError
92
- puts "Lexicon not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
93
- end
94
- end
95
-
96
-
97
- file 'identifiers' do
98
-
99
- begin
100
- data = nil
101
- if $identifiers[:file]
102
- file = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
103
- data = ArrayHash.new(file, $native_id, $identifiers[:file][:fields])
104
- end
105
-
106
- # Translate from entrez to native if needed
107
- if $entrez2native
108
- translations = {}
109
- Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
110
- each{|k,v| translations[k] = [v.join("|")] }
111
-
112
- if translations.keys.any?
113
- translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
114
- if data
115
- data.merge(translations_data, $native_id)
116
- else
117
- data = translations_data
118
- end
119
- else
120
- puts "No translations from Entrez to #{ $native_id }"
121
- end
122
- end
123
-
124
-
125
- # Read from Biomart and merge with previous data
126
- if $identifiers[:biomart]
127
- biomart = {}
128
-
129
- BioMart.query(
130
- $identifiers[:biomart][:database],
131
- $identifiers[:biomart][:main][1],
132
- $identifiers[:biomart][:extra].collect{|v| v[1]},
133
- $identifiers[:biomart][:filter]
134
- ).each{|key, values_list|
135
- values = values_list.values_at(*$identifiers[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
136
- biomart[key] = values
137
- }
138
-
139
- biomart_data = ArrayHash.new(biomart, $identifiers[:biomart][:main][0], $identifiers[:biomart][:extra].collect{|v| v[0]})
140
- $identifiers[:biomart][:extra].each{|values|
141
- if values[2]
142
- biomart_data.process(values[0]){|n| "#{values[2]}:#{n}"}
143
- end
144
- }
145
-
146
-
147
- if data
148
- if $identifiers[:biomart][:extra].collect{|v| v[1]}.include?( $native_id ) || $identifiers[:biomart][:main][0] == $native_id
149
- field = $native_id
150
- else
151
- field = 'Entrez Gene ID'
152
- end
153
- data.merge(biomart_data, field)
154
- else
155
- data = biomart_data
156
- end
157
- end
158
-
159
-
160
- # Add the alias at the end
161
- if $entrez2native
162
- gene_alias = {}
163
- Entrez.entrez2native($entrez2native[:tax],4).
164
- each{|k,v|
165
- gene_alias[k] = [v.join("|")]
166
- }
167
- if gene_alias.keys.any?
168
- gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
169
- if data
170
- data.merge(gene_alias_data, 'Entrez Gene ID')
171
- else
172
- data = gene_alias_data
173
- end
174
- end
175
- end
176
-
177
- # Write ids to file
178
- fout = File.open('identifiers', 'w')
179
- fout.puts "##{$native_id}\t" + data.fields.join("\t")
180
- data.clean
181
- data.data.each{|code, values|
182
- fout.puts code + "\t" + values.join("\t")
183
- }
184
- fout.close
185
-
186
- rescue Entrez::NoFileError
187
- puts "Identifiers not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
188
- end
189
- end
190
-
191
-
192
- file 'gene.go' do
193
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
194
-
195
- Open.write('gene.go', data.collect { |gene, values|
196
- goterms = values.select{|v| v =~ /GO:/}.collect{|v| v.match(/(GO:\d+)/)[1]}
197
- goterms.empty? ? nil : "%s\t%s" % [gene, values.uniq.join("|")]
198
- }.compact.join("\n"))
199
-
200
- end
201
-
202
-
203
- file 'gene_go.pmid' do
204
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
205
-
206
- data = data.collect{|code, value_lists|
207
- [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
208
- }.select{|p| p[1].any?}
209
-
210
- Open.write('gene_go.pmid',
211
- data.collect{|p|
212
- next if p[1].empty?
213
- "#{p[0]}\t#{p[1].uniq.join("|")}"
214
- }.compact.join("\n")
215
- )
216
- end
217
-
218
-
219
- file 'gene.pmid' do
220
- begin
221
- translations = Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)) if $native_id != "Entrez Gene ID"
222
-
223
- data = Entrez.entrez2pubmed($entrez2native[:tax])
224
-
225
- Open.write('gene.pmid',
226
- data.collect{|code,pmids|
227
- next if translations && ! translations[code]
228
- code = translations[code].first if translations
229
- "#{code}\t#{pmids.uniq.join("|")}"
230
- }.compact.join("\n")
231
- )
232
- rescue Entrez::NoFileError
233
- puts "Gene article associations from entrez not produced, install the gene2pumbed file (rbbt_config install entrez)."
234
- end
235
-
236
- end
237
-
238
-
239
-
240
-
241
- task 'all' => ['name', 'lexicon', 'identifiers', 'gene_go.pmid', 'gene.pmid', 'gene.go', 'all.pmid']
242
- task 'clean' do
243
- `rm -f 'name' 'lexicon' 'identifiers' 'gene_go.pmid' 'gene.pmid' 'gene.go' 'all.pmid'`
244
- end
245
-
246
- task 'update' do
247
- Rake::Task['clean'].invoke if $force
248
- Rake::Task['all'].invoke
249
- end
250
-
251
- task 'default' => 'all'
252
-