rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,77 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Mus musculus"
4
-
5
-
6
- $native_id = "MGI DB ID"
7
-
8
- $entrez2native = {
9
- :tax => 10090,
10
- :fix => nil,
11
- :check => proc{|code| code.match(/^MGI/)},
12
- }
13
-
14
- $lexicon = {
15
- :file => {
16
- :url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
17
- :native => 0,
18
- :extra => [2,3],
19
- :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
20
- },
21
- }
22
-
23
- $identifiers = {
24
- :file => {
25
- :url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
26
- :native => 0,
27
- :extra => [],
28
- :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
29
- },
30
- :biomart => {
31
- :database => 'mmusculus_gene_ensembl',
32
- :main => ['MGI DB ID', 'mgi_id'] ,
33
- :extra => [
34
- ['Associated Gene Name' , "external_gene_id"],
35
- ['Protein ID' , "protein_id"] ,
36
- ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
37
- ['Unigene ID' , "unigene"] ,
38
- ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
39
- ['RefSeq Protein ID' , "refseq_peptide"] ,
40
- ['EMBL (Genbank) ID' , "embl"] ,
41
-
42
- ['Affy mg u74a',"affy_mg_u74a" ],
43
- ['Affy mg u74av2',"affy_mg_u74av2" ],
44
- ['Affy mg u74b',"affy_mg_u74b" ],
45
- ['Affy mg u74bv2',"affy_mg_u74bv2" ],
46
- ['Affy mg u74c',"affy_mg_u74c" ],
47
- ['Affy mg u74cv2',"affy_mg_u74cv2" ],
48
- ['Affy moe430a',"affy_moe430a" ],
49
- ['Affy moe430b',"affy_moe430b" ],
50
- ['AFFY MoEx',"affy_moex_1_0_st_v1" ],
51
- ['AFFY MoGene',"affy_mogene_1_0_st_v1" ],
52
- ['Affy mouse430 2',"affy_mouse430_2" ],
53
- ['Affy mouse430a 2',"affy_mouse430a_2" ],
54
- ['Affy mu11ksuba',"affy_mu11ksuba" ],
55
- ['Affy mu11ksubb',"affy_mu11ksubb" ],
56
- ['Agilent WholeGenome',"agilent_wholegenome" ],
57
- ['Codelink ID',"codelink" ],
58
- ['Illumina MouseWG 6 v1',"illumina_mousewg_6_v1" ],
59
- ['Illumina MouseWG 6 v2',"illumina_mousewg_6_v2" ],
60
-
61
- ],
62
- :filter => ['with_mgi'], # This is needed as the filter is not with_mgi_id as was expected
63
- }
64
- }
65
-
66
- $go = {
67
- :url => "ftp://ftp.geneontology.org/go/gene-associations/gene_association.mgi.gz",
68
- :code => 1,
69
- :go => 4,
70
- :pmid => 5,
71
- }
72
-
73
- $query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]))'
74
- ##########################
75
-
76
-
77
-
@@ -1,43 +0,0 @@
1
- $org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
2
-
3
- task 'names' do
4
- orgs = Dir.glob('*').
5
- select{|t|
6
- File.directory?(t ) &&
7
- File.exist?(t + '/Rakefile')
8
- }
9
-
10
- orgs.each{|org|
11
- pid = Process.fork{
12
- Dir.chdir(org)
13
- load 'Rakefile'
14
- Rake::Task['name'].invoke
15
- }
16
- Process.waitpid pid
17
- }
18
-
19
- end
20
-
21
- task 'default' do
22
- if $org
23
- orgs = [$org]
24
- else
25
-
26
- orgs = Dir.glob('*').
27
- select{|t|
28
- File.directory?(t ) &&
29
- File.exist?(t + '/Rakefile')
30
- }
31
- end
32
-
33
- orgs.each{|org|
34
- puts "Updating #{ org }"
35
- pid = Process.fork{
36
- Dir.chdir(org)
37
- load 'Rakefile'
38
- Rake::Task['update'].invoke
39
- }
40
- Process.waitpid pid
41
- }
42
- end
43
-
@@ -1,88 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Rattus norvegicus"
4
-
5
-
6
- $native_id = "RGD DB ID"
7
-
8
- $entrez2native = {
9
- :tax => 10116,
10
- :check => proc{|code| code.match(/^RGD/)},
11
- }
12
-
13
- $lexicon = {
14
- :file => {
15
- :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
16
- :native => 1,
17
- :extra => [2,9],
18
- :exclude => proc{|l| !l.match(/^RGD/)}
19
- },
20
- }
21
-
22
- $identifiers = {
23
- :file => {
24
- :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
25
- :native => 1,
26
- :extra => [],
27
- :exclude => proc{|l| !l.match(/^RGD/)}
28
- },
29
- :biomart => {
30
- :database => 'rnorvegicus_gene_ensembl',
31
- :main => ['Entrez Gene ID' , "entrezgene"],
32
- :extra => [
33
- ['Associated Gene Name' , "external_gene_id"],
34
- ['Protein ID' , "protein_id"] ,
35
- ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
36
- ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
37
- ['RefSeq Protein ID' , "refseq_peptide"] ,
38
- ['EMBL (Genbank) ID' , "embl"] ,
39
-
40
- ['Affy rae230a', "affy_rae230a"],
41
- ['Affy rae230b', "affy_rae230b"],
42
- ['Affy RaGene', "affy_ragene_1_0_st_v1"],
43
- ['Affy rat230 2', "affy_rat230_2"],
44
- ['Affy RaEx', "affy_raex_1_0_st_v1"],
45
- ['Affy rg u34a', "affy_rg_u34a"],
46
- ['Affy rg u34b', "affy_rg_u34b"],
47
- ['Affy rg u34c', "affy_rg_u34c"],
48
- ['Affy rn u34', "affy_rn_u34"],
49
- ['Affy rt u34', "affy_rt_u34"],
50
- ['Agilent WholeGenome',"agilent_wholegenome" ],
51
- ['Codelink ID ', "codelink"],
52
-
53
-
54
- ],
55
- :filter => [],
56
- }
57
- }
58
-
59
- $go = {
60
- :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
61
- :exclude => proc{|l| !l.match(/^RGD/)},
62
- :code => 1,
63
- :go => 4,
64
- :pmid => 5,
65
- }
66
-
67
- $query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
68
-
69
- #{{{ Redefines
70
-
71
- module Open
72
-
73
- class << self
74
- alias_method :old_read, :read
75
-
76
- def read(url, options = {})
77
- data = old_read(url, options)
78
-
79
- if url =~ /gene_association.rgd.gz/
80
- return data.collect{|l| l.gsub(/^RGD\t/,"RGD\tRGD:")}.join("\n")
81
- else
82
- return data
83
- end
84
-
85
- end
86
- end
87
- end
88
-
@@ -1,66 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Saccharomyces cerevisiae"
4
-
5
-
6
- $native_id = "SGD DB Id"
7
-
8
- $entrez2native = {
9
- :tax => 559292,
10
- :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
11
- :check => proc{|code| code.match(/^S0/)},
12
- }
13
-
14
- $lexicon = {
15
- :file => {
16
- :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
17
- :native => 0,
18
- :extra => [4,3,5]
19
- },
20
- :biomart => {
21
- :database => 'scerevisiae_gene_ensembl',
22
- :main => ['Entrez Gene ID', 'entrezgene'],
23
- :extra => [
24
- ['Interpro Description' , "interpro_description"],
25
- ],
26
- :filter => [],
27
- }
28
-
29
- }
30
-
31
- $identifiers = {
32
- :file => {
33
- :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
34
- :native => 0,
35
- :extra => [],
36
- },
37
- :biomart => {
38
- :database => 'scerevisiae_gene_ensembl',
39
- :main => ['Entrez Gene ID', 'entrezgene'],
40
- :extra => [
41
- ['Associated Gene Name' , "external_gene_id"],
42
- ['Ensembl Gene ID', "ensembl_gene_id" ],
43
- ['Ensembl Protein ID', "ensembl_peptide_id" ],
44
- ['RefSeq Protein ID' , "refseq_peptide"] ,
45
- ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
46
- ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
47
- ['Protein ID' , "protein_id"] ,
48
- ['EMBL (Genbank) ID' , "embl"] ,
49
- # Affymetrix
50
- ['Affy yeast 2',"affy_yeast_2"],
51
- ['Affy yg s98', "affy_yg_s98"],
52
- ],
53
- :filter => [],
54
- }
55
- }
56
-
57
- $go = {
58
- :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
59
- :code => 1,
60
- :go => 4,
61
- :pmid => 5,
62
- }
63
-
64
- $query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
65
-
66
-
@@ -1,40 +0,0 @@
1
- require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
-
3
- $name = "Schizosaccharomyces pombe"
4
-
5
-
6
- $native_id = "GeneDB Id"
7
-
8
- $entrez2native = {
9
- :tax => 4896,
10
- :fix => proc{|code| code.sub(/GeneDB:SP/,'SP') },
11
- :check => proc{|code| code.match(/^SP/)},
12
- }
13
-
14
- $lexicon = {
15
- :file => {
16
- :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
17
- :native => 0,
18
- :extra => [1,2,3,4,5,6,7,8]
19
- },
20
- }
21
-
22
- $identifiers = {
23
- :file => {
24
- :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
25
- :native => 0,
26
- :extra => [],
27
- },
28
- }
29
-
30
- $go = {
31
- :url => "ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Gene_ontology/gene_association.GeneDB_Spombe",
32
- :code => 1,
33
- :go => 4,
34
- :pmid => 5,
35
- }
36
-
37
- $query = 'pombe[All Fields] AND (hasabstract[text] AND English[lang])'
38
- ####
39
-
40
-
@@ -1,252 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/util/arrayHash'
4
- require 'rbbt/sources/biomart'
5
- require 'rbbt/sources/entrez'
6
- require 'rbbt/sources/pubmed'
7
-
8
-
9
-
10
- file 'name' do
11
- Open.write('name', $name)
12
- end
13
-
14
- file 'all.pmid' do
15
- Open.write('all.pmid', PubMed.query($query).join("\n"))
16
- end
17
-
18
- file 'lexicon' do
19
- begin
20
-
21
- data = nil
22
- # Read from file
23
- if $lexicon[:file]
24
- file = Open.to_hash($lexicon[:file][:url], $lexicon[:file])
25
- data = ArrayHash.new(file, $native_id)
26
- end
27
-
28
- # Translate from entrez to native if needed
29
- if $entrez2native
30
- translations = {}
31
- Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
32
- each{|k,v|
33
- translations[k] = [v.join("|")]
34
- }
35
- translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
36
- if data
37
- data.merge(translations_data)
38
- else
39
- data = translations_data
40
- end
41
-
42
- end
43
-
44
-
45
- # Read from Biomart and merge with previous data
46
- if $lexicon[:biomart]
47
- biomart = {}
48
-
49
- BioMart.query(
50
- $lexicon[:biomart][:database],
51
- $lexicon[:biomart][:main][1],
52
- $lexicon[:biomart][:extra].collect{|v| v[1]},
53
- $lexicon[:biomart][:filter]
54
- ).each{|key, values_list|
55
- values = values_list.values_at(*$lexicon[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
56
- biomart[key] = values
57
- }
58
-
59
- biomart_data = ArrayHash.new(biomart, $lexicon[:biomart][:main][0], $lexicon[:biomart][:extra].collect{|v| v[0]})
60
-
61
- if data
62
- if $lexicon[:biomart][:extra].collect{|v| v[1]}.include?( $native_id )|| $lexicon[:biomart][:main][0] == $native_id
63
- field = $native_id
64
- else
65
- field = 'Entrez Gene ID'
66
- end
67
- data.merge(biomart_data, field)
68
- else
69
- data = biomart_data
70
- end
71
- end
72
-
73
- if $entrez2native
74
- gene_alias = {}
75
- Entrez.entrez2native($entrez2native[:tax],4).
76
- each{|k,v|
77
- gene_alias[k] = [v.select{|e| e.to_s != ""}.join("|")]
78
- }
79
- if gene_alias.keys.any?
80
- gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
81
- data.merge(gene_alias_data, 'Entrez Gene ID')
82
- end
83
- end
84
-
85
- data.remove('Entrez Gene ID')
86
- data.clean
87
- Open.write('lexicon', data.data.collect{|code, name_lists|
88
- "#{ code }\t" + name_lists.flatten.select{|n| n.to_s != ""}.uniq.join("\t")
89
- }.join("\n"))
90
-
91
- rescue Entrez::NoFileError
92
- puts "Lexicon not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
93
- end
94
- end
95
-
96
-
97
- file 'identifiers' do
98
-
99
- begin
100
- data = nil
101
- if $identifiers[:file]
102
- file = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
103
- data = ArrayHash.new(file, $native_id, $identifiers[:file][:fields])
104
- end
105
-
106
- # Translate from entrez to native if needed
107
- if $entrez2native
108
- translations = {}
109
- Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
110
- each{|k,v| translations[k] = [v.join("|")] }
111
-
112
- if translations.keys.any?
113
- translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
114
- if data
115
- data.merge(translations_data, $native_id)
116
- else
117
- data = translations_data
118
- end
119
- else
120
- puts "No translations from Entrez to #{ $native_id }"
121
- end
122
- end
123
-
124
-
125
- # Read from Biomart and merge with previous data
126
- if $identifiers[:biomart]
127
- biomart = {}
128
-
129
- BioMart.query(
130
- $identifiers[:biomart][:database],
131
- $identifiers[:biomart][:main][1],
132
- $identifiers[:biomart][:extra].collect{|v| v[1]},
133
- $identifiers[:biomart][:filter]
134
- ).each{|key, values_list|
135
- values = values_list.values_at(*$identifiers[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
136
- biomart[key] = values
137
- }
138
-
139
- biomart_data = ArrayHash.new(biomart, $identifiers[:biomart][:main][0], $identifiers[:biomart][:extra].collect{|v| v[0]})
140
- $identifiers[:biomart][:extra].each{|values|
141
- if values[2]
142
- biomart_data.process(values[0]){|n| "#{values[2]}:#{n}"}
143
- end
144
- }
145
-
146
-
147
- if data
148
- if $identifiers[:biomart][:extra].collect{|v| v[1]}.include?( $native_id ) || $identifiers[:biomart][:main][0] == $native_id
149
- field = $native_id
150
- else
151
- field = 'Entrez Gene ID'
152
- end
153
- data.merge(biomart_data, field)
154
- else
155
- data = biomart_data
156
- end
157
- end
158
-
159
-
160
- # Add the alias at the end
161
- if $entrez2native
162
- gene_alias = {}
163
- Entrez.entrez2native($entrez2native[:tax],4).
164
- each{|k,v|
165
- gene_alias[k] = [v.join("|")]
166
- }
167
- if gene_alias.keys.any?
168
- gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
169
- if data
170
- data.merge(gene_alias_data, 'Entrez Gene ID')
171
- else
172
- data = gene_alias_data
173
- end
174
- end
175
- end
176
-
177
- # Write ids to file
178
- fout = File.open('identifiers', 'w')
179
- fout.puts "##{$native_id}\t" + data.fields.join("\t")
180
- data.clean
181
- data.data.each{|code, values|
182
- fout.puts code + "\t" + values.join("\t")
183
- }
184
- fout.close
185
-
186
- rescue Entrez::NoFileError
187
- puts "Identifiers not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
188
- end
189
- end
190
-
191
-
192
- file 'gene.go' do
193
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
194
-
195
- Open.write('gene.go', data.collect { |gene, values|
196
- goterms = values.select{|v| v =~ /GO:/}.collect{|v| v.match(/(GO:\d+)/)[1]}
197
- goterms.empty? ? nil : "%s\t%s" % [gene, values.uniq.join("|")]
198
- }.compact.join("\n"))
199
-
200
- end
201
-
202
-
203
- file 'gene_go.pmid' do
204
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
205
-
206
- data = data.collect{|code, value_lists|
207
- [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
208
- }.select{|p| p[1].any?}
209
-
210
- Open.write('gene_go.pmid',
211
- data.collect{|p|
212
- next if p[1].empty?
213
- "#{p[0]}\t#{p[1].uniq.join("|")}"
214
- }.compact.join("\n")
215
- )
216
- end
217
-
218
-
219
- file 'gene.pmid' do
220
- begin
221
- translations = Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)) if $native_id != "Entrez Gene ID"
222
-
223
- data = Entrez.entrez2pubmed($entrez2native[:tax])
224
-
225
- Open.write('gene.pmid',
226
- data.collect{|code,pmids|
227
- next if translations && ! translations[code]
228
- code = translations[code].first if translations
229
- "#{code}\t#{pmids.uniq.join("|")}"
230
- }.compact.join("\n")
231
- )
232
- rescue Entrez::NoFileError
233
- puts "Gene article associations from entrez not produced, install the gene2pumbed file (rbbt_config install entrez)."
234
- end
235
-
236
- end
237
-
238
-
239
-
240
-
241
- task 'all' => ['name', 'lexicon', 'identifiers', 'gene_go.pmid', 'gene.pmid', 'gene.go', 'all.pmid']
242
- task 'clean' do
243
- `rm -f 'name' 'lexicon' 'identifiers' 'gene_go.pmid' 'gene.pmid' 'gene.go' 'all.pmid'`
244
- end
245
-
246
- task 'update' do
247
- Rake::Task['clean'].invoke if $force
248
- Rake::Task['all'].invoke
249
- end
250
-
251
- task 'default' => 'all'
252
-